diff options
Diffstat (limited to 'include/linux')
-rw-r--r-- | include/linux/fs.h | 120 | ||||
-rw-r--r-- | include/linux/hmm.h | 9 | ||||
-rw-r--r-- | include/linux/huge_mm.h | 59 | ||||
-rw-r--r-- | include/linux/hugetlb.h | 5 | ||||
-rw-r--r-- | include/linux/ksm.h | 6 | ||||
-rw-r--r-- | include/linux/memremap.h | 27 | ||||
-rw-r--r-- | include/linux/mm.h | 296 | ||||
-rw-r--r-- | include/linux/mm_inline.h | 11 | ||||
-rw-r--r-- | include/linux/mm_types.h | 26 | ||||
-rw-r--r-- | include/linux/pagemap.h | 146 | ||||
-rw-r--r-- | include/linux/rmap.h | 76 | ||||
-rw-r--r-- | include/linux/swap.h | 9 |
12 files changed, 409 insertions, 381 deletions
diff --git a/include/linux/fs.h b/include/linux/fs.h index 4fb6d5a50be7..60462181e9b2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2753,54 +2753,6 @@ extern void init_special_inode(struct inode *, umode_t, dev_t); extern void make_bad_inode(struct inode *); extern bool is_bad_inode(struct inode *); -unsigned long invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end); - -void invalidate_mapping_pagevec(struct address_space *mapping, - pgoff_t start, pgoff_t end, - unsigned long *nr_pagevec); - -static inline void invalidate_remote_inode(struct inode *inode) -{ - if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode)) - invalidate_mapping_pages(inode->i_mapping, 0, -1); -} -extern int invalidate_inode_pages2(struct address_space *mapping); -extern int invalidate_inode_pages2_range(struct address_space *mapping, - pgoff_t start, pgoff_t end); -extern int write_inode_now(struct inode *, int); -extern int filemap_fdatawrite(struct address_space *); -extern int filemap_flush(struct address_space *); -extern int filemap_fdatawait_keep_errors(struct address_space *mapping); -extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, - loff_t lend); -extern int filemap_fdatawait_range_keep_errors(struct address_space *mapping, - loff_t start_byte, loff_t end_byte); - -static inline int filemap_fdatawait(struct address_space *mapping) -{ - return filemap_fdatawait_range(mapping, 0, LLONG_MAX); -} - -extern bool filemap_range_has_page(struct address_space *, loff_t lstart, - loff_t lend); -extern int filemap_write_and_wait_range(struct address_space *mapping, - loff_t lstart, loff_t lend); -extern int __filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end, int sync_mode); -extern int filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end); -extern int filemap_check_errors(struct address_space *mapping); -extern void __filemap_set_wb_err(struct address_space *mapping, int err); -int filemap_fdatawrite_wbc(struct address_space *mapping, - struct writeback_control *wbc); - -static inline int filemap_write_and_wait(struct address_space *mapping) -{ - return filemap_write_and_wait_range(mapping, 0, LLONG_MAX); -} - extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, loff_t lend); extern int __must_check file_check_and_advance_wb_err(struct file *file); @@ -2812,67 +2764,6 @@ static inline int file_write_and_wait(struct file *file) return file_write_and_wait_range(file, 0, LLONG_MAX); } -/** - * filemap_set_wb_err - set a writeback error on an address_space - * @mapping: mapping in which to set writeback error - * @err: error to be set in mapping - * - * When writeback fails in some way, we must record that error so that - * userspace can be informed when fsync and the like are called. We endeavor - * to report errors on any file that was open at the time of the error. Some - * internal callers also need to know when writeback errors have occurred. - * - * When a writeback error occurs, most filesystems will want to call - * filemap_set_wb_err to record the error in the mapping so that it will be - * automatically reported whenever fsync is called on the file. - */ -static inline void filemap_set_wb_err(struct address_space *mapping, int err) -{ - /* Fastpath for common case of no error */ - if (unlikely(err)) - __filemap_set_wb_err(mapping, err); -} - -/** - * filemap_check_wb_err - has an error occurred since the mark was sampled? - * @mapping: mapping to check for writeback errors - * @since: previously-sampled errseq_t - * - * Grab the errseq_t value from the mapping, and see if it has changed "since" - * the given value was sampled. - * - * If it has then report the latest error set, otherwise return 0. - */ -static inline int filemap_check_wb_err(struct address_space *mapping, - errseq_t since) -{ - return errseq_check(&mapping->wb_err, since); -} - -/** - * filemap_sample_wb_err - sample the current errseq_t to test for later errors - * @mapping: mapping to be sampled - * - * Writeback errors are always reported relative to a particular sample point - * in the past. This function provides those sample points. - */ -static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) -{ - return errseq_sample(&mapping->wb_err); -} - -/** - * file_sample_sb_err - sample the current errseq_t to test for later errors - * @file: file pointer to be sampled - * - * Grab the most current superblock-level errseq_t value for the given - * struct file. - */ -static inline errseq_t file_sample_sb_err(struct file *file) -{ - return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); -} - extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); @@ -3627,15 +3518,4 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); -/* - * Flush file data before changing attributes. Caller must hold any locks - * required to prevent further writes to this file until we're done setting - * flags. - */ -static inline int inode_drain_writes(struct inode *inode) -{ - inode_dio_wait(inode); - return filemap_write_and_wait(inode->i_mapping); -} - #endif /* _LINUX_FS_H */ diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 2fd2e91d5107..d5a6f101f843 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -9,14 +9,9 @@ #ifndef LINUX_HMM_H #define LINUX_HMM_H -#include <linux/kconfig.h> -#include <linux/pgtable.h> +#include <linux/mm.h> -#include <linux/device.h> -#include <linux/migrate.h> -#include <linux/memremap.h> -#include <linux/completion.h> -#include <linux/mmu_notifier.h> +struct mmu_interval_notifier; /* * On output: diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e4c18ba8d3bf..0734aff8fa19 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -185,7 +185,7 @@ void prep_transhuge_page(struct page *page); void free_transhuge_page(struct page *page); bool is_transparent_hugepage(struct page *page); -bool can_split_huge_page(struct page *page, int *pextra_pins); +bool can_split_folio(struct folio *folio, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) { @@ -194,7 +194,7 @@ static inline int split_huge_page(struct page *page) void deferred_split_huge_page(struct page *page); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address, bool freeze, struct page *page); + unsigned long address, bool freeze, struct folio *folio); #define split_huge_pmd(__vma, __pmd, __address) \ do { \ @@ -207,7 +207,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, - bool freeze, struct page *page); + bool freeze, struct folio *folio); void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address); @@ -251,30 +251,6 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, } /** - * thp_order - Order of a transparent huge page. - * @page: Head page of a transparent huge page. - */ -static inline unsigned int thp_order(struct page *page) -{ - VM_BUG_ON_PGFLAGS(PageTail(page), page); - if (PageHead(page)) - return HPAGE_PMD_ORDER; - return 0; -} - -/** - * thp_nr_pages - The number of regular pages in this huge page. - * @page: The head page of a huge page. - */ -static inline int thp_nr_pages(struct page *page) -{ - VM_BUG_ON_PGFLAGS(PageTail(page), page); - if (PageHead(page)) - return HPAGE_PMD_NR; - return 1; -} - -/** * folio_test_pmd_mappable - Can we map this folio with a PMD? * @folio: The folio to test */ @@ -336,18 +312,6 @@ static inline struct list_head *page_deferred_list(struct page *page) #define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; }) -static inline unsigned int thp_order(struct page *page) -{ - VM_BUG_ON_PGFLAGS(PageTail(page), page); - return 0; -} - -static inline int thp_nr_pages(struct page *page) -{ - VM_BUG_ON_PGFLAGS(PageTail(page), page); - return 1; -} - static inline bool folio_test_pmd_mappable(struct folio *folio) { return false; @@ -387,7 +351,7 @@ static inline bool is_transparent_hugepage(struct page *page) #define thp_get_unmapped_area NULL static inline bool -can_split_huge_page(struct page *page, int *pextra_pins) +can_split_folio(struct folio *folio, int *pextra_pins) { BUILD_BUG(); return false; @@ -406,9 +370,9 @@ static inline void deferred_split_huge_page(struct page *page) {} do { } while (0) static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address, bool freeze, struct page *page) {} + unsigned long address, bool freeze, struct folio *folio) {} static inline void split_huge_pmd_address(struct vm_area_struct *vma, - unsigned long address, bool freeze, struct page *page) {} + unsigned long address, bool freeze, struct folio *folio) {} #define split_huge_pud(__vma, __pmd, __address) \ do { } while (0) @@ -483,15 +447,10 @@ static inline bool thp_migration_supported(void) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -/** - * thp_size - Size of a transparent huge page. - * @page: Head page of a transparent huge page. - * - * Return: Number of bytes in this page. - */ -static inline unsigned long thp_size(struct page *page) +static inline int split_folio_to_list(struct folio *folio, + struct list_head *list) { - return PAGE_SIZE << thp_order(page); + return split_huge_page_to_list(&folio->page, list); } #endif /* _LINUX_HUGE_MM_H */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 08357b4c7be7..53c1b6082a4c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -970,6 +970,11 @@ static inline struct hstate *page_hstate(struct page *page) return NULL; } +static inline struct hstate *size_to_hstate(unsigned long size) +{ + return NULL; +} + static inline unsigned long huge_page_size(struct hstate *h) { return PAGE_SIZE; diff --git a/include/linux/ksm.h b/include/linux/ksm.h index a38a5bca1ba5..0630e545f4cb 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -51,7 +51,7 @@ static inline void ksm_exit(struct mm_struct *mm) struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address); -void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); +void rmap_walk_ksm(struct folio *folio, const struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); #else /* !CONFIG_KSM */ @@ -78,8 +78,8 @@ static inline struct page *ksm_might_need_to_copy(struct page *page, return page; } -static inline void rmap_walk_ksm(struct page *page, - struct rmap_walk_control *rwc) +static inline void rmap_walk_ksm(struct folio *folio, + const struct rmap_walk_control *rwc) { } diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 1fafcc38acba..8af304f6b504 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MEMREMAP_H_ #define _LINUX_MEMREMAP_H_ + +#include <linux/mm.h> #include <linux/range.h> #include <linux/ioport.h> #include <linux/percpu-refcount.h> @@ -66,9 +68,9 @@ enum memory_type { struct dev_pagemap_ops { /* - * Called once the page refcount reaches 1. (ZONE_DEVICE pages never - * reach 0 refcount unless there is a refcount bug. This allows the - * device driver to implement its own memory management.) + * Called once the page refcount reaches 0. The reference count will be + * reset to one by the core code after the method is called to prepare + * for handing out the page again. */ void (*page_free)(struct page *page); @@ -129,6 +131,25 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) return 1 << pgmap->vmemmap_shift; } +static inline bool is_device_private_page(const struct page *page) +{ + return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && + is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_PRIVATE; +} + +static inline bool folio_is_device_private(const struct folio *folio) +{ + return is_device_private_page(&folio->page); +} + +static inline bool is_pci_p2pdma_page(const struct page *page) +{ + return IS_ENABLED(CONFIG_PCI_P2PDMA) && + is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; +} + #ifdef CONFIG_ZONE_DEVICE void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); diff --git a/include/linux/mm.h b/include/linux/mm.h index 0e4fd101616e..b8f9ba93a162 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3,9 +3,6 @@ #define _LINUX_MM_H #include <linux/errno.h> - -#ifdef __KERNEL__ - #include <linux/mmdebug.h> #include <linux/gfp.h> #include <linux/bug.h> @@ -26,7 +23,6 @@ #include <linux/err.h> #include <linux/page-flags.h> #include <linux/page_ref.h> -#include <linux/memremap.h> #include <linux/overflow.h> #include <linux/sizes.h> #include <linux/sched.h> @@ -216,8 +212,10 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) +#define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) #else #define nth_page(page,n) ((page) + (n)) +#define folio_page_idx(folio, p) ((p) - &(folio)->page) #endif /* to align the pointer to the (next) page boundary */ @@ -227,6 +225,10 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) #define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) +static inline struct folio *lru_to_folio(struct list_head *head) +{ + return list_entry((head)->prev, struct folio, lru); +} void setup_initial_init_mm(void *start_code, void *end_code, void *end_data, void *brk); @@ -775,21 +777,26 @@ static inline int is_vmalloc_or_module_addr(const void *x) } #endif -static inline int head_compound_mapcount(struct page *head) +/* + * How many times the entire folio is mapped as a single unit (eg by a + * PMD or PUD entry). This is probably not what you want, except for + * debugging purposes; look at folio_mapcount() or page_mapcount() + * instead. + */ +static inline int folio_entire_mapcount(struct folio *folio) { - return atomic_read(compound_mapcount_ptr(head)) + 1; + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + return atomic_read(folio_mapcount_ptr(folio)) + 1; } /* * Mapcount of compound page as a whole, does not include mapped sub-pages. * - * Must be called only for compound pages or any their tail sub-pages. + * Must be called only for compound pages. */ static inline int compound_mapcount(struct page *page) { - VM_BUG_ON_PAGE(!PageCompound(page), page); - page = compound_head(page); - return head_compound_mapcount(page); + return folio_entire_mapcount(page_folio(page)); } /* @@ -819,8 +826,14 @@ static inline int page_mapcount(struct page *page) return atomic_read(&page->_mapcount) + 1; } +int folio_mapcount(struct folio *folio); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE -int total_mapcount(struct page *page); +static inline int total_mapcount(struct page *page) +{ + return folio_mapcount(page_folio(page)); +} + int page_trans_huge_mapcount(struct page *page); #else static inline int total_mapcount(struct page *page) @@ -890,33 +903,17 @@ static inline void destroy_compound_page(struct page *page) compound_page_dtors[page[1].compound_dtor](page); } -static inline bool hpage_pincount_available(struct page *page) -{ - /* - * Can the page->hpage_pinned_refcount field be used? That field is in - * the 3rd page of the compound page, so the smallest (2-page) compound - * pages cannot support it. - */ - page = compound_head(page); - return PageCompound(page) && compound_order(page) > 1; -} - static inline int head_compound_pincount(struct page *head) { return atomic_read(compound_pincount_ptr(head)); } -static inline int compound_pincount(struct page *page) -{ - VM_BUG_ON_PAGE(!hpage_pincount_available(page), page); - page = compound_head(page); - return head_compound_pincount(page); -} - static inline void set_compound_order(struct page *page, unsigned int order) { page[1].compound_order = order; +#ifdef CONFIG_64BIT page[1].compound_nr = 1U << order; +#endif } /* Returns the number of pages in this potentially compound page. */ @@ -924,7 +921,11 @@ static inline unsigned long compound_nr(struct page *page) { if (!PageHead(page)) return 1; +#ifdef CONFIG_64BIT return page[1].compound_nr; +#else + return 1UL << compound_order(page); +#endif } /* Returns the number of bytes in this potentially compound page. */ @@ -939,6 +940,37 @@ static inline unsigned int page_shift(struct page *page) return PAGE_SHIFT + compound_order(page); } +/** + * thp_order - Order of a transparent huge page. + * @page: Head page of a transparent huge page. + */ +static inline unsigned int thp_order(struct page *page) +{ + VM_BUG_ON_PGFLAGS(PageTail(page), page); + return compound_order(page); +} + +/** + * thp_nr_pages - The number of regular pages in this huge page. + * @page: The head page of a huge page. + */ +static inline int thp_nr_pages(struct page *page) +{ + VM_BUG_ON_PGFLAGS(PageTail(page), page); + return compound_nr(page); +} + +/** + * thp_size - Size of a transparent huge page. + * @page: Head page of a transparent huge page. + * + * Return: Number of bytes in this page. + */ +static inline unsigned long thp_size(struct page *page) +{ + return PAGE_SIZE << thp_order(page); +} + void free_compound_page(struct page *page); #ifdef CONFIG_MMU @@ -1090,59 +1122,35 @@ static inline bool is_zone_device_page(const struct page *page) } #endif +static inline bool folio_is_zone_device(const struct folio *folio) +{ + return is_zone_device_page(&folio->page); +} + static inline bool is_zone_movable_page(const struct page *page) { return page_zonenum(page) == ZONE_MOVABLE; } -#ifdef CONFIG_DEV_PAGEMAP_OPS -void free_devmap_managed_page(struct page *page); +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) DECLARE_STATIC_KEY_FALSE(devmap_managed_key); -static inline bool page_is_devmap_managed(struct page *page) +bool __put_devmap_managed_page(struct page *page); +static inline bool put_devmap_managed_page(struct page *page) { if (!static_branch_unlikely(&devmap_managed_key)) return false; if (!is_zone_device_page(page)) return false; - switch (page->pgmap->type) { - case MEMORY_DEVICE_PRIVATE: - case MEMORY_DEVICE_FS_DAX: - return true; - default: - break; - } - return false; + return __put_devmap_managed_page(page); } -void put_devmap_managed_page(struct page *page); - -#else /* CONFIG_DEV_PAGEMAP_OPS */ -static inline bool page_is_devmap_managed(struct page *page) +#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ +static inline bool put_devmap_managed_page(struct page *page) { return false; } - -static inline void put_devmap_managed_page(struct page *page) -{ -} -#endif /* CONFIG_DEV_PAGEMAP_OPS */ - -static inline bool is_device_private_page(const struct page *page) -{ - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_DEVICE_PRIVATE) && - is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PRIVATE; -} - -static inline bool is_pci_p2pdma_page(const struct page *page) -{ - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_PCI_P2PDMA) && - is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; -} +#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ @@ -1168,9 +1176,6 @@ static inline void get_page(struct page *page) } bool __must_check try_grab_page(struct page *page, unsigned int flags); -struct page *try_grab_compound_head(struct page *page, int refs, - unsigned int flags); - static inline __must_check bool try_get_page(struct page *page) { @@ -1225,16 +1230,11 @@ static inline void put_page(struct page *page) struct folio *folio = page_folio(page); /* - * For devmap managed pages we need to catch refcount transition from - * 2 to 1, when refcount reach one it means the page is free and we - * need to inform the device driver through callback. See - * include/linux/memremap.h and HMM for details. + * For some devmap managed pages we need to catch refcount transition + * from 2 to 1: */ - if (page_is_devmap_managed(&folio->page)) { - put_devmap_managed_page(&folio->page); + if (put_devmap_managed_page(&folio->page)) return; - } - folio_put(folio); } @@ -1264,10 +1264,9 @@ static inline void put_page(struct page *page) * applications that don't have huge page reference counts, this won't be an * issue. * - * Locking: the lockless algorithm described in page_cache_get_speculative() - * and page_cache_gup_pin_speculative() provides safe operation for - * get_user_pages and page_mkclean and other calls that race to set up page - * table entries. + * Locking: the lockless algorithm described in folio_try_get_rcu() + * provides safe operation for get_user_pages(), page_mkclean() and + * other calls that race to set up page table entries. */ #define GUP_PIN_COUNTING_BIAS (1U << 10) @@ -1278,70 +1277,11 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); -/** - * page_maybe_dma_pinned - Report if a page is pinned for DMA. - * @page: The page. - * - * This function checks if a page has been pinned via a call to - * a function in the pin_user_pages() family. - * - * For non-huge pages, the return value is partially fuzzy: false is not fuzzy, - * because it means "definitely not pinned for DMA", but true means "probably - * pinned for DMA, but possibly a false positive due to having at least - * GUP_PIN_COUNTING_BIAS worth of normal page references". - * - * False positives are OK, because: a) it's unlikely for a page to get that many - * refcounts, and b) all the callers of this routine are expected to be able to - * deal gracefully with a false positive. - * - * For huge pages, the result will be exactly correct. That's because we have - * more tracking data available: the 3rd struct page in the compound page is - * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS - * scheme). - * - * For more information, please see Documentation/core-api/pin_user_pages.rst. - * - * Return: True, if it is likely that the page has been "dma-pinned". - * False, if the page is definitely not dma-pinned. - */ -static inline bool page_maybe_dma_pinned(struct page *page) -{ - if (hpage_pincount_available(page)) - return compound_pincount(page) > 0; - - /* - * page_ref_count() is signed. If that refcount overflows, then - * page_ref_count() returns a negative value, and callers will avoid - * further incrementing the refcount. - * - * Here, for that overflow case, use the signed bit to count a little - * bit higher via unsigned math, and thus still get an accurate result. - */ - return ((unsigned int)page_ref_count(compound_head(page))) >= - GUP_PIN_COUNTING_BIAS; -} - static inline bool is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } -/* - * This should most likely only be called during fork() to see whether we - * should break the cow immediately for a page on the src mm. - */ -static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, - struct page *page) -{ - if (!is_cow_mapping(vma->vm_flags)) - return false; - - if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) - return false; - - return page_maybe_dma_pinned(page); -} - #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif @@ -1586,6 +1526,74 @@ static inline unsigned long folio_pfn(struct folio *folio) return page_to_pfn(&folio->page); } +static inline atomic_t *folio_pincount_ptr(struct folio *folio) +{ + return &folio_page(folio, 1)->compound_pincount; +} + +/** + * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. + * @folio: The folio. + * + * This function checks if a folio has been pinned via a call to + * a function in the pin_user_pages() family. + * + * For small folios, the return value is partially fuzzy: false is not fuzzy, + * because it means "definitely not pinned for DMA", but true means "probably + * pinned for DMA, but possibly a false positive due to having at least + * GUP_PIN_COUNTING_BIAS worth of normal folio references". + * + * False positives are OK, because: a) it's unlikely for a folio to + * get that many refcounts, and b) all the callers of this routine are + * expected to be able to deal gracefully with a false positive. + * + * For large folios, the result will be exactly correct. That's because + * we have more tracking data available: the compound_pincount is used + * instead of the GUP_PIN_COUNTING_BIAS scheme. + * + * For more information, please see Documentation/core-api/pin_user_pages.rst. + * + * Return: True, if it is likely that the page has been "dma-pinned". + * False, if the page is definitely not dma-pinned. + */ +static inline bool folio_maybe_dma_pinned(struct folio *folio) +{ + if (folio_test_large(folio)) + return atomic_read(folio_pincount_ptr(folio)) > 0; + + /* + * folio_ref_count() is signed. If that refcount overflows, then + * folio_ref_count() returns a negative value, and callers will avoid + * further incrementing the refcount. + * + * Here, for that overflow case, use the sign bit to count a little + * bit higher via unsigned math, and thus still get an accurate result. + */ + return ((unsigned int)folio_ref_count(folio)) >= + GUP_PIN_COUNTING_BIAS; +} + +static inline bool page_maybe_dma_pinned(struct page *page) +{ + return folio_maybe_dma_pinned(page_folio(page)); +} + +/* + * This should most likely only be called during fork() to see whether we + * should break the cow immediately for a page on the src mm. + */ +static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, + struct page *page) +{ + if (!is_cow_mapping(vma->vm_flags)) + return false; + + if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) + return false; + + return page_maybe_dma_pinned(page); +} + /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ #ifdef CONFIG_MIGRATION static inline bool is_pinnable_page(struct page *page) @@ -1600,6 +1608,11 @@ static inline bool is_pinnable_page(struct page *page) } #endif +static inline bool folio_is_pinnable(struct folio *folio) +{ + return is_pinnable_page(&folio->page); +} + static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); @@ -1749,7 +1762,6 @@ static inline void *folio_address(const struct folio *folio) } extern void *page_rmapping(struct page *page); -extern struct anon_vma *page_anon_vma(struct page *page); extern pgoff_t __page_file_index(struct page *page); /* @@ -1855,7 +1867,6 @@ extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int generic_error_remove_page(struct address_space *mapping, struct page *page); -int invalidate_inode_page(struct page *page); #ifdef CONFIG_MMU extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, @@ -2921,13 +2932,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO * and return without waiting upon it */ -#define FOLL_POPULATE 0x40 /* fault in pages (with FOLL_MLOCK) */ #define FOLL_NOFAULT 0x80 /* do not fault in pages */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ -#define FOLL_MLOCK 0x1000 /* lock present pages */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ @@ -3381,5 +3390,4 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start, } #endif -#endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index cf90b1fa2c60..ac32125745ab 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -99,7 +99,8 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); - list_add(&folio->lru, &lruvec->lists[lru]); + if (lru != LRU_UNEVICTABLE) + list_add(&folio->lru, &lruvec->lists[lru]); } static __always_inline void add_page_to_lru_list(struct page *page, @@ -115,6 +116,7 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); + /* This is not expected to be used on LRU_UNEVICTABLE */ list_add_tail(&folio->lru, &lruvec->lists[lru]); } @@ -127,8 +129,11 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { - list_del(&folio->lru); - update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio), + enum lru_list lru = folio_lru_list(folio); + + if (lru != LRU_UNEVICTABLE) + list_del(&folio->lru); + update_lru_size(lruvec, lru, folio_zonenum(folio), -folio_nr_pages(folio)); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5f7a33890b0f..8834e38c06a4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -85,7 +85,16 @@ struct page { * lruvec->lru_lock. Sometimes used as a generic list * by the page owner. */ - struct list_head lru; + union { + struct list_head lru; + /* Or, for the Unevictable "LRU list" slot */ + struct { + /* Always even, to negate PageTail */ + void *__filler; + /* Count page's or folio's mlocks */ + unsigned int mlock_count; + }; + }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; pgoff_t index; /* Our offset within mapping. */ @@ -126,11 +135,14 @@ struct page { unsigned char compound_dtor; unsigned char compound_order; atomic_t compound_mapcount; + atomic_t compound_pincount; +#ifdef CONFIG_64BIT |