diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-10 11:18:00 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-10 11:18:00 -0700 |
| commit | b1701d5e29eb0a102aa3393319b3e4eb1a19c6ea (patch) | |
| tree | 7bcb08dc82b47c81ac39b329fa3e5b41485cc054 /mm | |
| parent | c235698355fa94df7073b51befda7d4be00a0e23 (diff) | |
| parent | a9e9c93966afdaae74a6a7533552391646b93f2c (diff) | |
| download | linux-b1701d5e29eb0a102aa3393319b3e4eb1a19c6ea.tar.gz linux-b1701d5e29eb0a102aa3393319b3e4eb1a19c6ea.tar.bz2 linux-b1701d5e29eb0a102aa3393319b3e4eb1a19c6ea.zip | |
Merge tag 'mm-stable-2022-08-09' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull remaining MM updates from Andrew Morton:
"Three patch series - two that perform cleanups and one feature:
- hugetlb_vmemmap cleanups from Muchun Song
- hardware poisoning support for 1GB hugepages, from Naoya Horiguchi
- highmem documentation fixups from Fabio De Francesco"
* tag 'mm-stable-2022-08-09' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (23 commits)
Documentation/mm: add details about kmap_local_page() and preemption
highmem: delete a sentence from kmap_local_page() kdocs
Documentation/mm: rrefer kmap_local_page() and avoid kmap()
Documentation/mm: avoid invalid use of addresses from kmap_local_page()
Documentation/mm: don't kmap*() pages which can't come from HIGHMEM
highmem: specify that kmap_local_page() is callable from interrupts
highmem: remove unneeded spaces in kmap_local_page() kdocs
mm, hwpoison: enable memory error handling on 1GB hugepage
mm, hwpoison: skip raw hwpoison page in freeing 1GB hugepage
mm, hwpoison: make __page_handle_poison returns int
mm, hwpoison: set PG_hwpoison for busy hugetlb pages
mm, hwpoison: make unpoison aware of raw error info in hwpoisoned hugepage
mm, hwpoison, hugetlb: support saving mechanism of raw error pages
mm/hugetlb: make pud_huge() and follow_huge_pud() aware of non-present pud entry
mm/hugetlb: check gigantic_page_runtime_supported() in return_unused_surplus_pages()
mm: hugetlb_vmemmap: use PTRS_PER_PTE instead of PMD_SIZE / PAGE_SIZE
mm: hugetlb_vmemmap: move code comments to vmemmap_dedup.rst
mm: hugetlb_vmemmap: improve hugetlb_vmemmap code readability
mm: hugetlb_vmemmap: replace early_param() with core_param()
mm: hugetlb_vmemmap: move vmemmap code related to HugeTLB to hugetlb_vmemmap.c
...
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/hugetlb.c | 73 | ||||
| -rw-r--r-- | mm/hugetlb_vmemmap.c | 589 | ||||
| -rw-r--r-- | mm/hugetlb_vmemmap.h | 45 | ||||
| -rw-r--r-- | mm/memory-failure.c | 179 | ||||
| -rw-r--r-- | mm/sparse-vmemmap.c | 399 |
5 files changed, 684 insertions, 601 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f044962ad9df..0aee2f3ae15c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1535,7 +1535,14 @@ static void __update_and_free_page(struct hstate *h, struct page *page) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; - if (hugetlb_vmemmap_alloc(h, page)) { + /* + * If we don't know which subpages are hwpoisoned, we can't free + * the hugepage, so it's leaked intentionally. + */ + if (HPageRawHwpUnreliable(page)) + return; + + if (hugetlb_vmemmap_restore(h, page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1547,6 +1554,13 @@ static void __update_and_free_page(struct hstate *h, struct page *page) return; } + /* + * Move PageHWPoison flag from head page to the raw error pages, + * which makes any healthy subpages reusable. + */ + if (unlikely(PageHWPoison(page))) + hugetlb_clear_page_hwpoison(page); + for (i = 0; i < pages_per_huge_page(h); i++, subpage = mem_map_next(subpage, page, i)) { subpage->flags &= ~(1 << PG_locked | 1 << PG_error | @@ -1612,7 +1626,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn); static inline void flush_free_hpage_work(struct hstate *h) { - if (hugetlb_optimize_vmemmap_pages(h)) + if (hugetlb_vmemmap_optimizable(h)) flush_work(&free_hpage_work); } @@ -1734,7 +1748,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) static void __prep_new_huge_page(struct hstate *h, struct page *page) { - hugetlb_vmemmap_free(h, page); + hugetlb_vmemmap_optimize(h, page); INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); hugetlb_set_page_subpool(page, NULL); @@ -2107,17 +2121,8 @@ retry: * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. */ - rc = hugetlb_vmemmap_alloc(h, head); + rc = hugetlb_vmemmap_restore(h, head); if (!rc) { - /* - * Move PageHWPoison flag from head page to the raw - * error page, which makes any subpages rather than - * the error page reusable. - */ - if (PageHWPoison(head) && page != head) { - SetPageHWPoison(page); - ClearPageHWPoison(head); - } update_and_free_page(h, head, false); } else { spin_lock_irq(&hugetlb_lock); @@ -2432,8 +2437,7 @@ static void return_unused_surplus_pages(struct hstate *h, /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; - /* Cannot return gigantic pages currently */ - if (hstate_is_gigantic(h)) + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) goto out; /* @@ -3182,8 +3186,10 @@ static void __init report_hugepages(void) char buf[32]; string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); - pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", + pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", buf, h->free_huge_pages); + pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", + hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf); } } @@ -3421,7 +3427,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) remove_hugetlb_page_for_demote(h, page, false); spin_unlock_irq(&hugetlb_lock); - rc = hugetlb_vmemmap_alloc(h, page); + rc = hugetlb_vmemmap_restore(h, page); if (rc) { /* Allocation of vmemmmap failed, we can not demote page */ spin_lock_irq(&hugetlb_lock); @@ -4111,7 +4117,6 @@ void __init hugetlb_add_hstate(unsigned int order) h->next_nid_to_free = first_memory_node; snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); - hugetlb_vmemmap_init(h); parsed_hstate = h; } @@ -6985,10 +6990,38 @@ struct page * __weak follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int flags) { - if (flags & (FOLL_GET | FOLL_PIN)) + struct page *page = NULL; + spinlock_t *ptl; + pte_t pte; + + if (WARN_ON_ONCE(flags & FOLL_PIN)) return NULL; - return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); +retry: + ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud); + if (!pud_huge(*pud)) + goto out; + pte = huge_ptep_get((pte_t *)pud); + if (pte_present(pte)) { + page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(!try_grab_page(page, flags))) { + page = NULL; + goto out; + } + } else { + if (is_hugetlb_entry_migration(pte)) { + spin_unlock(ptl); + __migration_entry_wait(mm, (pte_t *)pud, ptl); + goto retry; + } + /* + * hwpoisoned entry is treated as no_page_table in + * follow_page_mask(). + */ + } +out: + spin_unlock(ptl); + return page; } struct page * __weak diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 1362feb3c6c9..20f414c0379f 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Optimize vmemmap pages associated with HugeTLB + * HugeTLB Vmemmap Optimization (HVO) * - * Copyright (c) 2020, Bytedance. All rights reserved. + * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song <songmuchun@bytedance.com> * @@ -10,84 +10,443 @@ */ #define pr_fmt(fmt) "HugeTLB: " fmt -#include <linux/memory.h> +#include <linux/pgtable.h> +#include <linux/bootmem_info.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> #include "hugetlb_vmemmap.h" -/* - * There are a lot of struct page structures associated with each HugeTLB page. - * For tail pages, the value of compound_head is the same. So we can reuse first - * page of head page structures. We map the virtual addresses of all the pages - * of tail page structures to the head page struct, and then free these page - * frames. Therefore, we need to reserve one pages as vmemmap areas. +/** + * struct vmemmap_remap_walk - walk vmemmap page table + * + * @remap_pte: called for each lowest-level entry (PTE). + * @nr_walked: the number of walked pte. + * @reuse_page: the page which is reused for the tail vmemmap pages. + * @reuse_addr: the virtual address of the @reuse_page page. + * @vmemmap_pages: the list head of the vmemmap pages that can be freed + * or is mapped from. */ -#define RESERVE_VMEMMAP_NR 1U -#define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) - -enum vmemmap_optimize_mode { - VMEMMAP_OPTIMIZE_OFF, - VMEMMAP_OPTIMIZE_ON, +struct vmemmap_remap_walk { + void (*remap_pte)(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk); + unsigned long nr_walked; + struct page *reuse_page; + unsigned long reuse_addr; + struct list_head *vmemmap_pages; }; -DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, - hugetlb_optimize_vmemmap_key); -EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); +static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +{ + pmd_t __pmd; + int i; + unsigned long addr = start; + struct page *page = pmd_page(*pmd); + pte_t *pgtable = pte_alloc_one_kernel(&init_mm); + + if (!pgtable) + return -ENOMEM; + + pmd_populate_kernel(&init_mm, &__pmd, pgtable); + + for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { + pte_t entry, *pte; + pgprot_t pgprot = PAGE_KERNEL; + + entry = mk_pte(page + i, pgprot); + pte = pte_offset_kernel(&__pmd, addr); + set_pte_at(&init_mm, addr, pte, entry); + } -static enum vmemmap_optimize_mode vmemmap_optimize_mode = - IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); + spin_lock(&init_mm.page_table_lock); + if (likely(pmd_leaf(*pmd))) { + /* + * Higher order allocations from buddy allocator must be able to + * be treated as indepdenent small pages (as they can be freed + * individually). + */ + if (!PageReserved(page)) + split_page(page, get_order(PMD_SIZE)); + + /* Make pte visible before pmd. See comment in pmd_install(). */ + smp_wmb(); + pmd_populate_kernel(&init_mm, pmd, pgtable); + flush_tlb_kernel_range(start, start + PMD_SIZE); + } else { + pte_free_kernel(&init_mm, pgtable); + } + spin_unlock(&init_mm.page_table_lock); + + return 0; +} -static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to) +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) { - if (vmemmap_optimize_mode == to) - return; + int leaf; - if (to == VMEMMAP_OPTIMIZE_OFF) - static_branch_dec(&hugetlb_optimize_vmemmap_key); - else - static_branch_inc(&hugetlb_optimize_vmemmap_key); - WRITE_ONCE(vmemmap_optimize_mode, to); + spin_lock(&init_mm.page_table_lock); + leaf = pmd_leaf(*pmd); + spin_unlock(&init_mm.page_table_lock); + + if (!leaf) + return 0; + + return __split_vmemmap_huge_pmd(pmd, start); +} + +static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + + /* + * The reuse_page is found 'first' in table walk before we start + * remapping (which is calling @walk->remap_pte). + */ + if (!walk->reuse_page) { + walk->reuse_page = pte_page(*pte); + /* + * Because the reuse address is part of the range that we are + * walking, skip the reuse address range. + */ + addr += PAGE_SIZE; + pte++; + walk->nr_walked++; + } + + for (; addr != end; addr += PAGE_SIZE, pte++) { + walk->remap_pte(pte, addr, walk); + walk->nr_walked++; + } } -static int __init hugetlb_vmemmap_early_param(char *buf) +static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) { - bool enable; - enum vmemmap_optimize_mode mode; + pmd_t *pmd; + unsigned long next; - if (kstrtobool(buf, &enable)) - return -EINVAL; + pmd = pmd_offset(pud, addr); + do { + int ret; - mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF; - vmemmap_optimize_mode_switch(mode); + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + if (ret) + return ret; + + next = pmd_addr_end(addr, end); + vmemmap_pte_range(pmd, addr, next, walk); + } while (pmd++, addr = next, addr != end); + + return 0; +} + +static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(p4d, addr); + do { + int ret; + + next = pud_addr_end(addr, end); + ret = vmemmap_pmd_range(pud, addr, next, walk); + if (ret) + return ret; + } while (pud++, addr = next, addr != end); + + return 0; +} + +static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_offset(pgd, addr); + do { + int ret; + + next = p4d_addr_end(addr, end); + ret = vmemmap_pud_range(p4d, addr, next, walk); + if (ret) + return ret; + } while (p4d++, addr = next, addr != end); + + return 0; +} + +static int vmemmap_remap_range(unsigned long start, unsigned long end, + struct vmemmap_remap_walk *walk) +{ + unsigned long addr = start; + unsigned long next; + pgd_t *pgd; + + VM_BUG_ON(!PAGE_ALIGNED(start)); + VM_BUG_ON(!PAGE_ALIGNED(end)); + + pgd = pgd_offset_k(addr); + do { + int ret; + + next = pgd_addr_end(addr, end); + ret = vmemmap_p4d_range(pgd, addr, next, walk); + if (ret) + return ret; + } while (pgd++, addr = next, addr != end); + + /* + * We only change the mapping of the vmemmap virtual address range + * [@start + PAGE_SIZE, end), so we only need to flush the TLB which + * belongs to the range. + */ + flush_tlb_kernel_range(start + PAGE_SIZE, end); return 0; } -early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param); /* - * Previously discarded vmemmap pages will be allocated and remapping - * after this function returns zero. + * Free a vmemmap page. A vmemmap page can be allocated from the memblock + * allocator or buddy allocator. If the PG_reserved flag is set, it means + * that it allocated from the memblock allocator, just free it via the + * free_bootmem_page(). Otherwise, use __free_page(). */ -int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) +static inline void free_vmemmap_page(struct page *page) +{ + if (PageReserved(page)) + free_bootmem_page(page); + else + __free_page(page); +} + +/* Free a list of the vmemmap pages */ +static void free_vmemmap_page_list(struct list_head *list) +{ + struct page *page, *next; + + list_for_each_entry_safe(page, next, list, lru) { + list_del(&page->lru); + free_vmemmap_page(page); + } +} + +static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk) +{ + /* + * Remap the tail pages as read-only to catch illegal write operation + * to the tail pages. + */ + pgprot_t pgprot = PAGE_KERNEL_RO; + pte_t entry = mk_pte(walk->reuse_page, pgprot); + struct page *page = pte_page(*pte); + + list_add_tail(&page->lru, walk->vmemmap_pages); + set_pte_at(&init_mm, addr, pte, entry); +} + +/* + * How many struct page structs need to be reset. When we reuse the head + * struct page, the special metadata (e.g. page->flags or page->mapping) + * cannot copy to the tail struct page structs. The invalid value will be + * checked in the free_tail_pages_check(). In order to avoid the message + * of "corrupted mapping in tail page". We need to reset at least 3 (one + * head struct page struct and two tail struct page structs) struct page + * structs. + */ +#define NR_RESET_STRUCT_PAGE 3 + +static inline void reset_struct_pages(struct page *start) +{ + int i; + struct page *from = start + NR_RESET_STRUCT_PAGE; + + for (i = 0; i < NR_RESET_STRUCT_PAGE; i++) + memcpy(start + i, from, sizeof(*from)); +} + +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk) +{ + pgprot_t pgprot = PAGE_KERNEL; + struct page *page; + void *to; + + BUG_ON(pte_page(*pte) != walk->reuse_page); + + page = list_first_entry(walk->vmemmap_pages, struct page, lru); + list_del(&page->lru); + to = page_to_virt(page); + copy_page(to, (void *)walk->reuse_addr); + reset_struct_pages(to); + + set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); +} + +/** + * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) + * to the page which @reuse is mapped to, then free vmemmap + * which the range are mapped to. + * @start: start address of the vmemmap virtual address range that we want + * to remap. + * @end: end address of the vmemmap virtual address range that we want to + * remap. + * @reuse: reuse address. + * + * Return: %0 on success, negative error code otherwise. + */ +static int vmemmap_remap_free(unsigned long start, unsigned long end, + unsigned long reuse) +{ + int ret; + LIST_HEAD(vmemmap_pages); + struct vmemmap_remap_walk walk = { + .remap_pte = vmemmap_remap_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + /* + * In order to make remapping routine most efficient for the huge pages, + * the routine of vmemmap page table walking has the following rules + * (see more details from the vmemmap_pte_range()): + * + * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) + * should be continuous. + * - The @reuse address is part of the range [@reuse, @end) that we are + * walking which is passed to vmemmap_remap_range(). + * - The @reuse address is the first in the complete range. + * + * So we need to make sure that @start and @reuse meet the above rules. + */ + BUG_ON(start - reuse != PAGE_SIZE); + + mmap_read_lock(&init_mm); + ret = vmemmap_remap_range(reuse, end, &walk); + if (ret && walk.nr_walked) { + end = reuse + walk.nr_walked * PAGE_SIZE; + /* + * vmemmap_pages contains pages from the previous + * vmemmap_remap_range call which failed. These + * are pages which were removed from the vmemmap. + * They will be restored in the following call. + */ + walk = (struct vmemmap_remap_walk) { + .remap_pte = vmemmap_restore_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + vmemmap_remap_range(reuse, end, &walk); + } + mmap_read_unlock(&init_mm); + + free_vmemmap_page_list(&vmemmap_pages); + + return ret; +} + +static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, + gfp_t gfp_mask, struct list_head *list) +{ + unsigned long nr_pages = (end - start) >> PAGE_SHIFT; + int nid = page_to_nid((struct page *)start); + struct page *page, *next; + + while (nr_pages--) { + page = alloc_pages_node(nid, gfp_mask, 0); + if (!page) + goto out; + list_add_tail(&page->lru, list); + } + + return 0; +out: + list_for_each_entry_safe(page, next, list, lru) + __free_pages(page, 0); + return -ENOMEM; +} + +/** + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) + * to the page which is from the @vmemmap_pages + * respectively. + * @start: start address of the vmemmap virtual address range that we want + * to remap. + * @end: end address of the vmemmap virtual address range that we want to + * remap. + * @reuse: reuse address. + * @gfp_mask: GFP flag for allocating vmemmap pages. + * + * Return: %0 on success, negative error code otherwise. + */ +static int vmemmap_remap_alloc(unsigned long start, unsigned long end, + unsigned long reuse, gfp_t gfp_mask) +{ + LIST_HEAD(vmemmap_pages); + struct vmemmap_remap_walk walk = { + .remap_pte = vmemmap_restore_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + /* See the comment in the vmemmap_remap_free(). */ + BUG_ON(start - reuse != PAGE_SIZE); + + if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages)) + return -ENOMEM; + + mmap_read_lock(&init_mm); + vmemmap_remap_range(reuse, end, &walk); + mmap_read_unlock(&init_mm); + + return 0; +} + +DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); +EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); + +static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); +core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); + +/** + * hugetlb_vmemmap_restore - restore previously optimized (by + * hugetlb_vmemmap_optimize()) vmemmap pages which + * will be reallocated and remapped. + * @h: struct hstate. + * @head: the head page whose vmemmap pages will be restored. + * + * Return: %0 if @head's vmemmap pages have been reallocated and remapped, + * negative error code otherwise. + */ +int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) { int ret; - unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_reuse; if (!HPageVmemmapOptimized(head)) return 0; - vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); - vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); + vmemmap_reuse = vmemmap_start; + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* - * The pages which the vmemmap virtual address range [@vmemmap_addr, + * The pages which the vmemmap virtual address range [@vmemmap_start, * @vmemmap_end) are mapped to are freed to the buddy allocator, and * the range is mapped to the page which @vmemmap_reuse is mapped to. * When a HugeTLB page is freed to the buddy allocator, previously * discarded vmemmap pages must be allocated and remapping. */ - ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, + ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); if (!ret) { ClearHPageVmemmapOptimized(head); @@ -97,11 +456,14 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) return ret; } -static unsigned int vmemmap_optimizable_pages(struct hstate *h, - struct page *head) +/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ +static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head) { - if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) - return 0; + if (!READ_ONCE(vmemmap_optimize_enabled)) + return false; + + if (!hugetlb_vmemmap_optimizable(h)) + return false; if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { pmd_t *pmdp, pmd; @@ -144,118 +506,73 @@ static unsigned int vmemmap_optimizable_pages(struct hstate *h, * +-------------------------------------------+ */ if (PageVmemmapSelfHosted(vmemmap_page)) - return 0; + return false; } - return hugetlb_optimize_vmemmap_pages(h); + return true; } -void hugetlb_vmemmap_free(struct hstate *h, struct page *head) +/** + * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages. + * @h: struct hstate. + * @head: the head page whose vmemmap pages will be optimized. + * + * This function only tries to optimize @head's vmemmap pages and does not + * guarantee that the optimization will succeed after it returns. The caller + * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages + * have been optimized. + */ +void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) { - unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_reuse; - vmemmap_pages = vmemmap_optimizable_pages(h, head); - if (!vmemmap_pages) + if (!vmemmap_should_optimize(h, head)) return; static_branch_inc(&hugetlb_optimize_vmemmap_key); - vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); + vmemmap_reuse = vmemmap_start; + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* - * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end) + * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) * to the page which @vmemmap_reuse is mapped to, then free the pages - * which the range [@vmemmap_addr, @vmemmap_end] is mapped to. + * which the range [@vmemmap_start, @vmemmap_end] is mapped to. */ - if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) + if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse)) static_branch_dec(&hugetlb_optimize_vmemmap_key); else SetHPageVmemmapOptimized(head); } -void __init hugetlb_vmemmap_init(struct hstate *h) -{ - unsigned int nr_pages = pages_per_huge_page(h); - unsigned int vmemmap_pages; - - /* - * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct - * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP, - * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page. - */ - BUILD_BUG_ON(__NR_USED_SUBPAGE >= - RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - - if (!is_power_of_2(sizeof(struct page))) { - pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); - static_branch_disable(&hugetlb_optimize_vmemmap_key); - return; - } - - vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; - /* - * The head page is not to be freed to buddy allocator, the other tail - * pages will map to the head page, so they can be freed. - * - * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true - * on some architectures (e.g. aarch64). See Documentation/arm64/ - * hugetlbpage.rst for more details. - */ - if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR)) - h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; - - pr_info("can optimize %d vmemmap pages for %s\n", - h->optimize_vmemmap_pages, h->name); -} - -#ifdef CONFIG_PROC_SYSCTL -static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, - loff_t *ppos) -{ - int ret; - enum vmemmap_optimize_mode mode; - static DEFINE_MUTEX(sysctl_mutex); - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&sysctl_mutex); - mode = vmemmap_optimize_mode; - table->data = &mode; - ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (write && !ret) - vmemmap_optimize_mode_switch(mode); - mutex_unlock(&sysctl_mutex); - - return ret; -} - static struct ctl_table hugetlb_vmemmap_sysctls[] = { { .procname = "hugetlb_optimize_vmemmap", - .maxlen = sizeof(enum vmemmap_optimize_mode), + .data = &vmemmap_optimize_enabled, + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = hugetlb_optimize_vmemmap_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .proc_handler = proc_dobool, }, { } }; -static __init int hugetlb_vmemmap_sysctls_init(void) +static int __init hugetlb_vmemmap_init(void) { - /* - * If "struct page" crosses page boundaries, the vmemmap pages cannot - * be optimized. - */ - if (is_power_of_2(sizeof(struct page))) - register_sysctl_init("vm", hugetlb_vmemmap_sysctls); - + /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ + BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE); + + if (IS_ENABLED(CONFIG_PROC_SYSCTL)) { + const struct hstate *h; + + for_each_hstate(h) { + if (hugetlb_vmemmap_optimizable(h)) { + register_sysctl_init("vm", hugetlb_vmemmap_sysctls); + break; + } + } + } return 0; } -late_initcall(hugetlb_vmemmap_sysctls_init); -#endif /* CONFIG_PROC_SYSCTL */ +late_initcall(hugetlb_vmemmap_init); diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 109b0a53b6fe..25bd0e002431 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Optimize vmemmap pages associated with HugeTLB + * HugeTLB Vmemmap Optimization (HVO) * - * Copyright (c) 2020, Bytedance. All rights reserved. + * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song <songmuchun@bytedance.com> */ @@ -11,35 +11,50 @@ #include <linux/hugetlb.h> #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP -int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head); -void hugetlb_vmemmap_free(struct hstate *h, struct page *head); -void hugetlb_vmemmap_init(struct hstate *h); +int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); +void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); /* - * How many vmemmap pages associated with a HugeTLB page that can be - * optimized and freed to the buddy allocator. + * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See + * Documentation/vm/vmemmap_dedup.rst. */ -static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) +#define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE + +static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) { - return h->optimize_vmemmap_pages; + return pages_per_huge_page(h) * sizeof(struct page); +} + +/* + * Return how many vmemmap size associated with a HugeTLB page that can be + * optimized and can be freed to the buddy allocator. + */ +static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) +{ + int size = hugetlb_vmemmap_size(h) - HUGETLB_VMEMMAP_RESERVE_SIZE; + + if (!is_power_of_2(sizeof(struct page))) + return 0; + return size > 0 ? size : 0; } #else -static inline int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) +static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) { return 0; } -static inline void hugetlb_vmemmap_free(struct hstate *h, struct page *head) +static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) { } -static inline void hugetlb_vmemmap_init(struct hstate *h) +static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { + return 0; } +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ -static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) +static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h) { - return 0; + return hugetlb_vmemmap_optimizable_size(h) != 0; } -#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ #endif /* _LINUX_HUGETLB_VMEMMAP_H */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9a7a228ad04a..14439806b5ef 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -74,7 +74,13 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool hw_memory_failure __read_mostly = false; -static bool __page_handle_poison(struct page *page) +/* + * Return values: + * 1: the page is dissolved (if needed) and taken off from buddy, + * 0: the page is dissolved (if needed) and not taken off from buddy, + * < 0: failed to dissolve. + */ +static int __page_handle_poison(struct page *page) { int ret; @@ -84,7 +90,7 @@ static bool __page_handle_poison(struct page *page) ret = take_page_off_buddy(page); zone_pcp_enable(page_zone(page)); - return ret > 0; + return ret; } static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release) @@ -94,7 +100,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo * Doing this check for free pages is also fine since dissolve_free_huge_page * returns 0 for non-hugetlb pages as well. */ - if (!__page_handle_poison(page)) + if (__page_handle_poison(page) <= 0) /* * We could fail to take off the target page from buddy * for example due to racy page allocation, but that's @@ -762,7 +768,6 @@ static const char * const action_page_types[] = { [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", [MF_MSG_HUGE] = "huge page", [MF_MSG_FREE_HUGE] = "free huge page", - [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page", [MF_MSG_UNMAP_FAILED] = "unmapping failed page", [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page", @@ -1078,7 +1083,6 @@ static int me_huge_page(struct page_state *ps, struct page *p) res = truncate_error_page(hpage, page_to_pfn(p), mapping); unlock_page(hpage); } else { - res = MF_FAILED; unlock_page(hpage); /* * migration entry prevents later access on error hugepage, @@ -1086,9 +1090,11 @@ static int me_huge_page(struct page_state *ps, struct page *p) * subpages. */ put_page(hpage); - if (__page_handle_poison(p)) { + if (__page_handle_poison(p) >= 0) { page_ref_inc(p); res = MF_RECOVERED; + } else { + res = MF_FAILED; } } @@ -1662,6 +1668,113 @@ unlock: EXPORT_SYMBOL_GPL(mf_dax_kill_procs); #endif /* CONFIG_FS_DAX */ +#ifdef CONFIG_HUGETLB_PAGE +/* + * Struct raw_hwp_page represents information about "raw error page", + * constructing singly linked list originated from ->private field of + * SUBPAGE_INDEX_HWPOISON-th tail page. + */ +struct raw_hwp_page { + struct llist_node node; + struct page *page; +}; + +static inline struc |
