From 61ef8ddaa35e341508d8fa891c33029ed5f75779 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 22 Apr 2025 09:18:15 +0100 Subject: mm/vmalloc: Warn on improper use of vunmap_range() A call to vmalloc_huge() may cause memory blocks to be mapped at pmd or pud level. But it is possible to subsequently call vunmap_range() on a sub-range of the mapped memory, which partially overlaps a pmd or pud. In this case, vmalloc unmaps the entire pmd or pud so that the no-overlapping portion is also unmapped. Clearly that would have a bad outcome, but it's not something that any callers do today as far as I can tell. So I guess it's just expected that callers will not do this. However, it would be useful to know if this happened in future; let's add a warning to cover the eventuality. Reviewed-by: Anshuman Khandual Reviewed-by: Catalin Marinas Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Ryan Roberts Tested-by: Luiz Capitulino Link: https://lore.kernel.org/r/20250422081822.1836315-8-ryan.roberts@arm.com Signed-off-by: Will Deacon --- mm/vmalloc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3ed720a787ec..d60d3a29d149 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -374,8 +374,10 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, if (cleared || pmd_bad(*pmd)) *mask |= PGTBL_PMD_MODIFIED; - if (cleared) + if (cleared) { + WARN_ON(next - addr < PMD_SIZE); continue; + } if (pmd_none_or_clear_bad(pmd)) continue; vunmap_pte_range(pmd, addr, next, mask); @@ -399,8 +401,10 @@ static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, if (cleared || pud_bad(*pud)) *mask |= PGTBL_PUD_MODIFIED; - if (cleared) + if (cleared) { + WARN_ON(next - addr < PUD_SIZE); continue; + } if (pud_none_or_clear_bad(pud)) continue; vunmap_pmd_range(pud, addr, next, mask); -- cgit v1.2.3 From 2fba13371fe80b4d0d533a502e460ce0e936d024 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 22 Apr 2025 09:18:16 +0100 Subject: mm/vmalloc: Gracefully unmap huge ptes Commit f7ee1f13d606 ("mm/vmalloc: enable mapping of huge pages at pte level in vmap") added its support by reusing the set_huge_pte_at() API, which is otherwise only used for user mappings. But when unmapping those huge ptes, it continued to call ptep_get_and_clear(), which is a layering violation. To date, the only arch to implement this support is powerpc and it all happens to work ok for it. But arm64's implementation of ptep_get_and_clear() can not be safely used to clear a previous set_huge_pte_at(). So let's introduce a new arch opt-in function, arch_vmap_pte_range_unmap_size(), which can provide the size of a (present) pte. Then we can call huge_ptep_get_and_clear() to tear it down properly. Note that if vunmap_range() is called with a range that starts in the middle of a huge pte-mapped page, we must unmap the entire huge page so the behaviour is consistent with pmd and pud block mappings. In this case emit a warning just like we do for pmd/pud mappings. Reviewed-by: Anshuman Khandual Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Catalin Marinas Signed-off-by: Ryan Roberts Tested-by: Luiz Capitulino Link: https://lore.kernel.org/r/20250422081822.1836315-9-ryan.roberts@arm.com Signed-off-by: Will Deacon --- mm/vmalloc.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d60d3a29d149..fe2e2cc8da94 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -350,12 +350,26 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pte_t *pte; + pte_t ptent; + unsigned long size = PAGE_SIZE; pte = pte_offset_kernel(pmd, addr); do { - pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); +#ifdef CONFIG_HUGETLB_PAGE + size = arch_vmap_pte_range_unmap_size(addr, pte); + if (size != PAGE_SIZE) { + if (WARN_ON(!IS_ALIGNED(addr, size))) { + addr = ALIGN_DOWN(addr, size); + pte = PTR_ALIGN_DOWN(pte, sizeof(*pte) * (size >> PAGE_SHIFT)); + } + ptent = huge_ptep_get_and_clear(&init_mm, addr, pte, size); + if (WARN_ON(end - addr < size)) + size = end - addr; + } else +#endif + ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); - } while (pte++, addr += PAGE_SIZE, addr != end); + } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end); *mask |= PGTBL_PTE_MODIFIED; } -- cgit v1.2.3 From 44562c71e2cfc9eed39bfcd98af63d5da8b1b5bf Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 22 Apr 2025 09:18:18 +0100 Subject: mm/vmalloc: Enter lazy mmu mode while manipulating vmalloc ptes Wrap vmalloc's pte table manipulation loops with arch_enter_lazy_mmu_mode() / arch_leave_lazy_mmu_mode(). This provides the arch code with the opportunity to optimize the pte manipulations. Note that vmap_pfn() already uses lazy mmu mode since it delegates to apply_to_page_range() which enters lazy mmu mode for both user and kernel mappings. These hooks will shortly be used by arm64 to improve vmalloc performance. Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Catalin Marinas Reviewed-by: Anshuman Khandual Signed-off-by: Ryan Roberts Tested-by: Luiz Capitulino Link: https://lore.kernel.org/r/20250422081822.1836315-11-ryan.roberts@arm.com Signed-off-by: Will Deacon --- mm/vmalloc.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fe2e2cc8da94..24430160b37f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -104,6 +104,9 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; + + arch_enter_lazy_mmu_mode(); + do { if (unlikely(!pte_none(ptep_get(pte)))) { if (pfn_valid(pfn)) { @@ -127,6 +130,8 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte += PFN_DOWN(size), addr += size, addr != end); + + arch_leave_lazy_mmu_mode(); *mask |= PGTBL_PTE_MODIFIED; return 0; } @@ -354,6 +359,8 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long size = PAGE_SIZE; pte = pte_offset_kernel(pmd, addr); + arch_enter_lazy_mmu_mode(); + do { #ifdef CONFIG_HUGETLB_PAGE size = arch_vmap_pte_range_unmap_size(addr, pte); @@ -370,6 +377,8 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end); + + arch_leave_lazy_mmu_mode(); *mask |= PGTBL_PTE_MODIFIED; } @@ -515,6 +524,9 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; + + arch_enter_lazy_mmu_mode(); + do { struct page *page = pages[*nr]; @@ -528,6 +540,8 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); + + arch_leave_lazy_mmu_mode(); *mask |= PGTBL_PTE_MODIFIED; return 0; } -- cgit v1.2.3