summaryrefslogtreecommitdiff
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-11-23 09:58:07 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-11-23 09:58:07 -0800
commit5c00ff742bf5caf85f60e1c73999f99376fb865d (patch)
treefa484e83c27af79f1c0511e7e0673507461c9379 /mm/huge_memory.c
parent228a1157fb9fec47eb135b51c0202b574e079ebf (diff)
parent2532e6c74a67e65b95f310946e0c0e0a41b3a34b (diff)
downloadlinux-5c00ff742bf5caf85f60e1c73999f99376fb865d.tar.gz
linux-5c00ff742bf5caf85f60e1c73999f99376fb865d.tar.bz2
linux-5c00ff742bf5caf85f60e1c73999f99376fb865d.zip
Merge tag 'mm-stable-2024-11-18-19-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - The series "zram: optimal post-processing target selection" from Sergey Senozhatsky improves zram's post-processing selection algorithm. This leads to improved memory savings. - Wei Yang has gone to town on the mapletree code, contributing several series which clean up the implementation: - "refine mas_mab_cp()" - "Reduce the space to be cleared for maple_big_node" - "maple_tree: simplify mas_push_node()" - "Following cleanup after introduce mas_wr_store_type()" - "refine storing null" - The series "selftests/mm: hugetlb_fault_after_madv improvements" from David Hildenbrand fixes this selftest for s390. - The series "introduce pte_offset_map_{ro|rw}_nolock()" from Qi Zheng implements some rationaizations and cleanups in the page mapping code. - The series "mm: optimize shadow entries removal" from Shakeel Butt optimizes the file truncation code by speeding up the handling of shadow entries. - The series "Remove PageKsm()" from Matthew Wilcox completes the migration of this flag over to being a folio-based flag. - The series "Unify hugetlb into arch_get_unmapped_area functions" from Oscar Salvador implements a bunch of consolidations and cleanups in the hugetlb code. - The series "Do not shatter hugezeropage on wp-fault" from Dev Jain takes away the wp-fault time practice of turning a huge zero page into small pages. Instead we replace the whole thing with a THP. More consistent cleaner and potentiall saves a large number of pagefaults. - The series "percpu: Add a test case and fix for clang" from Andy Shevchenko enhances and fixes the kernel's built in percpu test code. - The series "mm/mremap: Remove extra vma tree walk" from Liam Howlett optimizes mremap() by avoiding doing things which we didn't need to do. - The series "Improve the tmpfs large folio read performance" from Baolin Wang teaches tmpfs to copy data into userspace at the folio size rather than as individual pages. A 20% speedup was observed. - The series "mm/damon/vaddr: Fix issue in damon_va_evenly_split_region()" fro Zheng Yejian fixes DAMON splitting. - The series "memcg-v1: fully deprecate charge moving" from Shakeel Butt removes the long-deprecated memcgv2 charge moving feature. - The series "fix error handling in mmap_region() and refactor" from Lorenzo Stoakes cleanup up some of the mmap() error handling and addresses some potential performance issues. - The series "x86/module: use large ROX pages for text allocations" from Mike Rapoport teaches x86 to use large pages for read-only-execute module text. - The series "page allocation tag compression" from Suren Baghdasaryan is followon maintenance work for the new page allocation profiling feature. - The series "page->index removals in mm" from Matthew Wilcox remove most references to page->index in mm/. A slow march towards shrinking struct page. - The series "damon/{self,kunit}tests: minor fixups for DAMON debugfs interface tests" from Andrew Paniakin performs maintenance work for DAMON's self testing code. - The series "mm: zswap swap-out of large folios" from Kanchana Sridhar improves zswap's batching of compression and decompression. It is a step along the way towards using Intel IAA hardware acceleration for this zswap operation. - The series "kasan: migrate the last module test to kunit" from Sabyrzhan Tasbolatov completes the migration of the KASAN built-in tests over to the KUnit framework. - The series "implement lightweight guard pages" from Lorenzo Stoakes permits userapace to place fault-generating guard pages within a single VMA, rather than requiring that multiple VMAs be created for this. Improved efficiencies for userspace memory allocators are expected. - The series "memcg: tracepoint for flushing stats" from JP Kobryn uses tracepoints to provide increased visibility into memcg stats flushing activity. - The series "zram: IDLE flag handling fixes" from Sergey Senozhatsky fixes a zram buglet which potentially affected performance. - The series "mm: add more kernel parameters to control mTHP" from MaĆ­ra Canal enhances our ability to control/configuremultisize THP from the kernel boot command line. - The series "kasan: few improvements on kunit tests" from Sabyrzhan Tasbolatov has a couple of fixups for the KASAN KUnit tests. - The series "mm/list_lru: Split list_lru lock into per-cgroup scope" from Kairui Song optimizes list_lru memory utilization when lockdep is enabled. * tag 'mm-stable-2024-11-18-19-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (215 commits) cma: enforce non-zero pageblock_order during cma_init_reserved_mem() mm/kfence: add a new kunit test test_use_after_free_read_nofault() zram: fix NULL pointer in comp_algorithm_show() memcg/hugetlb: add hugeTLB counters to memcg vmstat: call fold_vm_zone_numa_events() before show per zone NUMA event mm: mmap_lock: check trace_mmap_lock_$type_enabled() instead of regcount zram: ZRAM_DEF_COMP should depend on ZRAM MAINTAINERS/MEMORY MANAGEMENT: add document files for mm Docs/mm/damon: recommend academic papers to read and/or cite mm: define general function pXd_init() kmemleak: iommu/iova: fix transient kmemleak false positive mm/list_lru: simplify the list_lru walk callback function mm/list_lru: split the lock to per-cgroup scope mm/list_lru: simplify reparenting and initial allocation mm/list_lru: code clean up for reparenting mm/list_lru: don't export list_lru_add mm/list_lru: don't pass unnecessary key parameters kasan: add kunit tests for kmalloc_track_caller, kmalloc_node_track_caller kasan: change kasan_atomics kunit test as KUNIT_CASE_SLOW kasan: use EXPORT_SYMBOL_IF_KUNIT to export symbols ...
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c227
1 files changed, 150 insertions, 77 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5734d5d5060f..ee335d96fc39 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -8,7 +8,6 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
@@ -84,6 +83,21 @@ unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;
static bool anon_orders_configured __initdata;
+static inline bool file_thp_enabled(struct vm_area_struct *vma)
+{
+ struct inode *inode;
+
+ if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
+ return false;
+
+ if (!vma->vm_file)
+ return false;
+
+ inode = file_inode(vma->vm_file);
+
+ return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
+}
+
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
unsigned long vm_flags,
unsigned long tva_flags,
@@ -601,6 +615,8 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
+DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
#ifdef CONFIG_SHMEM
@@ -619,6 +635,8 @@ static struct attribute *anon_stats_attrs[] = {
&anon_fault_fallback_attr.attr,
&anon_fault_fallback_charge_attr.attr,
#ifndef CONFIG_SHMEM
+ &zswpout_attr.attr,
+ &swpin_attr.attr,
&swpout_attr.attr,
&swpout_fallback_attr.attr,
#endif
@@ -649,6 +667,8 @@ static struct attribute_group file_stats_attr_grp = {
static struct attribute *any_stats_attrs[] = {
#ifdef CONFIG_SHMEM
+ &zswpout_attr.attr,
+ &swpin_attr.attr,
&swpout_attr.attr,
&swpout_fallback_attr.attr,
#endif
@@ -938,26 +958,6 @@ out:
}
__setup("transparent_hugepage=", setup_transparent_hugepage);
-static inline int get_order_from_str(const char *size_str)
-{
- unsigned long size;
- char *endptr;
- int order;
-
- size = memparse(size_str, &endptr);
-
- if (!is_power_of_2(size))
- goto err;
- order = get_order(size);
- if (BIT(order) & ~THP_ORDERS_ALL_ANON)
- goto err;
-
- return order;
-err:
- pr_err("invalid size %s in thp_anon boot parameter\n", size_str);
- return -EINVAL;
-}
-
static char str_dup[PAGE_SIZE] __initdata;
static int __init setup_thp_anon(char *str)
{
@@ -969,7 +969,7 @@ static int __init setup_thp_anon(char *str)
if (!str || strlen(str) + 1 > PAGE_SIZE)
goto err;
- strcpy(str_dup, str);
+ strscpy(str_dup, str);
always = huge_anon_orders_always;
madvise = huge_anon_orders_madvise;
@@ -987,10 +987,22 @@ static int __init setup_thp_anon(char *str)
start_size = strsep(&subtoken, "-");
end_size = subtoken;
- start = get_order_from_str(start_size);
- end = get_order_from_str(end_size);
+ start = get_order_from_str(start_size, THP_ORDERS_ALL_ANON);
+ end = get_order_from_str(end_size, THP_ORDERS_ALL_ANON);
} else {
- start = end = get_order_from_str(subtoken);
+ start_size = end_size = subtoken;
+ start = end = get_order_from_str(subtoken,
+ THP_ORDERS_ALL_ANON);
+ }
+
+ if (start == -EINVAL) {
+ pr_err("invalid size %s in thp_anon boot parameter\n", start_size);
+ goto err;
+ }
+
+ if (end == -EINVAL) {
+ pr_err("invalid size %s in thp_anon boot parameter\n", end_size);
+ goto err;
}
if (start < 0 || end < 0 || start > end)
@@ -1137,47 +1149,87 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
-static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
- struct page *page, gfp_t gfp)
+static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
+ unsigned long addr)
{
- struct vm_area_struct *vma = vmf->vma;
- struct folio *folio = page_folio(page);
- pgtable_t pgtable;
- unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- vm_fault_t ret = 0;
+ gfp_t gfp = vma_thp_gfp_mask(vma);
+ const int order = HPAGE_PMD_ORDER;
+ struct folio *folio;
- VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+ folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK);
+ if (unlikely(!folio)) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+ return NULL;
+ }
+
+ VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
folio_put(folio);
count_vm_event(THP_FAULT_FALLBACK);
count_vm_event(THP_FAULT_FALLBACK_CHARGE);
- count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
- count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
- return VM_FAULT_FALLBACK;
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+ return NULL;
}
folio_throttle_swaprate(folio, gfp);
- pgtable = pte_alloc_one(vma->vm_mm);
- if (unlikely(!pgtable)) {
- ret = VM_FAULT_OOM;
- goto release;
- }
-
- folio_zero_user(folio, vmf->address);
+ /*
+ * When a folio is not zeroed during allocation (__GFP_ZERO not used),
+ * folio_zero_user() is used to make sure that the page corresponding
+ * to the faulting address will be hot in the cache after zeroing.
+ */
+ if (!alloc_zeroed())
+ folio_zero_user(folio, addr);
/*
* The memory barrier inside __folio_mark_uptodate makes sure that
* folio_zero_user writes become visible before the set_pmd_at()
* write.
*/
__folio_mark_uptodate(folio);
+ return folio;
+}
+
+static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
+ struct vm_area_struct *vma, unsigned long haddr)
+{
+ pmd_t entry;
+
+ entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
+ folio_add_lru_vma(folio, vma);
+ set_pmd_at(vma->vm_mm, haddr, pmd, entry);
+ update_mmu_cache_pmd(vma, haddr, pmd);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ count_vm_event(THP_FAULT_ALLOC);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+}
+
+static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
+{
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ struct folio *folio;
+ pgtable_t pgtable;
+ vm_fault_t ret = 0;
+
+ folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
+ if (unlikely(!folio))
+ return VM_FAULT_FALLBACK;
+
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (unlikely(!pgtable)) {
+ ret = VM_FAULT_OOM;
+ goto release;
+ }
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_none(*vmf->pmd))) {
goto unlock_release;
} else {
- pmd_t entry;
-
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto unlock_release;
@@ -1191,21 +1243,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
return ret;
}
-
- entry = mk_huge_pmd(page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
- folio_add_lru_vma(folio, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
- set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
- update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
mm_inc_nr_ptes(vma->vm_mm);
deferred_split_folio(folio, false);
spin_unlock(vmf->ptl);
- count_vm_event(THP_FAULT_ALLOC);
- count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
- count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
}
return 0;
@@ -1272,8 +1314,6 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- gfp_t gfp;
- struct folio *folio;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
vm_fault_t ret;
@@ -1324,14 +1364,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
}
return ret;
}
- gfp = vma_thp_gfp_mask(vma);
- folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
- if (unlikely(!folio)) {
- count_vm_event(THP_FAULT_FALLBACK);
- count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
- return VM_FAULT_FALLBACK;
- }
- return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
+
+ return __do_huge_pmd_anonymous_page(vmf);
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -1763,6 +1797,38 @@ unlock:
spin_unlock(vmf->ptl);
}
+static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
+{
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ struct mmu_notifier_range range;
+ struct folio *folio;
+ vm_fault_t ret = 0;
+
+ folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
+ if (unlikely(!folio))
+ return VM_FAULT_FALLBACK;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
+ haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
+ goto release;
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto release;
+ (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
+ map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
+ goto unlock;
+release:
+ folio_put(folio);
+unlock:
+ spin_unlock(vmf->ptl);
+ mmu_notifier_invalidate_range_end(&range);
+ return ret;
+}
+
vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
{
const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
@@ -1775,8 +1841,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
- if (is_huge_zero_pmd(orig_pmd))
+ if (is_huge_zero_pmd(orig_pmd)) {
+ vm_fault_t ret = do_huge_zero_wp_pmd(vmf);
+
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+
+ /* Fallback to splitting PMD if THP cannot be allocated */
goto fallback;
+ }
spin_lock(vmf->ptl);
@@ -3124,8 +3197,8 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
/* ->mapping in first and second tail page is replaced by other uses */
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
page_tail);
- page_tail->mapping = head->mapping;
- page_tail->index = head->index + tail;
+ new_folio->mapping = folio->mapping;
+ new_folio->index = folio->index + tail;
/*
* page->private should not be set in tail pages. Fix up and warn once
@@ -3201,11 +3274,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,
ClearPageHasHWPoisoned(head);
for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
+ struct folio *tail;
__split_huge_page_tail(folio, i, lruvec, list, new_order);
+ tail = page_folio(head + i);
/* Some pages can be beyond EOF: drop them from page cache */
- if (head[i].index >= end) {
- struct folio *tail = page_folio(head + i);
-
+ if (tail->index >= end) {
if (shmem_mapping(folio->mapping))
nr_dropped++;
else if (folio_test_clear_dirty(tail))
@@ -3213,12 +3286,12 @@ static void __split_huge_page(struct page *page, struct list_head *list,
inode_to_wb(folio->mapping->host));
__filemap_remove_folio(tail, NULL);
folio_put(tail);
- } else if (!PageAnon(page)) {
- __xa_store(&folio->mapping->i_pages, head[i].index,
- head + i, 0);
+ } else if (!folio_test_anon(folio)) {
+ __xa_store(&folio->mapping->i_pages, tail->index,
+ tail, 0);
} else if (swap_cache) {
__xa_store(&swap_cache->i_pages, offset + i,
- head + i, 0);
+ tail, 0);
}
}
@@ -4096,7 +4169,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
tok = strsep(&buf, ",");
if (tok) {
- strcpy(file_path, tok);
+ strscpy(file_path, tok);
} else {
ret = -EINVAL;
goto out;