summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-08-05 16:02:07 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2025-08-05 16:02:07 +0300
commitda23ea194db94257123f1534d487f3cdc9b5626d (patch)
treea6c069c9f5b55ebd81771e4f7871e174b1e034c7 /mm
parent7e161a991ea71e6ec526abc8f40c6852ebe3d946 (diff)
parenta2152fef29020e740ba0276930f3a24440012505 (diff)
downloadlinux-da23ea194db94257123f1534d487f3cdc9b5626d.tar.gz
linux-da23ea194db94257123f1534d487f3cdc9b5626d.tar.bz2
linux-da23ea194db94257123f1534d487f3cdc9b5626d.zip
Merge tag 'mm-stable-2025-08-03-12-35' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull more MM updates from Andrew Morton: "Significant patch series in this pull request: - "mseal cleanups" (Lorenzo Stoakes) Some mseal cleaning with no intended functional change. - "Optimizations for khugepaged" (David Hildenbrand) Improve khugepaged throughput by batching PTE operations for large folios. This gain is mainly for arm64. - "x86: enable EXECMEM_ROX_CACHE for ftrace and kprobes" (Mike Rapoport) A bugfix, additional debug code and cleanups to the execmem code. - "mm/shmem, swap: bugfix and improvement of mTHP swap in" (Kairui Song) Bugfixes, cleanups and performance improvememnts to the mTHP swapin code" * tag 'mm-stable-2025-08-03-12-35' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (38 commits) mm: mempool: fix crash in mempool_free() for zero-minimum pools mm: correct type for vmalloc vm_flags fields mm/shmem, swap: fix major fault counting mm/shmem, swap: rework swap entry and index calculation for large swapin mm/shmem, swap: simplify swapin path and result handling mm/shmem, swap: never use swap cache and readahead for SWP_SYNCHRONOUS_IO mm/shmem, swap: tidy up swap entry splitting mm/shmem, swap: tidy up THP swapin checks mm/shmem, swap: avoid redundant Xarray lookup during swapin x86/ftrace: enable EXECMEM_ROX_CACHE for ftrace allocations x86/kprobes: enable EXECMEM_ROX_CACHE for kprobes allocations execmem: drop writable parameter from execmem_fill_trapping_insns() execmem: add fallback for failures in vmalloc(VM_ALLOW_HUGE_VMAP) execmem: move execmem_force_rw() and execmem_restore_rox() before use execmem: rework execmem_cache_free() execmem: introduce execmem_alloc_rw() execmem: drop unused execmem_update_copy() mm: fix a UAF when vma->mm is freed after vma->vm_refcnt got dropped mm/rmap: add anon_vma lifetime debug check mm: remove mm/io-mapping.c ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/Makefile1
-rw-r--r--mm/damon/vaddr.c4
-rw-r--r--mm/execmem.c206
-rw-r--r--mm/internal.h2
-rw-r--r--mm/io-mapping.c30
-rw-r--r--mm/kasan/common.c25
-rw-r--r--mm/khugepaged.c58
-rw-r--r--mm/madvise.c71
-rw-r--r--mm/memory-failure.c12
-rw-r--r--mm/mempool.c24
-rw-r--r--mm/mincore.c3
-rw-r--r--mm/mmap_lock.c10
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/mseal.c166
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/shmem.c279
-rw-r--r--mm/vma.c4
-rw-r--r--mm/vma.h27
21 files changed, 518 insertions, 418 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d5d4eca947a6..e443fe8cd6cf 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1242,10 +1242,6 @@ config KMAP_LOCAL
config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY
bool
-# struct io_mapping based helper. Selected by drivers that need them
-config IO_MAPPING
- bool
-
config MEMFD_CREATE
bool "Enable memfd_create() system call" if EXPERT
diff --git a/mm/Makefile b/mm/Makefile
index 1a7a11d4933d..ef54aa615d9d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -141,7 +141,6 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
-obj-$(CONFIG_IO_MAPPING) += io-mapping.o
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 94af19c4dfed..87e825349bdf 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -711,6 +711,10 @@ static void damos_va_migrate_dests_add(struct folio *folio,
target -= dests->weight_arr[i];
}
+ /* If the folio is already in the right node, don't do anything */
+ if (folio_nid(folio) == dests->node_id_arr[i])
+ return;
+
isolate:
if (!folio_isolate_lru(folio))
return;
diff --git a/mm/execmem.c b/mm/execmem.c
index 627e6cf64f4f..0822305413ec 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -26,7 +26,7 @@ static struct execmem_info default_execmem_info __ro_after_init;
#ifdef CONFIG_MMU
static void *execmem_vmalloc(struct execmem_range *range, size_t size,
- pgprot_t pgprot, vm_flags_t vm_flags)
+ pgprot_t pgprot, unsigned long vm_flags)
{
bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN;
@@ -82,7 +82,7 @@ struct vm_struct *execmem_vmap(size_t size)
}
#else
static void *execmem_vmalloc(struct execmem_range *range, size_t size,
- pgprot_t pgprot, vm_flags_t vm_flags)
+ pgprot_t pgprot, unsigned long vm_flags)
{
return vmalloc(size);
}
@@ -93,8 +93,15 @@ struct execmem_cache {
struct mutex mutex;
struct maple_tree busy_areas;
struct maple_tree free_areas;
+ unsigned int pending_free_cnt; /* protected by mutex */
};
+/* delay to schedule asynchronous free if fast path free fails */
+#define FREE_DELAY (msecs_to_jiffies(10))
+
+/* mark entries in busy_areas that should be freed asynchronously */
+#define PENDING_FREE_MASK (1 << (PAGE_SHIFT - 1))
+
static struct execmem_cache execmem_cache = {
.mutex = __MUTEX_INITIALIZER(execmem_cache.mutex),
.busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN,
@@ -130,6 +137,27 @@ err_restore:
return err;
}
+static int execmem_force_rw(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+ int ret;
+
+ ret = set_memory_nx(addr, nr);
+ if (ret)
+ return ret;
+
+ return set_memory_rw(addr, nr);
+}
+
+int execmem_restore_rox(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+
+ return set_memory_rox(addr, nr);
+}
+
static void execmem_cache_clean(struct work_struct *work)
{
struct maple_tree *free_areas = &execmem_cache.free_areas;
@@ -155,20 +183,17 @@ static void execmem_cache_clean(struct work_struct *work)
static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean);
-static int execmem_cache_add(void *ptr, size_t size)
+static int execmem_cache_add_locked(void *ptr, size_t size, gfp_t gfp_mask)
{
struct maple_tree *free_areas = &execmem_cache.free_areas;
- struct mutex *mutex = &execmem_cache.mutex;
unsigned long addr = (unsigned long)ptr;
MA_STATE(mas, free_areas, addr - 1, addr + 1);
unsigned long lower, upper;
void *area = NULL;
- int err;
lower = addr;
upper = addr + size - 1;
- mutex_lock(mutex);
area = mas_walk(&mas);
if (area && mas.last == addr - 1)
lower = mas.index;
@@ -178,12 +203,14 @@ static int execmem_cache_add(void *ptr, size_t size)
upper = mas.last;
mas_set_range(&mas, lower, upper);
- err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL);
- mutex_unlock(mutex);
- if (err)
- return err;
+ return mas_store_gfp(&mas, (void *)lower, gfp_mask);
+}
- return 0;
+static int execmem_cache_add(void *ptr, size_t size, gfp_t gfp_mask)
+{
+ guard(mutex)(&execmem_cache.mutex);
+
+ return execmem_cache_add_locked(ptr, size, gfp_mask);
}
static bool within_range(struct execmem_range *range, struct ma_state *mas,
@@ -256,7 +283,7 @@ out_unlock:
static int execmem_cache_populate(struct execmem_range *range, size_t size)
{
- vm_flags_t vm_flags = VM_ALLOW_HUGE_VMAP;
+ unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
struct vm_struct *vm;
size_t alloc_size;
int err = -ENOMEM;
@@ -264,6 +291,11 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
alloc_size = round_up(size, PMD_SIZE);
p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
+ if (!p) {
+ alloc_size = size;
+ p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
+ }
+
if (!p)
return err;
@@ -272,13 +304,13 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
goto err_free_mem;
/* fill memory with instructions that will trap */
- execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
+ execmem_fill_trapping_insns(p, alloc_size);
err = set_memory_rox((unsigned long)p, vm->nr_pages);
if (err)
goto err_free_mem;
- err = execmem_cache_add(p, alloc_size);
+ err = execmem_cache_add(p, alloc_size, GFP_KERNEL);
if (err)
goto err_reset_direct_map;
@@ -307,57 +339,117 @@ static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
return __execmem_cache_alloc(range, size);
}
-static bool execmem_cache_free(void *ptr)
+static inline bool is_pending_free(void *ptr)
{
- struct maple_tree *busy_areas = &execmem_cache.busy_areas;
- struct mutex *mutex = &execmem_cache.mutex;
- unsigned long addr = (unsigned long)ptr;
- MA_STATE(mas, busy_areas, addr, addr);
- size_t size;
- void *area;
+ return ((unsigned long)ptr & PENDING_FREE_MASK);
+}
- mutex_lock(mutex);
- area = mas_walk(&mas);
- if (!area) {
- mutex_unlock(mutex);
- return false;
- }
- size = mas_range_len(&mas);
+static inline void *pending_free_set(void *ptr)
+{
+ return (void *)((unsigned long)ptr | PENDING_FREE_MASK);
+}
- mas_store_gfp(&mas, NULL, GFP_KERNEL);
- mutex_unlock(mutex);
+static inline void *pending_free_clear(void *ptr)
+{
+ return (void *)((unsigned long)ptr & ~PENDING_FREE_MASK);
+}
- execmem_fill_trapping_insns(ptr, size, /* writable = */ false);
+static int __execmem_cache_free(struct ma_state *mas, void *ptr, gfp_t gfp_mask)
+{
+ size_t size = mas_range_len(mas);
+ int err;
- execmem_cache_add(ptr, size);
+ err = execmem_force_rw(ptr, size);
+ if (err)
+ return err;
- schedule_work(&execmem_cache_clean_work);
+ execmem_fill_trapping_insns(ptr, size);
+ execmem_restore_rox(ptr, size);
- return true;
+ err = execmem_cache_add_locked(ptr, size, gfp_mask);
+ if (err)
+ return err;
+
+ mas_store_gfp(mas, NULL, gfp_mask);
+ return 0;
}
-int execmem_make_temp_rw(void *ptr, size_t size)
+static void execmem_cache_free_slow(struct work_struct *work);
+static DECLARE_DELAYED_WORK(execmem_cache_free_work, execmem_cache_free_slow);
+
+static void execmem_cache_free_slow(struct work_struct *work)
{
- unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
- unsigned long addr = (unsigned long)ptr;
- int ret;
+ struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+ MA_STATE(mas, busy_areas, 0, ULONG_MAX);
+ void *area;
- ret = set_memory_nx(addr, nr);
- if (ret)
- return ret;
+ guard(mutex)(&execmem_cache.mutex);
- return set_memory_rw(addr, nr);
+ if (!execmem_cache.pending_free_cnt)
+ return;
+
+ mas_for_each(&mas, area, ULONG_MAX) {
+ if (!is_pending_free(area))
+ continue;
+
+ area = pending_free_clear(area);
+ if (__execmem_cache_free(&mas, area, GFP_KERNEL))
+ continue;
+
+ execmem_cache.pending_free_cnt--;
+ }
+
+ if (execmem_cache.pending_free_cnt)
+ schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY);
+ else
+ schedule_work(&execmem_cache_clean_work);
}
-int execmem_restore_rox(void *ptr, size_t size)
+static bool execmem_cache_free(void *ptr)
{
- unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ struct maple_tree *busy_areas = &execmem_cache.busy_areas;
unsigned long addr = (unsigned long)ptr;
+ MA_STATE(mas, busy_areas, addr, addr);
+ void *area;
+ int err;
- return set_memory_rox(addr, nr);
+ guard(mutex)(&execmem_cache.mutex);
+
+ area = mas_walk(&mas);
+ if (!area)
+ return false;
+
+ err = __execmem_cache_free(&mas, area, GFP_KERNEL | __GFP_NORETRY);
+ if (err) {
+ /*
+ * mas points to exact slot we've got the area from, nothing
+ * else can modify the tree because of the mutex, so there
+ * won't be any allocations in mas_store_gfp() and it will just
+ * change the pointer.
+ */
+ area = pending_free_set(area);
+ mas_store_gfp(&mas, area, GFP_KERNEL);
+ execmem_cache.pending_free_cnt++;
+ schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY);
+ return true;
+ }
+
+ schedule_work(&execmem_cache_clean_work);
+
+ return true;
}
#else /* CONFIG_ARCH_HAS_EXECMEM_ROX */
+/*
+ * when ROX cache is not used the permissions defined by architectures for
+ * execmem ranges that are updated before use (e.g. EXECMEM_MODULE_TEXT) must
+ * be writable anyway
+ */
+static inline int execmem_force_rw(void *ptr, size_t size)
+{
+ return 0;
+}
+
static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
{
return NULL;
@@ -373,9 +465,9 @@ void *execmem_alloc(enum execmem_type type, size_t size)
{
struct execmem_range *range = &execmem_info->ranges[type];
bool use_cache = range->flags & EXECMEM_ROX_CACHE;
- vm_flags_t vm_flags = VM_FLUSH_RESET_PERMS;
+ unsigned long vm_flags = VM_FLUSH_RESET_PERMS;
pgprot_t pgprot = range->pgprot;
- void *p;
+ void *p = NULL;
size = PAGE_ALIGN(size);
@@ -387,6 +479,21 @@ void *execmem_alloc(enum execmem_type type, size_t size)
return kasan_reset_tag(p);
}
+void *execmem_alloc_rw(enum execmem_type type, size_t size)
+{
+ void *p __free(execmem) = execmem_alloc(type, size);
+ int err;
+
+ if (!p)
+ return NULL;
+
+ err = execmem_force_rw(p, size);
+ if (err)
+ return NULL;
+
+ return no_free_ptr(p);
+}
+
void execmem_free(void *ptr)
{
/*
@@ -399,11 +506,6 @@ void execmem_free(void *ptr)
vfree(ptr);
}
-void *execmem_update_copy(void *dst, const void *src, size_t size)
-{
- return text_poke_copy(dst, src, size);
-}
-
bool execmem_is_rox(enum execmem_type type)
{
return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE);
diff --git a/mm/internal.h b/mm/internal.h
index 1da16d550a45..45b725c3dc03 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1391,7 +1391,7 @@ int migrate_device_coherent_folio(struct folio *folio);
struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long shift,
- vm_flags_t vm_flags, unsigned long start,
+ unsigned long vm_flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask,
const void *caller);
diff --git a/mm/io-mapping.c b/mm/io-mapping.c
deleted file mode 100644
index d3586e95c12c..000000000000
--- a/mm/io-mapping.c
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-#include <linux/mm.h>
-#include <linux/io-mapping.h>
-
-/**
- * io_mapping_map_user - remap an I/O mapping to userspace
- * @iomap: the source io_mapping
- * @vma: user vma to map to
- * @addr: target user address to start at
- * @pfn: physical address of kernel memory
- * @size: size of map area
- *
- * Note: this is only safe if the mm semaphore is held when called.
- */
-int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
- unsigned long addr, unsigned long pfn, unsigned long size)
-{
- vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
-
- if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags))
- return -EINVAL;
-
- pgprot_t remap_prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
- (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK));
-
- /* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */
- return remap_pfn_range_notrack(vma, addr, pfn, size, remap_prot);
-}
-EXPORT_SYMBOL_GPL(io_mapping_map_user);
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index ed4873e18c75..9142964ab9c9 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -230,16 +230,12 @@ static bool check_slab_allocation(struct kmem_cache *cache, void *object,
}
static inline void poison_slab_object(struct kmem_cache *cache, void *object,
- bool init, bool still_accessible)
+ bool init)
{
void *tagged_object = object;
object = kasan_reset_tag(object);
- /* RCU slabs could be legally used after free within the RCU period. */
- if (unlikely(still_accessible))
- return;
-
kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
KASAN_SLAB_FREE, init);
@@ -261,7 +257,22 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init,
if (!kasan_arch_is_ready() || is_kfence_address(object))
return false;
- poison_slab_object(cache, object, init, still_accessible);
+ /*
+ * If this point is reached with an object that must still be
+ * accessible under RCU, we can't poison it; in that case, also skip the
+ * quarantine. This should mostly only happen when CONFIG_SLUB_RCU_DEBUG
+ * has been disabled manually.
+ *
+ * Putting the object on the quarantine wouldn't help catch UAFs (since
+ * we can't poison it here), and it would mask bugs caused by
+ * SLAB_TYPESAFE_BY_RCU users not being careful enough about object
+ * reuse; so overall, putting the object into the quarantine here would
+ * be counterproductive.
+ */
+ if (still_accessible)
+ return false;
+
+ poison_slab_object(cache, object, init);
/*
* If the object is put into quarantine, do not let slab put the object
@@ -519,7 +530,7 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip)
if (check_slab_allocation(slab->slab_cache, ptr, ip))
return false;
- poison_slab_object(slab->slab_cache, ptr, false, false);
+ poison_slab_object(slab->slab_cache, ptr, false);
return true;
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a55fb1dcd224..374a6a5193a7 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -700,12 +700,15 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
spinlock_t *ptl,
struct list_head *compound_pagelist)
{
+ unsigned long end = address + HPAGE_PMD_SIZE;
struct folio *src, *tmp;
- pte_t *_pte;
pte_t pteval;
+ pte_t *_pte;
+ unsigned int nr_ptes;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, address += PAGE_SIZE) {
+ for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
+ address += nr_ptes * PAGE_SIZE) {
+ nr_ptes = 1;
pteval = ptep_get(_pte);
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
@@ -722,18 +725,26 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
struct page *src_page = pte_page(pteval);
src = page_folio(src_page);
- if (!folio_test_large(src))
+
+ if (folio_test_large(src)) {
+ unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT;
+
+ nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes);
+ } else {
release_pte_folio(src);
+ }
+
/*
* ptl mostly unnecessary, but preempt has to
* be disabled to update the per-cpu stats
* inside folio_remove_rmap_pte().
*/
spin_lock(ptl);
- ptep_clear(vma->vm_mm, address, _pte);
- folio_remove_rmap_pte(src, src_page, vma);
+ clear_ptes(vma->vm_mm, address, _pte, nr_ptes);
+ folio_remove_rmap_ptes(src, src_page, nr_ptes, vma);
spin_unlock(ptl);
- free_folio_and_swap_cache(src);
+ free_swap_cache(src);
+ folio_put_refs(src, nr_ptes);
}
}
@@ -1492,15 +1503,17 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
bool install_pmd)
{
+ int nr_mapped_ptes = 0, result = SCAN_FAIL;
+ unsigned int nr_batch_ptes;
struct mmu_notifier_range range;
bool notified = false;
unsigned long haddr = addr & HPAGE_PMD_MASK;
+ unsigned long end = haddr + HPAGE_PMD_SIZE;
struct vm_area_struct *vma = vma_lookup(mm, haddr);
struct folio *folio;
pte_t *start_pte, *pte;
pmd_t *pmd, pgt_pmd;
spinlock_t *pml = NULL, *ptl;
- int nr_ptes = 0, result = SCAN_FAIL;
int i;
mmap_assert_locked(mm);
@@ -1614,11 +1627,15 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto abort;
/* step 2: clear page table and adjust rmap */
- for (i = 0, addr = haddr, pte = start_pte;
- i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR;
+ i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE,
+ pte += nr_batch_ptes) {
+ unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT;
struct page *page;
pte_t ptent = ptep_get(pte);
+ nr_batch_ptes = 1;
+
if (pte_none(ptent))
continue;
/*
@@ -1632,26 +1649,29 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto abort;
}
page = vm_normal_page(vma, addr, ptent);
+
if (folio_page(folio, i) != page)
goto abort;
+ nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes);
+
/*
* Must clear entry, or a racing truncate may re-remove it.
* TLB flush can be left until pmdp_collapse_flush() does it.
* PTE dirty? Shmem page is already dirty; file is read-only.
*/
- ptep_clear(mm, addr, pte);
- folio_remove_rmap_pte(folio, page, vma);
- nr_ptes++;
+ clear_ptes(mm, addr, pte, nr_batch_ptes);
+ folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma);
+ nr_mapped_ptes += nr_batch_ptes;
}
if (!pml)
spin_unlock(ptl);
/* step 3: set proper refcount and mm_counters. */
- if (nr_ptes) {
- folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
+ if (nr_mapped_ptes) {
+ folio_ref_sub(folio, nr_mapped_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
}
/* step 4: remove empty page table */
@@ -1684,10 +1704,10 @@ maybe_install_pmd:
: SCAN_SUCCEED;
goto drop_folio;
abort:
- if (nr_ptes) {
+ if (nr_mapped_ptes) {
flush_tlb_mm(mm);
- folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
+ folio_ref_sub(folio, nr_mapped_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
}
unlock:
if (start_pte)
diff --git a/mm/madvise.c b/mm/madvise.c
index bb80fc5ea08f..35ed4ab0d7c5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -19,6 +19,7 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mm_inline.h>
+#include <linux/mmu_context.h>
#include <linux/string.h>
#include <linux/uio.h>
#include <linux/ksm.h>
@@ -1256,6 +1257,74 @@ static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
&guard_remove_walk_ops, NULL);
}
+#ifdef CONFIG_64BIT
+/* Does the madvise operation result in discarding of mapped data? */
+static bool is_discard(int behavior)
+{
+ switch (behavior) {
+ case MADV_FREE:
+ case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
+ case MADV_REMOVE:
+ case MADV_DONTFORK:
+ case MADV_WIPEONFORK:
+ case MADV_GUARD_INSTALL:
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * We are restricted from madvise()'ing mseal()'d VMAs only in very particular
+ * circumstances - discarding of data from read-only anonymous SEALED mappings.
+ *
+ * This is because users cannot trivally discard data from these VMAs, and may
+ * only do so via an appropriate madvise() call.
+ */
+static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
+{
+ struct vm_area_struct *vma = madv_behavior->vma;
+
+ /* If the VMA isn't sealed we're good. */
+ if (!vma_is_sealed(vma))
+ return true;
+
+ /* For a sealed VMA, we only care about discard operations. */
+ if (!is_discard(madv_behavior->behavior))
+ return true;
+
+ /*
+ * We explicitly permit all file-backed mappings, whether MAP_SHARED or
+ * MAP_PRIVATE.
+ *
+ * The latter causes some complications. Because now, one can mmap()
+ * read/write a MAP_PRIVATE mapping, write to it, then mprotect()
+ * read-only, mseal() and a discard will be permitted.
+ *
+ * However, in order to avoid issues with potential use of madvise(...,
+ * MADV_DONTNEED) of mseal()'d .text mappings we, for the time being,
+ * permit this.
+ */
+ if (!vma_is_anonymous(vma))
+ return true;
+
+ /* If the user could write to the mapping anyway, then this is fine. */
+ if ((vma->vm_flags & VM_WRITE) &&
+ arch_vma_access_permitted(vma, /* write= */ true,
+ /* execute= */ false, /* foreign= */ false))
+ return true;
+
+ /* Otherwise, we are not permitted to perform this operation. */
+ return false;
+}
+#else
+static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
+{
+ return true;
+}
+#endif
+
/*
* Apply an madvise behavior to a region of a vma. madvise_update_vma
* will handle splitting a vm area into separate areas, each area with its own
@@ -1269,7 +1338,7 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
struct madvise_behavior_range *range = &madv_behavior->range;
int error;
- if (unlikely(!can_modify_vma_madv(madv_behavior->vma, behavior)))
+ if (unlikely(!can_madvise_modify(madv_behavior)))
return -EPERM;
switch (behavior) {
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3047b9ac667e..e2e685b971bb 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -837,11 +837,17 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
struct mm_walk *walk)
{
struct hwpoison_walk *hwp = walk->private;
- pte_t pte = huge_ptep_get(walk->mm, addr, ptep);
struct hstate *h = hstate_vma(walk->vma);
+ spinlock_t *ptl;
+ pte_t pte;
+ int ret;
- return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
- hwp->pfn, &hwp->tk);
+ ptl = huge_pte_lock(h, walk->mm, ptep);
+ pte = huge_ptep_get(walk->mm, addr, ptep);
+ ret = check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
+ hwp->pfn, &hwp->tk);
+ spin_unlock(ptl);
+ return ret;
}
#else
#define hwpoison_hugetlb_range NULL
diff --git a/mm/mempool.c b/mm/mempool.c
index 204a216b6418..1c38e873e546 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -136,7 +136,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
static __always_inline void add_element(mempool_t *pool, void *element)
{
- BUG_ON(pool->curr_nr >= pool->min_nr);
+ BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr);
poison_element(pool, element);
if (kasan_poison_element(pool, element))
pool->elements[pool->curr_nr++] = element;
@@ -202,16 +202,20 @@ int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
pool->alloc = alloc_fn;
pool->free = free_fn;
init_waitqueue_head(&pool->wait);
-
- pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
+ /*
+ * max() used here to ensure storage for at least 1 element to support
+ * zero minimum pool
+ */
+ pool->elements = kmalloc_array_node(max(1, min_nr), sizeof(void *),
gfp_mask, node_id);
if (!pool->elements)
return -ENOMEM;
/*
- * First pre-allocate the guaranteed number of buffers.
+ * First pre-allocate the guaranteed number of buffers,
+ * also pre-allocate 1 element for zero minimum pool.
*/
- while (pool->curr_nr < pool->min_nr) {
+ while (pool->curr_nr < max(1, pool->min_nr)) {
void *element;
element = pool->alloc(gfp_mask, pool->pool_data);
@@ -555,20 +559,12 @@ void mempool_free(void *element, mempool_t *pool)
* wake-up path of previous test. This explicit check ensures the
* allocation of element when both min_nr and curr_nr are 0, and
* any active waiters are properly awakened.
- *
- * Inline the same logic as previous test, add_element() cannot be
- * directly used here since it has BUG_ON to deny if min_nr equals
- * curr_nr, so here picked rest of add_element() to use without
- * BUG_ON check.
*/
if (unlikely(pool->min_nr == 0 &&
READ_ONCE(pool->curr_nr) == 0)) {
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr == 0)) {
- /* Inline the logic of add_element() */
- poison_element(pool, element);
- if (kasan_poison_element(pool, element))
- pool->elements[pool->curr_nr++] = element;
+ add_element(pool, element);
spin_unlock_irqrestore(&pool->lock, flags);
if (wq_has_sleeper(&pool->wait))
wake_up(&pool->wait);
diff --git a/mm/mincore.c b/mm/mincore.c
index 42d6c9c8da86..10dabefc3acc 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -29,7 +29,9 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
#ifdef CONFIG_HUGETLB_PAGE
unsigned char present;
unsigned char *vec = walk->private;
+ spinlock_t *ptl;
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
/*
* Hugepages under user process are always in RAM and never
* swapped out, but theoretically it needs to be checked.
@@ -38,6 +40,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
for (; addr != end; vec++, addr += PAGE_SIZE)
*vec = present;
walk->private = vec;
+ spin_unlock(ptl);
#else
BUG();
#endif
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 729fb7d0dd59..b006cec8e6fe 100644
--- a/mm/mmap_lock.c
+++ b/