summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-04-19 17:55:45 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-04-19 17:55:45 -0700
commitcb0856346a60fe3eb837ba5e73588a41f81ac05f (patch)
tree0445b08a97f1a3d9791095cfad1e021ba6b228bc
parent23990b1affd2dc8f5e59048d4d4bef05f6e1c544 (diff)
parentef832747a82dfbc22a3702219cc716f449b24e4a (diff)
downloadlinux-cb0856346a60fe3eb837ba5e73588a41f81ac05f.tar.gz
linux-cb0856346a60fe3eb837ba5e73588a41f81ac05f.tar.bz2
linux-cb0856346a60fe3eb837ba5e73588a41f81ac05f.zip
Merge tag 'mm-hotfixes-stable-2023-04-19-16-36' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull misc fixes from Andrew Morton: "22 hotfixes. 19 are cc:stable and the remainder address issues which were introduced during this merge cycle, or aren't considered suitable for -stable backporting. 19 are for MM and the remainder are for other subsystems" * tag 'mm-hotfixes-stable-2023-04-19-16-36' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (22 commits) nilfs2: initialize unused bytes in segment summary blocks mm: page_alloc: skip regions with hugetlbfs pages when allocating 1G pages mm/mmap: regression fix for unmapped_area{_topdown} maple_tree: fix mas_empty_area() search maple_tree: make maple state reusable after mas_empty_area_rev() mm: kmsan: handle alloc failures in kmsan_ioremap_page_range() mm: kmsan: handle alloc failures in kmsan_vmap_pages_range_noflush() tools/Makefile: do missed s/vm/mm/ mm: fix memory leak on mm_init error handling mm/page_alloc: fix potential deadlock on zonelist_update_seq seqlock kernel/sys.c: fix and improve control flow in __sys_setres[ug]id() Revert "userfaultfd: don't fail on unrecognized features" writeback, cgroup: fix null-ptr-deref write in bdi_split_work_to_wbs maple_tree: fix a potential memory leak, OOB access, or other unpredictable bug tools/mm/page_owner_sort.c: fix TGID output when cull=tg is used mailmap: update jtoppins' entry to reference correct email mm/mempolicy: fix use-after-free of VMA iterator mm/huge_memory.c: warn with pr_warn_ratelimited instead of VM_WARN_ON_ONCE_FOLIO mm/mprotect: fix do_mprotect_pkey() return on error mm/khugepaged: check again on anon uffd-wp during isolation ...
-rw-r--r--.mailmap2
-rw-r--r--fs/fs-writeback.c17
-rw-r--r--fs/nilfs2/segment.c20
-rw-r--r--fs/userfaultfd.c6
-rw-r--r--include/linux/kmsan.h39
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/sys.c69
-rw-r--r--lib/maple_tree.c66
-rw-r--r--mm/backing-dev.c12
-rw-r--r--mm/huge_memory.c19
-rw-r--r--mm/khugepaged.c4
-rw-r--r--mm/kmsan/hooks.c55
-rw-r--r--mm/kmsan/shadow.c27
-rw-r--r--mm/mempolicy.c104
-rw-r--r--mm/mmap.c48
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/page_alloc.c19
-rw-r--r--mm/swap.c2
-rw-r--r--mm/vmalloc.c10
-rw-r--r--tools/Makefile14
-rw-r--r--tools/mm/page_owner_sort.c2
21 files changed, 351 insertions, 187 deletions
diff --git a/.mailmap b/.mailmap
index e42486317d18..86d0760c15eb 100644
--- a/.mailmap
+++ b/.mailmap
@@ -232,6 +232,8 @@ Johan Hovold <johan@kernel.org> <johan@hovoldconsulting.com>
John Crispin <john@phrozen.org> <blogic@openwrt.org>
John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
John Stultz <johnstul@us.ibm.com>
+<jon.toppins+linux@gmail.com> <jtoppins@cumulusnetworks.com>
+<jon.toppins+linux@gmail.com> <jtoppins@redhat.com>
Jordan Crouse <jordan@cosmicpenguin.net> <jcrouse@codeaurora.org>
<josh@joshtriplett.org> <josh@freedesktop.org>
<josh@joshtriplett.org> <josh@kernel.org>
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 195dc23e0d83..1db3e3c24b43 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -978,6 +978,16 @@ restart:
continue;
}
+ /*
+ * If wb_tryget fails, the wb has been shutdown, skip it.
+ *
+ * Pin @wb so that it stays on @bdi->wb_list. This allows
+ * continuing iteration from @wb after dropping and
+ * regrabbing rcu read lock.
+ */
+ if (!wb_tryget(wb))
+ continue;
+
/* alloc failed, execute synchronously using on-stack fallback */
work = &fallback_work;
*work = *base_work;
@@ -986,13 +996,6 @@ restart:
work->done = &fallback_work_done;
wb_queue_work(wb, work);
-
- /*
- * Pin @wb so that it stays on @bdi->wb_list. This allows
- * continuing iteration from @wb after dropping and
- * regrabbing rcu read lock.
- */
- wb_get(wb);
last_wb = wb;
rcu_read_unlock();
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 6ad41390fa74..228659612c0d 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -430,6 +430,23 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
return 0;
}
+/**
+ * nilfs_segctor_zeropad_segsum - zero pad the rest of the segment summary area
+ * @sci: segment constructor object
+ *
+ * nilfs_segctor_zeropad_segsum() zero-fills unallocated space at the end of
+ * the current segment summary block.
+ */
+static void nilfs_segctor_zeropad_segsum(struct nilfs_sc_info *sci)
+{
+ struct nilfs_segsum_pointer *ssp;
+
+ ssp = sci->sc_blk_cnt > 0 ? &sci->sc_binfo_ptr : &sci->sc_finfo_ptr;
+ if (ssp->offset < ssp->bh->b_size)
+ memset(ssp->bh->b_data + ssp->offset, 0,
+ ssp->bh->b_size - ssp->offset);
+}
+
static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
{
sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
@@ -438,6 +455,7 @@ static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
* The current segment is filled up
* (internal code)
*/
+ nilfs_segctor_zeropad_segsum(sci);
sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
return nilfs_segctor_reset_segment_buffer(sci);
}
@@ -542,6 +560,7 @@ static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
goto retry;
}
if (unlikely(required)) {
+ nilfs_segctor_zeropad_segsum(sci);
err = nilfs_segbuf_extend_segsum(segbuf);
if (unlikely(err))
goto failed;
@@ -1533,6 +1552,7 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
sci->sc_stage = prev_stage;
}
+ nilfs_segctor_zeropad_segsum(sci);
nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile);
return 0;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 44d1ee429eb0..40f9e1a2ebdd 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1955,8 +1955,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
ret = -EFAULT;
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
goto out;
- /* Ignore unsupported features (userspace built against newer kernel) */
- features = uffdio_api.features & UFFD_API_FEATURES;
+ features = uffdio_api.features;
+ ret = -EINVAL;
+ if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
+ goto err_out;
ret = -EPERM;
if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
goto err_out;
diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h
index e38ae3c34618..30b17647ce3c 100644
--- a/include/linux/kmsan.h
+++ b/include/linux/kmsan.h
@@ -134,11 +134,12 @@ void kmsan_kfree_large(const void *ptr);
* @page_shift: page_shift passed to vmap_range_noflush().
*
* KMSAN maps shadow and origin pages of @pages into contiguous ranges in
- * vmalloc metadata address range.
+ * vmalloc metadata address range. Returns 0 on success, callers must check
+ * for non-zero return value.
*/
-void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
- pgprot_t prot, struct page **pages,
- unsigned int page_shift);
+int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
+ pgprot_t prot, struct page **pages,
+ unsigned int page_shift);
/**
* kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap.
@@ -159,11 +160,12 @@ void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end);
* @page_shift: page_shift argument passed to vmap_range_noflush().
*
* KMSAN creates new metadata pages for the physical pages mapped into the
- * virtual memory.
+ * virtual memory. Returns 0 on success, callers must check for non-zero return
+ * value.
*/
-void kmsan_ioremap_page_range(unsigned long addr, unsigned long end,
- phys_addr_t phys_addr, pgprot_t prot,
- unsigned int page_shift);
+int kmsan_ioremap_page_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int page_shift);
/**
* kmsan_iounmap_page_range() - Notify KMSAN about a iounmap_page_range() call.
@@ -281,12 +283,13 @@ static inline void kmsan_kfree_large(const void *ptr)
{
}
-static inline void kmsan_vmap_pages_range_noflush(unsigned long start,
- unsigned long end,
- pgprot_t prot,
- struct page **pages,
- unsigned int page_shift)
+static inline int kmsan_vmap_pages_range_noflush(unsigned long start,
+ unsigned long end,
+ pgprot_t prot,
+ struct page **pages,
+ unsigned int page_shift)
{
+ return 0;
}
static inline void kmsan_vunmap_range_noflush(unsigned long start,
@@ -294,12 +297,12 @@ static inline void kmsan_vunmap_range_noflush(unsigned long start,
{
}
-static inline void kmsan_ioremap_page_range(unsigned long start,
- unsigned long end,
- phys_addr_t phys_addr,
- pgprot_t prot,
- unsigned int page_shift)
+static inline int kmsan_ioremap_page_range(unsigned long start,
+ unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int page_shift)
{
+ return 0;
}
static inline void kmsan_iounmap_page_range(unsigned long start,
diff --git a/kernel/fork.c b/kernel/fork.c
index 0c92f224c68c..ea332319dffe 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1174,6 +1174,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
fail_pcpu:
while (i > 0)
percpu_counter_destroy(&mm->rss_stat[--i]);
+ destroy_context(mm);
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
diff --git a/kernel/sys.c b/kernel/sys.c
index 495cd87d9bf4..351de7916302 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -664,6 +664,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
struct cred *new;
int retval;
kuid_t kruid, keuid, ksuid;
+ bool ruid_new, euid_new, suid_new;
kruid = make_kuid(ns, ruid);
keuid = make_kuid(ns, euid);
@@ -678,25 +679,29 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
if ((suid != (uid_t) -1) && !uid_valid(ksuid))
return -EINVAL;
+ old = current_cred();
+
+ /* check for no-op */
+ if ((ruid == (uid_t) -1 || uid_eq(kruid, old->uid)) &&
+ (euid == (uid_t) -1 || (uid_eq(keuid, old->euid) &&
+ uid_eq(keuid, old->fsuid))) &&
+ (suid == (uid_t) -1 || uid_eq(ksuid, old->suid)))
+ return 0;
+
+ ruid_new = ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
+ !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid);
+ euid_new = euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&
+ !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid);
+ suid_new = suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&
+ !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid);
+ if ((ruid_new || euid_new || suid_new) &&
+ !ns_capable_setid(old->user_ns, CAP_SETUID))
+ return -EPERM;
+
new = prepare_creds();
if (!new)
return -ENOMEM;
- old = current_cred();
-
- retval = -EPERM;
- if (!ns_capable_setid(old->user_ns, CAP_SETUID)) {
- if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
- !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
- goto error;
- if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&
- !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
- goto error;
- if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&
- !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
- goto error;
- }
-
if (ruid != (uid_t) -1) {
new->uid = kruid;
if (!uid_eq(kruid, old->uid)) {
@@ -761,6 +766,7 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
struct cred *new;
int retval;
kgid_t krgid, kegid, ksgid;
+ bool rgid_new, egid_new, sgid_new;
krgid = make_kgid(ns, rgid);
kegid = make_kgid(ns, egid);
@@ -773,23 +779,28 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
return -EINVAL;
+ old = current_cred();
+
+ /* check for no-op */
+ if ((rgid == (gid_t) -1 || gid_eq(krgid, old->gid)) &&
+ (egid == (gid_t) -1 || (gid_eq(kegid, old->egid) &&
+ gid_eq(kegid, old->fsgid))) &&
+ (sgid == (gid_t) -1 || gid_eq(ksgid, old->sgid)))
+ return 0;
+
+ rgid_new = rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
+ !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid);
+ egid_new = egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&
+ !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid);
+ sgid_new = sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&
+ !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid);
+ if ((rgid_new || egid_new || sgid_new) &&
+ !ns_capable_setid(old->user_ns, CAP_SETGID))
+ return -EPERM;
+
new = prepare_creds();
if (!new)
return -ENOMEM;
- old = current_cred();
-
- retval = -EPERM;
- if (!ns_capable_setid(old->user_ns, CAP_SETGID)) {
- if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
- !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
- goto error;
- if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&
- !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
- goto error;
- if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&
- !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
- goto error;
- }
if (rgid != (gid_t) -1)
new->gid = krgid;
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index db60edb55f2f..1281a40d5735 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -1303,26 +1303,21 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
node = mas->alloc;
node->request_count = 0;
while (requested) {
- max_req = MAPLE_ALLOC_SLOTS;
- if (node->node_count) {
- unsigned int offset = node->node_count;
-
- slots = (void **)&node->slot[offset];
- max_req -= offset;
- } else {
- slots = (void **)&node->slot;
- }
-
+ max_req = MAPLE_ALLOC_SLOTS - node->node_count;
+ slots = (void **)&node->slot[node->node_count];
max_req = min(requested, max_req);
count = mt_alloc_bulk(gfp, max_req, slots);
if (!count)
goto nomem_bulk;
+ if (node->node_count == 0) {
+ node->slot[0]->node_count = 0;
+ node->slot[0]->request_count = 0;
+ }
+
node->node_count += count;
allocated += count;
node = node->slot[0];
- node->node_count = 0;
- node->request_count = 0;
requested -= count;
}
mas->alloc->total = allocated;
@@ -4970,7 +4965,8 @@ not_found:
* Return: True if found in a leaf, false otherwise.
*
*/
-static bool mas_rev_awalk(struct ma_state *mas, unsigned long size)
+static bool mas_rev_awalk(struct ma_state *mas, unsigned long size,
+ unsigned long *gap_min, unsigned long *gap_max)
{
enum maple_type type = mte_node_type(mas->node);
struct maple_node *node = mas_mn(mas);
@@ -5035,8 +5031,8 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size)
if (unlikely(ma_is_leaf(type))) {
mas->offset = offset;
- mas->min = min;
- mas->max = min + gap - 1;
+ *gap_min = min;
+ *gap_max = min + gap - 1;
return true;
}
@@ -5060,10 +5056,10 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size)
{
enum maple_type type = mte_node_type(mas->node);
unsigned long pivot, min, gap = 0;
- unsigned char offset;
- unsigned long *gaps;
- unsigned long *pivots = ma_pivots(mas_mn(mas), type);
- void __rcu **slots = ma_slots(mas_mn(mas), type);
+ unsigned char offset, data_end;
+ unsigned long *gaps, *pivots;
+ void __rcu **slots;
+ struct maple_node *node;
bool found = false;
if (ma_is_dense(type)) {
@@ -5071,13 +5067,15 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size)
return true;
}
- gaps = ma_gaps(mte_to_node(mas->node), type);
+ node = mas_mn(mas);
+ pivots = ma_pivots(node, type);
+ slots = ma_slots(node, type);
+ gaps = ma_gaps(node, type);
offset = mas->offset;
min = mas_safe_min(mas, pivots, offset);
- for (; offset < mt_slots[type]; offset++) {
- pivot = mas_safe_pivot(mas, pivots, offset, type);
- if (offset && !pivot)
- break;
+ data_end = ma_data_end(node, type, pivots, mas->max);
+ for (; offset <= data_end; offset++) {
+ pivot = mas_logical_pivot(mas, pivots, offset, type);
/* Not within lower bounds */
if (mas->index > pivot)
@@ -5312,6 +5310,9 @@ int mas_empty_area(struct ma_state *mas, unsigned long min,
unsigned long *pivots;
enum maple_type mt;
+ if (min >= max)
+ return -EINVAL;
+
if (mas_is_start(mas))
mas_start(mas);
else if (mas->offset >= 2)
@@ -5366,6 +5367,9 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
{
struct maple_enode *last = mas->node;
+ if (min >= max)
+ return -EINVAL;
+
if (mas_is_start(mas)) {
mas_start(mas);
mas->offset = mas_data_end(mas);
@@ -5385,7 +5389,7 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
mas->index = min;
mas->last = max;
- while (!mas_rev_awalk(mas, size)) {
+ while (!mas_rev_awalk(mas, size, &min, &max)) {
if (last == mas->node) {
if (!mas_rewind_node(mas))
return -EBUSY;
@@ -5400,17 +5404,9 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
if (unlikely(mas->offset == MAPLE_NODE_SLOTS))
return -EBUSY;
- /*
- * mas_rev_awalk() has set mas->min and mas->max to the gap values. If
- * the maximum is outside the window we are searching, then use the last
- * location in the search.
- * mas->max and mas->min is the range of the gap.
- * mas->index and mas->last are currently set to the search range.
- */
-
/* Trim the upper limit to the max. */
- if (mas->max <= mas->last)
- mas->last = mas->max;
+ if (max <= mas->last)
+ mas->last = max;
mas->index = mas->last - size + 1;
return 0;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index a53b9360b72e..30d2d0386fdb 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -507,6 +507,15 @@ static LIST_HEAD(offline_cgwbs);
static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);
+static void cgwb_free_rcu(struct rcu_head *rcu_head)
+{
+ struct bdi_writeback *wb = container_of(rcu_head,
+ struct bdi_writeback, rcu);
+
+ percpu_ref_exit(&wb->refcnt);
+ kfree(wb);
+}
+
static void cgwb_release_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
@@ -529,11 +538,10 @@ static void cgwb_release_workfn(struct work_struct *work)
list_del(&wb->offline_node);
spin_unlock_irq(&cgwb_lock);
- percpu_ref_exit(&wb->refcnt);
wb_exit(wb);
bdi_put(bdi);
WARN_ON_ONCE(!list_empty(&wb->b_attached));
- kfree_rcu(wb, rcu);
+ call_rcu(&wb->rcu, cgwb_free_rcu);
}
static void cgwb_release(struct percpu_ref *refcnt)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 032fb0ef9cd1..3fae2d2496ab 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1838,10 +1838,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (is_swap_pmd(*pmd)) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
struct page *page = pfn_swap_entry_to_page(entry);
+ pmd_t newpmd;
VM_BUG_ON(!is_pmd_migration_entry(*pmd));
if (is_writable_migration_entry(entry)) {
- pmd_t newpmd;
/*
* A protection check is difficult so
* just be safe and disable write
@@ -1855,8 +1855,16 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
newpmd = pmd_swp_mksoft_dirty(newpmd);
if (pmd_swp_uffd_wp(*pmd))
newpmd = pmd_swp_mkuffd_wp(newpmd);
- set_pmd_at(mm, addr, pmd, newpmd);
+ } else {
+ newpmd = *pmd;
}
+
+ if (uffd_wp)
+ newpmd = pmd_swp_mkuffd_wp(newpmd);
+ else if (uffd_wp_resolve)
+ newpmd = pmd_swp_clear_uffd_wp(newpmd);
+ if (!pmd_same(*pmd, newpmd))
+ set_pmd_at(mm, addr, pmd, newpmd);
goto unlock;
}
#endif
@@ -2657,9 +2665,10 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
is_hzp = is_huge_zero_page(&folio->page);
- VM_WARN_ON_ONCE_FOLIO(is_hzp, folio);
- if (is_hzp)
+ if (is_hzp) {
+ pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
return -EBUSY;
+ }
if (folio_test_writeback(folio))
return -EBUSY;
@@ -3251,6 +3260,8 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
pmdswp = swp_entry_to_pmd(entry);
if (pmd_soft_dirty(pmdval))
pmdswp = pmd_swp_mksoft_dirty(pmdswp);
+ if (pmd_uffd_wp(pmdval))
+ pmdswp = pmd_swp_mkuffd_wp(pmdswp);
set_pmd_at(mm, address, pvmw->pmd, pmdswp);
page_remove_rmap(page, vma, true);
put_page(page);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 92e6f56a932d..0ec69b96b497 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -572,6 +572,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_PTE_NON_PRESENT;
goto out;
}
+ if (pte_uffd_wp(pteval)) {
+ result = SCAN_PTE_UFFD_WP;
+ goto out;
+ }
page = vm_normal_page(vma, address, pteval);
if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 3807502766a3..ec0da72e65aa 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -148,35 +148,74 @@ void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end)
* into the virtual memory. If those physical pages already had shadow/origin,
* those are ignored.
*/
-void kmsan_ioremap_page_range(unsigned long start, unsigned long end,
- phys_addr_t phys_addr, pgprot_t prot,
- unsigned int page_shift)
+int kmsan_ioremap_page_range(unsigned long start, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int page_shift)
{
gfp_t gfp_mask = GFP_KERNEL | __GFP_ZERO;
struct page *shadow, *origin;
unsigned long off = 0;
- int nr;
+ int nr, err = 0, clean = 0, mapped;
if (!kmsan_enabled || kmsan_in_runtime())
- return;
+ return 0;
nr = (end - start) / PAGE_SIZE;
kmsan_enter_runtime();
- for (int i = 0; i < nr; i++, off += PAGE_SIZE) {
+ for (int i = 0; i < nr; i++, off += PAGE_SIZE, clean = i) {
shadow = alloc_pages(gfp_mask, 1);
origin = alloc_pages(gfp_mask, 1);
- __vmap_pages_range_noflush(
+ if (!shadow || !origin) {
+ err = -ENOMEM;
+ goto ret;
+ }
+ mapped = __vmap_pages_range_noflush(
vmalloc_shadow(start + off),
vmalloc_shadow(start + off + PAGE_SIZE), prot, &shadow,
PAGE_SHIFT);
- __vmap_pages_range_noflush(
+ if (mapped) {
+ err = mapped;
+ goto ret;
+ }
+ shadow = NULL;
+ mapped = __vmap_pages_range_noflush(
vmalloc_origin(start + off),
vmalloc_origin(start + off + PAGE_SIZE), prot, &origin,
PAGE_SHIFT);
+ if (mapped) {
+ __vunmap_range_noflush(
+ vmalloc_shadow(start + off),
+ vmalloc_shadow(start + off + PAGE_SIZE));
+ err = mapped;
+ goto ret;
+ }
+ origin = NULL;
+ }
+ /* Page mapping loop finished normally, nothing to clean up. */
+ clean = 0;
+
+ret:
+ if (clean > 0) {
+ /*
+ * Something went wrong. Clean up shadow/origin pages allocated
+ * on the last loop iteration, then delete mappings created
+ * during the previous iterations.
+ */
+ if (shadow)
+ __free_pages(shadow, 1);
+ if (origin)
+ __free_pages(origin, 1);
+ __vunmap_range_noflush(
+ vmalloc_shadow(start),
+ vmalloc_shadow(start + clean * PAGE_SIZE));
+ __vunmap_range_noflush(
+ vmalloc_origin(start),
+ vmalloc_origin(start + clean * PAGE_SIZE));
}
flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end));
flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end));
kmsan_leave_runtime();
+ return err;
}
void kmsan_iounmap_page_range(unsigned long start, unsigned long end)
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
index a787c04e9583..b8bb95eea5e3 100644
--- a/mm/kmsan/shadow.c
+++ b/mm/kmsan/shadow.c
@@ -216,27 +216,29 @@ void kmsan_free_page(struct page *page, unsigned int order)
kmsan_leave_runtime();
}
-void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
- pgprot_t prot, struct page **pages,
- unsigned int page_shift)
+int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
+ pgprot_t prot, struct page **pages,
+ unsigned int page_shift)
{
unsigned long shadow_start, origin_start, shadow_end, origin_end;
struct page **s_pages, **o_pages;
- int nr, mapped;
+ int nr, mapped, err = 0;
if (!kmsan_enabled)
- return;
+ return 0;
shadow_start = vmalloc_meta((void *)start, KMSAN_META_SHADOW);
shadow_end = vmalloc_meta((void *)end, KMSAN_META_SHADOW);
if (!shadow_start)
- return;
+ return 0;
nr = (end - start) / PAGE_SIZE;
s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL);
o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL);
- if (!s_pages || !o_pages)
+ if (!s_pages || !o_pages) {
+ err = -ENOMEM;
goto ret;
+ }
for (int i = 0; i < nr; i++) {
s_pages[i] = shadow_page_for(pages[i]);
o_pages[i] = origin_page_for(pages[i]);
@@ -249,10 +251,16 @@ void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
kmsan_enter_runtime();
mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot,
s_pages, page_shift);
- KMSAN_WARN_ON(mapped);
+ if (mapped) {
+ err = mapped;
+ goto ret;
+ }
mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot,
o_pages, page_shift);
- KMSAN_WARN_ON(mapped);
+ if (mapped) {
+ err = mapped;
+ goto ret;
+ }
kmsan_leave_runtime();
flush_tlb_kernel_range(shadow_start, shadow_end);
flush_tlb_kernel_range(origin_start, origin_end);
@@ -262,6 +270,7 @@ void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
ret:
kfree(s_pages);
kfree(o_pages);
+ return err;
}
/* Allocate metadata for pages allocated at boot time. */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a256a241fd1d..2068b594dc88 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -790,61 +790,50 @@ static int vma_replace_policy(struct vm_area_struct *vma,
return err;
}
-/* Step 2: apply policy to a range and do splits. */
-static int mbind_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, struct mempolicy *new_pol)
+/* Split or merge the VMA (if required) and apply the new policy */
+static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ struct vm_area_struct **prev, unsigned long start,
+ unsigned long end, struct mempolicy *new_pol)
{
- VMA_ITERATOR(vmi, mm, start);
- struct vm_area_struct *prev;
- struct vm_area_struct *vma;
- int err = 0;
+ struct vm_area_struct *merged;
+ unsigned long vmstart, vmend;
pgoff_t pgoff;
+ int err;
- prev = vma_prev(&vmi);
- vma = vma_find(&vmi, end);
- if (WARN_ON(!vma))
+ vmend = min(end, vma->vm_end);
+ if (start > vma->vm_start) {
+ *prev = vma;
+ vmstart = start;
+ } else {
+ vmstart = vma->vm_start;
+ }
+
+ if (mpol_equal(vma_policy(vma), new_pol))
return 0;
- if (start > vma->vm_start)
- prev = vma;
-
- do {
- unsigned long vmstart = max(start, vma->vm_start);
- unsigned long vmend = min(end, vma->vm_end);
-
- if (mpol_equal(vma_policy(vma), new_pol))
- goto next;
-
- pgoff = vma->vm_pgoff +
- ((vmstart - vma->vm_start) >> PAGE_SHIFT);
- prev = vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff,
- new_pol, vma->vm_userfaultfd_ctx,
- anon_vma_name(vma));
- if (prev) {
- vma = prev;
- goto replace;
- }
- if (vma->vm_start != vmstart) {
- err = split_vma(&vmi, vma, vmstart, 1);
- if (err)
- goto out;
- }
- if (vma->vm_end != vmend) {
- err = split_vma(&vmi, vma, vmend, 0);
- if (err)
- goto out;
- }
-replace:
- err = vma_replace_policy(vma, new_pol);
+ pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT);
+ merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags,
+ vma->anon_vma, vma->vm_file, pgoff, new_pol,
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+ if (merged) {
+ *prev = merged;
+ return vma_replace_policy(merged, new_pol);
+ }
+
+ if (vma->vm_start != vmstart) {
+ err = split_vma(vmi, vma, vmstart, 1);
if (err)
- goto out;
-next:
- prev = vma;
- } for_each_vma_range(vmi, vma, end);
+ return err;
+ }
-out:
- return err;
+ if (vma->vm_end != vmend) {
+ err = split_vma(vmi, vma, vmend, 0);
+ if (err)
+ return err;
+ }
+
+ *prev = vma;
+ return vma_replace_policy(vma, new_pol);
}
/* Set the process memory policy */
@@ -1259,6 +1248,8 @@ static long do_mbind(unsigned long start, unsigned long len,
nodemask_t *nmask, unsigned long flags)
{
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
+ struct vma_iterator vmi;
struct mempolicy *new;
unsigned long end;
int err;
@@ -1328,7 +1319,13 @@ static long do_mbind(unsigned long start, unsigned long len,
goto up_out;
}
- err = mbind_range(mm, start, end, new);
+ vma_iter_init(&vmi, mm, start);
+ prev = vma_prev(&vmi);
+ for_each_vma_range(vmi, vma, end) {
+ err = mbind_range(&vmi, vma, &prev, start, end, new);
+ if (err)
+ break;
+ }
if (!err) {
int nr_failed = 0;