diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-03 10:01:57 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-03 10:01:57 -0800 |
| commit | 0c272a1d33965627653f4fafd6eab55d0d50f21f (patch) | |
| tree | 326686bcebc172580079739d7def13b6739bf60f | |
| parent | 66a87fff1a87c260452f5a57123891ca5258c449 (diff) | |
| parent | ac86f547ca1002aec2ef66b9e64d03f45bbbfbb9 (diff) | |
| download | linux-0c272a1d33965627653f4fafd6eab55d0d50f21f.tar.gz linux-0c272a1d33965627653f4fafd6eab55d0d50f21f.tar.bz2 linux-0c272a1d33965627653f4fafd6eab55d0d50f21f.zip | |
Merge tag 'mm-hotfixes-stable-2023-02-02-19-24-2' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull misc fixes from Andrew Morton:
"25 hotfixes, mainly for MM. 13 are cc:stable"
* tag 'mm-hotfixes-stable-2023-02-02-19-24-2' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (26 commits)
mm: memcg: fix NULL pointer in mem_cgroup_track_foreign_dirty_slowpath()
Kconfig.debug: fix the help description in SCHED_DEBUG
mm/swapfile: add cond_resched() in get_swap_pages()
mm: use stack_depot_early_init for kmemleak
Squashfs: fix handling and sanity checking of xattr_ids count
sh: define RUNTIME_DISCARD_EXIT
highmem: round down the address passed to kunmap_flush_on_unmap()
migrate: hugetlb: check for hugetlb shared PMD in node migration
mm: hugetlb: proc: check for hugetlb shared PMD in /proc/PID/smaps
mm/MADV_COLLAPSE: catch !none !huge !bad pmd lookups
Revert "mm: kmemleak: alloc gray object for reserved region with direct map"
freevxfs: Kconfig: fix spelling
maple_tree: should get pivots boundary by type
.mailmap: update e-mail address for Eugen Hristev
mm, mremap: fix mremap() expanding for vma's with vm_ops->close()
squashfs: harden sanity check in squashfs_read_xattr_id_table
ia64: fix build error due to switch case label appearing next to declaration
mm: multi-gen LRU: fix crash during cgroup migration
Revert "mm: add nodes= arg to memory.reclaim"
zsmalloc: fix a race with deferred_handles storing
...
31 files changed, 430 insertions, 152 deletions
@@ -130,6 +130,7 @@ Domen Puncer <domen@coderock.org> Douglas Gilbert <dougg@torque.net> Ed L. Cashin <ecashin@coraid.com> Erik Kaneda <erik.kaneda@intel.com> <erik.schmauss@intel.com> +Eugen Hristev <eugen.hristev@collabora.com> <eugen.hristev@microchip.com> Evgeniy Polyakov <johnpol@2ka.mipt.ru> Ezequiel Garcia <ezequiel@vanguardiasur.com.ar> <ezequiel@collabora.com> Felipe W Damasio <felipewd@terra.com.br> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index c8ae7c897f14..74cec76be9f2 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1245,13 +1245,17 @@ PAGE_SIZE multiple when read back. This is a simple interface to trigger memory reclaim in the target cgroup. - This file accepts a string which contains the number of bytes to - reclaim. + This file accepts a single key, the number of bytes to reclaim. + No nested keys are currently supported. Example:: echo "1G" > memory.reclaim + The interface can be later extended with nested keys to + configure the reclaim behavior. For example, specify the + type of memory to reclaim from (anon, file, ..). + Please note that the kernel can over or under reclaim from the target cgroup. If less bytes are reclaimed than the specified amount, -EAGAIN is returned. @@ -1263,13 +1267,6 @@ PAGE_SIZE multiple when read back. This means that the networking layer will not adapt based on reclaim induced by memory.reclaim. - This file also allows the user to specify the nodes to reclaim from, - via the 'nodes=' key, for example:: - - echo "1G nodes=0,1" > memory.reclaim - - The above instructs the kernel to reclaim memory from nodes 0,1. - memory.peak A read-only single value file which exists on non-root cgroups. diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c index f6a502e8f02c..6e948d015332 100644 --- a/arch/ia64/kernel/sys_ia64.c +++ b/arch/ia64/kernel/sys_ia64.c @@ -170,6 +170,9 @@ ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, u asmlinkage long ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user *tp) { + struct timespec64 rtn_tp; + s64 tick_ns; + /* * ia64's clock_gettime() syscall is implemented as a vdso call * fsys_clock_gettime(). Currently it handles only @@ -185,8 +188,8 @@ ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user * switch (which_clock) { case CLOCK_REALTIME: case CLOCK_MONOTONIC: - s64 tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq); - struct timespec64 rtn_tp = ns_to_timespec64(tick_ns); + tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq); + rtn_tp = ns_to_timespec64(tick_ns); return put_timespec64(&rtn_tp, tp); } diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 3161b9ccd2a5..b6276a3521d7 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -4,6 +4,7 @@ * Written by Niibe Yutaka and Paul Mundt */ OUTPUT_ARCH(sh) +#define RUNTIME_DISCARD_EXIT #include <asm/thread_info.h> #include <asm/cache.h> #include <asm/vmlinux.lds.h> diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index f08b25195ae7..d1a68b6d03b3 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -26,7 +26,6 @@ #include <linux/serial_core.h> #include <linux/sysfs.h> #include <linux/random.h> -#include <linux/kmemleak.h> #include <asm/setup.h> /* for COMMAND_LINE_SIZE */ #include <asm/page.h> @@ -525,12 +524,9 @@ static int __init __reserved_mem_reserve_reg(unsigned long node, size = dt_mem_next_cell(dt_root_size_cells, &prop); if (size && - early_init_dt_reserve_memory(base, size, nomap) == 0) { + early_init_dt_reserve_memory(base, size, nomap) == 0) pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n", uname, &base, (unsigned long)(size / SZ_1M)); - if (!nomap) - kmemleak_alloc_phys(base, size, 0); - } else pr_err("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n", uname, &base, (unsigned long)(size / SZ_1M)); diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig index c05c71d57291..0e2fc08f7de4 100644 --- a/fs/freevxfs/Kconfig +++ b/fs/freevxfs/Kconfig @@ -8,7 +8,7 @@ config VXFS_FS of SCO UnixWare (and possibly others) and optionally available for Sunsoft Solaris, HP-UX and many other operating systems. However these particular OS implementations of vxfs may differ in on-disk - data endianess and/or superblock offset. The vxfs module has been + data endianness and/or superblock offset. The vxfs module has been tested with SCO UnixWare and HP-UX B.10.20 (pa-risc 1.1 arch.) Currently only readonly access is supported and VxFX versions 2, 3 and 4. Tests were performed with HP-UX VxFS version 3. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e35a0398db63..af1c49ae11b1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -745,9 +745,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, page = pfn_swap_entry_to_page(swpent); } if (page) { - int mapcount = page_mapcount(page); - - if (mapcount >= 2) + if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte)) mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); else mss->private_hugetlb += huge_page_size(hstate_vma(vma)); diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index b3fdc8212c5f..95f8e8901768 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -183,7 +183,7 @@ static inline int squashfs_block_size(__le32 raw) #define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ sizeof(u64)) /* xattr id lookup table defines */ -#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id)) +#define SQUASHFS_XATTR_BYTES(A) (((u64) (A)) * sizeof(struct squashfs_xattr_id)) #define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \ SQUASHFS_METADATA_SIZE) diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 659082e9e51d..72f6f4b37863 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -63,7 +63,7 @@ struct squashfs_sb_info { long long bytes_used; unsigned int inodes; unsigned int fragments; - int xattr_ids; + unsigned int xattr_ids; unsigned int ids; bool panic_on_errors; const struct squashfs_decompressor_thread_ops *thread_ops; diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h index d8a270d3ac4c..f1a463d8bfa0 100644 --- a/fs/squashfs/xattr.h +++ b/fs/squashfs/xattr.h @@ -10,12 +10,12 @@ #ifdef CONFIG_SQUASHFS_XATTR extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, - u64 *, int *); + u64 *, unsigned int *); extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, unsigned int *, unsigned long long *); #else static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, - u64 start, u64 *xattr_table_start, int *xattr_ids) + u64 start, u64 *xattr_table_start, unsigned int *xattr_ids) { struct squashfs_xattr_id_table *id_table; diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c index 087cab8c78f4..b88d19e9581e 100644 --- a/fs/squashfs/xattr_id.c +++ b/fs/squashfs/xattr_id.c @@ -56,7 +56,7 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index, * Read uncompressed xattr id lookup table indexes from disk into memory */ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start, - u64 *xattr_table_start, int *xattr_ids) + u64 *xattr_table_start, unsigned int *xattr_ids) { struct squashfs_sb_info *msblk = sb->s_fs_info; unsigned int len, indexes; @@ -76,7 +76,7 @@ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start, /* Sanity check values */ /* there is always at least one xattr id */ - if (*xattr_ids == 0) + if (*xattr_ids <= 0) return ERR_PTR(-EINVAL); len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids); diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 034b1106d022..e098f38422af 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -200,7 +200,7 @@ static inline void *kmap_local_pfn(unsigned long pfn) static inline void __kunmap_local(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP - kunmap_flush_on_unmap(addr); + kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif } @@ -227,7 +227,7 @@ static inline void *kmap_atomic_pfn(unsigned long pfn) static inline void __kunmap_atomic(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP - kunmap_flush_on_unmap(addr); + kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif pagefault_enable(); if (IS_ENABLED(CONFIG_PREEMPT_RT)) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 551834cd5299..db194e2ba69f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/hugetlb_inline.h> #include <linux/cgroup.h> +#include <linux/page_ref.h> #include <linux/list.h> #include <linux/kref.h> #include <linux/pgtable.h> @@ -1187,6 +1188,18 @@ static inline __init void hugetlb_cma_reserve(int order) } #endif +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +static inline bool hugetlb_pmd_shared(pte_t *pte) +{ + return page_count(virt_to_page(pte)) > 1; +} +#else +static inline bool hugetlb_pmd_shared(pte_t *pte) +{ + return false; +} +#endif + bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d3c8203cab6c..85dc9b88ea37 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1666,10 +1666,13 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { + struct mem_cgroup *memcg; + if (mem_cgroup_disabled()) return; - if (unlikely(&folio_memcg(folio)->css != wb->memcg_css)) + memcg = folio_memcg(folio); + if (unlikely(memcg && &memcg->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(folio, wb); } diff --git a/include/linux/swap.h b/include/linux/swap.h index 2787b84eaf12..0ceed49516ad 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -418,8 +418,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options, - nodemask_t *nodemask); + unsigned int reclaim_options); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 61a9425a311f..02ee440f7be3 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -754,6 +754,7 @@ config DEBUG_KMEMLEAK select KALLSYMS select CRC32 select STACKDEPOT + select STACKDEPOT_ALWAYS_INIT if !DEBUG_KMEMLEAK_DEFAULT_OFF help Say Y here if you want to enable the memory leak detector. The memory allocation/freeing is traced in a way @@ -1207,7 +1208,7 @@ config SCHED_DEBUG depends on DEBUG_KERNEL && PROC_FS default y help - If you say Y here, the /proc/sched_debug file will be provided + If you say Y here, the /sys/kernel/debug/sched file will be provided that can help debug the scheduler. The runtime overhead of this option is minimal. diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 26e2045d3cda..5a976393c9ae 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -670,12 +670,13 @@ static inline unsigned long mte_pivot(const struct maple_enode *mn, unsigned char piv) { struct maple_node *node = mte_to_node(mn); + enum maple_type type = mte_node_type(mn); - if (piv >= mt_pivots[piv]) { + if (piv >= mt_pivots[type]) { WARN_ON(1); return 0; } - switch (mte_node_type(mn)) { + switch (type) { case maple_arange_64: return node->ma64.pivot[piv]; case maple_range_64: @@ -4887,7 +4888,7 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) unsigned long *pivots, *gaps; void __rcu **slots; unsigned long gap = 0; - unsigned long max, min, index; + unsigned long max, min; unsigned char offset; if (unlikely(mas_is_err(mas))) @@ -4909,8 +4910,7 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) min = mas_safe_min(mas, pivots, --offset); max = mas_safe_pivot(mas, pivots, offset, type); - index = mas->index; - while (index <= max) { + while (mas->index <= max) { gap = 0; if (gaps) gap = gaps[offset]; @@ -4941,10 +4941,8 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) min = mas_safe_min(mas, pivots, offset); } - if (unlikely(index > max)) { - mas_set_err(mas, -EBUSY); - return false; - } + if (unlikely((mas->index > max) || (size - 1 > max - mas->index))) + goto no_space; if (unlikely(ma_is_leaf(type))) { mas->offset = offset; @@ -4961,9 +4959,11 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) return false; ascend: - if (mte_is_root(mas->node)) - mas_set_err(mas, -EBUSY); + if (!mte_is_root(mas->node)) + return false; +no_space: + mas_set_err(mas, -EBUSY); return false; } diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 497fc93ccf9e..ec847bf4dcb4 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -2517,6 +2517,91 @@ static noinline void check_bnode_min_spanning(struct maple_tree *mt) mt_set_non_kernel(0); } +static noinline void check_empty_area_window(struct maple_tree *mt) +{ + unsigned long i, nr_entries = 20; + MA_STATE(mas, mt, 0, 0); + + for (i = 1; i <= nr_entries; i++) + mtree_store_range(mt, i*10, i*10 + 9, + xa_mk_value(i), GFP_KERNEL); + + /* Create another hole besides the one at 0 */ + mtree_store_range(mt, 160, 169, NULL, GFP_KERNEL); + + /* Check lower bounds that don't fit */ + rcu_read_lock(); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 90, 10) != -EBUSY); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 6, 90, 5) != -EBUSY); + + /* Check lower bound that does fit */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 90, 5) != 0); + MT_BUG_ON(mt, mas.index != 5); + MT_BUG_ON(mt, mas.last != 9); + rcu_read_unlock(); + + /* Check one gap that doesn't fit and one that does */ + rcu_read_lock(); + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 217, 9) != 0); + MT_BUG_ON(mt, mas.index != 161); + MT_BUG_ON(mt, mas.last != 169); + + /* Check one gap that does fit above the min */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 218, 3) != 0); + MT_BUG_ON(mt, mas.index != 216); + MT_BUG_ON(mt, mas.last != 218); + + /* Check size that doesn't fit any gap */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 218, 16) != -EBUSY); + + /* + * Check size that doesn't fit the lower end of the window but + * does fit the gap + */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 167, 200, 4) != -EBUSY); + + /* + * Check size that doesn't fit the upper end of the window but + * does fit the gap + */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 162, 4) != -EBUSY); + + /* Check mas_empty_area forward */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 9) != 0); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 8); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 4) != 0); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 3); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 11) != -EBUSY); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 5, 100, 6) != -EBUSY); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 8, 10) != -EBUSY); + + mas_reset(&mas); + mas_empty_area(&mas, 100, 165, 3); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 100, 163, 6) != -EBUSY); + rcu_read_unlock(); +} + static DEFINE_MTREE(tree); static int maple_tree_seed(void) { @@ -2765,6 +2850,10 @@ static int maple_tree_seed(void) check_bnode_min_spanning(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_empty_area_window(&tree); + mtree_destroy(&tree); + #if defined(BENCH) skip: #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7fcdb98c9e68..bdbfeb6fb393 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5051,6 +5051,9 @@ again: entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry); } else if (unlikely(is_pte_marker(entry))) { + /* No swap on hugetlb */ + WARN_ON_ONCE( + is_swapin_error_entry(pte_to_swp_entry(entry))); /* * We copy the pte marker only if the dst vma has * uffd-wp enabled. diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 79be13133322..90acfea40c13 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -847,6 +847,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_SUCCEED; } +/* + * See pmd_trans_unstable() for how the result may change out from + * underneath us, even if we hold mmap_lock in read. + */ static int find_pmd_or_thp_or_none(struct mm_struct *mm, unsigned long address, pmd_t **pmd) @@ -865,8 +869,12 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, #endif if (pmd_none(pmde)) return SCAN_PMD_NONE; + if (!pmd_present(pmde)) + return SCAN_PMD_NULL; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; + if (pmd_devmap(pmde)) + return SCAN_PMD_NULL; if (pmd_bad(pmde)) return SCAN_PMD_NULL; return SCAN_SUCCEED; @@ -1642,7 +1650,7 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, * has higher cost too. It would also probably require locking * the anon_vma. */ - if (vma->anon_vma) { + if (READ_ONCE(vma->anon_vma)) { result = SCAN_PAGE_ANON; goto next; } @@ -1671,6 +1679,18 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, if ((cc->is_khugepaged || is_target) && mmap_write_trylock(mm)) { /* + * Re-check whether we have an ->anon_vma, because + * collapse_and_free_pmd() requires that either no + * ->anon_vma exists or the anon_vma is locked. + * We already checked ->anon_vma above, but that check + * is racy because ->anon_vma can be populated under the + * mmap lock in read mode. + */ + if (vma->anon_vma) { + result = SCAN_PAGE_ANON; + goto unlock_next; + } + /* * When a vma is registered with uffd-wp, we can't * recycle the pmd pgtable because there can be pte * markers installed. Skip it only, so the rest mm/vma diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 92f670edbf51..55dc8b8b0616 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -2070,8 +2070,10 @@ static int __init kmemleak_boot_config(char *str) return -EINVAL; if (strcmp(str, "off") == 0) kmemleak_disable(); - else if (strcmp(str, "on") == 0) + else if (strcmp(str, "on") == 0) { kmemleak_skip_disable = 1; + stack_depot_want_early_init(); + } else return -EINVAL; return 0; @@ -2093,7 +2095,6 @@ void __init kmemleak_init(void) if (kmemleak_error) return; - stack_depot_init(); jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ab457f0394ab..73afff8062f9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -63,7 +63,6 @@ #include <linux/resume_user_mode.h> #include <linux/psi.h> #include <linux/seq_buf.h> -#include <linux/parser.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -2393,8 +2392,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, - MEMCG_RECLAIM_MAY_SWAP, - NULL); + MEMCG_RECLAIM_MAY_SWAP); psi_memstall_leave(&pflags); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2685,8 +2683,7 @@ retry: psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, - gfp_mask, reclaim_options, - NULL); + gfp_mask, reclaim_options); psi_memstall_leave(&pflags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) @@ -3506,8 +3503,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, } if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, - NULL)) { + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { ret = -EBUSY; break; } @@ -3618,8 +3614,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) return -EINTR; if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - MEMCG_RECLAIM_MAY_SWAP, - NULL)) + MEMCG_RECLAIM_MAY_SWAP)) nr_retries--; } @@ -6429,8 +6424,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, } reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, - NULL); + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); if (!reclaimed && !nr_retries--) break; @@ -6479,8 +6473,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_reclaims) { if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, - NULL)) + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) nr_reclaims--; continue; } @@ -6603,54 +6596,21 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } -enum { - MEMORY_RECLAIM_NODES = 0, - MEMORY_RECLAIM_NULL, -}; - -static const match_table_t if_tokens = { - { MEMORY_RECLAIM_NODES, "nodes=%s" }, - { MEMORY_RECLAIM_NULL, NULL }, -}; - static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; - unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP | - MEMCG_RECLAIM_PROACTIVE; - char *old_buf, *start; - substring_t args[MAX_OPT_ARGS]; - int token; - char value[256]; - nodemask_t nodemask = NODE_MASK_ALL; - - buf = strstrip(buf); - - old_buf = buf; - nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; - if (buf == old_buf) - return -EINVAL; + unsigned int reclaim_options; + int err; buf = strstrip(buf); + err = page_counter_memparse(buf, "", &nr_to_reclaim); + if (err) + return err; - while ((start = strsep(&buf, " ")) != NULL) { - if (!strlen(start)) - continue; - token = match_token(start, if_tokens, args); - match_strlcpy(value, args, sizeof(value)); - switch (token) { - case MEMORY_RECLAIM_NODES: - if (nodelist_parse(value, nodemask) < 0) - return -EINVAL; - break; - default: - return -EINVAL; - } - } - + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; while (nr_reclaimed < nr_to_reclaim) { unsigned long reclaimed; @@ -6667,8 +6627,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, reclaim_options, - &nodemask); + GFP_KERNEL, reclaim_options); if (!reclaimed && !nr_retries--) return -EAGAIN; diff --git a/mm/memory.c b/mm/memory.c index aad226daf41b..3e836fecd035 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -828,12 +828,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, return -EBUSY; return -ENOENT; } else if (is_pte_marker_entry(entry)) { - /* - * We're copying the pgtabl |
