From 5bf6f3c595d38e3e92130373ae4c0799da5026ee Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 9 Jul 2024 11:25:10 +0100 Subject: MAINTAINERS: mailmap: update James Clark's email address My new address is james.clark@linaro.org Link: https://lkml.kernel.org/r/20240709102512.31212-2-james.clark@linaro.org Signed-off-by: James Clark Cc: Bjorn Andersson Cc: Conor Dooley Cc: David S. Miller Cc: Geliang Tang Cc: Hao Zhang Cc: Jakub Kicinski Cc: Jiri Kosina Cc: Kees Cook Cc: Krzysztof Kozlowski Cc: Mao Jinlong Cc: Matthieu Baerts Cc: Matt Ranostay Cc: Mike Leach Cc: Oleksij Rempel Cc: Rob Herring (Arm) Cc: Suzuki K Poulose Signed-off-by: Andrew Morton --- .mailmap | 1 + MAINTAINERS | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index cbdf5d63c381..e51d76df75c2 100644 --- a/.mailmap +++ b/.mailmap @@ -260,6 +260,7 @@ Jaegeuk Kim Jakub Kicinski James Bottomley James Bottomley +James Clark James E Wilson James Hogan James Hogan diff --git a/MAINTAINERS b/MAINTAINERS index c0a3d9e93689..5bbcfea5e3fa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2196,7 +2196,7 @@ N: digicolor ARM/CORESIGHT FRAMEWORK AND DRIVERS M: Suzuki K Poulose R: Mike Leach -R: James Clark +R: James Clark L: coresight@lists.linaro.org (moderated for non-subscribers) L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained @@ -17894,7 +17894,7 @@ F: tools/perf/ PERFORMANCE EVENTS TOOLING ARM64 R: John Garry R: Will Deacon -R: James Clark +R: James Clark R: Mike Leach R: Leo Yan L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -- cgit v1.2.3 From 34e526f6182e12b71f6076d43760dc6e0ae175a3 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 9 Jul 2024 11:25:11 +0100 Subject: dt-bindings: arm: update James Clark's email address My new address is james.clark@linaro.org Link: https://lkml.kernel.org/r/20240709102512.31212-3-james.clark@linaro.org Signed-off-by: James Clark Cc: Bjorn Andersson Cc: Conor Dooley Cc: David S. Miller Cc: Geliang Tang Cc: Hao Zhang Cc: Jakub Kicinski Cc: Jiri Kosina Cc: Kees Cook Cc: Krzysztof Kozlowski Cc: Mao Jinlong Cc: Matthieu Baerts Cc: Matt Ranostay Cc: Mike Leach Cc: Oleksij Rempel Cc: Rob Herring (Arm) Cc: Suzuki K Poulose Signed-off-by: Andrew Morton --- Documentation/devicetree/bindings/arm/arm,coresight-dummy-sink.yaml | 2 +- Documentation/devicetree/bindings/arm/arm,coresight-dummy-source.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/arm/arm,coresight-dummy-sink.yaml b/Documentation/devicetree/bindings/arm/arm,coresight-dummy-sink.yaml index c960c8e0a9a5..08b89b62c505 100644 --- a/Documentation/devicetree/bindings/arm/arm,coresight-dummy-sink.yaml +++ b/Documentation/devicetree/bindings/arm/arm,coresight-dummy-sink.yaml @@ -30,7 +30,7 @@ description: | maintainers: - Mike Leach - Suzuki K Poulose - - James Clark + - James Clark - Mao Jinlong - Hao Zhang diff --git a/Documentation/devicetree/bindings/arm/arm,coresight-dummy-source.yaml b/Documentation/devicetree/bindings/arm/arm,coresight-dummy-source.yaml index 6745b4cc8f1c..d50a60368e27 100644 --- a/Documentation/devicetree/bindings/arm/arm,coresight-dummy-source.yaml +++ b/Documentation/devicetree/bindings/arm/arm,coresight-dummy-source.yaml @@ -29,7 +29,7 @@ description: | maintainers: - Mike Leach - Suzuki K Poulose - - James Clark + - James Clark - Mao Jinlong - Hao Zhang -- cgit v1.2.3 From 4cd7ba16a0afb36550eed7690e73d3e7a743fa96 Mon Sep 17 00:00:00 2001 From: Ram Tummala Date: Tue, 9 Jul 2024 18:45:39 -0700 Subject: mm: fix old/young bit handling in the faulting path Commit 3bd786f76de2 ("mm: convert do_set_pte() to set_pte_range()") replaced do_set_pte() with set_pte_range() and that introduced a regression in the following faulting path of non-anonymous vmas which caused the PTE for the faulting address to be marked as old instead of young. handle_pte_fault() do_pte_missing() do_fault() do_read_fault() || do_cow_fault() || do_shared_fault() finish_fault() set_pte_range() The polarity of prefault calculation is incorrect. This leads to prefault being incorrectly set for the faulting address. The following check will incorrectly mark the PTE old rather than young. On some architectures this will cause a double fault to mark it young when the access is retried. if (prefault && arch_wants_old_prefaulted_pte()) entry = pte_mkold(entry); On a subsequent fault on the same address, the faulting path will see a non NULL vmf->pte and instead of reaching the do_pte_missing() path, PTE will then be correctly marked young in handle_pte_fault() itself. Due to this bug, performance degradation in the fault handling path will be observed due to unnecessary double faulting. Link: https://lkml.kernel.org/r/20240710014539.746200-1-rtummala@nvidia.com Fixes: 3bd786f76de2 ("mm: convert do_set_pte() to set_pte_range()") Signed-off-by: Ram Tummala Reviewed-by: Yin Fengwei Cc: Alistair Popple Cc: Matthew Wilcox (Oracle) Cc: Yin Fengwei Cc: Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 1ff7b6f51ec1..34f8402d2046 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4780,7 +4780,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; - bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE); + bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE); pte_t entry; flush_icache_pages(vma, page, nr); -- cgit v1.2.3 From d9592025000b3cf26c742f3505da7b83aedc26d5 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 12 Jul 2024 08:58:55 -0700 Subject: mm: huge_memory: use !CONFIG_64BIT to relax huge page alignment on 32 bit machines Yves-Alexis Perez reported commit 4ef9ad19e176 ("mm: huge_memory: don't force huge page alignment on 32 bit") didn't work for x86_32 [1]. It is because x86_32 uses CONFIG_X86_32 instead of CONFIG_32BIT. !CONFIG_64BIT should cover all 32 bit machines. [1] https://lore.kernel.org/linux-mm/CAHbLzkr1LwH3pcTgM+aGQ31ip2bKqiqEQ8=FQB+t2c3dhNKNHA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20240712155855.1130330-1-yang@os.amperecomputing.com Fixes: 4ef9ad19e176 ("mm: huge_memory: don't force huge page alignment on 32 bit") Signed-off-by: Yang Shi Reported-by: Yves-Alexis Perez Tested-by: Yves-Alexis Perez Acked-by: David Hildenbrand Cc: Ben Hutchings Cc: Christoph Lameter Cc: Jiri Slaby Cc: Matthew Wilcox (Oracle) Cc: Rik van Riel Cc: Salvatore Bonaccorso Cc: Suren Baghdasaryan Cc: [6.8+] Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f9696c94e211..04b72912035d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -877,7 +877,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, loff_t off_align = round_up(off, size); unsigned long len_pad, ret, off_sub; - if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall()) + if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) return 0; if (off_end <= off_align || (off_end - off_align) < size) -- cgit v1.2.3 From d659b715e94ac039803d7601505d3473393fc0be Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 15 Jul 2024 10:04:23 +1000 Subject: mm/huge_memory: avoid PMD-size page cache if needed xarray can't support arbitrary page cache size. the largest and supported page cache size is defined as MAX_PAGECACHE_ORDER by commit 099d90642a71 ("mm/filemap: make MAX_PAGECACHE_ORDER acceptable to xarray"). However, it's possible to have 512MB page cache in the huge memory's collapsing path on ARM64 system whose base page size is 64KB. 512MB page cache is breaking the limitation and a warning is raised when the xarray entry is split as shown in the following example. [root@dhcp-10-26-1-207 ~]# cat /proc/1/smaps | grep KernelPageSize KernelPageSize: 64 kB [root@dhcp-10-26-1-207 ~]# cat /tmp/test.c : int main(int argc, char **argv) { const char *filename = TEST_XFS_FILENAME; int fd = 0; void *buf = (void *)-1, *p; int pgsize = getpagesize(); int ret = 0; if (pgsize != 0x10000) { fprintf(stdout, "System with 64KB base page size is required!\n"); return -EPERM; } system("echo 0 > /sys/devices/virtual/bdi/253:0/read_ahead_kb"); system("echo 1 > /proc/sys/vm/drop_caches"); /* Open the xfs file */ fd = open(filename, O_RDONLY); assert(fd > 0); /* Create VMA */ buf = mmap(NULL, TEST_MEM_SIZE, PROT_READ, MAP_SHARED, fd, 0); assert(buf != (void *)-1); fprintf(stdout, "mapped buffer at 0x%p\n", buf); /* Populate VMA */ ret = madvise(buf, TEST_MEM_SIZE, MADV_NOHUGEPAGE); assert(ret == 0); ret = madvise(buf, TEST_MEM_SIZE, MADV_POPULATE_READ); assert(ret == 0); /* Collapse VMA */ ret = madvise(buf, TEST_MEM_SIZE, MADV_HUGEPAGE); assert(ret == 0); ret = madvise(buf, TEST_MEM_SIZE, MADV_COLLAPSE); if (ret) { fprintf(stdout, "Error %d to madvise(MADV_COLLAPSE)\n", errno); goto out; } /* Split xarray entry. Write permission is needed */ munmap(buf, TEST_MEM_SIZE); buf = (void *)-1; close(fd); fd = open(filename, O_RDWR); assert(fd > 0); fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, TEST_MEM_SIZE - pgsize, pgsize); out: if (buf != (void *)-1) munmap(buf, TEST_MEM_SIZE); if (fd > 0) close(fd); return ret; } [root@dhcp-10-26-1-207 ~]# gcc /tmp/test.c -o /tmp/test [root@dhcp-10-26-1-207 ~]# /tmp/test ------------[ cut here ]------------ WARNING: CPU: 25 PID: 7560 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128 Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib \ nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct \ nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 \ ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm fuse \ xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 virtio_net \ sha1_ce net_failover virtio_blk virtio_console failover dimlib virtio_mmio CPU: 25 PID: 7560 Comm: test Kdump: loaded Not tainted 6.10.0-rc7-gavin+ #9 Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024 pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--) pc : xas_split_alloc+0xf8/0x128 lr : split_huge_page_to_list_to_order+0x1c4/0x780 sp : ffff8000ac32f660 x29: ffff8000ac32f660 x28: ffff0000e0969eb0 x27: ffff8000ac32f6c0 x26: 0000000000000c40 x25: ffff0000e0969eb0 x24: 000000000000000d x23: ffff8000ac32f6c0 x22: ffffffdfc0700000 x21: 0000000000000000 x20: 0000000000000000 x19: ffffffdfc0700000 x18: 0000000000000000 x17: 0000000000000000 x16: ffffd5f3708ffc70 x15: 0000000000000000 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 x11: ffffffffffffffc0 x10: 0000000000000040 x9 : ffffd5f3708e692c x8 : 0000000000000003 x7 : 0000000000000000 x6 : ffff0000e0969eb8 x5 : ffffd5f37289e378 x4 : 0000000000000000 x3 : 0000000000000c40 x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000 Call trace: xas_split_alloc+0xf8/0x128 split_huge_page_to_list_to_order+0x1c4/0x780 truncate_inode_partial_folio+0xdc/0x160 truncate_inode_pages_range+0x1b4/0x4a8 truncate_pagecache_range+0x84/0xa0 xfs_flush_unmap_range+0x70/0x90 [xfs] xfs_file_fallocate+0xfc/0x4d8 [xfs] vfs_fallocate+0x124/0x2f0 ksys_fallocate+0x4c/0xa0 __arm64_sys_fallocate+0x24/0x38 invoke_syscall.constprop.0+0x7c/0xd8 do_el0_svc+0xb4/0xd0 el0_svc+0x44/0x1d8 el0t_64_sync_handler+0x134/0x150 el0t_64_sync+0x17c/0x180 Fix it by correcting the supported page cache orders, different sets for DAX and other files. With it corrected, 512MB page cache becomes disallowed on all non-DAX files on ARM64 system where the base page size is 64KB. After this patch is applied, the test program fails with error -EINVAL returned from __thp_vma_allowable_orders() and the madvise() system call to collapse the page caches. Link: https://lkml.kernel.org/r/20240715000423.316491-1-gshan@redhat.com Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache") Signed-off-by: Gavin Shan Acked-by: David Hildenbrand Reviewed-by: Ryan Roberts Acked-by: Zi Yan Cc: Baolin Wang Cc: Barry Song Cc: Don Dutile Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Ryan Roberts Cc: William Kucharski Cc: [5.17+] Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 12 +++++++++--- mm/huge_memory.c | 12 ++++++++++-- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index cff002be83eb..e25d9ebfdf89 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -74,14 +74,20 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define THP_ORDERS_ALL_ANON ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1))) /* - * Mask of all large folio orders supported for file THP. + * Mask of all large folio orders supported for file THP. Folios in a DAX + * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to + * it. */ -#define THP_ORDERS_ALL_FILE (BIT(PMD_ORDER) | BIT(PUD_ORDER)) +#define THP_ORDERS_ALL_FILE_DAX \ + (BIT(PMD_ORDER) | BIT(PUD_ORDER)) +#define THP_ORDERS_ALL_FILE_DEFAULT \ + ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0)) /* * Mask of all large folio orders supported for THP. */ -#define THP_ORDERS_ALL (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE) +#define THP_ORDERS_ALL \ + (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT) #define TVA_SMAPS (1 << 0) /* Will be used for procfs */ #define TVA_IN_PF (1 << 1) /* Page fault handler */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 04b72912035d..f4be468e06a4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -89,9 +89,17 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, bool smaps = tva_flags & TVA_SMAPS; bool in_pf = tva_flags & TVA_IN_PF; bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; + unsigned long supported_orders; + /* Check the intersection of requested and supported orders. */ - orders &= vma_is_anonymous(vma) ? - THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE; + if (vma_is_anonymous(vma)) + supported_orders = THP_ORDERS_ALL_ANON; + else if (vma_is_dax(vma)) + supported_orders = THP_ORDERS_ALL_FILE_DAX; + else + supported_orders = THP_ORDERS_ALL_FILE_DEFAULT; + + orders &= supported_orders; if (!orders) return 0; -- cgit v1.2.3 From bf6acd5d16057d7accbbb1bf7dc6d8c56eeb4ecc Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Wed, 17 Jul 2024 17:20:16 +0100 Subject: decompress_bunzip2: fix rare decompression failure The decompression code parses a huffman tree and counts the number of symbols for a given bit length. In rare cases, there may be >= 256 symbols with a given bit length, causing the unsigned char to overflow. This causes a decompression failure later when the code tries and fails to find the bit length for a given symbol. Since the maximum number of symbols is 258, use unsigned short instead. Link: https://lkml.kernel.org/r/20240717162016.1514077-1-ross.lagerwall@citrix.com Fixes: bc22c17e12c1 ("bzip2/lzma: library support for gzip, bzip2 and lzma decompression") Signed-off-by: Ross Lagerwall Cc: Alain Knaff Cc: "H. Peter Anvin" Cc: Signed-off-by: Andrew Morton --- lib/decompress_bunzip2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c index 3518e7394eca..ca736166f100 100644 --- a/lib/decompress_bunzip2.c +++ b/lib/decompress_bunzip2.c @@ -232,7 +232,8 @@ static int INIT get_next_block(struct bunzip_data *bd) RUNB) */ symCount = symTotal+2; for (j = 0; j < groupCount; j++) { - unsigned char length[MAX_SYMBOLS], temp[MAX_HUFCODE_BITS+1]; + unsigned char length[MAX_SYMBOLS]; + unsigned short temp[MAX_HUFCODE_BITS+1]; int minLen, maxLen, pp; /* Read Huffman code lengths for each symbol. They're stored in a way similar to mtf; record a starting -- cgit v1.2.3 From b3bebe44306e23827397d0d774d206e3fa374041 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 17 Jul 2024 14:28:44 -0700 Subject: alloc_tag: outline and export free_reserved_page() Outline and export free_reserved_page() because modules use it and it in turn uses page_ext_{get|put} which should not be exported. The same result could be obtained by outlining {get|put}_page_tag_ref() but that would have higher performance impact as these functions are used in more performance critical paths. Link: https://lkml.kernel.org/r/20240717212844.2749975-1-surenb@google.com Fixes: dcfe378c81f7 ("lib: introduce support for page allocation tagging") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202407080044.DWMC9N9I-lkp@intel.com/ Suggested-by: Christoph Hellwig Suggested-by: Vlastimil Babka Acked-by: Vlastimil Babka Cc: Kees Cook Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Cc: [6.10] Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 +--------------- mm/page_alloc.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 4563b560ced7..c4b238a20b76 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3137,21 +3137,7 @@ extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end, int nid); /* Free the reserved page into the buddy system, so it gets managed. */ -static inline void free_reserved_page(struct page *page) -{ - if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); - - if (ref) { - set_codetag_empty(ref); - put_page_tag_ref(ref); - } - } - ClearPageReserved(page); - init_page_count(page); - __free_page(page); - adjust_managed_page_count(page, 1); -} +void free_reserved_page(struct page *page); #define free_highmem_page(page) free_reserved_page(page) static inline void mark_page_reserved(struct page *page) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8337926b89d4..7ac8d61148fe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5815,6 +5815,23 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char return pages; } +void free_reserved_page(struct page *page) +{ + if (mem_alloc_profiling_enabled()) { + union codetag_ref *ref = get_page_tag_ref(page); + + if (ref) { + set_codetag_empty(ref); + put_page_tag_ref(ref); + } + } + ClearPageReserved(page); + init_page_count(page); + __free_page(page); + adjust_managed_page_count(page, 1); +} +EXPORT_SYMBOL(free_reserved_page); + static int page_alloc_cpu_dead(unsigned int cpu) { struct zone *zone; -- cgit v1.2.3 From f59adcf5933271ab7247c4c8938c67be8905b725 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 23 Jul 2024 17:12:44 +0000 Subject: mm: memcg: add cacheline padding after lruvec in mem_cgroup_per_node Oliver Sand reported a performance regression caused by commit 98c9daf5ae6b ("mm: memcg: guard memcg1-specific members of struct mem_cgroup_per_node"), which puts some fields of the mem_cgroup_per_node structure under the CONFIG_MEMCG_V1 config option. Apparently it causes a false cache sharing between lruvec and lru_zone_size members of the structure. Fix it by adding an explicit padding after the lruvec member. Even though the padding is not required with CONFIG_MEMCG_V1 set, it seems like the introduced memory overhead is not significant enough to warrant another divergence in the mem_cgroup_per_node layout, so the padding is added unconditionally. Link: https://lkml.kernel.org/r/20240723171244.747521-1-roman.gushchin@linux.dev Fixes: 98c9daf5ae6b ("mm: memcg: guard memcg1-specific members of struct mem_cgroup_per_node") Signed-off-by: Roman Gushchin Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202407121335.31a10cb6-oliver.sang@intel.com Tested-by: Oliver Sang Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7e2eb091049a..0e5bf25d324f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -109,6 +109,7 @@ struct mem_cgroup_per_node { /* Fields which get updated often at the end. */ struct lruvec lruvec; + CACHELINE_PADDING(_pad2_); unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter; }; -- cgit v1.2.3 From 66eca1021a42856d6af2a9802c99e160278aed91 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 23 Jul 2024 14:44:28 +0800 Subject: mm/page_alloc: fix pcp->count race between drain_pages_zone() vs __rmqueue_pcplist() It's expected that no page should be left in pcp_list after calling zone_pcp_disable() in offline_pages(). Previously, it's observed that offline_pages() gets stuck [1] due to some pages remaining in pcp_list. Cause: There is a race condition between drain_pages_zone() and __rmqueue_pcplist() involving the pcp->count variable. See below scenario: CPU0 CPU1 ---------------- --------------- spin_lock(&pcp->lock); __rmqueue_pcplist() { zone_pcp_disable() { /* list is empty */ if (list_empty(list)) { /* add pages to pcp_list */ alloced = rmqueue_bulk() mutex_lock(&pcp_batch_high_lock) ... __drain_all_pages() { drain_pages_zone() { /* read pcp->count, it's 0 here */ count = READ_ONCE(pcp->count) /* 0 means nothing to drain */ /* update pcp->count */ pcp->count += alloced << order; ... ... spin_unlock(&pcp->lock); In this case, after calling zone_pcp_disable() though, there are still some pages in pcp_list. And these pages in pcp_list are neither movable nor isolated, offline_pages() gets stuck as a result. Solution: Expand the scope of the pcp->lock to also protect pcp->count in drain_pages_zone(), to ensure no pages are left in the pcp list after zone_pcp_disable() [1] https://lore.kernel.org/linux-mm/6a07125f-e720-404c-b2f9-e55f3f166e85@fujitsu.com/ Link: https://lkml.kernel.org/r/20240723064428.1179519-1-lizhijian@fujitsu.com Fixes: 4b23a68f9536 ("mm/page_alloc: protect PCP lists with a spinlock") Signed-off-by: Li Zhijian Reported-by: Yao Xingtao Reviewed-by: Vlastimil Babka Cc: David Hildenbrand Cc: Signed-off-by: Andrew Morton --- mm/page_alloc.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7ac8d61148fe..28f80daf5c04 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2343,16 +2343,20 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) static void drain_pages_zone(unsigned int cpu, struct zone *zone) { struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - int count = READ_ONCE(pcp->count); - - while (count) { - int to_drain = min(count, pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX); - count -= to_drain; + int count; + do { spin_lock(&pcp->lock); - free_pcppages_bulk(zone, to_drain, pcp, 0); + count = pcp->count; + if (count) { + int to_drain = min(count, + pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX); + + free_pcppages_bulk(zone, to_drain, pcp, 0); + count -= to_drain; + } spin_unlock(&pcp->lock); - } + } while (count); } /* -- cgit v1.2.3 From f556acc2facdd240de2c7416cd1da48a1dae70ec Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Thu, 18 Jul 2024 10:55:04 +0530 Subject: selftests/mm: skip test for non-LPA2 and non-LVA systems Post my improvement of the test in e4a4ba415419 ("selftests/mm: va_high_addr_switch: dynamically initialize testcases to enable LPA2 testing"): The test begins to fail on 4k and 16k pages, on non-LPA2 systems. To reduce noise in the CI systems, let us skip the test when higher address space is not implemented. Link: https://lkml.kernel.org/r/20240718052504.356517-1-dev.jain@arm.com Fixes: e4a4ba415419 ("selftests/mm: va_high_addr_switch: dynamically initialize testcases to enable LPA2 testing") Signed-off-by: Dev Jain Reviewed-by: Ryan Roberts Cc: Anshuman Khandual Cc: Mark Brown Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c index fa7eabfaf841..896b3f73fc53 100644 --- a/tools/testing/selftests/mm/va_high_addr_switch.c +++ b/tools/testing/selftests/mm/va_high_addr_switch.c @@ -293,6 +293,20 @@ static int run_test(struct testcase *test, int count) return ret; } +#ifdef __aarch64__ +/* Check if userspace VA > 48 bits */ +static int high_address_present(void) +{ + void *ptr = mmap((void *)(1UL << 50), 1, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (ptr == MAP_FAILED) + return 0; + + munmap(ptr, 1); + return 1; +} +#endif + static int supported_arch(void) { #if defined(__powerpc64__) @@ -300,7 +314,7 @@ static int supported_arch(void) #elif defined(__x86_64__) return 1; #elif defined(__aarch64__) - return 1; + return high_address_present(); #else return 0; #endif -- cgit v1.2.3 From 4811f7af6090e8f5a398fbdd766f903ef6c0d787 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 25 Jul 2024 14:20:07 +0900 Subject: nilfs2: handle inconsistent state in nilfs_btnode_create_block() Syzbot reported that a buffer state inconsistency was detected in nilfs_btnode_create_block(), triggering a kernel bug. It is not appropriate to treat this inconsistency as a bug; it can occur if the argument block address (the buffer index of the newly created block) is a virtual block number and has been reallocated due to corruption of the bitmap used to manage its allocation state. So, modify nilfs_btnode_create_block() and its callers to treat it as a possible filesystem error, rather than triggering a kernel bug. Link: https://lkml.kernel.org/r/20240725052007.4562-1-konishi.ryusuke@gmail.com Fixes: a60be987d45d ("nilfs2: B-tree node cache") Signed-off-by: Ryusuke Konishi Reported-by: syzbot+89cc4f2324ed37988b60@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=89cc4f2324ed37988b60 Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/btnode.c | 25 ++++++++++++++++++++----- fs/nilfs2/btree.c | 4 ++-- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 0131d83b912d..c034080c334b 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -51,12 +51,21 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node)); if (unlikely(!bh)) - return NULL; + return ERR_PTR(-ENOMEM); if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) || buffer_dirty(bh))) { - brelse(bh); - BUG(); + /* + * The block buffer at the specified new address was already + * in use. This can happen if it is a virtual block number + * and has been reallocated due to corruption of the bitmap + * used to manage its allocation state (if not, the buffer + * clearing of an abandoned b-tree node is missing somewhere). + */ + nilfs_error(inode->i_sb, + "state inconsistency probably due to duplicate use of b-tree node block address %llu (ino=%lu)", + (unsigned long long)blocknr, inode->i_ino); + goto failed; } memset(bh->b_data, 0, i_blocksize(inode)); bh->b_bdev = inode->i_sb->s_bdev; @@ -67,6 +76,12 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) folio_unlock(bh->b_folio); folio_put(bh->b_folio); return bh; + +failed: + folio_unlock(bh->b_folio); + folio_put(bh->b_folio); + brelse(bh); + return ERR_PTR(-EIO); } int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, @@ -217,8 +232,8 @@ retry: } nbh = nilfs_btnode_create_block(btnc, newkey); - if (!nbh) - return -ENOMEM; + if (IS_ERR(nbh)) + return PTR_ERR(nbh); BUG_ON(nbh == obh); ctxt->newbh = nbh; diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index a139970e4804..862bdf23120e 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -63,8 +63,8 @@ static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree, struct buffer_head *bh; bh = nilfs_btnode_create_block(btnc, ptr); - if (!bh) - return -ENOMEM; + if (IS_ERR(bh)) + return PTR_ERR(bh); set_buffer_nilfs_volatile(bh); *bhp = bh; -- cgit v1.2.3