diff options
95 files changed, 2661 insertions, 1506 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 007ba86aef78..6d13f2de6d69 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1288,7 +1288,12 @@ PAGE_SIZE multiple when read back. inactive_anon, active_anon, inactive_file, active_file, unevictable Amount of memory, swap-backed and filesystem-backed, on the internal memory management lists used by the - page reclaim algorithm + page reclaim algorithm. + + As these represent internal list state (eg. shmem pages are on anon + memory management lists), inactive_foo + active_foo may not be equal to + the value for the foo counter, since the foo counter is type-based, not + list-based. slab_reclaimable Part of "slab" that might be reclaimed, such as diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 525296121d89..e4d66e7c50de 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -218,3 +218,66 @@ brk handler is used to print bug reports. A potential expansion of this mode is a hardware tag-based mode, which would use hardware memory tagging support instead of compiler instrumentation and manual shadow memory manipulation. + +What memory accesses are sanitised by KASAN? +-------------------------------------------- + +The kernel maps memory in a number of different parts of the address +space. This poses something of a problem for KASAN, which requires +that all addresses accessed by instrumented code have a valid shadow +region. + +The range of kernel virtual addresses is large: there is not enough +real memory to support a real shadow region for every address that +could be accessed by the kernel. + +By default +~~~~~~~~~~ + +By default, architectures only map real memory over the shadow region +for the linear mapping (and potentially other small areas). For all +other areas - such as vmalloc and vmemmap space - a single read-only +page is mapped over the shadow area. This read-only shadow page +declares all memory accesses as permitted. + +This presents a problem for modules: they do not live in the linear +mapping, but in a dedicated module space. By hooking in to the module +allocator, KASAN can temporarily map real shadow memory to cover +them. This allows detection of invalid accesses to module globals, for +example. + +This also creates an incompatibility with ``VMAP_STACK``: if the stack +lives in vmalloc space, it will be shadowed by the read-only page, and +the kernel will fault when trying to set up the shadow data for stack +variables. + +CONFIG_KASAN_VMALLOC +~~~~~~~~~~~~~~~~~~~~ + +With ``CONFIG_KASAN_VMALLOC``, KASAN can cover vmalloc space at the +cost of greater memory usage. Currently this is only supported on x86. + +This works by hooking into vmalloc and vmap, and dynamically +allocating real shadow memory to back the mappings. + +Most mappings in vmalloc space are small, requiring less than a full +page of shadow space. Allocating a full shadow page per mapping would +therefore be wasteful. Furthermore, to ensure that different mappings +use different shadow pages, mappings would have to be aligned to +``KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE``. + +Instead, we share backing space across multiple mappings. We allocate +a backing page when a mapping in vmalloc space uses a particular page +of the shadow region. This page can be shared by other vmalloc +mappings later on. + +We hook in to the vmap infrastructure to lazily clean up unused shadow +memory. + +To avoid the difficulties around swapping mappings around, we expect +that the part of the shadow region that covers the vmalloc space will +not be covered by the early shadow page, but will be left +unmapped. This will require changes in arch-specific code. + +This allows ``VMAP_STACK`` support on x86, and can simplify support of +architectures that do not have a fixed module region. diff --git a/arch/Kconfig b/arch/Kconfig index da75bab22eee..7b861fe3f900 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -836,16 +836,17 @@ config HAVE_ARCH_VMAP_STACK config VMAP_STACK default y bool "Use a virtually-mapped stack" - depends on HAVE_ARCH_VMAP_STACK && !KASAN + depends on HAVE_ARCH_VMAP_STACK + depends on !KASAN || KASAN_VMALLOC ---help--- Enable this if you want the use virtually-mapped kernel stacks with guard pages. This causes kernel stack overflows to be caught immediately rather than causing difficult-to-diagnose corruption. - This is presently incompatible with KASAN because KASAN expects - the stack to map directly to the KASAN shadow map using a formula - that is incorrect if the stack is in vmalloc space. + To use this with KASAN, the architecture must support backing + virtual mappings with real shadow memory, and KASAN_VMALLOC must + be enabled. config ARCH_OPTIONAL_KERNEL_RWX def_bool n diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 7addd0301c51..b917b596f7fb 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -33,7 +33,6 @@ #define _ASM_ARC_PGTABLE_H #include <linux/bits.h> -#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopmd.h> #include <asm/page.h> #include <asm/mmu.h> /* to propagate CONFIG_ARC_MMU_VER <n> */ diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 3861543b66a0..fb86bc3e9b35 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -30,6 +30,7 @@ noinline static int handle_kernel_vaddr_fault(unsigned long address) * with the 'reference' page table. */ pgd_t *pgd, *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; @@ -39,8 +40,13 @@ noinline static int handle_kernel_vaddr_fault(unsigned long address) if (!pgd_present(*pgd_k)) goto bad_area; - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + goto bad_area; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) goto bad_area; diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index a4856bfaedf3..fc8849e4f72e 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -111,12 +111,14 @@ EXPORT_SYMBOL(__kunmap_atomic); static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr) { pgd_t *pgd_k; + p4d_t *p4d_k; pud_t *pud_k; pmd_t *pmd_k; pte_t *pte_k; pgd_k = pgd_offset_k(kvaddr); - pud_k = pud_offset(pgd_k, kvaddr); + p4d_k = p4d_offset(pgd_k, kvaddr); + pud_k = pud_offset(p4d_k, kvaddr); pmd_k = pmd_offset(pud_k, kvaddr); pte_k = (pte_t *)memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h index a069dfcac9a9..4e697bc2f4cd 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h @@ -70,9 +70,6 @@ static inline int get_hugepd_cache_index(int index) /* should not reach */ } -#else /* !CONFIG_HUGETLB_PAGE */ -static inline int pmd_huge(pmd_t pmd) { return 0; } -static inline int pud_huge(pud_t pud) { return 0; } #endif /* CONFIG_HUGETLB_PAGE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h index e3d4dd4ae2fa..34d1018896b3 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h @@ -59,9 +59,6 @@ static inline int get_hugepd_cache_index(int index) BUG(); } -#else /* !CONFIG_HUGETLB_PAGE */ -static inline int pmd_huge(pmd_t pmd) { return 0; } -static inline int pud_huge(pud_t pud) { return 0; } #endif /* CONFIG_HUGETLB_PAGE */ static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 6ee17d09649c..974109bb85db 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -13,6 +13,7 @@ #include <linux/memblock.h> #include <linux/of_fdt.h> #include <linux/mm.h> +#include <linux/hugetlb.h> #include <linux/string_helpers.h> #include <linux/stop_machine.h> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0cb1756223be..5e8949953660 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -134,6 +134,7 @@ config X86 select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN if X86_64 + select HAVE_ARCH_KASAN_VMALLOC if X86_64 select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 296da58f3013..cf5bc37c90ac 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -245,6 +245,49 @@ static void __init kasan_map_early_shadow(pgd_t *pgd) } while (pgd++, addr = next, addr != end); } +static void __init kasan_shallow_populate_p4ds(pgd_t *pgd, + unsigned long addr, + unsigned long end) +{ + p4d_t *p4d; + unsigned long next; + void *p; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + + if (p4d_none(*p4d)) { + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE, true); + p4d_populate(&init_mm, p4d, p); + } + } while (p4d++, addr = next, addr != end); +} + +static void __init kasan_shallow_populate_pgds(void *start, void *end) +{ + unsigned long addr, next; + pgd_t *pgd; + void *p; + + addr = (unsigned long)start; + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, (unsigned long)end); + + if (pgd_none(*pgd)) { + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE, true); + pgd_populate(&init_mm, pgd, p); + } + + /* + * we need to populate p4ds to be synced when running in + * four level mode - see sync_global_pgds_l4() + */ + kasan_shallow_populate_p4ds(pgd, addr, next); + } while (pgd++, addr = next, addr != (unsigned long)end); +} + #ifdef CONFIG_KASAN_INLINE static int kasan_die_handler(struct notifier_block *self, unsigned long val, @@ -354,6 +397,24 @@ void __init kasan_init(void) kasan_populate_early_shadow( kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), + kasan_mem_to_shadow((void *)VMALLOC_START)); + + /* + * If we're in full vmalloc mode, don't back vmalloc space with early + * shadow pages. Instead, prepopulate pgds/p4ds so they are synced to + * the global table and we can populate the lower levels on demand. + */ + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) + kasan_shallow_populate_pgds( + kasan_mem_to_shadow((void *)VMALLOC_START), + kasan_mem_to_shadow((void *)VMALLOC_END)); + else + kasan_populate_early_shadow( + kasan_mem_to_shadow((void *)VMALLOC_START), + kasan_mem_to_shadow((void *)VMALLOC_END)); + + kasan_populate_early_shadow( + kasan_mem_to_shadow((void *)VMALLOC_END + 1), shadow_cpu_entry_begin); kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 84c4e1f72cbd..799b43191dea 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -19,15 +19,12 @@ #include <linux/memory.h> #include <linux/memory_hotplug.h> #include <linux/mm.h> -#include <linux/mutex.h> #include <linux/stat.h> #include <linux/slab.h> #include <linux/atomic.h> #include <linux/uaccess.h> -static DEFINE_MUTEX(mem_sysfs_mutex); - #define MEMORY_CLASS_NAME "memory" #define to_memory_block(dev) container_of(dev, struct memory_block, dev) @@ -538,12 +535,7 @@ static ssize_t soft_offline_page_store(struct device *dev, if (kstrtoull(buf, 0, &pfn) < 0) return -EINVAL; pfn >>= PAGE_SHIFT; - if (!pfn_valid(pfn)) - return -ENXIO; - /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */ - if (!pfn_to_online_page(pfn)) - return -EIO; - ret = soft_offline_page(pfn_to_page(pfn), 0); + ret = soft_offline_page(pfn, 0); return ret == 0 ? count : ret; } @@ -705,6 +697,8 @@ static void unregister_memory(struct memory_block *memory) * Create memory block devices for the given memory area. Start and size * have to be aligned to memory block granularity. Memory block devices * will be initial |
