diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-06-16 09:40:28 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-06-16 09:40:28 -0700 |
| commit | 70585216fe7730d9fb5453d3e2804e149d0fe201 (patch) | |
| tree | fa5ceaaad40a2a40b5b343d525a6ba18f1fbe1ab | |
| parent | 6b00bc639f1f2beeff3595e1bab9faaa51d23b01 (diff) | |
| parent | ccbd6283a9b640c8d5c2b44db318fd72a63338ff (diff) | |
| download | linux-70585216fe7730d9fb5453d3e2804e149d0fe201.tar.gz linux-70585216fe7730d9fb5453d3e2804e149d0fe201.tar.bz2 linux-70585216fe7730d9fb5453d3e2804e149d0fe201.zip | |
Merge branch 'akpm' (patches from Andrew)
Merge misc fixes from Andrew Morton:
"18 patches.
Subsystems affected by this patch series: mm (memory-failure, swap,
slub, hugetlb, memory-failure, slub, thp, sparsemem), and coredump"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
mm/sparse: fix check_usemap_section_nr warnings
mm: thp: replace DEBUG_VM BUG with VM_WARN when unmap fails for split
mm/thp: unmap_mapping_page() to fix THP truncate_cleanup_page()
mm/thp: fix page_address_in_vma() on file THP tails
mm/thp: fix vma_address() if virtual address below file offset
mm/thp: try_to_unmap() use TTU_SYNC for safe splitting
mm/thp: make is_huge_zero_pmd() safe and quicker
mm/thp: fix __split_huge_pmd_locked() on shmem migration entry
mm, thp: use head page in __migration_entry_wait()
mm/slub.c: include swab.h
crash_core, vmcoreinfo: append 'SECTION_SIZE_BITS' to vmcoreinfo
mm/memory-failure: make sure wait for page writeback in memory_failure
mm/hugetlb: expand restore_reserve_on_error functionality
mm/slub: actually fix freelist pointer vs redzoning
mm/slub: fix redzoning for small allocations
mm/slub: clarify verification reporting
mm/swap: fix pte_same_as_swp() not removing uffd-wp bit when compare
mm,hwpoison: fix race with hugetlb page allocation
| -rw-r--r-- | Documentation/vm/slub.rst | 10 | ||||
| -rw-r--r-- | fs/hugetlbfs/inode.c | 1 | ||||
| -rw-r--r-- | include/linux/huge_mm.h | 8 | ||||
| -rw-r--r-- | include/linux/hugetlb.h | 8 | ||||
| -rw-r--r-- | include/linux/mm.h | 3 | ||||
| -rw-r--r-- | include/linux/rmap.h | 1 | ||||
| -rw-r--r-- | include/linux/swapops.h | 15 | ||||
| -rw-r--r-- | kernel/crash_core.c | 1 | ||||
| -rw-r--r-- | mm/huge_memory.c | 56 | ||||
| -rw-r--r-- | mm/hugetlb.c | 135 | ||||
| -rw-r--r-- | mm/internal.h | 53 | ||||
| -rw-r--r-- | mm/memory-failure.c | 36 | ||||
| -rw-r--r-- | mm/memory.c | 41 | ||||
| -rw-r--r-- | mm/migrate.c | 1 | ||||
| -rw-r--r-- | mm/page_vma_mapped.c | 27 | ||||
| -rw-r--r-- | mm/pgtable-generic.c | 5 | ||||
| -rw-r--r-- | mm/rmap.c | 39 | ||||
| -rw-r--r-- | mm/slab_common.c | 3 | ||||
| -rw-r--r-- | mm/slub.c | 37 | ||||
| -rw-r--r-- | mm/sparse.c | 13 | ||||
| -rw-r--r-- | mm/swapfile.c | 2 | ||||
| -rw-r--r-- | mm/truncate.c | 43 |
22 files changed, 386 insertions, 152 deletions
diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst index 03f294a638bd..d3028554b1e9 100644 --- a/Documentation/vm/slub.rst +++ b/Documentation/vm/slub.rst @@ -181,7 +181,7 @@ SLUB Debug output Here is a sample of slub debug output:: ==================================================================== - BUG kmalloc-8: Redzone overwritten + BUG kmalloc-8: Right Redzone overwritten -------------------------------------------------------------------- INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc @@ -189,10 +189,10 @@ Here is a sample of slub debug output:: INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58 INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554 - Bytes b4 0xc90f6d10: 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ - Object 0xc90f6d20: 31 30 31 39 2e 30 30 35 1019.005 - Redzone 0xc90f6d28: 00 cc cc cc . - Padding 0xc90f6d50: 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ + Bytes b4 (0xc90f6d10): 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ + Object (0xc90f6d20): 31 30 31 39 2e 30 30 35 1019.005 + Redzone (0xc90f6d28): 00 cc cc cc . + Padding (0xc90f6d50): 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ [<c010523d>] dump_trace+0x63/0x1eb [<c01053df>] show_trace_log_lvl+0x1a/0x2f diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 55efd3dd04f6..30dee68458c7 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -735,6 +735,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, __SetPageUptodate(page); error = huge_add_to_page_cache(page, mapping, index); if (unlikely(error)) { + restore_reserve_on_error(h, &pseudo_vma, addr, page); put_page(page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out; diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9626fda5efce..2a8ebe6c222e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -286,6 +286,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); extern struct page *huge_zero_page; +extern unsigned long huge_zero_pfn; static inline bool is_huge_zero_page(struct page *page) { @@ -294,7 +295,7 @@ static inline bool is_huge_zero_page(struct page *page) static inline bool is_huge_zero_pmd(pmd_t pmd) { - return is_huge_zero_page(pmd_page(pmd)); + return READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd) && pmd_present(pmd); } static inline bool is_huge_zero_pud(pud_t pud) @@ -440,6 +441,11 @@ static inline bool is_huge_zero_page(struct page *page) return false; } +static inline bool is_huge_zero_pmd(pmd_t pmd) +{ + return false; +} + static inline bool is_huge_zero_pud(pud_t pud) { return false; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index b92f25ccef58..6504346a1947 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -149,6 +149,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool isolate_huge_page(struct page *page, struct list_head *list); +int get_hwpoison_huge_page(struct page *page, bool *hugetlb); void putback_active_hugepage(struct page *page); void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void free_huge_page(struct page *page); @@ -339,6 +340,11 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list) return false; } +static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +{ + return 0; +} + static inline void putback_active_hugepage(struct page *page) { } @@ -604,6 +610,8 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); int huge_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); +void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, + unsigned long address, struct page *page); /* arch callback */ int __init __alloc_bootmem_huge_page(struct hstate *h); diff --git a/include/linux/mm.h b/include/linux/mm.h index c274f75efcf9..8ae31622deef 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1719,6 +1719,7 @@ struct zap_details { struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ + struct page *single_page; /* Locked page to be unmapped */ }; struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, @@ -1766,6 +1767,7 @@ extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, extern int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); +void unmap_mapping_page(struct page *page); void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows); void unmap_mapping_range(struct address_space *mapping, @@ -1786,6 +1788,7 @@ static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address, BUG(); return -EFAULT; } +static inline void unmap_mapping_page(struct page *page) { } static inline void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { } static inline void unmap_mapping_range(struct address_space *mapping, diff --git a/include/linux/rmap.h b/include/linux/rmap.h index def5c62c93b3..8d04e7deedc6 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -91,6 +91,7 @@ enum ttu_flags { TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ + TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */ TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */ TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible * and caller guarantees they will diff --git a/include/linux/swapops.h b/include/linux/swapops.h index d9b7c9132c2f..6430a94c6981 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -23,6 +23,16 @@ #define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT) #define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1) +/* Clear all flags but only keep swp_entry_t related information */ +static inline pte_t pte_swp_clear_flags(pte_t pte) +{ + if (pte_swp_soft_dirty(pte)) + pte = pte_swp_clear_soft_dirty(pte); + if (pte_swp_uffd_wp(pte)) + pte = pte_swp_clear_uffd_wp(pte); + return pte; +} + /* * Store a type+offset into a swp_entry_t in an arch-independent format */ @@ -66,10 +76,7 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte) { swp_entry_t arch_entry; - if (pte_swp_soft_dirty(pte)) - pte = pte_swp_clear_soft_dirty(pte); - if (pte_swp_uffd_wp(pte)) - pte = pte_swp_clear_uffd_wp(pte); + pte = pte_swp_clear_flags(pte); arch_entry = __pte_to_swp_entry(pte); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 825284baaf46..684a6061a13a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -464,6 +464,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); + VMCOREINFO_NUMBER(SECTION_SIZE_BITS); VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); #endif VMCOREINFO_STRUCT_SIZE(page); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 63ed6b25deaa..6d2a0119fc58 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -62,6 +62,7 @@ static struct shrinker deferred_split_shrinker; static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; +unsigned long huge_zero_pfn __read_mostly = ~0UL; bool transparent_hugepage_enabled(struct vm_area_struct *vma) { @@ -98,6 +99,7 @@ retry: __free_pages(zero_page, compound_order(zero_page)); goto retry; } + WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page)); /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); @@ -147,6 +149,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { struct page *zero_page = xchg(&huge_zero_page, NULL); BUG_ON(zero_page == NULL); + WRITE_ONCE(huge_zero_pfn, ~0UL); __free_pages(zero_page, compound_order(zero_page)); return HPAGE_PMD_NR; } @@ -2044,7 +2047,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, count_vm_event(THP_SPLIT_PMD); if (!vma_is_anonymous(vma)) { - _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); + old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* * We are going to unmap this huge page. So * just go ahead and zap it @@ -2053,16 +2056,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, zap_deposited_table(mm, pmd); if (vma_is_special_huge(vma)) return; - page = pmd_page(_pmd); - if (!PageDirty(page) && pmd_dirty(_pmd)) - set_page_dirty(page); - if (!PageReferenced(page) && pmd_young(_pmd)) - SetPageReferenced(page); - page_remove_rmap(page, true); - put_page(page); + if (unlikely(is_pmd_migration_entry(old_pmd))) { + swp_entry_t entry; + + entry = pmd_to_swp_entry(old_pmd); + page = migration_entry_to_page(entry); + } else { + page = pmd_page(old_pmd); + if (!PageDirty(page) && pmd_dirty(old_pmd)) + set_page_dirty(page); + if (!PageReferenced(page) && pmd_young(old_pmd)) + SetPageReferenced(page); + page_remove_rmap(page, true); + put_page(page); + } add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); return; - } else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) { + } + + if (is_huge_zero_pmd(*pmd)) { /* * FIXME: Do we want to invalidate secondary mmu by calling * mmu_notifier_invalidate_range() see comments below inside @@ -2338,17 +2350,17 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, static void unmap_page(struct page *page) { - enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | + enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; - bool unmap_success; VM_BUG_ON_PAGE(!PageHead(page), page); if (PageAnon(page)) ttu_flags |= TTU_SPLIT_FREEZE; - unmap_success = try_to_unmap(page, ttu_flags); - VM_BUG_ON_PAGE(!unmap_success, page); + try_to_unmap(page, ttu_flags); + + VM_WARN_ON_ONCE_PAGE(page_mapped(page), page); } static void remap_page(struct page *page, unsigned int nr) @@ -2659,7 +2671,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) struct deferred_split *ds_queue = get_deferred_split_queue(head); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; - int count, mapcount, extra_pins, ret; + int extra_pins, ret; pgoff_t end; VM_BUG_ON_PAGE(is_huge_zero_page(head), head); @@ -2718,7 +2730,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } unmap_page(head); - VM_BUG_ON_PAGE(compound_mapcount(head), head); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); @@ -2736,9 +2747,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); - count = page_count(head); - mapcount = total_mapcount(head); - if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { + if (page_ref_freeze(head, 1 + extra_pins)) { if (!list_empty(page_deferred_list(head))) { ds_queue->split_queue_len--; list_del(page_deferred_list(head)); @@ -2758,16 +2767,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) __split_huge_page(page, list, end); ret = 0; } else { - if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { - pr_alert("total_mapcount: %u, page_count(): %u\n", - mapcount, count); - if (PageTail(page)) - dump_page(head, NULL); - dump_page(page, "total_mapcount(head) > 0"); - BUG(); - } spin_unlock(&ds_queue->split_queue_lock); -fail: if (mapping) +fail: + if (mapping) xa_unlock(&mapping->i_pages); local_irq_enable(); remap_page(head, thp_nr_pages(head)); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5560b50876fb..e0a5f9cbbece 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2121,12 +2121,18 @@ out: * be restored when a newly allocated huge page must be freed. It is * to be called after calling vma_needs_reservation to determine if a * reservation exists. + * + * vma_del_reservation is used in error paths where an entry in the reserve + * map was created during huge page allocation and must be removed. It is to + * be called after calling vma_needs_reservation to determine if a reservation + * exists. */ enum vma_resv_mode { VMA_NEEDS_RESV, VMA_COMMIT_RESV, VMA_END_RESV, VMA_ADD_RESV, + VMA_DEL_RESV, }; static long __vma_reservation_common(struct hstate *h, struct vm_area_struct *vma, unsigned long addr, @@ -2170,11 +2176,21 @@ static long __vma_reservation_common(struct hstate *h, ret = region_del(resv, idx, idx + 1); } break; + case VMA_DEL_RESV: + if (vma->vm_flags & VM_MAYSHARE) { + region_abort(resv, idx, idx + 1, 1); + ret = region_del(resv, idx, idx + 1); + } else { + ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); + /* region_add calls of range 1 should never fail. */ + VM_BUG_ON(ret < 0); + } + break; default: BUG(); } - if (vma->vm_flags & VM_MAYSHARE) + if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV) return ret; /* * We know private mapping must have HPAGE_RESV_OWNER set. @@ -2222,25 +2238,39 @@ static long vma_add_reservation(struct hstate *h, return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); } +static long vma_del_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV); +} + /* - * This routine is called to restore a reservation on error paths. In the - * specific error paths, a huge page was allocated (via alloc_huge_page) - * and is about to be freed. If a reservation for the page existed, - * alloc_huge_page would have consumed the reservation and set - * HPageRestoreReserve in the newly allocated page. When the page is freed - * via free_huge_page, the global reservation count will be incremented if - * HPageRestoreReserve is set. However, free_huge_page can not adjust the - * reserve map. Adjust the reserve map here to be consistent with global - * reserve count adjustments to be made by free_huge_page. + * This routine is called to restore reservation information on error paths. + * It should ONLY be called for pages allocated via alloc_huge_page(), and + * the hugetlb mutex should remain held when calling this routine. + * + * It handles two specific cases: + * 1) A reservation was in place and the page consumed the reservation. + * HPageRestoreReserve is set in the page. + * 2) No reservation was in place for the page, so HPageRestoreReserve is + * not set. However, alloc_huge_page always updates the reserve map. + * + * In case 1, free_huge_page later in the error path will increment the + * global reserve count. But, free_huge_page does not have enough context + * to adjust the reservation map. This case deals primarily with private + * mappings. Adjust the reserve map here to be consistent with global + * reserve count adjustments to be made by free_huge_page. Make sure the + * reserve map indicates there is a reservation present. + * + * In case 2, simply undo reserve map modifications done by alloc_huge_page. */ -static void restore_reserve_on_error(struct hstate *h, - struct vm_area_struct *vma, unsigned long address, - struct page *page) +void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, + unsigned long address, struct page *page) { - if (unlikely(HPageRestoreReserve(page))) { - long rc = vma_needs_reservation(h, vma, address); + long rc = vma_needs_reservation(h, vma, address); - if (unlikely(rc < 0)) { + if (HPageRestoreReserve(page)) { + if (unlikely(rc < 0)) /* * Rare out of memory condition in reserve map * manipulation. Clear HPageRestoreReserve so that @@ -2253,16 +2283,57 @@ static void restore_reserve_on_error(struct hstate *h, * accounting of reserve counts. */ ClearHPageRestoreReserve(page); - } else if (rc) { - rc = vma_add_reservation(h, vma, address); - if (unlikely(rc < 0)) + else if (rc) + (void)vma_add_reservation(h, vma, address); + else + vma_end_reservation(h, vma, address); + } else { + if (!rc) { + /* + * This indicates there is an entry in the reserve map + * added by alloc_huge_page. We know it was added + * before the alloc_huge_page call, otherwise + * HPageRestoreReserve would be set on the page. + * Remove the entry so that a subsequent allocation + * does not consume a reservation. + */ + rc = vma_del_reservation(h, vma, address); + if (rc < 0) + /* + * VERY rare out of memory condition. Since + * we can not delete the entry, set + * HPageRestoreReserve so that the reserve + * count will be incremented when the page + * is freed. This reserve will be consumed + * on a subsequent allocation. + */ + SetHPageRestoreReserve(page); + } else if (rc < 0) { + /* + * Rare out of memory condition from + * vma_needs_reservation call. Memory allocation is + * only attempted if a new entry is needed. Therefore, + * this implies there is not an entry in the + * reserve map. + * + * For shared mappings, no entry in the map indicates + * no reservation. We are done. + */ + if (!(vma->vm_flags & VM_MAYSHARE)) /* - * See above comment about rare out of - * memory condition. + * For private mappings, no entry indicates + * a reservation is present. Since we can + * not add an entry, set SetHPageRestoreReserve + * on the page so reserve count will be + * incremented when freed. This reserve will + * be consumed on a subsequent allocation. */ - ClearHPageRestoreReserve(page); + SetHPageRestoreReserve(page); } else - vma_end_reservation(h, vma, address); + /* + * No reservation present, do nothing + */ + vma_end_reservation(h, vma, address); } } @@ -4037,6 +4108,8 @@ again: spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); if (!pte_same(src_pte_old, entry)) { + restore_reserve_on_error(h, vma, addr, + new); put_page(new); /* dst_entry won't change as in child */ goto again; @@ -5006,6 +5079,7 @@ out_release_unlock: if (vm_shared || is_continue) unlock_page(page); out_release_nounlock: + restore_reserve_on_error(h, dst_vma, dst_addr, page); put_page(page); goto out; } @@ -5857,6 +5931,21 @@ unlock: return ret; } +int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +{ + int ret = 0; + + *hugetlb = false; + spin_lock_irq(&hugetlb_lock); + if (PageHeadHuge(page)) { + *hugetlb = true; + if (HPageFreed(page) || HPageMigratable(page)) + ret = get_page_unless_zero(page); + } + spin_unlock_irq(&hugetlb_lock); + return ret; +} + void putback_active_hugepage(struct page *page) { spin_lock_irq(&hugetlb_lock); diff --git a/mm/internal.h b/mm/internal.h index 2f1182948aa6..e8fdb531f887 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -384,27 +384,52 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /* - * At what user virtual address is page expected in @vma? + * At what user virtual address is page expected in vma? + * Returns -EFAULT if all of the page is outside the range of vma. + * If page is a compound head, the entire compound page is considered. */ static inline unsigned long -__vma_address(struct page *page, struct vm_area_struct *vma) +vma_address(struct page *page, struct vm_area_struct *vma) { - pgoff_t pgoff = page_to_pgoff(page); - return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + pgoff_t pgoff; + unsigned long address; + + VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ + pgoff = page_to_pgoff(page); + if (pgoff >= vma->vm_pgoff) { + address = vma->vm_start + + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + /* Check for address beyond vma (or wrapped through 0?) */ + if (address < vma->vm_start || address >= vma->vm_end) + address = -EFAULT; + } else if (PageHead(page) && + pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) { + /* Test above avoids possibility of wrap to 0 on 32-bit */ + address = vma->vm_start; + } else { + address = -EFAULT; + } + return address; } +/* + * Then at what user virtual address will none of the page be found in vma? + * Assumes that vma_address() already returned a good starting address. + * If page is a compound head, the entire compound page is considered. + */ static inline unsigned long -vma_address(struct page *page, struct vm_area_struct *vma) +vma_address_end(struct page *page, struct vm_area_struct *vma) { - unsigned long start, end; - - start = __vma_address(page, vma); - end = start + thp_size(page) - PAGE_SIZE; - - /* page should be within @vma mapping range */ - VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma); - - return max(start, vma->vm_start); + pgoff_t pgoff; + unsigned long address; + + VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ + pgoff = page_to_pgoff(page) + compound_nr(page); + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + /* Check for address beyond vma (or wrapped through 0?) */ + if (address < vma->vm_start || address > vma->vm_end) + address = vma->vm_end; + return address; } static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 85ad98c00fd9..0143d32bc666 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -949,6 +949,17 @@ static int page_action(struct page_state *ps, struct page *p, return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; } +/* + * Return true if a page type of a given page is supported by hwpoison + * mechanism (while handling could fail), otherwise false. This function + * does not return true for hugetlb or device memory pages, so it's assumed + * to be called only in the context where we never have such pages. + */ +static inline bool HWPoisonHandlable(struct page *page) +{ + return PageLRU(page) || __PageMovable(page); +} + /** * __get_hwpoison_page() - Get refcount for memory error handling: * @page: raw error page (hit by memory error) @@ -959,8 +970,22 @@ static int page_action(struct page_state *ps, struct page *p, static int __get_hwpoison_page(struct page *page) { struct page *head = compound_head(page); + int ret = 0; + bool hugetlb = false; + + ret = get_hwpoison_huge_page(head, &hugetlb); + if (hugetlb) + return ret; - if (!PageHuge(head) && PageTransHuge(head)) { + /* + * This check prevents from calling get_hwpoison_unless_zero() + * for any unsupported type of page in order to reduce the risk of + * unexpected races caused by taking a page refcount. + */ + if (!HWPoisonHandlable(head)) + return 0; + + if (PageTransHuge(head)) { /* * Non anonymous thp exists only in allocation/free time. We * can't handle such a case correctly, so let's give it up. @@ -1017,7 +1042,7 @@ try_again: ret = -EIO; } } else { - if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) { + if (PageHuge(p) || HWPoisonHandlable(p)) { ret = 1; } else { /* @@ -1527,7 +1552,12 @@ try_again: return 0; } - if (!PageTransTail(p) && !PageLRU(p)) + /* + * __munlock_pagevec may clear a writeback page's LRU flag without + * page_lock. We need wait writeback completion for this page or it + * may trigger vfs BUG while evict inode. + */ + if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p)) goto identify_page_state; /* diff --git a/mm/memory.c b/mm/memory.c index f3ffab9b9e39..486f4a2874e7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1361,7 +1361,18 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ + } else if (details && details->single_page && + PageTransCompound(details->single_page) && + next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { + spinlock_t *ptl = pmd_lock(tlb->mm, pmd); + /* + * Take and drop THP pmd lock so that we cannot return + * prematurely, while zap_huge_pmd() has cleared *pmd, + * but not yet decremented compound_mapcount(). + */ + spin_unlock(ptl); } + /* * Here there can be other concurrent MADV_DONTNEED or * trans huge page faults running, and if the pmd is @@ -3237,6 +3248,36 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, } /** + * unmap_mapping_page() - Unmap single page from processes. + * @page: The locked page to be unmapped. + * + * Unmap this page from any userspace process which still has it mmaped. + * Typically, for efficiency, the range of nearby pages has already been + * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once + * truncation or invalidation holds the lock on a page, it may find that + * the page has been remapped again: and then uses unmap_mapping_page() + * to unmap it finally. + */ +void unmap_mapping_page(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct zap_details details = { }; + + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageTail(page)); + + details.check_mapping = mapping; + details.first_index = page->index; + details.last_index = page->index + thp_nr_pages(page) - 1; + details.single_page = page; + + i_mmap_lock_write(mapping); + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) + unmap_mapping_range_tree(&mapping->i_mmap, &details); + i_mmap_unlock_write(mapping); +} + +/** * unmap_mapping_pages() - Unmap pages from processes. * @mapping: The address space containing pages to be unmapped. * @start: Index of first page to be unmapped. diff --git a/mm/migrate.c b/mm/migrate.c index b234c3f3acb7..41ff2c9896c4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -295,6 +295,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, goto out; page = migration_entry_to_page(entry); + page = compound_head(page); /* * Once page cache replacement of page migration started, page_count diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 2cf01d933f13..e37bd43904af 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -212,23 +212,34 @@ restart: pvmw->ptl = NULL; } } else if (!pmd_present(pmde)) { + /* + * If PVMW_SYNC, take and drop THP pmd lock so that we + * cannot return prematurely, while zap_huge_pmd() has + * cleared *pmd but not decremented compound_mapcount(). + */ + if ((pvmw->flags & PVMW_SYNC) && + PageTransCompound(pvmw->page)) { + spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); + + spin_unlock(ptl); + } return false; } if (!map_pte(pvmw)) goto next_pte; while (1) { + unsigned long end; + if (check_pte(pvmw)) return true; next_pte: /* Seek to next pte only makes sense for THP */ if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page)) return not_found(pvmw); + end = vma_address_end(pvmw->page, pvmw->vma); do { pvmw->address += PAGE_SIZE; - if (pvmw->address >= pvmw->vma->vm_end || - pvmw->address >= - __vma_address(pvmw->page, pvmw->vma) + - thp_size(pvmw-&g |
