diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-05-05 13:50:15 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-05-05 13:50:15 -0700 |
| commit | 8404c9fbc84b741f66cff7d4934a25dd2c344452 (patch) | |
| tree | ad9b31db8b954b89a0984760a57aec7526caa1b5 /mm/hugetlb.c | |
| parent | a79cdfba68a13b731004f0aafe1155a83830d472 (diff) | |
| parent | 36f0b35d0894576fe63268ede80d9f5aa140be09 (diff) | |
| download | linux-8404c9fbc84b741f66cff7d4934a25dd2c344452.tar.gz linux-8404c9fbc84b741f66cff7d4934a25dd2c344452.tar.bz2 linux-8404c9fbc84b741f66cff7d4934a25dd2c344452.zip | |
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton:
"The remainder of the main mm/ queue.
143 patches.
Subsystems affected by this patch series (all mm): pagecache, hugetlb,
userfaultfd, vmscan, compaction, migration, cma, ksm, vmstat, mmap,
kconfig, util, memory-hotplug, zswap, zsmalloc, highmem, cleanups, and
kfence"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (143 commits)
kfence: use power-efficient work queue to run delayed work
kfence: maximize allocation wait timeout duration
kfence: await for allocation using wait_event
kfence: zero guard page after out-of-bounds access
mm/process_vm_access.c: remove duplicate include
mm/mempool: minor coding style tweaks
mm/highmem.c: fix coding style issue
btrfs: use memzero_page() instead of open coded kmap pattern
iov_iter: lift memzero_page() to highmem.h
mm/zsmalloc: use BUG_ON instead of if condition followed by BUG.
mm/zswap.c: switch from strlcpy to strscpy
arm64/Kconfig: introduce ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
x86/Kconfig: introduce ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
mm,memory_hotplug: add kernel boot option to enable memmap_on_memory
acpi,memhotplug: enable MHP_MEMMAP_ON_MEMORY when supported
mm,memory_hotplug: allocate memmap from the added memory range
mm,memory_hotplug: factor out adjusting present pages into adjust_present_page_count()
mm,memory_hotplug: relax fully spanned sections check
drivers/base/memory: introduce memory_block_{online,offline}
mm/memory_hotplug: remove broken locking of zone PCP structures during hot remove
...
Diffstat (limited to 'mm/hugetlb.c')
| -rw-r--r-- | mm/hugetlb.c | 777 |
1 files changed, 517 insertions, 260 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6c72433bec1e..629aa4c2259c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -39,7 +39,6 @@ #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #include <linux/node.h> -#include <linux/userfaultfd_k.h> #include <linux/page_owner.h> #include "internal.h" @@ -94,9 +93,10 @@ static inline bool subpool_is_free(struct hugepage_subpool *spool) return true; } -static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) +static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, + unsigned long irq_flags) { - spin_unlock(&spool->lock); + spin_unlock_irqrestore(&spool->lock, irq_flags); /* If no pages are used, and no other handles to the subpool * remain, give up any reservations based on minimum size and @@ -135,10 +135,12 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, void hugepage_put_subpool(struct hugepage_subpool *spool) { - spin_lock(&spool->lock); + unsigned long flags; + + spin_lock_irqsave(&spool->lock, flags); BUG_ON(!spool->count); spool->count--; - unlock_or_release_subpool(spool); + unlock_or_release_subpool(spool, flags); } /* @@ -157,7 +159,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, if (!spool) return ret; - spin_lock(&spool->lock); + spin_lock_irq(&spool->lock); if (spool->max_hpages != -1) { /* maximum size accounting */ if ((spool->used_hpages + delta) <= spool->max_hpages) @@ -184,7 +186,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, } unlock_ret: - spin_unlock(&spool->lock); + spin_unlock_irq(&spool->lock); return ret; } @@ -198,11 +200,12 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, long delta) { long ret = delta; + unsigned long flags; if (!spool) return delta; - spin_lock(&spool->lock); + spin_lock_irqsave(&spool->lock, flags); if (spool->max_hpages != -1) /* maximum size accounting */ spool->used_hpages -= delta; @@ -223,7 +226,7 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, * If hugetlbfs_put_super couldn't free spool due to an outstanding * quota reference, free it now. */ - unlock_or_release_subpool(spool); + unlock_or_release_subpool(spool, flags); return ret; } @@ -553,7 +556,6 @@ retry: resv->adds_in_progress -= in_regions_needed; spin_unlock(&resv->lock); - VM_BUG_ON(add < 0); return add; } @@ -743,13 +745,20 @@ void hugetlb_fix_reserve_counts(struct inode *inode) { struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; + bool reserved = false; rsv_adjust = hugepage_subpool_get_pages(spool, 1); - if (rsv_adjust) { + if (rsv_adjust > 0) { struct hstate *h = hstate_inode(inode); - hugetlb_acct_memory(h, 1); + if (!hugetlb_acct_memory(h, 1)) + reserved = true; + } else if (!rsv_adjust) { + reserved = true; } + + if (!reserved) + pr_warn("hugetlb: Huge Page Reserved count may go negative.\n"); } /* @@ -1059,6 +1068,8 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); + + lockdep_assert_held(&hugetlb_lock); list_move(&page->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; @@ -1068,10 +1079,11 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) { struct page *page; - bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA); + bool pin = !!(current->flags & PF_MEMALLOC_PIN); + lockdep_assert_held(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { - if (nocma && is_migrate_cma_page(page)) + if (pin && !is_pinnable_page(page)) continue; if (PageHWPoison(page)) @@ -1205,7 +1217,7 @@ static int hstate_next_node_to_alloc(struct hstate *h, } /* - * helper for free_pool_huge_page() - return the previously saved + * helper for remove_pool_huge_page() - return the previously saved * node ["this node"] from which to free a huge page. Advance the * next node id whether or not we find a free huge page to free so * that the next attempt to free addresses the next node. @@ -1273,7 +1285,7 @@ static void free_gigantic_page(struct page *page, unsigned int order) static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { - unsigned long nr_pages = 1UL << huge_page_order(h); + unsigned long nr_pages = pages_per_huge_page(h); if (nid == NUMA_NO_NODE) nid = numa_mem_id(); @@ -1327,6 +1339,42 @@ static inline void destroy_compound_gigantic_page(struct page *page, unsigned int order) { } #endif +/* + * Remove hugetlb page from lists, and update dtor so that page appears + * as just a compound page. A reference is held on the page. + * + * Must be called with hugetlb lock held. + */ +static void remove_hugetlb_page(struct hstate *h, struct page *page, + bool adjust_surplus) +{ + int nid = page_to_nid(page); + + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); + + lockdep_assert_held(&hugetlb_lock); + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + return; + + list_del(&page->lru); + + if (HPageFreed(page)) { + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + } + if (adjust_surplus) { + h->surplus_huge_pages--; + h->surplus_huge_pages_node[nid]--; + } + + set_page_refcounted(page); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + + h->nr_huge_pages--; + h->nr_huge_pages_node[nid]--; +} + static void update_and_free_page(struct hstate *h, struct page *page) { int i; @@ -1335,8 +1383,6 @@ static void update_and_free_page(struct hstate *h, struct page *page) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; - h->nr_huge_pages--; - h->nr_huge_pages_node[page_to_nid(page)]--; for (i = 0; i < pages_per_huge_page(h); i++, subpage = mem_map_next(subpage, page, i)) { subpage->flags &= ~(1 << PG_locked | 1 << PG_error | @@ -1344,24 +1390,24 @@ static void update_and_free_page(struct hstate *h, struct page *page) 1 << PG_active | 1 << PG_private | 1 << PG_writeback); } - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); - set_compound_page_dtor(page, NULL_COMPOUND_DTOR); - set_page_refcounted(page); if (hstate_is_gigantic(h)) { - /* - * Temporarily drop the hugetlb_lock, because - * we might block in free_gigantic_page(). - */ - spin_unlock(&hugetlb_lock); destroy_compound_gigantic_page(page, huge_page_order(h)); free_gigantic_page(page, huge_page_order(h)); - spin_lock(&hugetlb_lock); } else { __free_pages(page, huge_page_order(h)); } } +static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) +{ + struct page *page, *t_page; + + list_for_each_entry_safe(page, t_page, list, lru) { + update_and_free_page(h, page); + cond_resched(); + } +} + struct hstate *size_to_hstate(unsigned long size) { struct hstate *h; @@ -1373,7 +1419,7 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } -static void __free_huge_page(struct page *page) +void free_huge_page(struct page *page) { /* * Can't pass hstate in here because it is called from the @@ -1383,6 +1429,7 @@ static void __free_huge_page(struct page *page) int nid = page_to_nid(page); struct hugepage_subpool *spool = hugetlb_page_subpool(page); bool restore_reserve; + unsigned long flags; VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(page_mapcount(page), page); @@ -1411,7 +1458,7 @@ static void __free_huge_page(struct page *page) restore_reserve = true; } - spin_lock(&hugetlb_lock); + spin_lock_irqsave(&hugetlb_lock, flags); ClearHPageMigratable(page); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); @@ -1421,82 +1468,46 @@ static void __free_huge_page(struct page *page) h->resv_huge_pages++; if (HPageTemporary(page)) { - list_del(&page->lru); - ClearHPageTemporary(page); + remove_hugetlb_page(h, page, false); + spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_page(h, page); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ - list_del(&page->lru); + remove_hugetlb_page(h, page, true); + spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_page(h, page); - h->surplus_huge_pages--; - h->surplus_huge_pages_node[nid]--; } else { arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); + spin_unlock_irqrestore(&hugetlb_lock, flags); } - spin_unlock(&hugetlb_lock); } /* - * As free_huge_page() can be called from a non-task context, we have - * to defer the actual freeing in a workqueue to prevent potential - * hugetlb_lock deadlock. - * - * free_hpage_workfn() locklessly retrieves the linked list of pages to - * be freed and frees them one-by-one. As the page->mapping pointer is - * going to be cleared in __free_huge_page() anyway, it is reused as the - * llist_node structure of a lockless linked list of huge pages to be freed. + * Must be called with the hugetlb lock held */ -static LLIST_HEAD(hpage_freelist); - -static void free_hpage_workfn(struct work_struct *work) +static void __prep_account_new_huge_page(struct hstate *h, int nid) { - struct llist_node *node; - struct page *page; - - node = llist_del_all(&hpage_freelist); - - while (node) { - page = container_of((struct address_space **)node, - struct page, mapping); - node = node->next; - __free_huge_page(page); - } -} -static DECLARE_WORK(free_hpage_work, free_hpage_workfn); - -void free_huge_page(struct page *page) -{ - /* - * Defer freeing if in non-task context to avoid hugetlb_lock deadlock. - */ - if (!in_task()) { - /* - * Only call schedule_work() if hpage_freelist is previously - * empty. Otherwise, schedule_work() had been called but the - * workfn hasn't retrieved the list yet. - */ - if (llist_add((struct llist_node *)&page->mapping, - &hpage_freelist)) - schedule_work(&free_hpage_work); - return; - } - - __free_huge_page(page); + lockdep_assert_held(&hugetlb_lock); + h->nr_huge_pages++; + h->nr_huge_pages_node[nid]++; } -static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +static void __prep_new_huge_page(struct page *page) { INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); hugetlb_set_page_subpool(page, NULL); set_hugetlb_cgroup(page, NULL); set_hugetlb_cgroup_rsvd(page, NULL); - spin_lock(&hugetlb_lock); - h->nr_huge_pages++; - h->nr_huge_pages_node[nid]++; - ClearHPageFreed(page); - spin_unlock(&hugetlb_lock); +} + +static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +{ + __prep_new_huge_page(page); + spin_lock_irq(&hugetlb_lock); + __prep_account_new_huge_page(h, nid); + spin_unlock_irq(&hugetlb_lock); } static void prep_compound_gigantic_page(struct page *page, unsigned int order) @@ -1693,17 +1704,20 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, } /* - * Free huge page from pool from next node to free. - * Attempt to keep persistent huge pages more or less - * balanced over allowed nodes. + * Remove huge page from pool from next node to free. Attempt to keep + * persistent huge pages more or less balanced over allowed nodes. + * This routine only 'removes' the hugetlb page. The caller must make + * an additional call to free the page to low level allocators. * Called with hugetlb_lock locked. */ -static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, - bool acct_surplus) +static struct page *remove_pool_huge_page(struct hstate *h, + nodemask_t *nodes_allowed, + bool acct_surplus) { int nr_nodes, node; - int ret = 0; + struct page *page = NULL; + lockdep_assert_held(&hugetlb_lock); for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { /* * If we're returning unused surplus pages, only examine @@ -1711,23 +1725,14 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, */ if ((!acct_surplus || h->surplus_huge_pages_node[node]) && !list_empty(&h->hugepage_freelists[node])) { - struct page *page = - list_entry(h->hugepage_freelists[node].next, + page = list_entry(h->hugepage_freelists[node].next, struct page, lru); - list_del(&page->lru); - h->free_huge_pages--; - h->free_huge_pages_node[node]--; - if (acct_surplus) { - h->surplus_huge_pages--; - h->surplus_huge_pages_node[node]--; - } - update_and_free_page(h, page); - ret = 1; + remove_hugetlb_page(h, page, acct_surplus); break; } } - return ret; + return page; } /* @@ -1749,7 +1754,7 @@ retry: if (!PageHuge(page)) return 0; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (!PageHuge(page)) { rc = 0; goto out; @@ -1758,7 +1763,6 @@ retry: if (!page_count(page)) { struct page *head = compound_head(page); struct hstate *h = page_hstate(head); - int nid = page_to_nid(head); if (h->free_huge_pages - h->resv_huge_pages == 0) goto out; @@ -1767,7 +1771,7 @@ retry: * when it is dissolved. */ if (unlikely(!HPageFreed(head))) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); cond_resched(); /* @@ -1789,15 +1793,14 @@ retry: SetPageHWPoison(page); ClearPageHWPoison(head); } - list_del(&head->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; + remove_hugetlb_page(h, page, false); h->max_huge_pages--; + spin_unlock_irq(&hugetlb_lock); update_and_free_page(h, head); - rc = 0; + return 0; } out: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return rc; } @@ -1839,16 +1842,16 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, if (hstate_is_gigantic(h)) return NULL; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) goto out_unlock; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); if (!page) return NULL; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * We could have raced with the pool size change. * Double check that and simply deallocate the new page @@ -1858,7 +1861,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { SetHPageTemporary(page); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); put_page(page); return NULL; } else { @@ -1867,7 +1870,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, } out_unlock: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return page; } @@ -1917,17 +1920,17 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (h->free_huge_pages - h->resv_huge_pages > 0) { struct page *page; page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); if (page) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return page; } } - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); } @@ -1964,6 +1967,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) long needed, allocated; bool alloc_ok = true; + lockdep_assert_held(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - h->free_huge_pages; if (needed <= 0) { h->resv_huge_pages += delta; @@ -1975,7 +1979,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) ret = -ENOMEM; retry: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), NUMA_NO_NODE, NULL); @@ -1992,7 +1996,7 @@ retry: * After retaking hugetlb_lock, we need to recalculate 'needed' * because either resv_huge_pages or free_huge_pages may have changed. */ - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - (h->free_huge_pages + allocated); if (needed > 0) { @@ -2032,12 +2036,12 @@ retry: enqueue_huge_page(h, page); } free: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) put_page(page); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); return ret; } @@ -2049,17 +2053,17 @@ free: * to the associated reservation map. * 2) Free any unused surplus pages that may have been allocated to satisfy * the reservation. As many as unused_resv_pages may be freed. - * - * Called with hugetlb_lock held. However, the lock could be dropped (and - * reacquired) during calls to cond_resched_lock. Whenever dropping the lock, - * we must make sure nobody else can claim pages we are in the process of - * freeing. Do this by ensuring resv_huge_page always is greater than the - * number of huge pages we plan to free when dropping the lock. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; + struct page *page; + LIST_HEAD(page_list); + + lockdep_assert_held(&hugetlb_lock); + /* Uncommit the reservation */ + h->resv_huge_pages -= unused_resv_pages; /* Cannot return gigantic pages currently */ if (hstate_is_gigantic(h)) @@ -2076,24 +2080,21 @@ static void return_unused_surplus_pages(struct hstate *h, * evenly across all nodes with memory. Iterate across these nodes * until we can no longer free unreserved surplus pages. This occurs * when the nodes with surplus pages have no free pages. - * free_pool_huge_page() will balance the freed pages across the + * remove_pool_huge_page() will balance the freed pages across the * on-line nodes with memory and will handle the hstate accounting. - * - * Note that we decrement resv_huge_pages as we free the pages. If - * we drop the lock, resv_huge_pages will still be sufficiently large - * to cover subsequent pages we may free. */ while (nr_pages--) { - h->resv_huge_pages--; - unused_resv_pages--; - if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) + page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1); + if (!page) goto out; - cond_resched_lock(&hugetlb_lock); + + list_add(&page->lru, &page_list); } out: - /* Fully uncommit the reservation */ - h->resv_huge_pages -= unused_resv_pages; + spin_unlock_irq(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + spin_lock_irq(&hugetlb_lock); } @@ -2175,27 +2176,26 @@ static long __vma_reservation_common(struct hstate *h, if (vma->vm_flags & VM_MAYSHARE) return ret; - else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) { - /* - * In most cases, reserves always exist for private mappings. - * However, a file associated with mapping could have been - * hole punched or truncated after reserves were consumed. - * As subsequent fault on such a range will not use reserves. - * Subtle - The reserve map for private mappings has the - * opposite meaning than that of shared mappings. If NO - * entry is in the reserve map, it means a reservation exists. - * If an entry exists in the reserve map, it means the - * reservation has already been consumed. As a result, the - * return value of this routine is the opposite of the - * value returned from reserve map manipulation routines above. - */ - if (ret) - return 0; - else - return 1; - } - else - return ret < 0 ? ret : 0; + /* + * We know private mapping must have HPAGE_RESV_OWNER set. + * + * In most cases, reserves always exist for private mappings. + * However, a file associated with mapping could have been + * hole punched or truncated after reserves were consumed. + * As subsequent fault on such a range will not use reserves. + * Subtle - The reserve map for private mappings has the + * opposite meaning than that of shared mappings. If NO + * entry is in the reserve map, it means a reservation exists. + * If an entry exists in the reserve map, it means the + * reservation has already been consumed. As a result, the + * return value of this routine is the opposite of the + * value returned from reserve map manipulation routines above. + */ + if (ret > 0) + return 0; + if (ret == 0) + return 1; + return ret; } static long vma_needs_reservation(struct hstate *h, @@ -2266,6 +2266,134 @@ static void restore_reserve_on_error(struct hstate *h, } } +/* + * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one + * @h: struct hstate old page belongs to + * @old_page: Old page to dissolve + * @list: List to isolate the page in case we need to + * Returns 0 on success, otherwise negated error. + */ +static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, + struct list_head *list) +{ + gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + int nid = page_to_nid(old_page); + struct page *new_page; + int ret = 0; + + /* + * Before dissolving the page, we need to allocate a new one for the + * pool to remain stable. Using alloc_buddy_huge_page() allows us to + * not having to deal with prep_new_huge_page() and avoids dealing of any + * counters. This simplifies and let us do the whole thing under the + * lock. + */ + new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); + if (!new_page) + return -ENOMEM; + +retry: + spin_lock_irq(&hugetlb_lock); + if (!PageHuge(old_page)) { + /* + * Freed from under us. Drop new_page too. + */ + goto free_new; + } else if (page_count(old_page)) { + /* + * Someone has grabbed the page, try to isolate it here. + * Fail with -EBUSY if not possible. + */ + spin_unlock_irq(&hugetlb_lock); + if (!isolate_huge_page(old_page, list)) + ret = -EBUSY; + spin_lock_irq(&hugetlb_lock); + goto free_new; + } else if (!HPageFreed(old_page)) { + /* + * Page's refcount is 0 but it has not been enqueued in the + * freelist yet. Race window is small, so we can succeed here if + * we retry. + */ + spin_unlock_irq(&hugetlb_lock); + cond_resched(); + goto retry; + } else { + /* + * Ok, old_page is still a genuine free hugepage. Remove it from + * the freelist and decrease the counters. These will be + * incremented again when calling __prep_account_new_huge_page() + * and enqueue_huge_page() for new_page. The counters will remain + * stable since this happens under the lock. + */ + remove_hugetlb_page(h, old_page, false); + + /* + * new_page needs to be initialized with the standard hugetlb + * state. This is normally done by prep_new_huge_page() but + * that takes hugetlb_lock which is already held so we need to + * open code it here. + * Reference count trick is needed because allocator gives us + * referenced page but the pool requires pages with 0 refcount. + */ + __prep_new_huge_page(new_page); + __prep_account_new_huge_page(h, nid); + page_ref_dec(new_page); + enqueue_huge_page(h, new_page); + + /* + * Pages have been replaced, we can safely free the old one. + */ + spin_unlock_irq(&hugetlb_lock); + update_and_free_page(h, old_page); + } + + return ret; + +free_new: + spin_unlock_irq(&hugetlb_lock); + __free_pages(new_page, huge_page_order(h)); + + return ret; +} + +int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) +{ + struct hstate *h; + struct page *head; + int ret = -EBUSY; + + /* + * The page might have been dissolved from under our feet, so make sure + * to carefully check the state under the lock. + * Return success when racing as if we dissolved the page ourselves. + */ + spin_lock_irq(&hugetlb_lock); + if (PageHuge(page)) { + head = compound_head(page); + h = page_hstate(head); + } else { + spin_unlock_irq(&hugetlb_lock); + return 0; + } + spin_unlock_irq(&hugetlb_lock); + + /* + * Fence off gigantic pages as there is a cyclic dependency between + * alloc_contig_range and them. Return -ENOMEM as this has the effect + * of bailing out right away without further retrying. + */ + if (hstate_is_gigantic(h)) + return -ENOMEM; + + if (page_count(head) && isolate_huge_page(head, list)) + ret = 0; + else if (!page_count(head)) + ret = alloc_and_dissolve_huge_page(h, head, list); + + return ret; +} + struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { @@ -2316,7 +2444,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, /* If this allocation is not consuming a reservation, charge it now. */ - deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma); + deferred_reserve = map_chg || avoid_reserve; if (deferred_reserve) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); @@ -2328,7 +2456,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, if (ret) goto out_uncharge_cgroup_reservation; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * glb_chg is passed to indicate whether or not a page must be taken * from the global free pool (global change). gbl_chg == 0 indicates @@ -2336,7 +2464,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, */ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); if (!page) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); page = alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; @@ -2344,7 +2472,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, SetHPageRestoreReserve(page); h->resv_huge_pages--; } - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); list_add(&page->lru, &h->hugepage_activelist); /* Fall through */ } @@ -2357,7 +2485,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, h_cg, page); } - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); hugetlb_set_page_subpool(page, spool); @@ -2547,24 +2675,32 @@ static void try_to_free_low(struct hstate *h, unsigned long count, nodemask_t *nodes_allowed) { int i; + LIST_HEAD(page_list); + lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h)) return; + /* + * Collect pages to be freed on a list, and free after dropping lock + */ for_each_node_mask(i, *nodes_allowed) { struct page *page, *next; struct list_head *freel = &h->hugepage_freelists[i]; list_for_each_entry_safe(page, next, freel, lru) { if (count >= h->nr_huge_pages) - return; + goto out; if (PageHighMem(page)) continue; - list_del(&page->lru); - update_and_free_page(h, page); - h->free_huge_pages--; - h->free_huge_pages_node[page_to_nid(page)]--; + remove_hugetlb_page(h, page, false); + list_add(&page->lru, &page_list); } } + +out: + spin_unlock_irq(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + spin_lock_irq(&hugetlb_lock); } #else static inline void try_to_free_low(struct hstate *h, unsigned long count, @@ -2583,6 +2719,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, { int nr_nodes, node; + lockdep_assert_held(&hugetlb_lock); VM_BUG_ON(delta != -1 && delta != 1); if (delta < 0) { @@ -2610,6 +2747,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { unsigned long min_count, ret; + struct page *page; + LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); /* @@ -2622,7 +2761,12 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, else return -ENOMEM; - spin_lock(&hugetlb_lock); + /* + * resize_lock mutex prevents concurrent adjustments to number of + * pages in hstate via the proc/sysfs interfaces. + */ + mutex_lock(&h->resize_lock); + spin_lock_irq(&hugetlb_lock); /* * Check for a node specific request. @@ -2653,7 +2797,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, */ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { if (count > persistent_huge_pages(h)) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); + mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); return -EINVAL; } @@ -2682,14 +2827,14 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * page, free_huge_page will handle it by freeing the page * and reducing the surplus. */ - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); /* yield cpu to avoid soft lockup */ cond_resched(); ret = alloc_pool_huge_page(h, nodes_allowed, node_alloc_noretry); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (!ret) goto out; @@ -2716,18 +2861,30 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; min_count = max(count, min_count); try_to_free_low(h, min_count, nodes_allowed); + + /* + * Collect pages to be removed on list without dropping lock + */ while (min_count < persistent_huge_pages(h)) { - if (!free_pool_huge_page(h, nodes_allowed, 0)) + page = remove_pool_huge_page(h, nodes_allowed, 0); + if (!page) break; - cond_resched_lock(&hugetlb_lock); + + list_add(&page->lru, &page_list); } + /* free the pages after dropping lock */ + spin_unlock_irq(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + spin_lock_irq(&hugetlb_lock); + while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) break; } out: h->max_huge_pages = persistent_huge_pages(h); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); + mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); @@ -2882,9 +3039,9 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, if (err) return err; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); h->nr_overcommit_huge_pages = input; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return count; } @@ -3215,6 +3372,7 @@ void __init hugetlb_add_hstate(unsigned int order) BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order == 0); h = &hstates[hugetlb_max_hstate++]; + mutex_init(&h->resize_lock); h->order = order; h->mask = ~(huge_page_size(h) - 1); for (i = 0; i < MAX_NUMNODES; ++i) @@ -3267,10 +3425,10 @@ static int __init hugepages_setup(char *s) /* * Global state is always initialized later in hugetlb_init. - * But we need to allocate >= MAX_ORDER hstates here early to still + * But we need to allocate gigantic hstates here early to still * use the bootmem allocator. */ - if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) + if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate)) hugetlb_hstate_alloc_pages(parsed_hstate); last_mhp = mhp; @@ -3470,9 +3628,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, goto out; if (write) { - spin_lock(&hugetlb_loc |
