summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c1251
1 files changed, 867 insertions, 384 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cfc72873961d..0817d88383d5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -72,7 +72,6 @@
#include <linux/padata.h>
#include <linux/khugepaged.h>
#include <linux/buffer_head.h>
-
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -108,9 +107,38 @@ typedef int __bitwise fpi_t;
*/
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
+/*
+ * Don't poison memory with KASAN (only for the tag-based modes).
+ * During boot, all non-reserved memblock memory is exposed to page_alloc.
+ * Poisoning all that memory lengthens boot time, especially on systems with
+ * large amount of RAM. This flag is used to skip that poisoning.
+ * This is only done for the tag-based KASAN modes, as those are able to
+ * detect memory corruptions with the memory tags assigned by default.
+ * All memory allocated normally after boot gets poisoned as usual.
+ */
+#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
-#define MIN_PERCPU_PAGELIST_FRACTION (8)
+#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
+
+struct pagesets {
+ local_lock_t lock;
+#if defined(CONFIG_DEBUG_INFO_BTF) && \
+ !defined(CONFIG_DEBUG_LOCK_ALLOC) && \
+ !defined(CONFIG_PAHOLE_HAS_ZEROSIZE_PERCPU_SUPPORT)
+ /*
+ * pahole 1.21 and earlier gets confused by zero-sized per-CPU
+ * variables and produces invalid BTF. Ensure that
+ * sizeof(struct pagesets) != 0 for older versions of pahole.
+ */
+ char __pahole_hack;
+ #warning "pahole too old to support zero-sized struct pagesets"
+#endif
+};
+static DEFINE_PER_CPU(struct pagesets, pagesets) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
@@ -165,12 +193,12 @@ EXPORT_SYMBOL(_totalram_pages);
unsigned long totalreserve_pages __read_mostly;
unsigned long totalcma_pages __read_mostly;
-int percpu_pagelist_fraction;
+int percpu_pagelist_high_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
-DEFINE_STATIC_KEY_FALSE(init_on_alloc);
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
EXPORT_SYMBOL(init_on_alloc);
-DEFINE_STATIC_KEY_FALSE(init_on_free);
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
EXPORT_SYMBOL(init_on_free);
static bool _init_on_alloc_enabled_early __read_mostly
@@ -321,20 +349,7 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
-#ifdef CONFIG_DISCONTIGMEM
-/*
- * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
- * are not on separate NUMA nodes. Functionally this works but with
- * watermark_boost_factor, it can reclaim prematurely as the ranges can be
- * quite small. By default, do not boost watermarks on discontigmem as in
- * many cases very high-order allocations like THP are likely to be
- * unsupported and the premature reclaim offsets the advantage of long-term
- * fragmentation avoidance.
- */
-int watermark_boost_factor __read_mostly;
-#else
int watermark_boost_factor __read_mostly = 15000;
-#endif
int watermark_scale_factor = 10;
static unsigned long nr_kernel_pages __initdata;
@@ -372,7 +387,7 @@ int page_group_by_mobility_disabled __read_mostly;
static DEFINE_STATIC_KEY_TRUE(deferred_pages);
/*
- * Calling kasan_free_pages() only after deferred memory initialization
+ * Calling kasan_poison_pages() only after deferred memory initialization
* has completed. Poisoning pages during deferred memory init will greatly
* lengthen the process and cause problem in large memory systems as the
* deferred pages initialization is done with interrupt disabled.
@@ -384,10 +399,12 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages);
* on-demand allocation and then freed again before the deferred pages
* initialization is done, but this is not likely to happen.
*/
-static inline void kasan_free_nondeferred_pages(struct page *page, int order)
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
{
- if (!static_branch_unlikely(&deferred_pages))
- kasan_free_pages(page, order);
+ return static_branch_unlikely(&deferred_pages) ||
+ (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
+ PageSkipKASanPoison(page);
}
/* Returns true if the struct page for the pfn is uninitialised */
@@ -438,7 +455,12 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
#else
-#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
+{
+ return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
+ PageSkipKASanPoison(page);
+}
static inline bool early_page_uninitialised(unsigned long pfn)
{
@@ -452,7 +474,7 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
#endif
/* Return a pointer to the bitmap storing bits affecting a block of pages */
-static inline unsigned long *get_pageblock_bitmap(struct page *page,
+static inline unsigned long *get_pageblock_bitmap(const struct page *page,
unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
@@ -462,7 +484,7 @@ static inline unsigned long *get_pageblock_bitmap(struct page *page,
#endif /* CONFIG_SPARSEMEM */
}
-static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
+static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
pfn &= (PAGES_PER_SECTION-1);
@@ -473,7 +495,7 @@ static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
}
static __always_inline
-unsigned long __get_pfnblock_flags_mask(struct page *page,
+unsigned long __get_pfnblock_flags_mask(const struct page *page,
unsigned long pfn,
unsigned long mask)
{
@@ -498,13 +520,14 @@ unsigned long __get_pfnblock_flags_mask(struct page *page,
*
* Return: pageblock_bits flags
*/
-unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
- unsigned long mask)
+unsigned long get_pfnblock_flags_mask(const struct page *page,
+ unsigned long pfn, unsigned long mask)
{
return __get_pfnblock_flags_mask(page, pfn, mask);
}
-static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
+static __always_inline int get_pfnblock_migratetype(const struct page *page,
+ unsigned long pfn)
{
return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
}
@@ -636,8 +659,7 @@ static void bad_page(struct page *page, const char *reason)
pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
- __dump_page(page, reason);
- dump_page_owner(page);
+ dump_page(page, reason);
print_modules();
dump_stack();
@@ -647,6 +669,57 @@ out:
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
+static inline unsigned int order_to_pindex(int migratetype, int order)
+{
+ int base = order;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (order > PAGE_ALLOC_COSTLY_ORDER) {
+ VM_BUG_ON(order != pageblock_order);
+ base = PAGE_ALLOC_COSTLY_ORDER + 1;
+ }
+#else
+ VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
+#endif
+
+ return (MIGRATE_PCPTYPES * base) + migratetype;
+}
+
+static inline int pindex_to_order(unsigned int pindex)
+{
+ int order = pindex / MIGRATE_PCPTYPES;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (order > PAGE_ALLOC_COSTLY_ORDER) {
+ order = pageblock_order;
+ VM_BUG_ON(order != pageblock_order);
+ }
+#else
+ VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
+#endif
+
+ return order;
+}
+
+static inline bool pcp_allowed_order(unsigned int order)
+{
+ if (order <= PAGE_ALLOC_COSTLY_ORDER)
+ return true;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (order == pageblock_order)
+ return true;
+#endif
+ return false;
+}
+
+static inline void free_the_page(struct page *page, unsigned int order)
+{
+ if (pcp_allowed_order(order)) /* Via pcp? */
+ free_unref_page(page, order);
+ else
+ __free_pages_ok(page, order, FPI_NONE);
+}
+
/*
* Higher-order pages are called "compound pages". They are structured thusly:
*
@@ -665,7 +738,7 @@ out:
void free_compound_page(struct page *page)
{
mem_cgroup_uncharge(page);
- __free_pages_ok(page, compound_order(page), FPI_NONE);
+ free_the_page(page, compound_order(page));
}
void prep_compound_page(struct page *page, unsigned int order)
@@ -764,32 +837,36 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
*/
void init_mem_debugging_and_hardening(void)
{
+ bool page_poisoning_requested = false;
+
+#ifdef CONFIG_PAGE_POISONING
+ /*
+ * Page poisoning is debug page alloc for some arches. If
+ * either of those options are enabled, enable poisoning.
+ */
+ if (page_poisoning_enabled() ||
+ (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
+ debug_pagealloc_enabled())) {
+ static_branch_enable(&_page_poisoning_enabled);
+ page_poisoning_requested = true;
+ }
+#endif
+
if (_init_on_alloc_enabled_early) {
- if (page_poisoning_enabled())
+ if (page_poisoning_requested)
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
"will take precedence over init_on_alloc\n");
else
static_branch_enable(&init_on_alloc);
}
if (_init_on_free_enabled_early) {
- if (page_poisoning_enabled())
+ if (page_poisoning_requested)
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
"will take precedence over init_on_free\n");
else
static_branch_enable(&init_on_free);
}
-#ifdef CONFIG_PAGE_POISONING
- /*
- * Page poisoning is debug page alloc for some arches. If
- * either of those options are enabled, enable poisoning.
- */
- if (page_poisoning_enabled() ||
- (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
- debug_pagealloc_enabled()))
- static_branch_enable(&_page_poisoning_enabled);
-#endif
-
#ifdef CONFIG_DEBUG_PAGEALLOC
if (!debug_pagealloc_enabled())
return;
@@ -867,7 +944,7 @@ compaction_capture(struct capture_control *capc, struct page *page,
return false;
/*
- * Do not let lower order allocations polluate a movable pageblock.
+ * Do not let lower order allocations pollute a movable pageblock.
* This might let an unmovable request use a reclaimable pageblock
* and vice-versa but no more than normal fallback logic which can
* have trouble finding a high-order free page.
@@ -1103,7 +1180,7 @@ static inline bool page_expected_state(struct page *page,
if (unlikely((unsigned long)page->mapping |
page_ref_count(page) |
#ifdef CONFIG_MEMCG
- (unsigned long)page_memcg(page) |
+ page->memcg_data |
#endif
(page->flags & check_flags)))
return false;
@@ -1128,7 +1205,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
}
#ifdef CONFIG_MEMCG
- if (unlikely(page_memcg(page)))
+ if (unlikely(page->memcg_data))
bad_reason = "page still charged to cgroup";
#endif
return bad_reason;
@@ -1200,10 +1277,16 @@ out:
return ret;
}
-static void kernel_init_free_pages(struct page *page, int numpages)
+static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
{
int i;
+ if (zero_tags) {
+ for (i = 0; i < numpages; i++)
+ tag_clear_highpage(page + i);
+ return;
+ }
+
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
for (i = 0; i < numpages; i++) {
@@ -1216,9 +1299,10 @@ static void kernel_init_free_pages(struct page *page, int numpages)
}
static __always_inline bool free_pages_prepare(struct page *page,
- unsigned int order, bool check_free)
+ unsigned int order, bool check_free, fpi_t fpi_flags)
{
int bad = 0;
+ bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -1276,16 +1360,28 @@ static __always_inline bool free_pages_prepare(struct page *page,
debug_check_no_obj_freed(page_address(page),
PAGE_SIZE << order);
}
- if (want_init_on_free())
- kernel_init_free_pages(page, 1 << order);
kernel_poison_pages(page, 1 << order);
/*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_free_pages and kernel_init_free_pages must be
+ * kept together to avoid discrepancies in behavior.
+ *
* With hardware tag-based KASAN, memory tags must be set before the
* page becomes unavailable via debug_pagealloc or arch_free_page.
*/
- kasan_free_nondeferred_pages(page, order);
+ if (kasan_has_integrated_init()) {
+ if (!skip_kasan_poison)
+ kasan_free_pages(page, order);
+ } else {
+ bool init = want_init_on_free();
+
+ if (init)
+ kernel_init_free_pages(page, 1 << order, false);
+ if (!skip_kasan_poison)
+ kasan_poison_pages(page, order, init);
+ }
/*
* arch_free_page() can make the page's contents inaccessible. s390
@@ -1305,9 +1401,9 @@ static __always_inline bool free_pages_prepare(struct page *page,
* to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
* moved from pcp lists to free lists.
*/
-static bool free_pcp_prepare(struct page *page)
+static bool free_pcp_prepare(struct page *page, unsigned int order)
{
- return free_pages_prepare(page, 0, true);
+ return free_pages_prepare(page, order, true, FPI_NONE);
}
static bool bulkfree_pcp_prepare(struct page *page)
@@ -1324,12 +1420,12 @@ static bool bulkfree_pcp_prepare(struct page *page)
* debug_pagealloc enabled, they are checked also immediately when being freed
* to the pcp lists.
*/
-static bool free_pcp_prepare(struct page *page)
+static bool free_pcp_prepare(struct page *page, unsigned int order)
{
if (debug_pagealloc_enabled_static())
- return free_pages_prepare(page, 0, true);
+ return free_pages_prepare(page, order, true, FPI_NONE);
else
- return free_pages_prepare(page, 0, false);
+ return free_pages_prepare(page, order, false, FPI_NONE);
}
static bool bulkfree_pcp_prepare(struct page *page)
@@ -1361,8 +1457,10 @@ static inline void prefetch_buddy(struct page *page)
static void free_pcppages_bulk(struct zone *zone, int count,
struct per_cpu_pages *pcp)
{
- int migratetype = 0;
+ int pindex = 0;
int batch_free = 0;
+ int nr_freed = 0;
+ unsigned int order;
int prefetch_nr = READ_ONCE(pcp->batch);
bool isolated_pageblocks;
struct page *page, *tmp;
@@ -1373,7 +1471,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
* below while (list_empty(list)) loop.
*/
count = min(pcp->count, count);
- while (count) {
+ while (count > 0) {
struct list_head *list;
/*
@@ -1385,24 +1483,31 @@ static void free_pcppages_bulk(struct zone *zone, int count,
*/
do {
batch_free++;
- if (++migratetype == MIGRATE_PCPTYPES)
- migratetype = 0;
- list = &pcp->lists[migratetype];
+ if (++pindex == NR_PCP_LISTS)
+ pindex = 0;
+ list = &pcp->lists[pindex];
} while (list_empty(list));
/* This is the only non-empty list. Free them all. */
- if (batch_free == MIGRATE_PCPTYPES)
+ if (batch_free == NR_PCP_LISTS)
batch_free = count;
+ order = pindex_to_order(pindex);
+ BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH));
do {
page = list_last_entry(list, struct page, lru);
/* must delete to avoid corrupting pcp list */
list_del(&page->lru);
- pcp->count--;
+ nr_freed += 1 << order;
+ count -= 1 << order;
if (bulkfree_pcp_prepare(page))
continue;
+ /* Encode order with the migratetype */
+ page->index <<= NR_PCP_ORDER_WIDTH;
+ page->index |= order;
+
list_add_tail(&page->lru, &head);
/*
@@ -1418,9 +1523,14 @@ static void free_pcppages_bulk(struct zone *zone, int count,
prefetch_buddy(page);
prefetch_nr--;
}
- } while (--count && --batch_free && !list_empty(list));
+ } while (count > 0 && --batch_free && !list_empty(list));
}
+ pcp->count -= nr_freed;
+ /*
+ * local_lock_irq held so equivalent to spin_lock_irqsave for
+ * both PREEMPT_RT and non-PREEMPT_RT configurations.
+ */
spin_lock(&zone->lock);
isolated_pageblocks = has_isolate_pageblock(zone);
@@ -1430,14 +1540,19 @@ static void free_pcppages_bulk(struct zone *zone, int count,
*/
list_for_each_entry_safe(page, tmp, &head, lru) {
int mt = get_pcppage_migratetype(page);
+
+ /* mt has been encoded with the order (see above) */
+ order = mt & NR_PCP_ORDER_MASK;
+ mt >>= NR_PCP_ORDER_WIDTH;
+
/* MIGRATE_ISOLATE page should not go to pcplists */
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
/* Pageblock could have been isolated meanwhile */
if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
- __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
- trace_mm_page_pcpu_drain(page, 0, mt);
+ __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
+ trace_mm_page_pcpu_drain(page, order, mt);
}
spin_unlock(&zone->lock);
}
@@ -1447,13 +1562,15 @@ static void free_one_page(struct zone *zone,
unsigned int order,
int migratetype, fpi_t fpi_flags)
{
- spin_lock(&zone->lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&zone->lock, flags);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
- spin_unlock(&zone->lock);
+ spin_unlock_irqrestore(&zone->lock, flags);
}
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -1536,16 +1653,22 @@ static void __free_pages_ok(struct page *page, unsigned int order,
unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
+ struct zone *zone = page_zone(page);
- if (!free_pages_prepare(page, order, true))
+ if (!free_pages_prepare(page, order, true, fpi_flags))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
- local_irq_save(flags);
+
+ spin_lock_irqsave(&zone->lock, flags);
+ if (unlikely(has_isolate_pageblock(zone) ||
+ is_migrate_isolate(migratetype))) {
+ migratetype = get_pfnblock_migratetype(page, pfn);
+ }
+ __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
+ spin_unlock_irqrestore(&zone->lock, flags);
+
__count_vm_events(PGFREE, 1 << order);
- free_one_page(page_zone(page), page, pfn, order, migratetype,
- fpi_flags);
- local_irq_restore(flags);
}
void __free_pages_core(struct page *page, unsigned int order)
@@ -1574,10 +1697,10 @@ void __free_pages_core(struct page *page, unsigned int order)
* Bypass PCP and place fresh pages right to the tail, primarily
* relevant for memory onlining.
*/
- __free_pages_ok(page, order, FPI_TO_TAIL);
+ __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
}
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
/*
* During memory init memblocks map pfns to nids. The search is expensive and
@@ -1627,7 +1750,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
return nid;
}
-#endif /* CONFIG_NEED_MULTIPLE_NODES */
+#endif /* CONFIG_NUMA */
void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
@@ -2115,14 +2238,6 @@ void __init page_alloc_init_late(void)
wait_for_completion(&pgdat_init_all_done_comp);
/*
- * The number of managed pages has changed due to the initialisation
- * so the pcpu batch and high limits needs to be updated or the limits
- * will be artificially small.
- */
- for_each_populated_zone(zone)
- zone_pcp_update(zone);
-
- /*
* We initialized the rest of the deferred pages. Permanently disable
* on-demand struct page initialization.
*/
@@ -2297,12 +2412,31 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
arch_alloc_page(page, order);
debug_pagealloc_map_pages(page, 1 << order);
- kasan_alloc_pages(page, order);
+
+ /*
+ * Page unpoisoning must happen before memory initialization.
+ * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
+ * allocations and the page unpoisoning code will complain.
+ */
kernel_unpoison_pages(page, 1 << order);
- set_page_owner(page, order, gfp_flags);
- if (!want_init_on_free() && want_init_on_alloc(gfp_flags))
- kernel_init_free_pages(page, 1 << order);
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_alloc_pages and kernel_init_free_pages must be
+ * kept together to avoid discrepancies in behavior.
+ */
+ if (kasan_has_integrated_init()) {
+ kasan_alloc_pages(page, order, gfp_flags);
+ } else {
+ bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
+
+ kasan_unpoison_pages(page, order, init);
+ if (init)
+ kernel_init_free_pages(page, 1 << order,
+ gfp_flags & __GFP_ZEROTAGS);
+ }
+
+ set_page_owner(page, order, gfp_flags);
}
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
@@ -2386,19 +2520,21 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
* boundary. If alignment is required, use move_freepages_block()
*/
static int move_freepages(struct zone *zone,
- struct page *start_page, struct page *end_page,
+ unsigned long start_pfn, unsigned long end_pfn,
int migratetype, int *num_movable)
{
struct page *page;
+ unsigned long pfn;
unsigned int order;
int pages_moved = 0;
- for (page = start_page; page <= end_page;) {
- if (!pfn_valid_within(page_to_pfn(page))) {
- page++;
+ for (pfn = start_pfn; pfn <= end_pfn;) {
+ if (!pfn_valid_within(pfn)) {
+ pfn++;
continue;
}
+ page = pfn_to_page(pfn);
if (!PageBuddy(page)) {
/*
* We assume that pages that could be isolated for
@@ -2408,8 +2544,7 @@ static int move_freepages(struct zone *zone,
if (num_movable &&
(PageLRU(page) || __PageMovable(page)))
(*num_movable)++;
-
- page++;
+ pfn++;
continue;
}
@@ -2419,7 +2554,7 @@ static int move_freepages(struct zone *zone,
order = buddy_order(page);
move_to_free_list(page, zone, order, migratetype);
- page += 1 << order;
+ pfn += 1 << order;
pages_moved += 1 << order;
}
@@ -2429,25 +2564,22 @@ static int move_freepages(struct zone *zone,
int move_freepages_block(struct zone *zone, struct page *page,
int migratetype, int *num_movable)
{
- unsigned long start_pfn, end_pfn;
- struct page *start_page, *end_page;
+ unsigned long start_pfn, end_pfn, pfn;
if (num_movable)
*num_movable = 0;
- start_pfn = page_to_pfn(page);
- start_pfn = start_pfn & ~(pageblock_nr_pages-1);
- start_page = pfn_to_page(start_pfn);
- end_page = start_page + pageblock_nr_pages - 1;
+ pfn = page_to_pfn(page);
+ start_pfn = pfn & ~(pageblock_nr_pages - 1);
end_pfn = start_pfn + pageblock_nr_pages - 1;
/* Do not cross zone boundaries */
if (!zone_spans_pfn(zone, start_pfn))
- start_page = page;
+ start_pfn = pfn;
if (!zone_spans_pfn(zone, end_pfn))
return 0;
- return move_freepages(zone, start_page, end_page, migratetype,
+ return move_freepages(zone, start_pfn, end_pfn, migratetype,
num_movable);
}
@@ -2731,7 +2863,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
/*
* In page freeing path, migratetype change is racy so
* we can counter several free pages in a pageblock
- * in this loop althoug we changed the pageblock type
+ * in this loop although we changed the pageblock type
* from highatomic to ac->migratetype. So we should
* adjust the count once.
*/
@@ -2908,8 +3040,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
- int i, alloced = 0;
+ int i, allocated = 0;
+ /*
+ * local_lock_irq held so equivalent to spin_lock_irqsave for
+ * both PREEMPT_RT and non-PREEMPT_RT configurations.
+ */
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
@@ -2931,7 +3067,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* pages are ordered properly.
*/
list_add_tail(&page->lru, list);
- alloced++;
+ allocated++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
@@ -2940,12 +3076,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
/*
* i pages were removed from the buddy list even if some leak due
* to check_pcp_refill failing so adjust NR_FREE_PAGES based
- * on i. Do not confuse with 'alloced' which is the number of
+ * on i. Do not confuse with 'allocated' which is the number of
* pages added to the pcp list.
*/
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock(&zone->lock);
- return alloced;
+ return allocated;
}
#ifdef CONFIG_NUMA
@@ -2962,12 +3098,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
unsigned long flags;
int to_drain, batch;
- local_irq_save(flags);
+ local_lock_irqsave(&pagesets.lock, flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0)
free_pcppages_bulk(zone, to_drain, pcp);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&pagesets.lock, flags);
}
#endif
@@ -2981,16 +3117,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
static void drain_pages_zone(unsigned int cpu, struct zone *zone)
{
unsigned long flags;
- struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
- local_irq_save(flags);
- pset = per_cpu_ptr(zone->pageset, cpu);
+ local_lock_irqsave(&pagesets.lock, flags);
- pcp = &pset->pcp;
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
if (pcp->count)
free_pcppages_bulk(zone, pcp->count, pcp);
- local_irq_restore(flags);
+
+ local_unlock_irqrestore(&pagesets.lock, flags);
}
/*
@@ -3035,7 +3170,7 @@ static void drain_local_pages_wq(struct work_struct *work)
* drain_all_pages doesn't use proper cpu hotplug protection so
* we can race with cpu offline when the WQ can move this from
* a cpu pinned worker to an unbound one. We can operate on a different
- * cpu which is allright but we also have to make sure to not move to
+ * cpu which is alright but we also have to make sure to not move to
* a different one.
*/
preempt_disable();
@@ -3088,7 +3223,7 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
* disables preemption as part of its processing
*/
for_each_online_cpu(cpu) {
- struct per_cpu_pageset *pcp;
+ struct per_cpu_pages *pcp;
struct zone *z;
bool has_pcps = false;
@@ -3099,13 +3234,13 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
*/
has_pcps = true;
} else if (zone) {
- pcp = per_cpu_ptr(zone->pageset, cpu);
- if (pcp->pcp.count)
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ if (pcp->count)
has_pcps = true;
} else {
for_each_populated_zone(z) {
- pcp = per_cpu_ptr(z->pageset, cpu);
- if (pcp->pcp.count) {
+ pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
+ if (pcp->count) {
has_pcps = true;
break;
}
@@ -3198,11 +3333,12 @@ void mark_free_pages(struct zone *zone)
}
#endif /* CONFIG_PM */
-static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
+static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
+ unsigned int order)
{
int migratetype;
- if (!free_pcp_prepare(page))
+ if (!free_pcp_prepare(page, order))
return false;
migratetype = get_pfnblock_migratetype(page, pfn);
@@ -3210,52 +3346,99 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
return true;
}
-static void free_unref_page_commit(struct page *page, unsigned long pfn)
+static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch)
+{
+ int min_nr_free, max_nr_free;
+
+ /* Check for PCP disabled or boot pageset */
+ if (unlikely(high < batch))
+ return 1;
+
+ /* Leave at least pcp->batch pages on the list */
+ min_nr_free = batch;
+ max_nr_free = high - batch;
+
+ /*
+ * Double the number of pages freed each time there is subsequent
+ * freeing of pages without any allocation.
+ */
+ batch <<= pcp->free_factor;
+ if (batch < max_nr_free)
+ pcp->free_factor++;
+ batch = clamp(batch, min_nr_free, max_nr_free);
+
+ return batch;
+}
+
+static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone)
+{
+ int high = READ_ONCE(pcp->high);
+
+ if (unlikely(!high))
+ return 0;
+
+ if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
+ return high;
+
+ /*
+ * If reclaim is active, limit the number of pages that can be
+ * stored on pcp lists
+ */
+ return min(READ_ONCE(pcp->batch) << 2, high);
+}
+
+static void free_unref_page_commit(struct page *page, unsigned long pfn,
+ int migratetype, unsigned int order)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
- int migratetype;
+ int high;
+ int pindex;
- migratetype = get_pcppage_migratetype(page);
__count_vm_event(PGFREE);
+ pcp = this_cpu_ptr(zone->per_cpu_pageset);
+ pindex = order_to_pindex(migratetype, order);
+ list_add(&page->lru, &pcp->lists[pindex]);
+ pcp->count += 1 << order;
+ high = nr_pcp_high(pcp, zone);
+ if (pcp->count >= high) {
+ int batch = READ_ONCE(pcp->batch);
+
+ free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp);
+ }
+}
+
+/*
+ * Free a pcp page
+ */
+void free_unref_page(struct page *page, unsigned int order)
+{
+ unsigned long flags;
+ unsigned long pfn = page_to_pfn(page);
+ int migratetype;
+
+ if (!free_unref_page_prepare(page, pfn, order))
+ return;
/*
* We only track unmovable, reclaimable and movable on pcp lists.
- * Free ISOLATE pages back to the allocator because they are being
+ * Place ISOLATE pages on the isolated list because they are being
* offlined but treat HIGHATOMIC as movable pages so we can get those
* areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
- if (migratetype >= MIGRATE_PCPTYPES) {
+ migratetype = get_pcppage_migratetype(page);
+ if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(zone, page, pfn, 0, migratetype,
- FPI_NONE);
+ free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
return;
}
migratetype = MIGRATE_MOVABLE;
}
- pcp = &this_cpu_ptr(zone->pageset)->pcp;
- list_add(&page->lru, &pcp->lists[migratetype]);
- pcp->count++;
- if (pcp->count >= READ_ONCE(pcp->high))
- free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp);
-}
-
-/*
- * Free a 0-order page
- */
-void free_unref_page(struct page *page)
-{
- unsigned long flags;
- unsigned long pfn = page_to_pfn(page);
-
- if (!free_unref_page_prepare(page, pfn))
- return;
-
- local_irq_save(flags);
- free_unref_page_commit(page, pfn);
- local_irq_restore(flags);
+ local_lock_irqsave(&pagesets.lock, flags);
+ free_unref_page_commit(page, pfn, migratetype, order);
+ local_unlock_irqrestore(&pagesets.lock, flags);
}
/*
@@ -3266,34 +3449,56 @@ void free_unref_page_list(struct list_head *list)
struct page *page, *next;
unsigned long flags, pfn;
int batch_count = 0;
+ int migratetype;
/* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
pfn = page_to_pfn(page);
- if (!free_unref_page_prepare(page, pfn))
+ if (!free_unref_page_prepare(page, pfn, 0))
list_del(&page->lru);
+
+ /*
+ * Free isolated pages directly to the allocator, see
+ * comment in free_unref_page.
+ */
+ migratetype = get_pcppage_migratetype(page);
+ if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
+ if (unlikely(is_migrate_isolate(migratetype))) {
+ list_del(&page->lru);
+ free_one_page(page_zone(page), page, pfn, 0,
+ migratetype, FPI_NONE);
+ continue;
+ }
+
+ /*
+ * Non-isolated types over MIGRATE_PCPTYPES get added
+ * to the MIGRATE_MOVABLE pcp list.
+ */
+ set_pcppage_migratetype(page, MIGRATE_MOVABLE);
+ }
+
set_page_private(page, pfn);
}
- local_irq_save(flags);
+ local_lock_irqsave(&pagesets.lock, flags);
list_for_each_entry_safe(page, next, list, lru) {
- unsigned long pfn = page_private(page);
-
+ pfn = page_private(page);
set_page_private(page, 0);
+ migratetype = get_pcppage_migratetype(page);
trace_mm_page_free_batched(page);
- free_unref_page_commit(page, pfn);
+ free_unref_page_commit(page, pfn, migratetype, 0);
/*
* Guard against excessive IRQ disabled times when we get
* a large list of pages to free.
*/
if (++batch_count == SWAP_CLUSTER_MAX) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&pagesets.lock, flags);
batch_count = 0;
- local_irq_save(flags);
+ local_lock_irqsave(&pagesets.lock, flags);
}
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(&pagesets.lock, flags);
}
/*
@@ -3392,7 +3597,8 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt)
*
* Must be called with interrupts disabled.
*/
-static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
+ long nr_account)
{
#ifdef CONFIG_NUMA
enum numa_stat_item local_stat = NUMA_LOCAL;
@@ -3405,17 +3611,19 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
local_stat = NUMA_OTHER;
if (zone_to_nid(z) == zone_to_nid(preferred_zone))
- __inc_numa_state(z, NUMA_HIT);
+ __count_numa_events(z, NUMA_HIT, nr_account);
else {
- __inc_numa_state(z, NUMA_MISS);
- __inc_numa_state(preferred_zone, NUMA_FOREIGN);
+ __count_numa_events(z, NUMA_MISS, nr_account);
+ __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
}
- __inc_numa_state(z, local_stat);
+ __count_numa_events(z, local_stat, nr_account);
#endif
}
/* Remove page from the per-cpu list, caller must protect the list */
-static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+static inline
+struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
+ int migratetype,
unsigned int alloc_flags,
struct per_cpu_pages *pcp,
struct list_head *list)
@@ -3