diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 1251 |
1 files changed, 867 insertions, 384 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cfc72873961d..0817d88383d5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -72,7 +72,6 @@ #include <linux/padata.h> #include <linux/khugepaged.h> #include <linux/buffer_head.h> - #include <asm/sections.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -108,9 +107,38 @@ typedef int __bitwise fpi_t; */ #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) +/* + * Don't poison memory with KASAN (only for the tag-based modes). + * During boot, all non-reserved memblock memory is exposed to page_alloc. + * Poisoning all that memory lengthens boot time, especially on systems with + * large amount of RAM. This flag is used to skip that poisoning. + * This is only done for the tag-based KASAN modes, as those are able to + * detect memory corruptions with the memory tags assigned by default. + * All memory allocated normally after boot gets poisoned as usual. + */ +#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2)) + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); -#define MIN_PERCPU_PAGELIST_FRACTION (8) +#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) + +struct pagesets { + local_lock_t lock; +#if defined(CONFIG_DEBUG_INFO_BTF) && \ + !defined(CONFIG_DEBUG_LOCK_ALLOC) && \ + !defined(CONFIG_PAHOLE_HAS_ZEROSIZE_PERCPU_SUPPORT) + /* + * pahole 1.21 and earlier gets confused by zero-sized per-CPU + * variables and produces invalid BTF. Ensure that + * sizeof(struct pagesets) != 0 for older versions of pahole. + */ + char __pahole_hack; + #warning "pahole too old to support zero-sized struct pagesets" +#endif +}; +static DEFINE_PER_CPU(struct pagesets, pagesets) = { + .lock = INIT_LOCAL_LOCK(lock), +}; #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); @@ -165,12 +193,12 @@ EXPORT_SYMBOL(_totalram_pages); unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; -int percpu_pagelist_fraction; +int percpu_pagelist_high_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; -DEFINE_STATIC_KEY_FALSE(init_on_alloc); +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); EXPORT_SYMBOL(init_on_alloc); -DEFINE_STATIC_KEY_FALSE(init_on_free); +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); EXPORT_SYMBOL(init_on_free); static bool _init_on_alloc_enabled_early __read_mostly @@ -321,20 +349,7 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; -#ifdef CONFIG_DISCONTIGMEM -/* - * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges - * are not on separate NUMA nodes. Functionally this works but with - * watermark_boost_factor, it can reclaim prematurely as the ranges can be - * quite small. By default, do not boost watermarks on discontigmem as in - * many cases very high-order allocations like THP are likely to be - * unsupported and the premature reclaim offsets the advantage of long-term - * fragmentation avoidance. - */ -int watermark_boost_factor __read_mostly; -#else int watermark_boost_factor __read_mostly = 15000; -#endif int watermark_scale_factor = 10; static unsigned long nr_kernel_pages __initdata; @@ -372,7 +387,7 @@ int page_group_by_mobility_disabled __read_mostly; static DEFINE_STATIC_KEY_TRUE(deferred_pages); /* - * Calling kasan_free_pages() only after deferred memory initialization + * Calling kasan_poison_pages() only after deferred memory initialization * has completed. Poisoning pages during deferred memory init will greatly * lengthen the process and cause problem in large memory systems as the * deferred pages initialization is done with interrupt disabled. @@ -384,10 +399,12 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages); * on-demand allocation and then freed again before the deferred pages * initialization is done, but this is not likely to happen. */ -static inline void kasan_free_nondeferred_pages(struct page *page, int order) +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) { - if (!static_branch_unlikely(&deferred_pages)) - kasan_free_pages(page, order); + return static_branch_unlikely(&deferred_pages) || + (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) || + PageSkipKASanPoison(page); } /* Returns true if the struct page for the pfn is uninitialised */ @@ -438,7 +455,12 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } #else -#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o) +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +{ + return (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) || + PageSkipKASanPoison(page); +} static inline bool early_page_uninitialised(unsigned long pfn) { @@ -452,7 +474,7 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) #endif /* Return a pointer to the bitmap storing bits affecting a block of pages */ -static inline unsigned long *get_pageblock_bitmap(struct page *page, +static inline unsigned long *get_pageblock_bitmap(const struct page *page, unsigned long pfn) { #ifdef CONFIG_SPARSEMEM @@ -462,7 +484,7 @@ static inline unsigned long *get_pageblock_bitmap(struct page *page, #endif /* CONFIG_SPARSEMEM */ } -static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) +static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) { #ifdef CONFIG_SPARSEMEM pfn &= (PAGES_PER_SECTION-1); @@ -473,7 +495,7 @@ static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) } static __always_inline -unsigned long __get_pfnblock_flags_mask(struct page *page, +unsigned long __get_pfnblock_flags_mask(const struct page *page, unsigned long pfn, unsigned long mask) { @@ -498,13 +520,14 @@ unsigned long __get_pfnblock_flags_mask(struct page *page, * * Return: pageblock_bits flags */ -unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, - unsigned long mask) +unsigned long get_pfnblock_flags_mask(const struct page *page, + unsigned long pfn, unsigned long mask) { return __get_pfnblock_flags_mask(page, pfn, mask); } -static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) +static __always_inline int get_pfnblock_migratetype(const struct page *page, + unsigned long pfn) { return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); } @@ -636,8 +659,7 @@ static void bad_page(struct page *page, const char *reason) pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); - __dump_page(page, reason); - dump_page_owner(page); + dump_page(page, reason); print_modules(); dump_stack(); @@ -647,6 +669,57 @@ out: add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } +static inline unsigned int order_to_pindex(int migratetype, int order) +{ + int base = order; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order > PAGE_ALLOC_COSTLY_ORDER) { + VM_BUG_ON(order != pageblock_order); + base = PAGE_ALLOC_COSTLY_ORDER + 1; + } +#else + VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); +#endif + + return (MIGRATE_PCPTYPES * base) + migratetype; +} + +static inline int pindex_to_order(unsigned int pindex) +{ + int order = pindex / MIGRATE_PCPTYPES; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order > PAGE_ALLOC_COSTLY_ORDER) { + order = pageblock_order; + VM_BUG_ON(order != pageblock_order); + } +#else + VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); +#endif + + return order; +} + +static inline bool pcp_allowed_order(unsigned int order) +{ + if (order <= PAGE_ALLOC_COSTLY_ORDER) + return true; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order == pageblock_order) + return true; +#endif + return false; +} + +static inline void free_the_page(struct page *page, unsigned int order) +{ + if (pcp_allowed_order(order)) /* Via pcp? */ + free_unref_page(page, order); + else + __free_pages_ok(page, order, FPI_NONE); +} + /* * Higher-order pages are called "compound pages". They are structured thusly: * @@ -665,7 +738,7 @@ out: void free_compound_page(struct page *page) { mem_cgroup_uncharge(page); - __free_pages_ok(page, compound_order(page), FPI_NONE); + free_the_page(page, compound_order(page)); } void prep_compound_page(struct page *page, unsigned int order) @@ -764,32 +837,36 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, */ void init_mem_debugging_and_hardening(void) { + bool page_poisoning_requested = false; + +#ifdef CONFIG_PAGE_POISONING + /* + * Page poisoning is debug page alloc for some arches. If + * either of those options are enabled, enable poisoning. + */ + if (page_poisoning_enabled() || + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && + debug_pagealloc_enabled())) { + static_branch_enable(&_page_poisoning_enabled); + page_poisoning_requested = true; + } +#endif + if (_init_on_alloc_enabled_early) { - if (page_poisoning_enabled()) + if (page_poisoning_requested) pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " "will take precedence over init_on_alloc\n"); else static_branch_enable(&init_on_alloc); } if (_init_on_free_enabled_early) { - if (page_poisoning_enabled()) + if (page_poisoning_requested) pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " "will take precedence over init_on_free\n"); else static_branch_enable(&init_on_free); } -#ifdef CONFIG_PAGE_POISONING - /* - * Page poisoning is debug page alloc for some arches. If - * either of those options are enabled, enable poisoning. - */ - if (page_poisoning_enabled() || - (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && - debug_pagealloc_enabled())) - static_branch_enable(&_page_poisoning_enabled); -#endif - #ifdef CONFIG_DEBUG_PAGEALLOC if (!debug_pagealloc_enabled()) return; @@ -867,7 +944,7 @@ compaction_capture(struct capture_control *capc, struct page *page, return false; /* - * Do not let lower order allocations polluate a movable pageblock. + * Do not let lower order allocations pollute a movable pageblock. * This might let an unmovable request use a reclaimable pageblock * and vice-versa but no more than normal fallback logic which can * have trouble finding a high-order free page. @@ -1103,7 +1180,7 @@ static inline bool page_expected_state(struct page *page, if (unlikely((unsigned long)page->mapping | page_ref_count(page) | #ifdef CONFIG_MEMCG - (unsigned long)page_memcg(page) | + page->memcg_data | #endif (page->flags & check_flags))) return false; @@ -1128,7 +1205,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; } #ifdef CONFIG_MEMCG - if (unlikely(page_memcg(page))) + if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; #endif return bad_reason; @@ -1200,10 +1277,16 @@ out: return ret; } -static void kernel_init_free_pages(struct page *page, int numpages) +static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags) { int i; + if (zero_tags) { + for (i = 0; i < numpages; i++) + tag_clear_highpage(page + i); + return; + } + /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); for (i = 0; i < numpages; i++) { @@ -1216,9 +1299,10 @@ static void kernel_init_free_pages(struct page *page, int numpages) } static __always_inline bool free_pages_prepare(struct page *page, - unsigned int order, bool check_free) + unsigned int order, bool check_free, fpi_t fpi_flags) { int bad = 0; + bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); VM_BUG_ON_PAGE(PageTail(page), page); @@ -1276,16 +1360,28 @@ static __always_inline bool free_pages_prepare(struct page *page, debug_check_no_obj_freed(page_address(page), PAGE_SIZE << order); } - if (want_init_on_free()) - kernel_init_free_pages(page, 1 << order); kernel_poison_pages(page, 1 << order); /* + * As memory initialization might be integrated into KASAN, + * kasan_free_pages and kernel_init_free_pages must be + * kept together to avoid discrepancies in behavior. + * * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - kasan_free_nondeferred_pages(page, order); + if (kasan_has_integrated_init()) { + if (!skip_kasan_poison) + kasan_free_pages(page, order); + } else { + bool init = want_init_on_free(); + + if (init) + kernel_init_free_pages(page, 1 << order, false); + if (!skip_kasan_poison) + kasan_poison_pages(page, order, init); + } /* * arch_free_page() can make the page's contents inaccessible. s390 @@ -1305,9 +1401,9 @@ static __always_inline bool free_pages_prepare(struct page *page, * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when * moved from pcp lists to free lists. */ -static bool free_pcp_prepare(struct page *page) +static bool free_pcp_prepare(struct page *page, unsigned int order) { - return free_pages_prepare(page, 0, true); + return free_pages_prepare(page, order, true, FPI_NONE); } static bool bulkfree_pcp_prepare(struct page *page) @@ -1324,12 +1420,12 @@ static bool bulkfree_pcp_prepare(struct page *page) * debug_pagealloc enabled, they are checked also immediately when being freed * to the pcp lists. */ -static bool free_pcp_prepare(struct page *page) +static bool free_pcp_prepare(struct page *page, unsigned int order) { if (debug_pagealloc_enabled_static()) - return free_pages_prepare(page, 0, true); + return free_pages_prepare(page, order, true, FPI_NONE); else - return free_pages_prepare(page, 0, false); + return free_pages_prepare(page, order, false, FPI_NONE); } static bool bulkfree_pcp_prepare(struct page *page) @@ -1361,8 +1457,10 @@ static inline void prefetch_buddy(struct page *page) static void free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp) { - int migratetype = 0; + int pindex = 0; int batch_free = 0; + int nr_freed = 0; + unsigned int order; int prefetch_nr = READ_ONCE(pcp->batch); bool isolated_pageblocks; struct page *page, *tmp; @@ -1373,7 +1471,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, * below while (list_empty(list)) loop. */ count = min(pcp->count, count); - while (count) { + while (count > 0) { struct list_head *list; /* @@ -1385,24 +1483,31 @@ static void free_pcppages_bulk(struct zone *zone, int count, */ do { batch_free++; - if (++migratetype == MIGRATE_PCPTYPES) - migratetype = 0; - list = &pcp->lists[migratetype]; + if (++pindex == NR_PCP_LISTS) + pindex = 0; + list = &pcp->lists[pindex]; } while (list_empty(list)); /* This is the only non-empty list. Free them all. */ - if (batch_free == MIGRATE_PCPTYPES) + if (batch_free == NR_PCP_LISTS) batch_free = count; + order = pindex_to_order(pindex); + BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH)); do { page = list_last_entry(list, struct page, lru); /* must delete to avoid corrupting pcp list */ list_del(&page->lru); - pcp->count--; + nr_freed += 1 << order; + count -= 1 << order; if (bulkfree_pcp_prepare(page)) continue; + /* Encode order with the migratetype */ + page->index <<= NR_PCP_ORDER_WIDTH; + page->index |= order; + list_add_tail(&page->lru, &head); /* @@ -1418,9 +1523,14 @@ static void free_pcppages_bulk(struct zone *zone, int count, prefetch_buddy(page); prefetch_nr--; } - } while (--count && --batch_free && !list_empty(list)); + } while (count > 0 && --batch_free && !list_empty(list)); } + pcp->count -= nr_freed; + /* + * local_lock_irq held so equivalent to spin_lock_irqsave for + * both PREEMPT_RT and non-PREEMPT_RT configurations. + */ spin_lock(&zone->lock); isolated_pageblocks = has_isolate_pageblock(zone); @@ -1430,14 +1540,19 @@ static void free_pcppages_bulk(struct zone *zone, int count, */ list_for_each_entry_safe(page, tmp, &head, lru) { int mt = get_pcppage_migratetype(page); + + /* mt has been encoded with the order (see above) */ + order = mt & NR_PCP_ORDER_MASK; + mt >>= NR_PCP_ORDER_WIDTH; + /* MIGRATE_ISOLATE page should not go to pcplists */ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); /* Pageblock could have been isolated meanwhile */ if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page); - __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); - trace_mm_page_pcpu_drain(page, 0, mt); + __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); + trace_mm_page_pcpu_drain(page, order, mt); } spin_unlock(&zone->lock); } @@ -1447,13 +1562,15 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype, fpi_t fpi_flags) { - spin_lock(&zone->lock); + unsigned long flags; + + spin_lock_irqsave(&zone->lock, flags); if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); } __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); } static void __meminit __init_single_page(struct page *page, unsigned long pfn, @@ -1536,16 +1653,22 @@ static void __free_pages_ok(struct page *page, unsigned int order, unsigned long flags; int migratetype; unsigned long pfn = page_to_pfn(page); + struct zone *zone = page_zone(page); - if (!free_pages_prepare(page, order, true)) + if (!free_pages_prepare(page, order, true, fpi_flags)) return; migratetype = get_pfnblock_migratetype(page, pfn); - local_irq_save(flags); + + spin_lock_irqsave(&zone->lock, flags); + if (unlikely(has_isolate_pageblock(zone) || + is_migrate_isolate(migratetype))) { + migratetype = get_pfnblock_migratetype(page, pfn); + } + __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); + spin_unlock_irqrestore(&zone->lock, flags); + __count_vm_events(PGFREE, 1 << order); - free_one_page(page_zone(page), page, pfn, order, migratetype, - fpi_flags); - local_irq_restore(flags); } void __free_pages_core(struct page *page, unsigned int order) @@ -1574,10 +1697,10 @@ void __free_pages_core(struct page *page, unsigned int order) * Bypass PCP and place fresh pages right to the tail, primarily * relevant for memory onlining. */ - __free_pages_ok(page, order, FPI_TO_TAIL); + __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON); } -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA /* * During memory init memblocks map pfns to nids. The search is expensive and @@ -1627,7 +1750,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn) return nid; } -#endif /* CONFIG_NEED_MULTIPLE_NODES */ +#endif /* CONFIG_NUMA */ void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) @@ -2115,14 +2238,6 @@ void __init page_alloc_init_late(void) wait_for_completion(&pgdat_init_all_done_comp); /* - * The number of managed pages has changed due to the initialisation - * so the pcpu batch and high limits needs to be updated or the limits - * will be artificially small. - */ - for_each_populated_zone(zone) - zone_pcp_update(zone); - - /* * We initialized the rest of the deferred pages. Permanently disable * on-demand struct page initialization. */ @@ -2297,12 +2412,31 @@ inline void post_alloc_hook(struct page *page, unsigned int order, arch_alloc_page(page, order); debug_pagealloc_map_pages(page, 1 << order); - kasan_alloc_pages(page, order); + + /* + * Page unpoisoning must happen before memory initialization. + * Otherwise, the poison pattern will be overwritten for __GFP_ZERO + * allocations and the page unpoisoning code will complain. + */ kernel_unpoison_pages(page, 1 << order); - set_page_owner(page, order, gfp_flags); - if (!want_init_on_free() && want_init_on_alloc(gfp_flags)) - kernel_init_free_pages(page, 1 << order); + /* + * As memory initialization might be integrated into KASAN, + * kasan_alloc_pages and kernel_init_free_pages must be + * kept together to avoid discrepancies in behavior. + */ + if (kasan_has_integrated_init()) { + kasan_alloc_pages(page, order, gfp_flags); + } else { + bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + + kasan_unpoison_pages(page, order, init); + if (init) + kernel_init_free_pages(page, 1 << order, + gfp_flags & __GFP_ZEROTAGS); + } + + set_page_owner(page, order, gfp_flags); } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, @@ -2386,19 +2520,21 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, * boundary. If alignment is required, use move_freepages_block() */ static int move_freepages(struct zone *zone, - struct page *start_page, struct page *end_page, + unsigned long start_pfn, unsigned long end_pfn, int migratetype, int *num_movable) { struct page *page; + unsigned long pfn; unsigned int order; int pages_moved = 0; - for (page = start_page; page <= end_page;) { - if (!pfn_valid_within(page_to_pfn(page))) { - page++; + for (pfn = start_pfn; pfn <= end_pfn;) { + if (!pfn_valid_within(pfn)) { + pfn++; continue; } + page = pfn_to_page(pfn); if (!PageBuddy(page)) { /* * We assume that pages that could be isolated for @@ -2408,8 +2544,7 @@ static int move_freepages(struct zone *zone, if (num_movable && (PageLRU(page) || __PageMovable(page))) (*num_movable)++; - - page++; + pfn++; continue; } @@ -2419,7 +2554,7 @@ static int move_freepages(struct zone *zone, order = buddy_order(page); move_to_free_list(page, zone, order, migratetype); - page += 1 << order; + pfn += 1 << order; pages_moved += 1 << order; } @@ -2429,25 +2564,22 @@ static int move_freepages(struct zone *zone, int move_freepages_block(struct zone *zone, struct page *page, int migratetype, int *num_movable) { - unsigned long start_pfn, end_pfn; - struct page *start_page, *end_page; + unsigned long start_pfn, end_pfn, pfn; if (num_movable) *num_movable = 0; - start_pfn = page_to_pfn(page); - start_pfn = start_pfn & ~(pageblock_nr_pages-1); - start_page = pfn_to_page(start_pfn); - end_page = start_page + pageblock_nr_pages - 1; + pfn = page_to_pfn(page); + start_pfn = pfn & ~(pageblock_nr_pages - 1); end_pfn = start_pfn + pageblock_nr_pages - 1; /* Do not cross zone boundaries */ if (!zone_spans_pfn(zone, start_pfn)) - start_page = page; + start_pfn = pfn; if (!zone_spans_pfn(zone, end_pfn)) return 0; - return move_freepages(zone, start_page, end_page, migratetype, + return move_freepages(zone, start_pfn, end_pfn, migratetype, num_movable); } @@ -2731,7 +2863,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, /* * In page freeing path, migratetype change is racy so * we can counter several free pages in a pageblock - * in this loop althoug we changed the pageblock type + * in this loop although we changed the pageblock type * from highatomic to ac->migratetype. So we should * adjust the count once. */ @@ -2908,8 +3040,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { - int i, alloced = 0; + int i, allocated = 0; + /* + * local_lock_irq held so equivalent to spin_lock_irqsave for + * both PREEMPT_RT and non-PREEMPT_RT configurations. + */ spin_lock(&zone->lock); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, @@ -2931,7 +3067,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages are ordered properly. */ list_add_tail(&page->lru, list); - alloced++; + allocated++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); @@ -2940,12 +3076,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, /* * i pages were removed from the buddy list even if some leak due * to check_pcp_refill failing so adjust NR_FREE_PAGES based - * on i. Do not confuse with 'alloced' which is the number of + * on i. Do not confuse with 'allocated' which is the number of * pages added to the pcp list. */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); - return alloced; + return allocated; } #ifdef CONFIG_NUMA @@ -2962,12 +3098,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) unsigned long flags; int to_drain, batch; - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); if (to_drain > 0) free_pcppages_bulk(zone, to_drain, pcp); - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); } #endif @@ -2981,16 +3117,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) static void drain_pages_zone(unsigned int cpu, struct zone *zone) { unsigned long flags; - struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; - local_irq_save(flags); - pset = per_cpu_ptr(zone->pageset, cpu); + local_lock_irqsave(&pagesets.lock, flags); - pcp = &pset->pcp; + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); if (pcp->count) free_pcppages_bulk(zone, pcp->count, pcp); - local_irq_restore(flags); + + local_unlock_irqrestore(&pagesets.lock, flags); } /* @@ -3035,7 +3170,7 @@ static void drain_local_pages_wq(struct work_struct *work) * drain_all_pages doesn't use proper cpu hotplug protection so * we can race with cpu offline when the WQ can move this from * a cpu pinned worker to an unbound one. We can operate on a different - * cpu which is allright but we also have to make sure to not move to + * cpu which is alright but we also have to make sure to not move to * a different one. */ preempt_disable(); @@ -3088,7 +3223,7 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) * disables preemption as part of its processing */ for_each_online_cpu(cpu) { - struct per_cpu_pageset *pcp; + struct per_cpu_pages *pcp; struct zone *z; bool has_pcps = false; @@ -3099,13 +3234,13 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) */ has_pcps = true; } else if (zone) { - pcp = per_cpu_ptr(zone->pageset, cpu); - if (pcp->pcp.count) + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + if (pcp->count) has_pcps = true; } else { for_each_populated_zone(z) { - pcp = per_cpu_ptr(z->pageset, cpu); - if (pcp->pcp.count) { + pcp = per_cpu_ptr(z->per_cpu_pageset, cpu); + if (pcp->count) { has_pcps = true; break; } @@ -3198,11 +3333,12 @@ void mark_free_pages(struct zone *zone) } #endif /* CONFIG_PM */ -static bool free_unref_page_prepare(struct page *page, unsigned long pfn) +static bool free_unref_page_prepare(struct page *page, unsigned long pfn, + unsigned int order) { int migratetype; - if (!free_pcp_prepare(page)) + if (!free_pcp_prepare(page, order)) return false; migratetype = get_pfnblock_migratetype(page, pfn); @@ -3210,52 +3346,99 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn) return true; } -static void free_unref_page_commit(struct page *page, unsigned long pfn) +static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch) +{ + int min_nr_free, max_nr_free; + + /* Check for PCP disabled or boot pageset */ + if (unlikely(high < batch)) + return 1; + + /* Leave at least pcp->batch pages on the list */ + min_nr_free = batch; + max_nr_free = high - batch; + + /* + * Double the number of pages freed each time there is subsequent + * freeing of pages without any allocation. + */ + batch <<= pcp->free_factor; + if (batch < max_nr_free) + pcp->free_factor++; + batch = clamp(batch, min_nr_free, max_nr_free); + + return batch; +} + +static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone) +{ + int high = READ_ONCE(pcp->high); + + if (unlikely(!high)) + return 0; + + if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) + return high; + + /* + * If reclaim is active, limit the number of pages that can be + * stored on pcp lists + */ + return min(READ_ONCE(pcp->batch) << 2, high); +} + +static void free_unref_page_commit(struct page *page, unsigned long pfn, + int migratetype, unsigned int order) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; - int migratetype; + int high; + int pindex; - migratetype = get_pcppage_migratetype(page); __count_vm_event(PGFREE); + pcp = this_cpu_ptr(zone->per_cpu_pageset); + pindex = order_to_pindex(migratetype, order); + list_add(&page->lru, &pcp->lists[pindex]); + pcp->count += 1 << order; + high = nr_pcp_high(pcp, zone); + if (pcp->count >= high) { + int batch = READ_ONCE(pcp->batch); + + free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp); + } +} + +/* + * Free a pcp page + */ +void free_unref_page(struct page *page, unsigned int order) +{ + unsigned long flags; + unsigned long pfn = page_to_pfn(page); + int migratetype; + + if (!free_unref_page_prepare(page, pfn, order)) + return; /* * We only track unmovable, reclaimable and movable on pcp lists. - * Free ISOLATE pages back to the allocator because they are being + * Place ISOLATE pages on the isolated list because they are being * offlined but treat HIGHATOMIC as movable pages so we can get those * areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ - if (migratetype >= MIGRATE_PCPTYPES) { + migratetype = get_pcppage_migratetype(page); + if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(zone, page, pfn, 0, migratetype, - FPI_NONE); + free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); return; } migratetype = MIGRATE_MOVABLE; } - pcp = &this_cpu_ptr(zone->pageset)->pcp; - list_add(&page->lru, &pcp->lists[migratetype]); - pcp->count++; - if (pcp->count >= READ_ONCE(pcp->high)) - free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp); -} - -/* - * Free a 0-order page - */ -void free_unref_page(struct page *page) -{ - unsigned long flags; - unsigned long pfn = page_to_pfn(page); - - if (!free_unref_page_prepare(page, pfn)) - return; - - local_irq_save(flags); - free_unref_page_commit(page, pfn); - local_irq_restore(flags); + local_lock_irqsave(&pagesets.lock, flags); + free_unref_page_commit(page, pfn, migratetype, order); + local_unlock_irqrestore(&pagesets.lock, flags); } /* @@ -3266,34 +3449,56 @@ void free_unref_page_list(struct list_head *list) struct page *page, *next; unsigned long flags, pfn; int batch_count = 0; + int migratetype; /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { pfn = page_to_pfn(page); - if (!free_unref_page_prepare(page, pfn)) + if (!free_unref_page_prepare(page, pfn, 0)) list_del(&page->lru); + + /* + * Free isolated pages directly to the allocator, see + * comment in free_unref_page. + */ + migratetype = get_pcppage_migratetype(page); + if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { + if (unlikely(is_migrate_isolate(migratetype))) { + list_del(&page->lru); + free_one_page(page_zone(page), page, pfn, 0, + migratetype, FPI_NONE); + continue; + } + + /* + * Non-isolated types over MIGRATE_PCPTYPES get added + * to the MIGRATE_MOVABLE pcp list. + */ + set_pcppage_migratetype(page, MIGRATE_MOVABLE); + } + set_page_private(page, pfn); } - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); list_for_each_entry_safe(page, next, list, lru) { - unsigned long pfn = page_private(page); - + pfn = page_private(page); set_page_private(page, 0); + migratetype = get_pcppage_migratetype(page); trace_mm_page_free_batched(page); - free_unref_page_commit(page, pfn); + free_unref_page_commit(page, pfn, migratetype, 0); /* * Guard against excessive IRQ disabled times when we get * a large list of pages to free. */ if (++batch_count == SWAP_CLUSTER_MAX) { - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); batch_count = 0; - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); } } - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); } /* @@ -3392,7 +3597,8 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt) * * Must be called with interrupts disabled. */ -static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) +static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, + long nr_account) { #ifdef CONFIG_NUMA enum numa_stat_item local_stat = NUMA_LOCAL; @@ -3405,17 +3611,19 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) local_stat = NUMA_OTHER; if (zone_to_nid(z) == zone_to_nid(preferred_zone)) - __inc_numa_state(z, NUMA_HIT); + __count_numa_events(z, NUMA_HIT, nr_account); else { - __inc_numa_state(z, NUMA_MISS); - __inc_numa_state(preferred_zone, NUMA_FOREIGN); + __count_numa_events(z, NUMA_MISS, nr_account); + __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account); } - __inc_numa_state(z, local_stat); + __count_numa_events(z, local_stat, nr_account); #endif } /* Remove page from the per-cpu list, caller must protect the list */ -static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, +static inline +struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, + int migratetype, unsigned int alloc_flags, struct per_cpu_pages *pcp, struct list_head *list) @@ -3 |