diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-07 21:38:00 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-07 21:38:00 -0700 |
| commit | b66484cd74706fa8681d051840fe4b18a3da40ff (patch) | |
| tree | e8215e7c25661d25f84abc4b98140c2062d6d5de /mm | |
| parent | c913fc4146ba7c280e074558d0a461e5c6f07c8a (diff) | |
| parent | 05fd007e46296afb24d15c7d589d535e5a5b9d5c (diff) | |
| download | linux-b66484cd74706fa8681d051840fe4b18a3da40ff.tar.gz linux-b66484cd74706fa8681d051840fe4b18a3da40ff.tar.bz2 linux-b66484cd74706fa8681d051840fe4b18a3da40ff.zip | |
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton:
- fsnotify updates
- ocfs2 updates
- all of MM
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (127 commits)
console: don't prefer first registered if DT specifies stdout-path
cred: simpler, 1D supplementary groups
CREDITS: update Pavel's information, add GPG key, remove snail mail address
mailmap: add Johan Hovold
.gitattributes: set git diff driver for C source code files
uprobes: remove function declarations from arch/{mips,s390}
spelling.txt: "modeled" is spelt correctly
nmi_backtrace: generate one-line reports for idle cpus
arch/tile: adopt the new nmi_backtrace framework
nmi_backtrace: do a local dump_stack() instead of a self-NMI
nmi_backtrace: add more trigger_*_cpu_backtrace() methods
min/max: remove sparse warnings when they're nested
Documentation/filesystems/proc.txt: add more description for maps/smaps
mm, proc: fix region lost in /proc/self/smaps
proc: fix timerslack_ns CAP_SYS_NICE check when adjusting self
proc: add LSM hook checks to /proc/<tid>/timerslack_ns
proc: relax /proc/<tid>/timerslack_ns capability requirements
meminfo: break apart a very long seq_printf with #ifdefs
seq/proc: modify seq_put_decimal_[u]ll to take a const char *, not char
proc: faster /proc/*/status
...
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/bootmem.c | 14 | ||||
| -rw-r--r-- | mm/compaction.c | 205 | ||||
| -rw-r--r-- | mm/debug.c | 5 | ||||
| -rw-r--r-- | mm/filemap.c | 8 | ||||
| -rw-r--r-- | mm/huge_memory.c | 81 | ||||
| -rw-r--r-- | mm/hugetlb.c | 53 | ||||
| -rw-r--r-- | mm/internal.h | 3 | ||||
| -rw-r--r-- | mm/ksm.c | 7 | ||||
| -rw-r--r-- | mm/memblock.c | 5 | ||||
| -rw-r--r-- | mm/memcontrol.c | 154 | ||||
| -rw-r--r-- | mm/memory.c | 21 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 4 | ||||
| -rw-r--r-- | mm/mempolicy.c | 2 | ||||
| -rw-r--r-- | mm/migrate.c | 2 | ||||
| -rw-r--r-- | mm/mincore.c | 5 | ||||
| -rw-r--r-- | mm/mlock.c | 52 | ||||
| -rw-r--r-- | mm/mmap.c | 238 | ||||
| -rw-r--r-- | mm/mprotect.c | 3 | ||||
| -rw-r--r-- | mm/nobootmem.c | 20 | ||||
| -rw-r--r-- | mm/oom_kill.c | 381 | ||||
| -rw-r--r-- | mm/page-writeback.c | 34 | ||||
| -rw-r--r-- | mm/page_alloc.c | 281 | ||||
| -rw-r--r-- | mm/page_ext.c | 45 | ||||
| -rw-r--r-- | mm/page_io.c | 7 | ||||
| -rw-r--r-- | mm/page_isolation.c | 2 | ||||
| -rw-r--r-- | mm/page_owner.c | 156 | ||||
| -rw-r--r-- | mm/shmem.c | 2 | ||||
| -rw-r--r-- | mm/swap.c | 4 | ||||
| -rw-r--r-- | mm/swap_state.c | 14 | ||||
| -rw-r--r-- | mm/swapfile.c | 137 | ||||
| -rw-r--r-- | mm/vmacache.c | 8 | ||||
| -rw-r--r-- | mm/vmalloc.c | 22 | ||||
| -rw-r--r-- | mm/vmscan.c | 53 | ||||
| -rw-r--r-- | mm/vmstat.c | 95 |
34 files changed, 1250 insertions, 873 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c index 0aa7dda52402..a869f84f44d3 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -11,15 +11,12 @@ #include <linux/init.h> #include <linux/pfn.h> #include <linux/slab.h> -#include <linux/bootmem.h> #include <linux/export.h> #include <linux/kmemleak.h> #include <linux/range.h> -#include <linux/memblock.h> #include <linux/bug.h> #include <linux/io.h> - -#include <asm/processor.h> +#include <linux/bootmem.h> #include "internal.h" @@ -712,7 +709,7 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, void *ptr; if (WARN_ON_ONCE(slab_is_available())) - return kzalloc(size, GFP_NOWAIT); + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); again: /* do not panic in alloc_bootmem_bdata() */ @@ -738,9 +735,6 @@ again: void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); } @@ -812,10 +806,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, } -#ifndef ARCH_LOW_ADDRESS_LIMIT -#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL -#endif - /** * __alloc_bootmem_low - allocate low boot memory * @size: size of the request in bytes diff --git a/mm/compaction.c b/mm/compaction.c index 9affb2908304..0409a4ad6ea1 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -997,8 +997,12 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, #ifdef CONFIG_COMPACTION /* Returns true if the page is within a block suitable for migration to */ -static bool suitable_migration_target(struct page *page) +static bool suitable_migration_target(struct compact_control *cc, + struct page *page) { + if (cc->ignore_block_suitable) + return true; + /* If the page is a large free page, then disallow migration */ if (PageBuddy(page)) { /* @@ -1083,7 +1087,7 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Check the block is suitable for migration */ - if (!suitable_migration_target(page)) + if (!suitable_migration_target(cc, page)) continue; /* If isolation recently failed, do not retry */ @@ -1316,7 +1320,7 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ return COMPACT_CONTINUE; /* Compaction run is not finished if the watermark is not met */ - watermark = low_wmark_pages(zone); + watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, cc->alloc_flags)) @@ -1329,13 +1333,13 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ /* Job done if page is free of the right migratetype */ if (!list_empty(&area->free_list[migratetype])) - return COMPACT_PARTIAL; + return COMPACT_SUCCESS; #ifdef CONFIG_CMA /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ if (migratetype == MIGRATE_MOVABLE && !list_empty(&area->free_list[MIGRATE_CMA])) - return COMPACT_PARTIAL; + return COMPACT_SUCCESS; #endif /* * Job done if allocation would steal freepages from @@ -1343,7 +1347,7 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ */ if (find_suitable_fallback(area, order, migratetype, true, &can_steal) != -1) - return COMPACT_PARTIAL; + return COMPACT_SUCCESS; } return COMPACT_NO_SUITABLE_PAGE; @@ -1367,7 +1371,7 @@ static enum compact_result compact_finished(struct zone *zone, * compaction_suitable: Is this suitable to run compaction on this zone now? * Returns * COMPACT_SKIPPED - If there are too few free pages for compaction - * COMPACT_PARTIAL - If the allocation would succeed without compaction + * COMPACT_SUCCESS - If the allocation would succeed without compaction * COMPACT_CONTINUE - If compaction should run now */ static enum compact_result __compaction_suitable(struct zone *zone, int order, @@ -1375,46 +1379,41 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, int classzone_idx, unsigned long wmark_target) { - int fragindex; unsigned long watermark; if (is_via_compact_memory(order)) return COMPACT_CONTINUE; - watermark = low_wmark_pages(zone); + watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; /* * If watermarks for high-order allocation are already met, there * should be no need for compaction at all. */ if (zone_watermark_ok(zone, order, watermark, classzone_idx, alloc_flags)) - return COMPACT_PARTIAL; + return COMPACT_SUCCESS; /* - * Watermarks for order-0 must be met for compaction. Note the 2UL. - * This is because during migration, copies of pages need to be - * allocated and for a short time, the footprint is higher + * Watermarks for order-0 must be met for compaction to be able to + * isolate free pages for migration targets. This means that the + * watermark and alloc_flags have to match, or be more pessimistic than + * the check in __isolate_free_page(). We don't use the direct + * compactor's alloc_flags, as they are not relevant for freepage + * isolation. We however do use the direct compactor's classzone_idx to + * skip over zones where lowmem reserves would prevent allocation even + * if compaction succeeds. + * For costly orders, we require low watermark instead of min for + * compaction to proceed to increase its chances. + * ALLOC_CMA is used, as pages in CMA pageblocks are considered + * suitable migration targets */ - watermark += (2UL << order); + watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? + low_wmark_pages(zone) : min_wmark_pages(zone); + watermark += compact_gap(order); if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, - alloc_flags, wmark_target)) + ALLOC_CMA, wmark_target)) return COMPACT_SKIPPED; - /* - * fragmentation index determines if allocation failures are due to - * low memory or external fragmentation - * - * index of -1000 would imply allocations might succeed depending on - * watermarks, but we already failed the high-order watermark check - * index towards 0 implies failure is due to lack of memory - * index towards 1000 implies failure is due to fragmentation - * - * Only compact if a failure would be due to fragmentation. - */ - fragindex = fragmentation_index(zone, order); - if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) - return COMPACT_NOT_SUITABLE_ZONE; - return COMPACT_CONTINUE; } @@ -1423,9 +1422,32 @@ enum compact_result compaction_suitable(struct zone *zone, int order, int classzone_idx) { enum compact_result ret; + int fragindex; ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, zone_page_state(zone, NR_FREE_PAGES)); + /* + * fragmentation index determines if allocation failures are due to + * low memory or external fragmentation + * + * index of -1000 would imply allocations might succeed depending on + * watermarks, but we already failed the high-order watermark check + * index towards 0 implies failure is due to lack of memory + * index towards 1000 implies failure is due to fragmentation + * + * Only compact if a failure would be due to fragmentation. Also + * ignore fragindex for non-costly orders where the alternative to + * a successful reclaim/compaction is OOM. Fragindex and the + * vm.extfrag_threshold sysctl is meant as a heuristic to prevent + * excessive compaction for costly orders, but it should not be at the + * expense of system stability. + */ + if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) { + fragindex = fragmentation_index(zone, order); + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) + ret = COMPACT_NOT_SUITABLE_ZONE; + } + trace_mm_compaction_suitable(zone, order, ret); if (ret == COMPACT_NOT_SUITABLE_ZONE) ret = COMPACT_SKIPPED; @@ -1458,8 +1480,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, available += zone_page_state_snapshot(zone, NR_FREE_PAGES); compact_result = __compaction_suitable(zone, order, alloc_flags, ac_classzone_idx(ac), available); - if (compact_result != COMPACT_SKIPPED && - compact_result != COMPACT_NOT_SUITABLE_ZONE) + if (compact_result != COMPACT_SKIPPED) return true; } @@ -1477,7 +1498,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro ret = compaction_suitable(zone, cc->order, cc->alloc_flags, cc->classzone_idx); /* Compaction is likely to fail */ - if (ret == COMPACT_PARTIAL || ret == COMPACT_SKIPPED) + if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) return ret; /* huh, compaction_suitable is returning something unexpected */ @@ -1492,23 +1513,29 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro /* * Setup to move all movable pages to the end of the zone. Used cached - * information on where the scanners should start but check that it - * is initialised by ensuring the values are within zone boundaries. + * information on where the scanners should start (unless we explicitly + * want to compact the whole zone), but check that it is initialised + * by ensuring the values are within zone boundaries. */ - cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; - cc->free_pfn = zone->compact_cached_free_pfn; - if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { - cc->free_pfn = pageblock_start_pfn(end_pfn - 1); - zone->compact_cached_free_pfn = cc->free_pfn; - } - if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { + if (cc->whole_zone) { cc->migrate_pfn = start_pfn; - zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; - zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; - } + cc->free_pfn = pageblock_start_pfn(end_pfn - 1); + } else { + cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; + cc->free_pfn = zone->compact_cached_free_pfn; + if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { + cc->free_pfn = pageblock_start_pfn(end_pfn - 1); + zone->compact_cached_free_pfn = cc->free_pfn; + } + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { + cc->migrate_pfn = start_pfn; + zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; + zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; + } - if (cc->migrate_pfn == start_pfn) - cc->whole_zone = true; + if (cc->migrate_pfn == start_pfn) + cc->whole_zone = true; + } cc->last_migrated_pfn = 0; @@ -1638,6 +1665,9 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .alloc_flags = alloc_flags, .classzone_idx = classzone_idx, .direct_compaction = true, + .whole_zone = (prio == MIN_COMPACT_PRIORITY), + .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), + .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY) }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1683,7 +1713,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, ac->nodemask) { enum compact_result status; - if (compaction_deferred(zone, order)) { + if (prio > MIN_COMPACT_PRIORITY + && compaction_deferred(zone, order)) { rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); continue; } @@ -1692,9 +1723,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, alloc_flags, ac_classzone_idx(ac)); rc = max(status, rc); - /* If a normal allocation would succeed, stop compacting */ - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), - ac_classzone_idx(ac), alloc_flags)) { + /* The allocation should succeed, stop compacting */ + if (status == COMPACT_SUCCESS) { /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller @@ -1730,10 +1760,18 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, /* Compact all zones within a node */ -static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) +static void compact_node(int nid) { + pg_data_t *pgdat = NODE_DATA(nid); int zoneid; struct zone *zone; + struct compact_control cc = { + .order = -1, + .mode = MIGRATE_SYNC, + .ignore_skip_hint = true, + .whole_zone = true, + }; + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { @@ -1741,60 +1779,19 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) if (!populated_zone(zone)) continue; - cc->nr_freepages = 0; - cc->nr_migratepages = 0; - cc->zone = zone; - INIT_LIST_HEAD(&cc->freepages); - INIT_LIST_HEAD(&cc->migratepages); - - /* - * When called via /proc/sys/vm/compact_memory - * this makes sure we compact the whole zone regardless of - * cached scanner positions. - */ - if (is_via_compact_memory(cc->order)) - __reset_isolation_suitable(zone); - - if (is_via_compact_memory(cc->order) || - !compaction_deferred(zone, cc->order)) - compact_zone(zone, cc); - - VM_BUG_ON(!list_empty(&cc->freepages)); - VM_BUG_ON(!list_empty(&cc->migratepages)); + cc.nr_freepages = 0; + cc.nr_migratepages = 0; + cc.zone = zone; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); - if (is_via_compact_memory(cc->order)) - continue; + compact_zone(zone, &cc); - if (zone_watermark_ok(zone, cc->order, - low_wmark_pages(zone), 0, 0)) - compaction_defer_reset(zone, cc->order, false); + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); } } -void compact_pgdat(pg_data_t *pgdat, int order) -{ - struct compact_control cc = { - .order = order, - .mode = MIGRATE_ASYNC, - }; - - if (!order) - return; - - __compact_pgdat(pgdat, &cc); -} - -static void compact_node(int nid) -{ - struct compact_control cc = { - .order = -1, - .mode = MIGRATE_SYNC, - .ignore_skip_hint = true, - }; - - __compact_pgdat(NODE_DATA(nid), &cc); -} - /* Compact all nodes in the system */ static void compact_nodes(void) { @@ -1900,8 +1897,6 @@ static void kcompactd_do_work(pg_data_t *pgdat) .ignore_skip_hint = true, }; - bool success = false; - trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, cc.classzone_idx); count_vm_event(KCOMPACTD_WAKE); @@ -1930,9 +1925,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) return; status = compact_zone(zone, &cc); - if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone), - cc.classzone_idx, 0)) { - success = true; + if (status == COMPACT_SUCCESS) { compaction_defer_reset(zone, cc.order, false); } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) { /* diff --git a/mm/debug.c b/mm/debug.c index 74c7cae4f683..9feb699c5d25 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -42,6 +42,11 @@ const struct trace_print_flags vmaflag_names[] = { void __dump_page(struct page *page, const char *reason) { + /* + * Avoid VM_BUG_ON() in page_mapcount(). + * page->_mapcount space in struct page is used by sl[aou]b pages to + * encode own info. + */ int mapcount = PageSlab(page) ? 0 : page_mapcount(page); pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", diff --git a/mm/filemap.c b/mm/filemap.c index 68f1813fbdc3..2f7b7783bd6b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1687,6 +1687,10 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, unsigned int prev_offset; int error = 0; + if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) + return -EINVAL; + iov_iter_truncate(iter, inode->i_sb->s_maxbytes); + index = *ppos >> PAGE_SHIFT; prev_index = ra->prev_pos >> PAGE_SHIFT; prev_offset = ra->prev_pos & (PAGE_SIZE-1); @@ -1721,7 +1725,9 @@ find_page: * wait_on_page_locked is used to avoid unnecessarily * serialisations and why it's safe. */ - wait_on_page_locked_killable(page); + error = wait_on_page_locked_killable(page); + if (unlikely(error)) + goto readpage_error; if (PageUptodate(page)) goto page_ok; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 283583fcb1e7..cdcd25cb30fe 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -59,7 +59,7 @@ static struct shrinker deferred_split_shrinker; static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; -struct page *get_huge_zero_page(void) +static struct page *get_huge_zero_page(void) { struct page *zero_page; retry: @@ -86,7 +86,7 @@ retry: return READ_ONCE(huge_zero_page); } -void put_huge_zero_page(void) +static void put_huge_zero_page(void) { /* * Counter should never go to zero here. Only shrinker can put @@ -95,6 +95,26 @@ void put_huge_zero_page(void) BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); } +struct page *mm_get_huge_zero_page(struct mm_struct *mm) +{ + if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + return READ_ONCE(huge_zero_page); + + if (!get_huge_zero_page()) + return NULL; + + if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + put_huge_zero_page(); + + return READ_ONCE(huge_zero_page); +} + +void mm_put_huge_zero_page(struct mm_struct *mm) +{ + if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + put_huge_zero_page(); +} + static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -469,6 +489,49 @@ void prep_transhuge_page(struct page *page) set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } +unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, + loff_t off, unsigned long flags, unsigned long size) +{ + unsigned long addr; + loff_t off_end = off + len; + loff_t off_align = round_up(off, size); + unsigned long len_pad; + + if (off_end <= off_align || (off_end - off_align) < size) + return 0; + + len_pad = len + size; + if (len_pad < len || (off + len_pad) < off) + return 0; + + addr = current->mm->get_unmapped_area(filp, 0, len_pad, + off >> PAGE_SHIFT, flags); + if (IS_ERR_VALUE(addr)) + return 0; + + addr += (off - addr) & (size - 1); + return addr; +} + +unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + loff_t off = (loff_t)pgoff << PAGE_SHIFT; + + if (addr) + goto out; + if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD)) + goto out; + + addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE); + if (addr) + return addr; + + out: + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); +} +EXPORT_SYMBOL_GPL(thp_get_unmapped_area); + static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, gfp_t gfp) { @@ -601,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) pgtable = pte_alloc_one(vma->vm_mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; - zero_page = get_huge_zero_page(); + zero_page = mm_get_huge_zero_page(vma->vm_mm); if (unlikely(!zero_page)) { pte_free(vma->vm_mm, pgtable); count_vm_event(THP_FAULT_FALLBACK); @@ -623,10 +686,8 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) } } else spin_unlock(fe->ptl); - if (!set) { + if (!set) pte_free(vma->vm_mm, pgtable); - put_huge_zero_page(); - } return ret; } gfp = alloc_hugepage_direct_gfpmask(vma); @@ -780,7 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, * since we already have a zero page to copy. It just takes a * reference. */ - zero_page = get_huge_zero_page(); + zero_page = mm_get_huge_zero_page(dst_mm); set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, zero_page); ret = 0; @@ -1038,7 +1099,6 @@ alloc: update_mmu_cache_pmd(vma, fe->address, fe->pmd); if (!page) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); - put_huge_zero_page(); } else { VM_BUG_ON_PAGE(!PageHead(page), page); page_remove_rmap(page, true); @@ -1499,7 +1559,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, } smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - put_huge_zero_page(); } static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, @@ -1522,8 +1581,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (!vma_is_anonymous(vma)) { _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); - if (is_huge_zero_pmd(_pmd)) - put_huge_zero_page(); if (vma_is_dax(vma)) return; page = pmd_page(_pmd); @@ -1563,7 +1620,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (soft_dirty) entry = pte_swp_mksoft_dirty(entry); } else { - entry = mk_pte(page + i, vma->vm_page_prot); + entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); entry = maybe_mkwrite(entry, vma); if (!write) entry = pte_wrprotect(entry); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 87e11d8ad536..ec49d9ef1eef 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -567,13 +567,13 @@ retry: * appear as a "reserved" entry instead of simply dangling with incorrect * counts. */ -void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve) +void hugetlb_fix_reserve_counts(struct inode *inode) { struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; rsv_adjust = hugepage_subpool_get_pages(spool, 1); - if (restore_reserve && rsv_adjust) { + if (rsv_adjust) { struct hstate *h = hstate_inode(inode); hugetlb_acct_memory(h, 1); @@ -1022,7 +1022,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -#if (defined(CONFIG_X86_64) || defined(CONFIG_S390)) && \ +#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \ ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \ defined(CONFIG_CMA)) static void destroy_compound_gigantic_page(struct page *page, @@ -1437,38 +1437,61 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, /* * Dissolve a given free hugepage into free buddy pages. This function does - * nothing for in-use (including surplus) hugepages. + * nothing for in-use (including surplus) hugepages. Returns -EBUSY if the + * number of free hugepages would be reduced below the number of reserved + * hugepages. */ -static void dissolve_free_huge_page(struct page *page) +static int dissolve_free_huge_page(struct page *page) { + int rc = 0; + spin_lock(&hugetlb_lock); if (PageHuge(page) && !page_count(page)) { - struct hstate *h = page_hstate(page); - int nid = page_to_nid(page); - list_del(&page->lru); + struct page *head = compound_head(page); + struct hstate *h = page_hstate(head); + int nid = page_to_nid(head); + if (h->free_huge_pages - h->resv_huge_pages == 0) { + rc = -EBUSY; + goto out; + } + list_del(&head->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; h->max_huge_pages--; - update_and_free_page(h, page); + update_and_free_page(h, head); } +out: spin_unlock(&hugetlb_lock); + return rc; } /* * Dissolve free hugepages in a given pfn range. Used by memory hotplug to * make specified memory blocks removable from the system. - * Note that start_pfn should aligned with (minimum) hugepage size. + * Note that this will dissolve a free gigantic hugepage completely, if any + * part of it lies within the given range. + * Also note that if dissolve_free_huge_page() returns with an error, all + * free hugepages that were dissolved before that error are lost. */ -void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) +int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; + struct page *page; + int rc = 0; if (!hugepages_supported()) - return; + return rc; + + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) { + page = pfn_to_page(pfn); + if (PageHuge(page) && !page_count(page)) { + rc = dissolve_free_huge_page(page); + if (rc) + break; + } + } - VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order)); - for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) - dissolve_free_huge_page(pfn_to_page(pfn)); + return rc; } /* diff --git a/mm/internal.h b/mm/internal.h index 1501304f87a4..537ac9951f5f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -178,8 +178,9 @@ struct compact_control { unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ + bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ - bool whole_zone; /* Whole zone has been scanned */ + bool whole_zone; /* Whole zone should/has been scanned */ int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ const unsigned int alloc_fla |
