summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-10-07 21:38:00 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2016-10-07 21:38:00 -0700
commitb66484cd74706fa8681d051840fe4b18a3da40ff (patch)
treee8215e7c25661d25f84abc4b98140c2062d6d5de /mm
parentc913fc4146ba7c280e074558d0a461e5c6f07c8a (diff)
parent05fd007e46296afb24d15c7d589d535e5a5b9d5c (diff)
downloadlinux-b66484cd74706fa8681d051840fe4b18a3da40ff.tar.gz
linux-b66484cd74706fa8681d051840fe4b18a3da40ff.tar.bz2
linux-b66484cd74706fa8681d051840fe4b18a3da40ff.zip
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - fsnotify updates - ocfs2 updates - all of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (127 commits) console: don't prefer first registered if DT specifies stdout-path cred: simpler, 1D supplementary groups CREDITS: update Pavel's information, add GPG key, remove snail mail address mailmap: add Johan Hovold .gitattributes: set git diff driver for C source code files uprobes: remove function declarations from arch/{mips,s390} spelling.txt: "modeled" is spelt correctly nmi_backtrace: generate one-line reports for idle cpus arch/tile: adopt the new nmi_backtrace framework nmi_backtrace: do a local dump_stack() instead of a self-NMI nmi_backtrace: add more trigger_*_cpu_backtrace() methods min/max: remove sparse warnings when they're nested Documentation/filesystems/proc.txt: add more description for maps/smaps mm, proc: fix region lost in /proc/self/smaps proc: fix timerslack_ns CAP_SYS_NICE check when adjusting self proc: add LSM hook checks to /proc/<tid>/timerslack_ns proc: relax /proc/<tid>/timerslack_ns capability requirements meminfo: break apart a very long seq_printf with #ifdefs seq/proc: modify seq_put_decimal_[u]ll to take a const char *, not char proc: faster /proc/*/status ...
Diffstat (limited to 'mm')
-rw-r--r--mm/bootmem.c14
-rw-r--r--mm/compaction.c205
-rw-r--r--mm/debug.c5
-rw-r--r--mm/filemap.c8
-rw-r--r--mm/huge_memory.c81
-rw-r--r--mm/hugetlb.c53
-rw-r--r--mm/internal.h3
-rw-r--r--mm/ksm.c7
-rw-r--r--mm/memblock.c5
-rw-r--r--mm/memcontrol.c154
-rw-r--r--mm/memory.c21
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mincore.c5
-rw-r--r--mm/mlock.c52
-rw-r--r--mm/mmap.c238
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/nobootmem.c20
-rw-r--r--mm/oom_kill.c381
-rw-r--r--mm/page-writeback.c34
-rw-r--r--mm/page_alloc.c281
-rw-r--r--mm/page_ext.c45
-rw-r--r--mm/page_io.c7
-rw-r--r--mm/page_isolation.c2
-rw-r--r--mm/page_owner.c156
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/swap.c4
-rw-r--r--mm/swap_state.c14
-rw-r--r--mm/swapfile.c137
-rw-r--r--mm/vmacache.c8
-rw-r--r--mm/vmalloc.c22
-rw-r--r--mm/vmscan.c53
-rw-r--r--mm/vmstat.c95
34 files changed, 1250 insertions, 873 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 0aa7dda52402..a869f84f44d3 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -11,15 +11,12 @@
#include <linux/init.h>
#include <linux/pfn.h>
#include <linux/slab.h>
-#include <linux/bootmem.h>
#include <linux/export.h>
#include <linux/kmemleak.h>
#include <linux/range.h>
-#include <linux/memblock.h>
#include <linux/bug.h>
#include <linux/io.h>
-
-#include <asm/processor.h>
+#include <linux/bootmem.h>
#include "internal.h"
@@ -712,7 +709,7 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
void *ptr;
if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc(size, GFP_NOWAIT);
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
again:
/* do not panic in alloc_bootmem_bdata() */
@@ -738,9 +735,6 @@ again:
void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-
return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
}
@@ -812,10 +806,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
}
-#ifndef ARCH_LOW_ADDRESS_LIMIT
-#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
-#endif
-
/**
* __alloc_bootmem_low - allocate low boot memory
* @size: size of the request in bytes
diff --git a/mm/compaction.c b/mm/compaction.c
index 9affb2908304..0409a4ad6ea1 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -997,8 +997,12 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
#ifdef CONFIG_COMPACTION
/* Returns true if the page is within a block suitable for migration to */
-static bool suitable_migration_target(struct page *page)
+static bool suitable_migration_target(struct compact_control *cc,
+ struct page *page)
{
+ if (cc->ignore_block_suitable)
+ return true;
+
/* If the page is a large free page, then disallow migration */
if (PageBuddy(page)) {
/*
@@ -1083,7 +1087,7 @@ static void isolate_freepages(struct compact_control *cc)
continue;
/* Check the block is suitable for migration */
- if (!suitable_migration_target(page))
+ if (!suitable_migration_target(cc, page))
continue;
/* If isolation recently failed, do not retry */
@@ -1316,7 +1320,7 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
return COMPACT_CONTINUE;
/* Compaction run is not finished if the watermark is not met */
- watermark = low_wmark_pages(zone);
+ watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK];
if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
cc->alloc_flags))
@@ -1329,13 +1333,13 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
/* Job done if page is free of the right migratetype */
if (!list_empty(&area->free_list[migratetype]))
- return COMPACT_PARTIAL;
+ return COMPACT_SUCCESS;
#ifdef CONFIG_CMA
/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
if (migratetype == MIGRATE_MOVABLE &&
!list_empty(&area->free_list[MIGRATE_CMA]))
- return COMPACT_PARTIAL;
+ return COMPACT_SUCCESS;
#endif
/*
* Job done if allocation would steal freepages from
@@ -1343,7 +1347,7 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
*/
if (find_suitable_fallback(area, order, migratetype,
true, &can_steal) != -1)
- return COMPACT_PARTIAL;
+ return COMPACT_SUCCESS;
}
return COMPACT_NO_SUITABLE_PAGE;
@@ -1367,7 +1371,7 @@ static enum compact_result compact_finished(struct zone *zone,
* compaction_suitable: Is this suitable to run compaction on this zone now?
* Returns
* COMPACT_SKIPPED - If there are too few free pages for compaction
- * COMPACT_PARTIAL - If the allocation would succeed without compaction
+ * COMPACT_SUCCESS - If the allocation would succeed without compaction
* COMPACT_CONTINUE - If compaction should run now
*/
static enum compact_result __compaction_suitable(struct zone *zone, int order,
@@ -1375,46 +1379,41 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
int classzone_idx,
unsigned long wmark_target)
{
- int fragindex;
unsigned long watermark;
if (is_via_compact_memory(order))
return COMPACT_CONTINUE;
- watermark = low_wmark_pages(zone);
+ watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
/*
* If watermarks for high-order allocation are already met, there
* should be no need for compaction at all.
*/
if (zone_watermark_ok(zone, order, watermark, classzone_idx,
alloc_flags))
- return COMPACT_PARTIAL;
+ return COMPACT_SUCCESS;
/*
- * Watermarks for order-0 must be met for compaction. Note the 2UL.
- * This is because during migration, copies of pages need to be
- * allocated and for a short time, the footprint is higher
+ * Watermarks for order-0 must be met for compaction to be able to
+ * isolate free pages for migration targets. This means that the
+ * watermark and alloc_flags have to match, or be more pessimistic than
+ * the check in __isolate_free_page(). We don't use the direct
+ * compactor's alloc_flags, as they are not relevant for freepage
+ * isolation. We however do use the direct compactor's classzone_idx to
+ * skip over zones where lowmem reserves would prevent allocation even
+ * if compaction succeeds.
+ * For costly orders, we require low watermark instead of min for
+ * compaction to proceed to increase its chances.
+ * ALLOC_CMA is used, as pages in CMA pageblocks are considered
+ * suitable migration targets
*/
- watermark += (2UL << order);
+ watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
+ low_wmark_pages(zone) : min_wmark_pages(zone);
+ watermark += compact_gap(order);
if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
- alloc_flags, wmark_target))
+ ALLOC_CMA, wmark_target))
return COMPACT_SKIPPED;
- /*
- * fragmentation index determines if allocation failures are due to
- * low memory or external fragmentation
- *
- * index of -1000 would imply allocations might succeed depending on
- * watermarks, but we already failed the high-order watermark check
- * index towards 0 implies failure is due to lack of memory
- * index towards 1000 implies failure is due to fragmentation
- *
- * Only compact if a failure would be due to fragmentation.
- */
- fragindex = fragmentation_index(zone, order);
- if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
- return COMPACT_NOT_SUITABLE_ZONE;
-
return COMPACT_CONTINUE;
}
@@ -1423,9 +1422,32 @@ enum compact_result compaction_suitable(struct zone *zone, int order,
int classzone_idx)
{
enum compact_result ret;
+ int fragindex;
ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
zone_page_state(zone, NR_FREE_PAGES));
+ /*
+ * fragmentation index determines if allocation failures are due to
+ * low memory or external fragmentation
+ *
+ * index of -1000 would imply allocations might succeed depending on
+ * watermarks, but we already failed the high-order watermark check
+ * index towards 0 implies failure is due to lack of memory
+ * index towards 1000 implies failure is due to fragmentation
+ *
+ * Only compact if a failure would be due to fragmentation. Also
+ * ignore fragindex for non-costly orders where the alternative to
+ * a successful reclaim/compaction is OOM. Fragindex and the
+ * vm.extfrag_threshold sysctl is meant as a heuristic to prevent
+ * excessive compaction for costly orders, but it should not be at the
+ * expense of system stability.
+ */
+ if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) {
+ fragindex = fragmentation_index(zone, order);
+ if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+ ret = COMPACT_NOT_SUITABLE_ZONE;
+ }
+
trace_mm_compaction_suitable(zone, order, ret);
if (ret == COMPACT_NOT_SUITABLE_ZONE)
ret = COMPACT_SKIPPED;
@@ -1458,8 +1480,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
compact_result = __compaction_suitable(zone, order, alloc_flags,
ac_classzone_idx(ac), available);
- if (compact_result != COMPACT_SKIPPED &&
- compact_result != COMPACT_NOT_SUITABLE_ZONE)
+ if (compact_result != COMPACT_SKIPPED)
return true;
}
@@ -1477,7 +1498,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
/* Compaction is likely to fail */
- if (ret == COMPACT_PARTIAL || ret == COMPACT_SKIPPED)
+ if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
return ret;
/* huh, compaction_suitable is returning something unexpected */
@@ -1492,23 +1513,29 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
/*
* Setup to move all movable pages to the end of the zone. Used cached
- * information on where the scanners should start but check that it
- * is initialised by ensuring the values are within zone boundaries.
+ * information on where the scanners should start (unless we explicitly
+ * want to compact the whole zone), but check that it is initialised
+ * by ensuring the values are within zone boundaries.
*/
- cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
- cc->free_pfn = zone->compact_cached_free_pfn;
- if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
- cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
- zone->compact_cached_free_pfn = cc->free_pfn;
- }
- if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
+ if (cc->whole_zone) {
cc->migrate_pfn = start_pfn;
- zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
- zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
- }
+ cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
+ } else {
+ cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
+ cc->free_pfn = zone->compact_cached_free_pfn;
+ if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
+ cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
+ zone->compact_cached_free_pfn = cc->free_pfn;
+ }
+ if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
+ cc->migrate_pfn = start_pfn;
+ zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+ zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
+ }
- if (cc->migrate_pfn == start_pfn)
- cc->whole_zone = true;
+ if (cc->migrate_pfn == start_pfn)
+ cc->whole_zone = true;
+ }
cc->last_migrated_pfn = 0;
@@ -1638,6 +1665,9 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.alloc_flags = alloc_flags,
.classzone_idx = classzone_idx,
.direct_compaction = true,
+ .whole_zone = (prio == MIN_COMPACT_PRIORITY),
+ .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
+ .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -1683,7 +1713,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
ac->nodemask) {
enum compact_result status;
- if (compaction_deferred(zone, order)) {
+ if (prio > MIN_COMPACT_PRIORITY
+ && compaction_deferred(zone, order)) {
rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
continue;
}
@@ -1692,9 +1723,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
alloc_flags, ac_classzone_idx(ac));
rc = max(status, rc);
- /* If a normal allocation would succeed, stop compacting */
- if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
- ac_classzone_idx(ac), alloc_flags)) {
+ /* The allocation should succeed, stop compacting */
+ if (status == COMPACT_SUCCESS) {
/*
* We think the allocation will succeed in this zone,
* but it is not certain, hence the false. The caller
@@ -1730,10 +1760,18 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
/* Compact all zones within a node */
-static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
+static void compact_node(int nid)
{
+ pg_data_t *pgdat = NODE_DATA(nid);
int zoneid;
struct zone *zone;
+ struct compact_control cc = {
+ .order = -1,
+ .mode = MIGRATE_SYNC,
+ .ignore_skip_hint = true,
+ .whole_zone = true,
+ };
+
for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
@@ -1741,60 +1779,19 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
if (!populated_zone(zone))
continue;
- cc->nr_freepages = 0;
- cc->nr_migratepages = 0;
- cc->zone = zone;
- INIT_LIST_HEAD(&cc->freepages);
- INIT_LIST_HEAD(&cc->migratepages);
-
- /*
- * When called via /proc/sys/vm/compact_memory
- * this makes sure we compact the whole zone regardless of
- * cached scanner positions.
- */
- if (is_via_compact_memory(cc->order))
- __reset_isolation_suitable(zone);
-
- if (is_via_compact_memory(cc->order) ||
- !compaction_deferred(zone, cc->order))
- compact_zone(zone, cc);
-
- VM_BUG_ON(!list_empty(&cc->freepages));
- VM_BUG_ON(!list_empty(&cc->migratepages));
+ cc.nr_freepages = 0;
+ cc.nr_migratepages = 0;
+ cc.zone = zone;
+ INIT_LIST_HEAD(&cc.freepages);
+ INIT_LIST_HEAD(&cc.migratepages);
- if (is_via_compact_memory(cc->order))
- continue;
+ compact_zone(zone, &cc);
- if (zone_watermark_ok(zone, cc->order,
- low_wmark_pages(zone), 0, 0))
- compaction_defer_reset(zone, cc->order, false);
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
}
}
-void compact_pgdat(pg_data_t *pgdat, int order)
-{
- struct compact_control cc = {
- .order = order,
- .mode = MIGRATE_ASYNC,
- };
-
- if (!order)
- return;
-
- __compact_pgdat(pgdat, &cc);
-}
-
-static void compact_node(int nid)
-{
- struct compact_control cc = {
- .order = -1,
- .mode = MIGRATE_SYNC,
- .ignore_skip_hint = true,
- };
-
- __compact_pgdat(NODE_DATA(nid), &cc);
-}
-
/* Compact all nodes in the system */
static void compact_nodes(void)
{
@@ -1900,8 +1897,6 @@ static void kcompactd_do_work(pg_data_t *pgdat)
.ignore_skip_hint = true,
};
- bool success = false;
-
trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
cc.classzone_idx);
count_vm_event(KCOMPACTD_WAKE);
@@ -1930,9 +1925,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
return;
status = compact_zone(zone, &cc);
- if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone),
- cc.classzone_idx, 0)) {
- success = true;
+ if (status == COMPACT_SUCCESS) {
compaction_defer_reset(zone, cc.order, false);
} else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
/*
diff --git a/mm/debug.c b/mm/debug.c
index 74c7cae4f683..9feb699c5d25 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -42,6 +42,11 @@ const struct trace_print_flags vmaflag_names[] = {
void __dump_page(struct page *page, const char *reason)
{
+ /*
+ * Avoid VM_BUG_ON() in page_mapcount().
+ * page->_mapcount space in struct page is used by sl[aou]b pages to
+ * encode own info.
+ */
int mapcount = PageSlab(page) ? 0 : page_mapcount(page);
pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
diff --git a/mm/filemap.c b/mm/filemap.c
index 68f1813fbdc3..2f7b7783bd6b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1687,6 +1687,10 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
unsigned int prev_offset;
int error = 0;
+ if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
+ return -EINVAL;
+ iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
+
index = *ppos >> PAGE_SHIFT;
prev_index = ra->prev_pos >> PAGE_SHIFT;
prev_offset = ra->prev_pos & (PAGE_SIZE-1);
@@ -1721,7 +1725,9 @@ find_page:
* wait_on_page_locked is used to avoid unnecessarily
* serialisations and why it's safe.
*/
- wait_on_page_locked_killable(page);
+ error = wait_on_page_locked_killable(page);
+ if (unlikely(error))
+ goto readpage_error;
if (PageUptodate(page))
goto page_ok;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 283583fcb1e7..cdcd25cb30fe 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -59,7 +59,7 @@ static struct shrinker deferred_split_shrinker;
static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
-struct page *get_huge_zero_page(void)
+static struct page *get_huge_zero_page(void)
{
struct page *zero_page;
retry:
@@ -86,7 +86,7 @@ retry:
return READ_ONCE(huge_zero_page);
}
-void put_huge_zero_page(void)
+static void put_huge_zero_page(void)
{
/*
* Counter should never go to zero here. Only shrinker can put
@@ -95,6 +95,26 @@ void put_huge_zero_page(void)
BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}
+struct page *mm_get_huge_zero_page(struct mm_struct *mm)
+{
+ if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ return READ_ONCE(huge_zero_page);
+
+ if (!get_huge_zero_page())
+ return NULL;
+
+ if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ put_huge_zero_page();
+
+ return READ_ONCE(huge_zero_page);
+}
+
+void mm_put_huge_zero_page(struct mm_struct *mm)
+{
+ if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ put_huge_zero_page();
+}
+
static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
struct shrink_control *sc)
{
@@ -469,6 +489,49 @@ void prep_transhuge_page(struct page *page)
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
+unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
+ loff_t off, unsigned long flags, unsigned long size)
+{
+ unsigned long addr;
+ loff_t off_end = off + len;
+ loff_t off_align = round_up(off, size);
+ unsigned long len_pad;
+
+ if (off_end <= off_align || (off_end - off_align) < size)
+ return 0;
+
+ len_pad = len + size;
+ if (len_pad < len || (off + len_pad) < off)
+ return 0;
+
+ addr = current->mm->get_unmapped_area(filp, 0, len_pad,
+ off >> PAGE_SHIFT, flags);
+ if (IS_ERR_VALUE(addr))
+ return 0;
+
+ addr += (off - addr) & (size - 1);
+ return addr;
+}
+
+unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ loff_t off = (loff_t)pgoff << PAGE_SHIFT;
+
+ if (addr)
+ goto out;
+ if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
+ goto out;
+
+ addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
+ if (addr)
+ return addr;
+
+ out:
+ return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+}
+EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
+
static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
gfp_t gfp)
{
@@ -601,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
pgtable = pte_alloc_one(vma->vm_mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
- zero_page = get_huge_zero_page();
+ zero_page = mm_get_huge_zero_page(vma->vm_mm);
if (unlikely(!zero_page)) {
pte_free(vma->vm_mm, pgtable);
count_vm_event(THP_FAULT_FALLBACK);
@@ -623,10 +686,8 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
}
} else
spin_unlock(fe->ptl);
- if (!set) {
+ if (!set)
pte_free(vma->vm_mm, pgtable);
- put_huge_zero_page();
- }
return ret;
}
gfp = alloc_hugepage_direct_gfpmask(vma);
@@ -780,7 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* since we already have a zero page to copy. It just takes a
* reference.
*/
- zero_page = get_huge_zero_page();
+ zero_page = mm_get_huge_zero_page(dst_mm);
set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
zero_page);
ret = 0;
@@ -1038,7 +1099,6 @@ alloc:
update_mmu_cache_pmd(vma, fe->address, fe->pmd);
if (!page) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
- put_huge_zero_page();
} else {
VM_BUG_ON_PAGE(!PageHead(page), page);
page_remove_rmap(page, true);
@@ -1499,7 +1559,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
}
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
- put_huge_zero_page();
}
static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
@@ -1522,8 +1581,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (!vma_is_anonymous(vma)) {
_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
- if (is_huge_zero_pmd(_pmd))
- put_huge_zero_page();
if (vma_is_dax(vma))
return;
page = pmd_page(_pmd);
@@ -1563,7 +1620,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (soft_dirty)
entry = pte_swp_mksoft_dirty(entry);
} else {
- entry = mk_pte(page + i, vma->vm_page_prot);
+ entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
entry = maybe_mkwrite(entry, vma);
if (!write)
entry = pte_wrprotect(entry);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 87e11d8ad536..ec49d9ef1eef 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -567,13 +567,13 @@ retry:
* appear as a "reserved" entry instead of simply dangling with incorrect
* counts.
*/
-void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
+void hugetlb_fix_reserve_counts(struct inode *inode)
{
struct hugepage_subpool *spool = subpool_inode(inode);
long rsv_adjust;
rsv_adjust = hugepage_subpool_get_pages(spool, 1);
- if (restore_reserve && rsv_adjust) {
+ if (rsv_adjust) {
struct hstate *h = hstate_inode(inode);
hugetlb_acct_memory(h, 1);
@@ -1022,7 +1022,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
-#if (defined(CONFIG_X86_64) || defined(CONFIG_S390)) && \
+#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \
((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \
defined(CONFIG_CMA))
static void destroy_compound_gigantic_page(struct page *page,
@@ -1437,38 +1437,61 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
/*
* Dissolve a given free hugepage into free buddy pages. This function does
- * nothing for in-use (including surplus) hugepages.
+ * nothing for in-use (including surplus) hugepages. Returns -EBUSY if the
+ * number of free hugepages would be reduced below the number of reserved
+ * hugepages.
*/
-static void dissolve_free_huge_page(struct page *page)
+static int dissolve_free_huge_page(struct page *page)
{
+ int rc = 0;
+
spin_lock(&hugetlb_lock);
if (PageHuge(page) && !page_count(page)) {
- struct hstate *h = page_hstate(page);
- int nid = page_to_nid(page);
- list_del(&page->lru);
+ struct page *head = compound_head(page);
+ struct hstate *h = page_hstate(head);
+ int nid = page_to_nid(head);
+ if (h->free_huge_pages - h->resv_huge_pages == 0) {
+ rc = -EBUSY;
+ goto out;
+ }
+ list_del(&head->lru);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
h->max_huge_pages--;
- update_and_free_page(h, page);
+ update_and_free_page(h, head);
}
+out:
spin_unlock(&hugetlb_lock);
+ return rc;
}
/*
* Dissolve free hugepages in a given pfn range. Used by memory hotplug to
* make specified memory blocks removable from the system.
- * Note that start_pfn should aligned with (minimum) hugepage size.
+ * Note that this will dissolve a free gigantic hugepage completely, if any
+ * part of it lies within the given range.
+ * Also note that if dissolve_free_huge_page() returns with an error, all
+ * free hugepages that were dissolved before that error are lost.
*/
-void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
+int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
+ struct page *page;
+ int rc = 0;
if (!hugepages_supported())
- return;
+ return rc;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
+ page = pfn_to_page(pfn);
+ if (PageHuge(page) && !page_count(page)) {
+ rc = dissolve_free_huge_page(page);
+ if (rc)
+ break;
+ }
+ }
- VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order));
- for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order)
- dissolve_free_huge_page(pfn_to_page(pfn));
+ return rc;
}
/*
diff --git a/mm/internal.h b/mm/internal.h
index 1501304f87a4..537ac9951f5f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -178,8 +178,9 @@ struct compact_control {
unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
+ bool ignore_block_suitable; /* Scan blocks considered unsuitable */
bool direct_compaction; /* False from kcompactd or /proc/... */
- bool whole_zone; /* Whole zone has been scanned */
+ bool whole_zone; /* Whole zone should/has been scanned */
int order; /* order a direct compactor needs */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
const unsigned int alloc_fla