summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-02-04 07:24:48 +0000
committerLinus Torvalds <torvalds@linux-foundation.org>2020-02-04 07:24:48 +0000
commitcc12071ff39060fc2e47c58b43e249fe0d0061ee (patch)
tree085789dec88100d2ac12d9f2c990c00b909d7ee2 /mm
parent9717c1cea16e3eae81ca226f4c3670bb799b61ad (diff)
parentf3cc4e1d44a813a0685f2e558b78ace3db559722 (diff)
downloadlinux-cc12071ff39060fc2e47c58b43e249fe0d0061ee.tar.gz
linux-cc12071ff39060fc2e47c58b43e249fe0d0061ee.tar.bz2
linux-cc12071ff39060fc2e47c58b43e249fe0d0061ee.zip
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: "The rest of MM and the rest of everything else: hotfixes, ipc, misc, procfs, lib, cleanups, arm" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (67 commits) ARM: dma-api: fix max_pfn off-by-one error in __dma_supported() treewide: remove redundant IS_ERR() before error code check include/linux/cpumask.h: don't calculate length of the input string lib: new testcases for bitmap_parse{_user} lib: rework bitmap_parse() lib: make bitmap_parse_user a wrapper on bitmap_parse lib: add test for bitmap_parse() bitops: more BITS_TO_* macros lib/string: add strnchrnul() proc: convert everything to "struct proc_ops" proc: decouple proc from VFS with "struct proc_ops" asm-generic/tlb: provide MMU_GATHER_TABLE_FREE asm-generic/tlb: rename HAVE_MMU_GATHER_NO_GATHER asm-generic/tlb: rename HAVE_MMU_GATHER_PAGE_SIZE asm-generic/tlb: rename HAVE_RCU_TABLE_FREE asm-generic/tlb: add missing CONFIG symbol asm-gemeric/tlb: remove stray function declarations asm-generic/tlb: avoid potential double flush mm/mmu_gather: invalidate TLB correctly on batch allocation failure and flush powerpc/mmu_gather: enable RCU_TABLE_FREE even for !SMP case ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug21
-rw-r--r--mm/Makefile1
-rw-r--r--mm/gup.c2
-rw-r--r--mm/hmm.c62
-rw-r--r--mm/memory_hotplug.c104
-rw-r--r--mm/memremap.c2
-rw-r--r--mm/migrate.c5
-rw-r--r--mm/mincore.c1
-rw-r--r--mm/mmu_gather.c134
-rw-r--r--mm/page_alloc.c71
-rw-r--r--mm/pagewalk.c163
-rw-r--r--mm/ptdump.c139
-rw-r--r--mm/slab_common.c37
-rw-r--r--mm/sparse.c10
-rw-r--r--mm/swapfile.c14
15 files changed, 514 insertions, 252 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 327b3ebf23bf..0271b22e063f 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -117,3 +117,24 @@ config DEBUG_RODATA_TEST
depends on STRICT_KERNEL_RWX
---help---
This option enables a testcase for the setting rodata read-only.
+
+config GENERIC_PTDUMP
+ bool
+
+config PTDUMP_CORE
+ bool
+
+config PTDUMP_DEBUGFS
+ bool "Export kernel pagetable layout to userspace via debugfs"
+ depends on DEBUG_KERNEL
+ depends on DEBUG_FS
+ depends on GENERIC_PTDUMP
+ select PTDUMP_CORE
+ help
+ Say Y here if you want to show the kernel pagetable layout in a
+ debugfs file. This information is only useful for kernel developers
+ who are working in architecture specific areas of the kernel.
+ It is probably not a good idea to enable this feature in a production
+ kernel.
+
+ If in doubt, say N.
diff --git a/mm/Makefile b/mm/Makefile
index 32f08e22e824..272e66039e70 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -109,3 +109,4 @@ obj-$(CONFIG_ZONE_DEVICE) += memremap.o
obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
+obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
diff --git a/mm/gup.c b/mm/gup.c
index e13f4d211475..1b521e0ac1de 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1792,7 +1792,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
* Before activating this code, please be aware that the following assumptions
* are currently made:
*
- * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
+ * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
* free pages containing page tables or TLB flushing requires IPI broadcast.
*
* *) ptes can be read atomically by the architecture.
diff --git a/mm/hmm.c b/mm/hmm.c
index d379cb6496ae..72e5a6d9a417 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -186,7 +186,7 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
}
static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+ __always_unused int depth, struct mm_walk *walk)
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
@@ -380,7 +380,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
again:
pmd = READ_ONCE(*pmdp);
if (pmd_none(pmd))
- return hmm_vma_walk_hole(start, end, walk);
+ return hmm_vma_walk_hole(start, end, -1, walk);
if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
bool fault, write_fault;
@@ -474,23 +474,32 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
- unsigned long addr = start, next;
- pmd_t *pmdp;
+ unsigned long addr = start;
pud_t pud;
- int ret;
+ int ret = 0;
+ spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
+
+ if (!ptl)
+ return 0;
+
+ /* Normally we don't want to split the huge page */
+ walk->action = ACTION_CONTINUE;
-again:
pud = READ_ONCE(*pudp);
- if (pud_none(pud))
- return hmm_vma_walk_hole(start, end, walk);
+ if (pud_none(pud)) {
+ ret = hmm_vma_walk_hole(start, end, -1, walk);
+ goto out_unlock;
+ }
if (pud_huge(pud) && pud_devmap(pud)) {
unsigned long i, npages, pfn;
uint64_t *pfns, cpu_flags;
bool fault, write_fault;
- if (!pud_present(pud))
- return hmm_vma_walk_hole(start, end, walk);
+ if (!pud_present(pud)) {
+ ret = hmm_vma_walk_hole(start, end, -1, walk);
+ goto out_unlock;
+ }
i = (addr - range->start) >> PAGE_SHIFT;
npages = (end - addr) >> PAGE_SHIFT;
@@ -499,16 +508,20 @@ again:
cpu_flags = pud_to_hmm_pfn_flags(range, pud);
hmm_range_need_fault(hmm_vma_walk, pfns, npages,
cpu_flags, &fault, &write_fault);
- if (fault || write_fault)
- return hmm_vma_walk_hole_(addr, end, fault,
- write_fault, walk);
+ if (fault || write_fault) {
+ ret = hmm_vma_walk_hole_(addr, end, fault,
+ write_fault, walk);
+ goto out_unlock;
+ }
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
for (i = 0; i < npages; ++i, ++pfn) {
hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
hmm_vma_walk->pgmap);
- if (unlikely(!hmm_vma_walk->pgmap))
- return -EBUSY;
+ if (unlikely(!hmm_vma_walk->pgmap)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
cpu_flags;
}
@@ -517,22 +530,15 @@ again:
hmm_vma_walk->pgmap = NULL;
}
hmm_vma_walk->last = end;
- return 0;
+ goto out_unlock;
}
- split_huge_pud(walk->vma, pudp, addr);
- if (pud_none(*pudp))
- goto again;
+ /* Ask for the PUD to be split */
+ walk->action = ACTION_SUBTREE;
- pmdp = pmd_offset(pudp, addr);
- do {
- next = pmd_addr_end(addr, end);
- ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
- if (ret)
- return ret;
- } while (pmdp++, addr = next, addr != end);
-
- return 0;
+out_unlock:
+ spin_unlock(ptl);
+ return ret;
}
#else
#define hmm_vma_walk_pud NULL
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 36d80915ddc2..0a54ffac8c68 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -355,7 +355,7 @@ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
if (unlikely(pfn_to_nid(start_pfn) != nid))
continue;
- if (zone && zone != page_zone(pfn_to_page(start_pfn)))
+ if (zone != page_zone(pfn_to_page(start_pfn)))
continue;
return start_pfn;
@@ -380,7 +380,7 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
if (unlikely(pfn_to_nid(pfn) != nid))
continue;
- if (zone && zone != page_zone(pfn_to_page(pfn)))
+ if (zone != page_zone(pfn_to_page(pfn)))
continue;
return pfn;
@@ -392,14 +392,11 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long zone_start_pfn = zone->zone_start_pfn;
- unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
- unsigned long zone_end_pfn = z;
unsigned long pfn;
int nid = zone_to_nid(zone);
zone_span_writelock(zone);
- if (zone_start_pfn == start_pfn) {
+ if (zone->zone_start_pfn == start_pfn) {
/*
* If the section is smallest section in the zone, it need
* shrink zone->zone_start_pfn and zone->zone_spanned_pages.
@@ -407,50 +404,30 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
* for shrinking zone.
*/
pfn = find_smallest_section_pfn(nid, zone, end_pfn,
- zone_end_pfn);
+ zone_end_pfn(zone));
if (pfn) {
+ zone->spanned_pages = zone_end_pfn(zone) - pfn;
zone->zone_start_pfn = pfn;
- zone->spanned_pages = zone_end_pfn - pfn;
+ } else {
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = 0;
}
- } else if (zone_end_pfn == end_pfn) {
+ } else if (zone_end_pfn(zone) == end_pfn) {
/*
* If the section is biggest section in the zone, it need
* shrink zone->spanned_pages.
* In this case, we find second biggest valid mem_section for
* shrinking zone.
*/
- pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
+ pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
start_pfn);
if (pfn)
- zone->spanned_pages = pfn - zone_start_pfn + 1;
- }
-
- /*
- * The section is not biggest or smallest mem_section in the zone, it
- * only creates a hole in the zone. So in this case, we need not
- * change the zone. But perhaps, the zone has only hole data. Thus
- * it check the zone has only hole or not.
- */
- pfn = zone_start_pfn;
- for (; pfn < zone_end_pfn; pfn += PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_to_online_page(pfn)))
- continue;
-
- if (page_zone(pfn_to_page(pfn)) != zone)
- continue;
-
- /* Skip range to be removed */
- if (pfn >= start_pfn && pfn < end_pfn)
- continue;
-
- /* If we find valid section, we have nothing to do */
- zone_span_writeunlock(zone);
- return;
+ zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
+ else {
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = 0;
+ }
}
-
- /* The zone has no valid section */
- zone->zone_start_pfn = 0;
- zone->spanned_pages = 0;
zone_span_writeunlock(zone);
}
@@ -490,6 +467,9 @@ void __ref remove_pfn_range_from_zone(struct zone *zone,
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long flags;
+ /* Poison struct pages because they are now uninitialized again. */
+ page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
+
#ifdef CONFIG_ZONE_DEVICE
/*
* Zone shrinking code cannot properly deal with ZONE_DEVICE. So
@@ -536,25 +516,20 @@ static void __remove_section(unsigned long pfn, unsigned long nr_pages,
void __remove_pages(unsigned long pfn, unsigned long nr_pages,
struct vmem_altmap *altmap)
{
+ const unsigned long end_pfn = pfn + nr_pages;
+ unsigned long cur_nr_pages;
unsigned long map_offset = 0;
- unsigned long nr, start_sec, end_sec;
map_offset = vmem_altmap_offset(altmap);
if (check_pfn_span(pfn, nr_pages, "remove"))
return;
- start_sec = pfn_to_section_nr(pfn);
- end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
- for (nr = start_sec; nr <= end_sec; nr++) {
- unsigned long pfns;
-
+ for (; pfn < end_pfn; pfn += cur_nr_pages) {
cond_resched();
- pfns = min(nr_pages, PAGES_PER_SECTION
- - (pfn & ~PAGE_SECTION_MASK));
- __remove_section(pfn, pfns, map_offset, altmap);
- pfn += pfns;
- nr_pages -= pfns;
+ /* Select all remaining pages up to the next section boundary */
+ cur_nr_pages = min(end_pfn - pfn, -(pfn | PAGE_SECTION_MASK));
+ __remove_section(pfn, cur_nr_pages, map_offset, altmap);
map_offset = 0;
}
}
@@ -1197,14 +1172,13 @@ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
}
/*
- * Confirm all pages in a range [start, end) belong to the same zone.
- * When true, return its valid [start, end).
+ * Confirm all pages in a range [start, end) belong to the same zone (skipping
+ * memory holes). When true, return the zone.
*/
-int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
- unsigned long *valid_start, unsigned long *valid_end)
+struct zone *test_pages_in_a_zone(unsigned long start_pfn,
+ unsigned long end_pfn)
{
unsigned long pfn, sec_end_pfn;
- unsigned long start, end;
struct zone *zone = NULL;
struct page *page;
int i;
@@ -1225,24 +1199,15 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
continue;
/* Check if we got outside of the zone */
if (zone && !zone_spans_pfn(zone, pfn + i))
- return 0;
+ return NULL;
page = pfn_to_page(pfn + i);
if (zone && page_zone(page) != zone)
- return 0;
- if (!zone)
- start = pfn + i;
+ return NULL;
zone = page_zone(page);
- end = pfn + MAX_ORDER_NR_PAGES;
}
}
- if (zone) {
- *valid_start = start;
- *valid_end = min(end, end_pfn);
- return 1;
- } else {
- return 0;
- }
+ return zone;
}
/*
@@ -1487,7 +1452,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
unsigned long offlined_pages = 0;
int ret, node, nr_isolate_pageblock;
unsigned long flags;
- unsigned long valid_start, valid_end;
struct zone *zone;
struct memory_notify arg;
char *reason;
@@ -1512,14 +1476,12 @@ static int __ref __offline_pages(unsigned long start_pfn,
/* This makes hotplug much easier...and readable.
we assume this for now. .*/
- if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
- &valid_end)) {
+ zone = test_pages_in_a_zone(start_pfn, end_pfn);
+ if (!zone) {
ret = -EINVAL;
reason = "multizone range";
goto failed_removal;
}
-
- zone = page_zone(pfn_to_page(valid_start));
node = zone_to_nid(zone);
/* set above range as isolated */
diff --git a/mm/memremap.c b/mm/memremap.c
index 4c723d2049d5..09b5b7adc773 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -120,6 +120,8 @@ void memunmap_pages(struct dev_pagemap *pgmap)
nid = page_to_nid(first_page);
mem_hotplug_begin();
+ remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(res->start),
+ PHYS_PFN(resource_size(res)));
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
__remove_pages(PHYS_PFN(res->start),
PHYS_PFN(resource_size(res)), NULL);
diff --git a/mm/migrate.c b/mm/migrate.c
index edf42ed90030..b1092876e537 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2151,6 +2151,7 @@ out_unlock:
#ifdef CONFIG_DEVICE_PRIVATE
static int migrate_vma_collect_hole(unsigned long start,
unsigned long end,
+ __always_unused int depth,
struct mm_walk *walk)
{
struct migrate_vma *migrate = walk->private;
@@ -2195,7 +2196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
again:
if (pmd_none(*pmdp))
- return migrate_vma_collect_hole(start, end, walk);
+ return migrate_vma_collect_hole(start, end, -1, walk);
if (pmd_trans_huge(*pmdp)) {
struct page *page;
@@ -2228,7 +2229,7 @@ again:
return migrate_vma_collect_skip(start, end,
walk);
if (pmd_none(*pmdp))
- return migrate_vma_collect_hole(start, end,
+ return migrate_vma_collect_hole(start, end, -1,
walk);
}
}
diff --git a/mm/mincore.c b/mm/mincore.c
index 49b6fa2f6aa1..0e6dd9948f1a 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -112,6 +112,7 @@ static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
}
static int mincore_unmapped_range(unsigned long addr, unsigned long end,
+ __always_unused int depth,
struct mm_walk *walk)
{
walk->private += __mincore_unmapped_range(addr, end,
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 7d70e5c78f97..a3538cb2bcbe 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -11,7 +11,7 @@
#include <asm/pgalloc.h>
#include <asm/tlb.h>
-#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
static bool tlb_next_batch(struct mmu_gather *tlb)
{
@@ -69,7 +69,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
VM_BUG_ON(!tlb->end);
-#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
+#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
VM_WARN_ON(tlb->page_size != page_size);
#endif
@@ -89,58 +89,108 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
return false;
}
-#endif /* HAVE_MMU_GATHER_NO_GATHER */
+#endif /* MMU_GATHER_NO_GATHER */
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+#ifdef CONFIG_MMU_GATHER_TABLE_FREE
-/*
- * See the comment near struct mmu_table_batch.
- */
+static void __tlb_remove_table_free(struct mmu_table_batch *batch)
+{
+ int i;
+
+ for (i = 0; i < batch->nr; i++)
+ __tlb_remove_table(batch->tables[i]);
+
+ free_page((unsigned long)batch);
+}
+
+#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
/*
- * If we want tlb_remove_table() to imply TLB invalidates.
+ * Semi RCU freeing of the page directories.
+ *
+ * This is needed by some architectures to implement software pagetable walkers.
+ *
+ * gup_fast() and other software pagetable walkers do a lockless page-table
+ * walk and therefore needs some synchronization with the freeing of the page
+ * directories. The chosen means to accomplish that is by disabling IRQs over
+ * the walk.
+ *
+ * Architectures that use IPIs to flush TLBs will then automagically DTRT,
+ * since we unlink the page, flush TLBs, free the page. Since the disabling of
+ * IRQs delays the completion of the TLB flush we can never observe an already
+ * freed page.
+ *
+ * Architectures that do not have this (PPC) need to delay the freeing by some
+ * other means, this is that means.
+ *
+ * What we do is batch the freed directory pages (tables) and RCU free them.
+ * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
+ * holds off grace periods.
+ *
+ * However, in order to batch these pages we need to allocate storage, this
+ * allocation is deep inside the MM code and can thus easily fail on memory
+ * pressure. To guarantee progress we fall back to single table freeing, see
+ * the implementation of tlb_remove_table_one().
+ *
*/
-static inline void tlb_table_invalidate(struct mmu_gather *tlb)
-{
-#ifndef CONFIG_HAVE_RCU_TABLE_NO_INVALIDATE
- /*
- * Invalidate page-table caches used by hardware walkers. Then we still
- * need to RCU-sched wait while freeing the pages because software
- * walkers can still be in-flight.
- */
- tlb_flush_mmu_tlbonly(tlb);
-#endif
-}
static void tlb_remove_table_smp_sync(void *arg)
{
/* Simply deliver the interrupt */
}
-static void tlb_remove_table_one(void *table)
+static void tlb_remove_table_sync_one(void)
{
/*
* This isn't an RCU grace period and hence the page-tables cannot be
* assumed to be actually RCU-freed.
*
* It is however sufficient for software page-table walkers that rely on
- * IRQ disabling. See the comment near struct mmu_table_batch.
+ * IRQ disabling.
*/
smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
- __tlb_remove_table(table);
}
static void tlb_remove_table_rcu(struct rcu_head *head)
{
- struct mmu_table_batch *batch;
- int i;
+ __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
+}
+
+static void tlb_remove_table_free(struct mmu_table_batch *batch)
+{
+ call_rcu(&batch->rcu, tlb_remove_table_rcu);
+}
- batch = container_of(head, struct mmu_table_batch, rcu);
+#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
- for (i = 0; i < batch->nr; i++)
- __tlb_remove_table(batch->tables[i]);
+static void tlb_remove_table_sync_one(void) { }
- free_page((unsigned long)batch);
+static void tlb_remove_table_free(struct mmu_table_batch *batch)
+{
+ __tlb_remove_table_free(batch);
+}
+
+#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
+
+/*
+ * If we want tlb_remove_table() to imply TLB invalidates.
+ */
+static inline void tlb_table_invalidate(struct mmu_gather *tlb)
+{
+ if (tlb_needs_table_invalidate()) {
+ /*
+ * Invalidate page-table caches used by hardware walkers. Then
+ * we still need to RCU-sched wait while freeing the pages
+ * because software walkers can still be in-flight.
+ */
+ tlb_flush_mmu_tlbonly(tlb);
+ }
+}
+
+static void tlb_remove_table_one(void *table)
+{
+ tlb_remove_table_sync_one();
+ __tlb_remove_table(table);
}
static void tlb_table_flush(struct mmu_gather *tlb)
@@ -149,7 +199,7 @@ static void tlb_table_flush(struct mmu_gather *tlb)
if (*batch) {
tlb_table_invalidate(tlb);
- call_rcu(&(*batch)->rcu, tlb_remove_table_rcu);
+ tlb_remove_table_free(*batch);
*batch = NULL;
}
}
@@ -173,14 +223,22 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
tlb_table_flush(tlb);
}
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
+static inline void tlb_table_init(struct mmu_gather *tlb)
+{
+ tlb->batch = NULL;
+}
+
+#else /* !CONFIG_MMU_GATHER_TABLE_FREE */
+
+static inline void tlb_table_flush(struct mmu_gather *tlb) { }
+static inline void tlb_table_init(struct mmu_gather *tlb) { }
+
+#endif /* CONFIG_MMU_GATHER_TABLE_FREE */
static void tlb_flush_mmu_free(struct mmu_gather *tlb)
{
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb_table_flush(tlb);
-#endif
-#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
tlb_batch_pages_flush(tlb);
#endif
}
@@ -211,7 +269,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
/* Is it from 0 to ~0? */
tlb->fullmm = !(start | (end+1));
-#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
tlb->need_flush_all = 0;
tlb->local.next = NULL;
tlb->local.nr = 0;
@@ -220,10 +278,8 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
tlb->batch_count = 0;
#endif
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
- tlb->batch = NULL;
-#endif
-#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
+ tlb_table_init(tlb);
+#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
tlb->page_size = 0;
#endif
@@ -271,7 +327,7 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
tlb_flush_mmu(tlb);
-#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
tlb_batch_list_free(tlb);
#endif
dec_tlb_flush_pending(tlb->mm);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 15e908ad933b..3c4eb750a199 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5852,18 +5852,11 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
/* Skip PFNs that belong to non-present sections */
static inline __meminit unsigned long next_pfn(unsigned long pfn)
{
- unsigned long section_nr;
+ const unsigned long section_nr = pfn_to_section_nr(++pfn);
- section_nr = pfn_to_section_nr(++pfn);
if (present_section_nr(section_nr))
return pfn;
-
- while (++section_nr <= __highest_present_section_nr) {
- if (present_section_nr(section_nr))
- return section_nr_to_pfn(section_nr);
- }
-
- return -1;
+ return section_nr_to_pfn(next_present_section_nr(section_nr));
}
#else
static inline __meminit unsigned long next_pfn(unsigned long pfn)
@@ -5905,18 +5898,20 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
}
#endif
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ for (pfn = start_pfn; pfn < end_pfn; ) {
/*
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
if (context == MEMMAP_EARLY) {
if (!early_pfn_valid(pfn)) {
- pfn = next_pfn(pfn) - 1;
+ pfn = next_pfn(pfn);
continue;
}
- if (!early_pfn_in_nid(pfn, nid))
+ if (!early_pfn_in_nid(pfn, nid)) {
+ pfn++;
continue;
+ }
if (overlap_memmap_init(zone, &pfn))
continue;
if (defer_init(nid, pfn, end_pfn))
@@ -5944,16 +5939,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
+ pfn++;
}
}
#ifdef CONFIG_ZONE_DEVICE
void __ref memmap_init_zone_device(struct zone *zone,
unsigned long start_pfn,
- unsigned long size,
+ unsigned long nr_pages,
struct dev_pagemap *pgmap)
{
- unsigned long pfn, end_pfn = start_pfn + size;
+ unsigned long pfn, end_pfn = start_pfn + nr_pages;
struct pglist_data *pgdat = zone->zone_pgdat;
struct vmem_altmap *altmap = pgmap_altmap(pgmap);
unsigned long zone_idx = zone_idx(zone);
@@ -5970,7 +5966,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
*/
if (altmap) {
start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
- size = end_pfn - start_pfn;
+ nr_pages = end_pfn - start_pfn;
}
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
@@ -6017,7 +6013,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
}
pr_info("%s initialised %lu pages in %ums\n", __func__,
- size, jiffies_to_msecs(jiffies - start));
+ nr_pages, jiffies_to_msecs(jiffies - start));
}
#endif
@@ -6916,10 +6912,10 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,
#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
/*
- * Zero all valid struct pages in range [spfn, epfn), return number of struct
- * pages zeroed
+ * Initialize all valid struct pages in the range [spfn, epfn) and mark them
+ * PageReserved(). Return the number of struct pages that were initialized.
*/
-static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
+static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn)
{
unsigned long pfn;
u64 pgcnt = 0;
@@ -6930,7 +6926,13 @@ static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
+ pageblock_nr_pages - 1;
continue;
}
- mm_zero_struct_page(pfn_to_page(pfn));
+ /*
+ * Use a fake node/zone (0) for now. Some of these pages
+ * (in memblock.reserved but not in memblock.memory) will
+ * get re-initialized via reserve_bootmem_region() later.
+ */
+ __init_single_page(pfn_to_page(pfn), pfn, 0, 0);
+ __SetPageReserved(pfn_to_page(pfn));
pgcnt++;
}
@@ -6942,14 +6944,15 @@ static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
* initialized by going through __init_single_page(). But, there are some
* struct pages which are reserved in memblock allocator and their fields
* may be accessed (for example page_to_pfn() on some configuration accesses
- * flags). We must explicitly zero those struct pages.
+ * flags). We must explicitly initialize those struct pages.
*
* This function also addresses a similar issue where struct pages are left
* uninitialized because the physical address range is not covered by
* memblock.memory or memblock.reserved. That could happen when memblock
- * layout is manually configured via memmap=.
+ * layout is manually configured via memmap=, or when the highest physical
+ * address (max_pfn) does not end on a section boundary.
*/
-void __init zero_resv_unavail(void)
+static void __init init_unavailable_mem(void)
{
phys_addr_t start, end;
u64 i, pgcnt;
@@ -6962,10 +6965,20 @@ void __init zero_resv_unavail(void)
for_each_mem_range(i, &memblock.memory, NULL,
NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
if (next < start)
- pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
+ pgcnt += init_unavailable_range(PFN_DOWN(next),
+ PFN_UP(start));
next = end;
}
- pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn);
+
+ /*
+ * Early sections always have a fully populated memmap for the whole
+ * section - see pfn_valid(). If the last section has holes at the
+ * end and that section is marked "online", the memmap will be
+ * considered initialized. Make sure that memmap has a well defined
+ * state.
+ */
+ pgcnt += init_unavailable_range(PFN_DOWN(next),
+ round_up(max_pfn, PAGES_PER_SECTION));
/*
* Struct pages that do not have backing memory. This could be because
@@ -6974,6 +6987,10 @@ void __init zero_resv_unavail(void)
if (pgcnt)
pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
}
+#else
+static inline void __init init_unavailable_mem(void)
+{
+}
#endif /* !CONFIG_FLAT_NODE_MEM_MAP */
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -7403,7 +7420,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
- zero_resv_unavail();
+ init_unavailable_mem();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid, NULL,
@@ -7598,7 +7615,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
void __init free_area_init(unsigned long *zones_size)
{
- zero_resv_unavail();
+ init_unavailable_mem();
free_area_init_node(0, zones_size,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index ea0b9e606ad1..928df1638c30 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -4,26 +4,57 @@
#include <linux/sched.h>
#include <linux/hugetlb.h>
-static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+/*
+ * We want to know the real level where a entry is located ignoring any
+ * folding of levels which may be happening. For example if p4d is folded then
+ * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
+ */
+static int real_depth(int depth)
+{
+ if (depth == 3 && PTRS_PER_PMD == 1)
+ depth = 2;
+ if (depth == 2 && PTRS_PER_PUD == 1)
+ depth = 1;
+ if (depth == 1 && PTRS_PER_P4D == 1)
+ depth = 0;
+ return depth;
+}
+
+static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
{
- pte_t *pte;
- int err = 0;
const struct mm_walk_ops *ops = walk->ops;
- spinlock_t *ptl;
+ int err = 0;
- pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
for (;;) {
err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
if (err)
break;
- addr += PAGE_SIZE;
- if (addr == end)
+ if (addr >= end - PAGE_SIZE)
break;
+ addr += PAGE_SIZE;
pte++;
}
+ return err;
+}
+
+static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pte_t *pte;
+ int err = 0;
+ spinlock_t *ptl;
+
+ if (walk->no_vma) {
+ pte = pte_offset_map(pmd, addr);
+ err = walk_pte_range_inner(pte, addr, end, walk);
+ pte_unmap(pte);
+ } else {
+ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ err = walk_pte_range_inner(pte, addr, end, walk);
+ pte_unmap_unlock(pte, ptl);
+ }
- pte_unmap_unlock(pte, ptl);
return err;
}
@@ -34,18 +65,22 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
unsigned