summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/Locking10
-rw-r--r--fs/userfaultfd.c22
-rw-r--r--include/linux/huge_mm.h20
-rw-r--r--include/linux/mm.h34
-rw-r--r--include/linux/userfaultfd_k.h8
-rw-r--r--mm/filemap.c28
-rw-r--r--mm/huge_memory.c280
-rw-r--r--mm/internal.h4
-rw-r--r--mm/memory.c582
-rw-r--r--mm/nommu.c3
10 files changed, 475 insertions, 516 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index dda6e3f8e203..5a7386e38e2d 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -548,13 +548,13 @@ subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
locked. The VM will unlock the page.
->map_pages() is called when VM asks to map easy accessible pages.
-Filesystem should find and map pages associated with offsets from "pgoff"
-till "max_pgoff". ->map_pages() is called with page table locked and must
+Filesystem should find and map pages associated with offsets from "start_pgoff"
+till "end_pgoff". ->map_pages() is called with page table locked and must
not block. If it's not possible to reach a page without blocking,
filesystem should skip it. Filesystem should use do_set_pte() to setup
-page table entry. Pointer to entry associated with offset "pgoff" is
-passed in "pte" field in vm_fault structure. Pointers to entries for other
-offsets should be calculated relative to "pte".
+page table entry. Pointer to entry associated with the page is passed in
+"pte" field in fault_env structure. Pointers to entries for other offsets
+should be calculated relative to "pte".
->page_mkwrite() is called when a previously read-only pte is
about to become writeable. The filesystem again must ensure that there are
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 2d97952e341a..85959d8324df 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -257,10 +257,9 @@ out:
* fatal_signal_pending()s, and the mmap_sem must be released before
* returning it.
*/
-int handle_userfault(struct vm_area_struct *vma, unsigned long address,
- unsigned int flags, unsigned long reason)
+int handle_userfault(struct fault_env *fe, unsigned long reason)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mm_struct *mm = fe->vma->vm_mm;
struct userfaultfd_ctx *ctx;
struct userfaultfd_wait_queue uwq;
int ret;
@@ -269,7 +268,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
ret = VM_FAULT_SIGBUS;
- ctx = vma->vm_userfaultfd_ctx.ctx;
+ ctx = fe->vma->vm_userfaultfd_ctx.ctx;
if (!ctx)
goto out;
@@ -302,17 +301,17 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
* without first stopping userland access to the memory. For
* VM_UFFD_MISSING userfaults this is enough for now.
*/
- if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) {
+ if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) {
/*
* Validate the invariant that nowait must allow retry
* to be sure not to return SIGBUS erroneously on
* nowait invocations.
*/
- BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT);
+ BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT);
#ifdef CONFIG_DEBUG_VM
if (printk_ratelimit()) {
printk(KERN_WARNING
- "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags);
+ "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags);
dump_stack();
}
#endif
@@ -324,7 +323,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
* and wait.
*/
ret = VM_FAULT_RETRY;
- if (flags & FAULT_FLAG_RETRY_NOWAIT)
+ if (fe->flags & FAULT_FLAG_RETRY_NOWAIT)
goto out;
/* take the reference before dropping the mmap_sem */
@@ -332,10 +331,11 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
- uwq.msg = userfault_msg(address, flags, reason);
+ uwq.msg = userfault_msg(fe->address, fe->flags, reason);
uwq.ctx = ctx;
- return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
+ return_to_userland =
+ (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
(FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
spin_lock(&ctx->fault_pending_wqh.lock);
@@ -353,7 +353,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
TASK_KILLABLE);
spin_unlock(&ctx->fault_pending_wqh.lock);
- must_wait = userfaultfd_must_wait(ctx, address, flags, reason);
+ must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason);
up_read(&mm->mmap_sem);
if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f0a7a0320300..9bed9249156f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -1,20 +1,12 @@
#ifndef _LINUX_HUGE_MM_H
#define _LINUX_HUGE_MM_H
-extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- unsigned int flags);
+extern int do_huge_pmd_anonymous_page(struct fault_env *fe);
extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *vma);
-extern void huge_pmd_set_accessed(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- pmd_t orig_pmd, int dirty);
-extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- pmd_t orig_pmd);
+extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd);
+extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd);
extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmd,
@@ -134,8 +126,7 @@ static inline int hpage_nr_pages(struct page *page)
return 1;
}
-extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pmd_t pmd, pmd_t *pmdp);
+extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd);
extern struct page *huge_zero_page;
@@ -196,8 +187,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
return NULL;
}
-static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd)
{
return 0;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 646bc36b4d1b..8bd74558c0e4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -309,10 +309,27 @@ struct vm_fault {
* VM_FAULT_DAX_LOCKED and fill in
* entry here.
*/
- /* for ->map_pages() only */
- pgoff_t max_pgoff; /* map pages for offset from pgoff till
- * max_pgoff inclusive */
- pte_t *pte; /* pte entry associated with ->pgoff */
+};
+
+/*
+ * Page fault context: passes though page fault handler instead of endless list
+ * of function arguments.
+ */
+struct fault_env {
+ struct vm_area_struct *vma; /* Target VMA */
+ unsigned long address; /* Faulting virtual address */
+ unsigned int flags; /* FAULT_FLAG_xxx flags */
+ pmd_t *pmd; /* Pointer to pmd entry matching
+ * the 'address'
+ */
+ pte_t *pte; /* Pointer to pte entry matching
+ * the 'address'. NULL if the page
+ * table hasn't been allocated.
+ */
+ spinlock_t *ptl; /* Page table lock.
+ * Protects pte page table if 'pte'
+ * is not NULL, otherwise pmd.
+ */
};
/*
@@ -327,7 +344,8 @@ struct vm_operations_struct {
int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
pmd_t *, unsigned int flags);
- void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
+ void (*map_pages)(struct fault_env *fe,
+ pgoff_t start_pgoff, pgoff_t end_pgoff);
/* notification that a previously read-only page is about to become
* writable, if an error is returned it will cause a SIGBUS */
@@ -600,8 +618,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
return pte;
}
-void do_set_pte(struct vm_area_struct *vma, unsigned long address,
- struct page *page, pte_t *pte, bool write, bool anon);
+void do_set_pte(struct fault_env *fe, struct page *page);
#endif
/*
@@ -2062,7 +2079,8 @@ extern void truncate_inode_pages_final(struct address_space *);
/* generic vm_area_ops exported for stackable file systems */
extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
-extern void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern void filemap_map_pages(struct fault_env *fe,
+ pgoff_t start_pgoff, pgoff_t end_pgoff);
extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
/* mm/page-writeback.c */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 587480ad41b7..dd66a952e8cd 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -27,8 +27,7 @@
#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
-extern int handle_userfault(struct vm_area_struct *vma, unsigned long address,
- unsigned int flags, unsigned long reason);
+extern int handle_userfault(struct fault_env *fe, unsigned long reason);
extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len);
@@ -56,10 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
#else /* CONFIG_USERFAULTFD */
/* mm helpers */
-static inline int handle_userfault(struct vm_area_struct *vma,
- unsigned long address,
- unsigned int flags,
- unsigned long reason)
+static inline int handle_userfault(struct fault_env *fe, unsigned long reason)
{
return VM_FAULT_SIGBUS;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 20f3b1f33f0e..54d5318f8d3f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2128,22 +2128,27 @@ page_not_uptodate:
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
+void filemap_map_pages(struct fault_env *fe,
+ pgoff_t start_pgoff, pgoff_t end_pgoff)
{
struct radix_tree_iter iter;
void **slot;
- struct file *file = vma->vm_file;
+ struct file *file = fe->vma->vm_file;
struct address_space *mapping = file->f_mapping;
+ pgoff_t last_pgoff = start_pgoff;
loff_t size;
struct page *page;
- unsigned long address = (unsigned long) vmf->virtual_address;
- unsigned long addr;
- pte_t *pte;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) {
- if (iter.index > vmf->max_pgoff)
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
+ start_pgoff) {
+ if (iter.index > end_pgoff)
break;
+ fe->pte += iter.index - last_pgoff;
+ fe->address += (iter.index - last_pgoff) << PAGE_SHIFT;
+ last_pgoff = iter.index;
+ if (!pte_none(*fe->pte))
+ goto next;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -2179,14 +2184,9 @@ repeat:
if (page->index >= size >> PAGE_SHIFT)
goto unlock;
- pte = vmf->pte + page->index - vmf->pgoff;
- if (!pte_none(*pte))
- goto unlock;
-
if (file->f_ra.mmap_miss > 0)
file->f_ra.mmap_miss--;
- addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
- do_set_pte(vma, addr, page, pte, false, false);
+ do_set_pte(fe, page);
unlock_page(page);
goto next;
unlock:
@@ -2194,7 +2194,7 @@ unlock:
skip:
put_page(page);
next:
- if (iter.index == vmf->max_pgoff)
+ if (iter.index == end_pgoff)
break;
}
rcu_read_unlock();
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1a90f55d930f..bc5abcbe376e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -821,26 +821,23 @@ void prep_transhuge_page(struct page *page)
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
-static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- struct page *page, gfp_t gfp,
- unsigned int flags)
+static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
+ gfp_t gfp)
{
+ struct vm_area_struct *vma = fe->vma;
struct mem_cgroup *memcg;
pgtable_t pgtable;
- spinlock_t *ptl;
- unsigned long haddr = address & HPAGE_PMD_MASK;
+ unsigned long haddr = fe->address & HPAGE_PMD_MASK;
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) {
+ if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- pgtable = pte_alloc_one(mm, haddr);
+ pgtable = pte_alloc_one(vma->vm_mm, haddr);
if (unlikely(!pgtable)) {
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
@@ -855,12 +852,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
*/
__SetPageUptodate(page);
- ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_none(*pmd))) {
- spin_unlock(ptl);
+ fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+ if (unlikely(!pmd_none(*fe->pmd))) {
+ spin_unlock(fe->ptl);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
- pte_free(mm, pgtable);
+ pte_free(vma->vm_mm, pgtable);
} else {
pmd_t entry;
@@ -868,12 +865,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
if (userfaultfd_missing(vma)) {
int ret;
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
- pte_free(mm, pgtable);
- ret = handle_userfault(vma, address, flags,
- VM_UFFD_MISSING);
+ pte_free(vma->vm_mm, pgtable);
+ ret = handle_userfault(fe, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
return ret;
}
@@ -883,11 +879,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
page_add_new_anon_rmap(page, vma, haddr, true);
mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
- set_pmd_at(mm, haddr, pmd, entry);
- add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
- atomic_long_inc(&mm->nr_ptes);
- spin_unlock(ptl);
+ pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable);
+ set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ atomic_long_inc(&vma->vm_mm->nr_ptes);
+ spin_unlock(fe->ptl);
count_vm_event(THP_FAULT_ALLOC);
}
@@ -937,13 +933,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return true;
}
-int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- unsigned int flags)
+int do_huge_pmd_anonymous_page(struct fault_env *fe)
{
+ struct vm_area_struct *vma = fe->vma;
gfp_t gfp;
struct page *page;
- unsigned long haddr = address & HPAGE_PMD_MASK;
+ unsigned long haddr = fe->address & HPAGE_PMD_MASK;
if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
return VM_FAULT_FALLBACK;
@@ -951,42 +946,40 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
return VM_FAULT_OOM;
- if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) &&
+ if (!(fe->flags & FAULT_FLAG_WRITE) &&
+ !mm_forbids_zeropage(vma->vm_mm) &&
transparent_hugepage_use_zero_page()) {
- spinlock_t *ptl;
pgtable_t pgtable;
struct page *zero_page;
bool set;
int ret;
- pgtable = pte_alloc_one(mm, haddr);
+ pgtable = pte_alloc_one(vma->vm_mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
zero_page = get_huge_zero_page();
if (unlikely(!zero_page)) {
- pte_free(mm, pgtable);
+ pte_free(vma->vm_mm, pgtable);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- ptl = pmd_lock(mm, pmd);
+ fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
ret = 0;
set = false;
- if (pmd_none(*pmd)) {
+ if (pmd_none(*fe->pmd)) {
if (userfaultfd_missing(vma)) {
- spin_unlock(ptl);
- ret = handle_userfault(vma, address, flags,
- VM_UFFD_MISSING);
+ spin_unlock(fe->ptl);
+ ret = handle_userfault(fe, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
} else {
- set_huge_zero_page(pgtable, mm, vma,
- haddr, pmd,
- zero_page);
- spin_unlock(ptl);
+ set_huge_zero_page(pgtable, vma->vm_mm, vma,
+ haddr, fe->pmd, zero_page);
+ spin_unlock(fe->ptl);
set = true;
}
} else
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
if (!set) {
- pte_free(mm, pgtable);
+ pte_free(vma->vm_mm, pgtable);
put_huge_zero_page();
}
return ret;
@@ -998,8 +991,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_FALLBACK;
}
prep_transhuge_page(page);
- return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
- flags);
+ return __do_huge_pmd_anonymous_page(fe, page, gfp);
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -1172,38 +1164,31 @@ out:
return ret;
}
-void huge_pmd_set_accessed(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long address,
- pmd_t *pmd, pmd_t orig_pmd,
- int dirty)
+void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd)
{
- spinlock_t *ptl;
pmd_t entry;
unsigned long haddr;
- ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd);
+ if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
goto unlock;
entry = pmd_mkyoung(orig_pmd);
- haddr = address & HPAGE_PMD_MASK;
- if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
- update_mmu_cache_pmd(vma, address, pmd);
+ haddr = fe->address & HPAGE_PMD_MASK;
+ if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry,
+ fe->flags & FAULT_FLAG_WRITE))
+ update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd);
unlock:
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
}
-static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long address,
- pmd_t *pmd, pmd_t orig_pmd,
- struct page *page,
- unsigned long haddr)
+static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
+ struct page *page)
{
+ struct vm_area_struct *vma = fe->vma;
+ unsigned long haddr = fe->address & HPAGE_PMD_MASK;
struct mem_cgroup *memcg;
- spinlock_t *ptl;
pgtable_t pgtable;
pmd_t _pmd;
int ret = 0, i;
@@ -1220,11 +1205,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
for (i = 0; i < HPAGE_PMD_NR; i++) {
pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
- __GFP_OTHER_NODE,
- vma, address, page_to_nid(page));
+ __GFP_OTHER_NODE, vma,
+ fe->address, page_to_nid(page));
if (unlikely(!pages[i] ||
- mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
- &memcg, false))) {
+ mem_cgroup_try_charge(pages[i], vma->vm_mm,
+ GFP_KERNEL, &memcg, false))) {
if (pages[i])
put_page(pages[i]);
while (--i >= 0) {
@@ -1250,41 +1235,41 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
- ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+ if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
- pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
/* leave pmd empty until pte is filled */
- pgtable = pgtable_trans_huge_withdraw(mm, pmd);
- pmd_populate(mm, &_pmd, pgtable);
+ pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd);
+ pmd_populate(vma->vm_mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t *pte, entry;
+ pte_t entry;
entry = mk_pte(pages[i], vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- page_add_new_anon_rmap(pages[i], vma, haddr, false);
+ page_add_new_anon_rmap(pages[i], fe->vma, haddr, false);
mem_cgroup_commit_charge(pages[i], memcg, false, false);
lru_cache_add_active_or_unevictable(pages[i], vma);
- pte = pte_offset_map(&_pmd, haddr);
- VM_BUG_ON(!pte_none(*pte));
- set_pte_at(mm, haddr, pte, entry);
- pte_unmap(pte);
+ fe->pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*fe->pte));
+ set_pte_at(vma->vm_mm, haddr, fe->pte, entry);
+ pte_unmap(fe->pte);
}
kfree(pages);
smp_wmb(); /* make pte visible before pmd */
- pmd_populate(mm, pmd, pgtable);
+ pmd_populate(vma->vm_mm, fe->pmd, pgtable);
page_remove_rmap(page, true);
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
ret |= VM_FAULT_WRITE;
put_page(page);
@@ -1293,8 +1278,8 @@ out:
return ret;
out_free_pages:
- spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ spin_unlock(fe->ptl);
+ mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
for (i = 0; i < HPAGE_PMD_NR; i++) {
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
@@ -1305,25 +1290,23 @@ out_free_pages:
goto out;
}
-int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
+int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
{
- spinlock_t *ptl;
- int ret = 0;
+ struct vm_area_struct *vma = fe->vma;
struct page *page = NULL, *new_page;
struct mem_cgroup *memcg;
- unsigned long haddr;
+ unsigned long haddr = fe->address & HPAGE_PMD_MASK;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
gfp_t huge_gfp; /* for allocation and charge */
+ int ret = 0;
- ptl = pmd_lockptr(mm, pmd);
+ fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
- haddr = address & HPAGE_PMD_MASK;
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
- spin_lock(ptl);
- if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ spin_lock(fe->ptl);
+ if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
goto out_unlock;
page = pmd_page(orig_pmd);
@@ -1336,13 +1319,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
- update_mmu_cache_pmd(vma, address, pmd);
+ if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1))
+ update_mmu_cache_pmd(vma, fe->address, fe->pmd);
ret |= VM_FAULT_WRITE;
goto out_unlock;
}
get_page(page);
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
@@ -1355,13 +1338,12 @@ alloc:
prep_transhuge_page(new_page);
} else {
if (!page) {
- split_huge_pmd(vma, pmd, address);
+ split_huge_pmd(vma, fe->pmd, fe->address);
ret |= VM_FAULT_FALLBACK;
} else {
- ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
- pmd, orig_pmd, page, haddr);
+ ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page);
if (ret & VM_FAULT_OOM) {
- split_huge_pmd(vma, pmd, address);
+ split_huge_pmd(vma, fe->pmd, fe->address);
ret |= VM_FAULT_FALLBACK;
}
put_page(page);
@@ -1370,14 +1352,12 @@ alloc:
goto out;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg,
- true))) {
+ if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
+ huge_gfp, &memcg, true))) {
put_page(new_page);
- if (page) {
- split_huge_pmd(vma, pmd, address);
+ split_huge_pmd(vma, fe->pmd, fe->address);
+ if (page)
put_page(page);
- } else
- split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
count_vm_event(THP_FAULT_FALLBACK);
goto out;
@@ -1393,13 +1373,13 @@ alloc:
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
- spin_lock(ptl);
+ spin_lock(fe->ptl);
if (page)
put_page(page);
- if (unlikely(!pmd_same(*pmd, orig_pmd))) {
- spin_unlock(ptl);
+ if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) {
+ spin_unlock(fe->ptl);
mem_cgroup_cancel_charge(new_page, memcg, true);
put_page(new_page);
goto out_mn;
@@ -1407,14 +1387,14 @@ alloc:
pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
page_add_new_anon_rmap(new_page, vma, haddr, true);
mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
- set_pmd_at(mm, haddr, pmd, entry);
- update_mmu_cache_pmd(vma, address, pmd);
+ set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+ update_mmu_cache_pmd(vma, fe->address, fe->pmd);
if (!page) {
- add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
put_huge_zero_page();
} else {
VM_BUG_ON_PAGE(!PageHead(page), page);
@@ -1423,13 +1403,13 @@ alloc:
}
ret |= VM_FAULT_WRITE;
}
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
out_mn:
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
out:
return ret;
out_unlock:
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
return ret;
}
@@ -1489,13 +1469,12 @@ out:
}
/* NUMA hinting page fault entry point for trans huge pmds */
-int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
{
- spinlock_t *ptl;
+ struct vm_area_struct *vma = fe->vma;
struct anon_vma *anon_vma = NULL;
struct page *page;
- unsigned long haddr = addr & HPAGE_PMD_MASK;
+ unsigned long haddr = fe->address & HPAGE_PMD_MASK;
int page_nid = -1, this_nid = numa_node_id();
int target_nid, last_cpupid = -1;
bool page_locked;
@@ -1506,8 +1485,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* A PROT_NONE fault should not end up here */
BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
- ptl = pmd_lock(mm, pmdp);
- if (unlikely(!pmd_same(pmd, *pmdp)))
+ fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+ if (unlikely(!pmd_same(pmd, *fe->pmd)))
goto out_unlock;
/*
@@ -1515,9 +1494,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* without disrupting NUMA hinting information. Do not relock and
* check_same as the page may no longer be mapped.
*/
- if (unlikely(pmd_trans_migrating(*pmdp))) {
- page = pmd_page(*pmdp);
- spin_unlock(ptl);
+ if (unlikely(pmd_trans_migrating(*fe->pmd))) {
+ page = pmd_page(*fe->pmd);
+ spin_unlock(fe->ptl);
wait_on_page_locked(page);
goto out;
}
@@ -1550,7 +1529,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
wait_on_page_locked(page);
page_nid = -1;
goto out;
@@ -1561,12 +1540,12 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* to serialises splits
*/
get_page(page);
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
anon_vma = page_lock_anon_vma_read(page);
/* Confirm the PMD did not change while page_table_lock was released */
- spin_lock(ptl);
- if (unlikely(!pmd_same(pmd, *pmdp))) {
+ spin_lock(fe->ptl);
+ if (unlikely(!pmd_same(pmd, *fe->pmd))) {
unlock_page(page);
put_page(page);
page_nid = -1;
@@ -1584,9 +1563,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* Migrate the THP to the requested node, returns with page unlocked
* and access rights restored.
*/
- spin_unlock(ptl);
- migrated = migrate_misplaced_transhuge_page(mm, vma,
- pmdp, pmd, addr, page, target_nid);
+ spin_unlock(fe->ptl);
+ migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
+ fe->pmd, pmd, fe->address, page, target_nid);
if (migrated) {
flags |= TNF_MIGRATED;
page_nid = target_nid;
@@ -1601,18 +1580,18 @@ clear_pmdnuma:
pmd = pmd_mkyoung(pmd);
if (was_writable)
pmd = pmd_mkwrite(pmd);
- set_pmd_at(mm, haddr, pmdp, pmd);
- update_mmu_cache_pmd(vma, addr, pmdp);
+ set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd);
+ update_mmu_cache_pmd(vma, fe->address, fe->pmd);
unlock_page(page);
out_unlock:
- spin_unlock(ptl);
+ spin_unlock(fe->ptl);
out:
if (anon_vma)
page_unlock_anon_vma_read(anon_vma);
if (page_nid != -1)
- task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
+ task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags);
return 0;
}
@@ -2413,20 +2392,23 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd)
{
- unsigned long _address;
- pte_t *pte, pteval;
+ pte_t pteval;
int swapped_in = 0, ret = 0;
-
- pte = pte_offset_map(pmd, address);
- for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE;
- pte++, _address += PAGE_SIZE) {
- pteval = *pte;
+ struct fault_env fe = {
+ .vma = vma,
+ .address = address,
+ .flags = FAULT_FLAG_ALLOW_RETRY,
+ .pmd = pmd,
+ };
+
+ fe.pte = pte_offset_map(pmd, address);
+ for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE;
+ fe.pte++, fe.address += PAGE_SIZE) {
+ pteval = *fe.pte;
if (!is_swap_pte(pteval))
continue;
swapped_in++;
- ret = do_swap_page(mm, vma, _address, pte, pmd,
- FAULT_FLAG_ALLOW_RETRY,
- pteval);
+ ret = do_swap_page(&fe, pteval);
/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
if (ret & VM_FAULT_RETRY) {
down_read(&mm->mmap_sem);
@@ -2442,10 +2424,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
return false;
}
/* pte is unmapped now, we need to map it */
- pte = pte_offset_map(pmd, _address);
+ fe.pte = pte_offset_map(pmd, fe.address);
}
- pte--;
- pte_unmap(pte);
+ fe.pte--;
+ pte_unmap(fe.pte);
trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
return true;
}
diff --git a/mm/internal.h b/mm/internal.h
index e1531758122b..9b6a6c43ac39 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -36,9 +36,7 @@
/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
-extern int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- unsigned int flags, pte_t orig_pte);
+int do_swap_page(struct fault_env *fe, pte_t orig_pte);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
diff --git a/mm/memory.c b/mm/memory.c
index 6bf2b8564376..72b520897339 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2070,13 +2070,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
* case, all we need to do here is to mark the page as writable and update
* any related book-keeping.
*/
-static inline int wp_page_reuse(struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long address,
- pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
- struct page *page, int page_mkwrite,
- int dirty_shared)
- __releases(ptl)
+static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
+ struct page *page, int page_mkwrite, int dirty_shared)
+ __releases(fe->ptl)
{
+ struct vm_area_struct *vma = fe->vma;
pte_t entry;
/*
* Clear the pages cpupid information as the existing
@@ -2086,12 +2084,12 @@ static inline int wp_page_reuse(struct mm_struct *mm,
if (page)
page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
- flush_cache_page(vma, address, pte_pfn(orig_pte));
+ flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (ptep_set_access_flags(vma, address, page_table, entry, 1))
- update_mmu_cache(vma, address, page_table);
- pte_unmap_unlock(page_table, ptl);
+ if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1))
+ update_mmu_cache(vma, fe->address, fe->pte);
+ pte_unmap_unlock(fe->pte, fe->ptl);
if (dirty_shared) {
struct address_space *mapping;
@@ -2137,30 +2135,31 @@ st