diff options
| -rw-r--r-- | Documentation/filesystems/Locking | 10 | ||||
| -rw-r--r-- | fs/userfaultfd.c | 22 | ||||
| -rw-r--r-- | include/linux/huge_mm.h | 20 | ||||
| -rw-r--r-- | include/linux/mm.h | 34 | ||||
| -rw-r--r-- | include/linux/userfaultfd_k.h | 8 | ||||
| -rw-r--r-- | mm/filemap.c | 28 | ||||
| -rw-r--r-- | mm/huge_memory.c | 280 | ||||
| -rw-r--r-- | mm/internal.h | 4 | ||||
| -rw-r--r-- | mm/memory.c | 582 | ||||
| -rw-r--r-- | mm/nommu.c | 3 |
10 files changed, 475 insertions, 516 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index dda6e3f8e203..5a7386e38e2d 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -548,13 +548,13 @@ subsequent truncate), and then return with VM_FAULT_LOCKED, and the page locked. The VM will unlock the page. ->map_pages() is called when VM asks to map easy accessible pages. -Filesystem should find and map pages associated with offsets from "pgoff" -till "max_pgoff". ->map_pages() is called with page table locked and must +Filesystem should find and map pages associated with offsets from "start_pgoff" +till "end_pgoff". ->map_pages() is called with page table locked and must not block. If it's not possible to reach a page without blocking, filesystem should skip it. Filesystem should use do_set_pte() to setup -page table entry. Pointer to entry associated with offset "pgoff" is -passed in "pte" field in vm_fault structure. Pointers to entries for other -offsets should be calculated relative to "pte". +page table entry. Pointer to entry associated with the page is passed in +"pte" field in fault_env structure. Pointers to entries for other offsets +should be calculated relative to "pte". ->page_mkwrite() is called when a previously read-only pte is about to become writeable. The filesystem again must ensure that there are diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 2d97952e341a..85959d8324df 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -257,10 +257,9 @@ out: * fatal_signal_pending()s, and the mmap_sem must be released before * returning it. */ -int handle_userfault(struct vm_area_struct *vma, unsigned long address, - unsigned int flags, unsigned long reason) +int handle_userfault(struct fault_env *fe, unsigned long reason) { - struct mm_struct *mm = vma->vm_mm; + struct mm_struct *mm = fe->vma->vm_mm; struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; int ret; @@ -269,7 +268,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); ret = VM_FAULT_SIGBUS; - ctx = vma->vm_userfaultfd_ctx.ctx; + ctx = fe->vma->vm_userfaultfd_ctx.ctx; if (!ctx) goto out; @@ -302,17 +301,17 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, * without first stopping userland access to the memory. For * VM_UFFD_MISSING userfaults this is enough for now. */ - if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) { + if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) { /* * Validate the invariant that nowait must allow retry * to be sure not to return SIGBUS erroneously on * nowait invocations. */ - BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT); + BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT); #ifdef CONFIG_DEBUG_VM if (printk_ratelimit()) { printk(KERN_WARNING - "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags); + "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags); dump_stack(); } #endif @@ -324,7 +323,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, * and wait. */ ret = VM_FAULT_RETRY; - if (flags & FAULT_FLAG_RETRY_NOWAIT) + if (fe->flags & FAULT_FLAG_RETRY_NOWAIT) goto out; /* take the reference before dropping the mmap_sem */ @@ -332,10 +331,11 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(address, flags, reason); + uwq.msg = userfault_msg(fe->address, fe->flags, reason); uwq.ctx = ctx; - return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == + return_to_userland = + (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); spin_lock(&ctx->fault_pending_wqh.lock); @@ -353,7 +353,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, TASK_KILLABLE); spin_unlock(&ctx->fault_pending_wqh.lock); - must_wait = userfaultfd_must_wait(ctx, address, flags, reason); + must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason); up_read(&mm->mmap_sem); if (likely(must_wait && !ACCESS_ONCE(ctx->released) && diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f0a7a0320300..9bed9249156f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -1,20 +1,12 @@ #ifndef _LINUX_HUGE_MM_H #define _LINUX_HUGE_MM_H -extern int do_huge_pmd_anonymous_page(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - unsigned int flags); +extern int do_huge_pmd_anonymous_page(struct fault_env *fe); extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma); -extern void huge_pmd_set_accessed(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pmd_t orig_pmd, int dirty); -extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pmd_t orig_pmd); +extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd); +extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd); extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, @@ -134,8 +126,7 @@ static inline int hpage_nr_pages(struct page *page) return 1; } -extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t pmd, pmd_t *pmdp); +extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd); extern struct page *huge_zero_page; @@ -196,8 +187,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, return NULL; } -static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t pmd, pmd_t *pmdp) +static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd) { return 0; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 646bc36b4d1b..8bd74558c0e4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -309,10 +309,27 @@ struct vm_fault { * VM_FAULT_DAX_LOCKED and fill in * entry here. */ - /* for ->map_pages() only */ - pgoff_t max_pgoff; /* map pages for offset from pgoff till - * max_pgoff inclusive */ - pte_t *pte; /* pte entry associated with ->pgoff */ +}; + +/* + * Page fault context: passes though page fault handler instead of endless list + * of function arguments. + */ +struct fault_env { + struct vm_area_struct *vma; /* Target VMA */ + unsigned long address; /* Faulting virtual address */ + unsigned int flags; /* FAULT_FLAG_xxx flags */ + pmd_t *pmd; /* Pointer to pmd entry matching + * the 'address' + */ + pte_t *pte; /* Pointer to pte entry matching + * the 'address'. NULL if the page + * table hasn't been allocated. + */ + spinlock_t *ptl; /* Page table lock. + * Protects pte page table if 'pte' + * is not NULL, otherwise pmd. + */ }; /* @@ -327,7 +344,8 @@ struct vm_operations_struct { int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); int (*pmd_fault)(struct vm_area_struct *, unsigned long address, pmd_t *, unsigned int flags); - void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf); + void (*map_pages)(struct fault_env *fe, + pgoff_t start_pgoff, pgoff_t end_pgoff); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ @@ -600,8 +618,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -void do_set_pte(struct vm_area_struct *vma, unsigned long address, - struct page *page, pte_t *pte, bool write, bool anon); +void do_set_pte(struct fault_env *fe, struct page *page); #endif /* @@ -2062,7 +2079,8 @@ extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); -extern void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf); +extern void filemap_map_pages(struct fault_env *fe, + pgoff_t start_pgoff, pgoff_t end_pgoff); extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); /* mm/page-writeback.c */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 587480ad41b7..dd66a952e8cd 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -27,8 +27,7 @@ #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) -extern int handle_userfault(struct vm_area_struct *vma, unsigned long address, - unsigned int flags, unsigned long reason); +extern int handle_userfault(struct fault_env *fe, unsigned long reason); extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len); @@ -56,10 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) #else /* CONFIG_USERFAULTFD */ /* mm helpers */ -static inline int handle_userfault(struct vm_area_struct *vma, - unsigned long address, - unsigned int flags, - unsigned long reason) +static inline int handle_userfault(struct fault_env *fe, unsigned long reason) { return VM_FAULT_SIGBUS; } diff --git a/mm/filemap.c b/mm/filemap.c index 20f3b1f33f0e..54d5318f8d3f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2128,22 +2128,27 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) +void filemap_map_pages(struct fault_env *fe, + pgoff_t start_pgoff, pgoff_t end_pgoff) { struct radix_tree_iter iter; void **slot; - struct file *file = vma->vm_file; + struct file *file = fe->vma->vm_file; struct address_space *mapping = file->f_mapping; + pgoff_t last_pgoff = start_pgoff; loff_t size; struct page *page; - unsigned long address = (unsigned long) vmf->virtual_address; - unsigned long addr; - pte_t *pte; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) { - if (iter.index > vmf->max_pgoff) + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, + start_pgoff) { + if (iter.index > end_pgoff) break; + fe->pte += iter.index - last_pgoff; + fe->address += (iter.index - last_pgoff) << PAGE_SHIFT; + last_pgoff = iter.index; + if (!pte_none(*fe->pte)) + goto next; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -2179,14 +2184,9 @@ repeat: if (page->index >= size >> PAGE_SHIFT) goto unlock; - pte = vmf->pte + page->index - vmf->pgoff; - if (!pte_none(*pte)) - goto unlock; - if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; - addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; - do_set_pte(vma, addr, page, pte, false, false); + do_set_pte(fe, page); unlock_page(page); goto next; unlock: @@ -2194,7 +2194,7 @@ unlock: skip: put_page(page); next: - if (iter.index == vmf->max_pgoff) + if (iter.index == end_pgoff) break; } rcu_read_unlock(); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1a90f55d930f..bc5abcbe376e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -821,26 +821,23 @@ void prep_transhuge_page(struct page *page) set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } -static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - struct page *page, gfp_t gfp, - unsigned int flags) +static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, + gfp_t gfp) { + struct vm_area_struct *vma = fe->vma; struct mem_cgroup *memcg; pgtable_t pgtable; - spinlock_t *ptl; - unsigned long haddr = address & HPAGE_PMD_MASK; + unsigned long haddr = fe->address & HPAGE_PMD_MASK; VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - pgtable = pte_alloc_one(mm, haddr); + pgtable = pte_alloc_one(vma->vm_mm, haddr); if (unlikely(!pgtable)) { mem_cgroup_cancel_charge(page, memcg, true); put_page(page); @@ -855,12 +852,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, */ __SetPageUptodate(page); - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_none(*pmd))) { - spin_unlock(ptl); + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + if (unlikely(!pmd_none(*fe->pmd))) { + spin_unlock(fe->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); - pte_free(mm, pgtable); + pte_free(vma->vm_mm, pgtable); } else { pmd_t entry; @@ -868,12 +865,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, if (userfaultfd_missing(vma)) { int ret; - spin_unlock(ptl); + spin_unlock(fe->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); - pte_free(mm, pgtable); - ret = handle_userfault(vma, address, flags, - VM_UFFD_MISSING); + pte_free(vma->vm_mm, pgtable); + ret = handle_userfault(fe, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); return ret; } @@ -883,11 +879,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, page_add_new_anon_rmap(page, vma, haddr, true); mem_cgroup_commit_charge(page, memcg, false, true); lru_cache_add_active_or_unevictable(page, vma); - pgtable_trans_huge_deposit(mm, pmd, pgtable); - set_pmd_at(mm, haddr, pmd, entry); - add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&mm->nr_ptes); - spin_unlock(ptl); + pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable); + set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + atomic_long_inc(&vma->vm_mm->nr_ptes); + spin_unlock(fe->ptl); count_vm_event(THP_FAULT_ALLOC); } @@ -937,13 +933,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return true; } -int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - unsigned int flags) +int do_huge_pmd_anonymous_page(struct fault_env *fe) { + struct vm_area_struct *vma = fe->vma; gfp_t gfp; struct page *page; - unsigned long haddr = address & HPAGE_PMD_MASK; + unsigned long haddr = fe->address & HPAGE_PMD_MASK; if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return VM_FAULT_FALLBACK; @@ -951,42 +946,40 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (unlikely(khugepaged_enter(vma, vma->vm_flags))) return VM_FAULT_OOM; - if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) && + if (!(fe->flags & FAULT_FLAG_WRITE) && + !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { - spinlock_t *ptl; pgtable_t pgtable; struct page *zero_page; bool set; int ret; - pgtable = pte_alloc_one(mm, haddr); + pgtable = pte_alloc_one(vma->vm_mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; zero_page = get_huge_zero_page(); if (unlikely(!zero_page)) { - pte_free(mm, pgtable); + pte_free(vma->vm_mm, pgtable); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - ptl = pmd_lock(mm, pmd); + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); ret = 0; set = false; - if (pmd_none(*pmd)) { + if (pmd_none(*fe->pmd)) { if (userfaultfd_missing(vma)) { - spin_unlock(ptl); - ret = handle_userfault(vma, address, flags, - VM_UFFD_MISSING); + spin_unlock(fe->ptl); + ret = handle_userfault(fe, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); } else { - set_huge_zero_page(pgtable, mm, vma, - haddr, pmd, - zero_page); - spin_unlock(ptl); + set_huge_zero_page(pgtable, vma->vm_mm, vma, + haddr, fe->pmd, zero_page); + spin_unlock(fe->ptl); set = true; } } else - spin_unlock(ptl); + spin_unlock(fe->ptl); if (!set) { - pte_free(mm, pgtable); + pte_free(vma->vm_mm, pgtable); put_huge_zero_page(); } return ret; @@ -998,8 +991,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; } prep_transhuge_page(page); - return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, - flags); + return __do_huge_pmd_anonymous_page(fe, page, gfp); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -1172,38 +1164,31 @@ out: return ret; } -void huge_pmd_set_accessed(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmd, pmd_t orig_pmd, - int dirty) +void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd) { - spinlock_t *ptl; pmd_t entry; unsigned long haddr; - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, orig_pmd))) + fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd); + if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) goto unlock; entry = pmd_mkyoung(orig_pmd); - haddr = address & HPAGE_PMD_MASK; - if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) - update_mmu_cache_pmd(vma, address, pmd); + haddr = fe->address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry, + fe->flags & FAULT_FLAG_WRITE)) + update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd); unlock: - spin_unlock(ptl); + spin_unlock(fe->ptl); } -static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmd, pmd_t orig_pmd, - struct page *page, - unsigned long haddr) +static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, + struct page *page) { + struct vm_area_struct *vma = fe->vma; + unsigned long haddr = fe->address & HPAGE_PMD_MASK; struct mem_cgroup *memcg; - spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; int ret = 0, i; @@ -1220,11 +1205,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, for (i = 0; i < HPAGE_PMD_NR; i++) { pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | - __GFP_OTHER_NODE, - vma, address, page_to_nid(page)); + __GFP_OTHER_NODE, vma, + fe->address, page_to_nid(page)); if (unlikely(!pages[i] || - mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, - &memcg, false))) { + mem_cgroup_try_charge(pages[i], vma->vm_mm, + GFP_KERNEL, &memcg, false))) { if (pages[i]) put_page(pages[i]); while (--i >= 0) { @@ -1250,41 +1235,41 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, mmun_start = haddr; mmun_end = haddr + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, orig_pmd))) + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_huge_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(mm, pmd); - pmd_populate(mm, &_pmd, pgtable); + pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd); + pmd_populate(vma->vm_mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { - pte_t *pte, entry; + pte_t entry; entry = mk_pte(pages[i], vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], vma, haddr, false); + page_add_new_anon_rmap(pages[i], fe->vma, haddr, false); mem_cgroup_commit_charge(pages[i], memcg, false, false); lru_cache_add_active_or_unevictable(pages[i], vma); - pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*pte)); - set_pte_at(mm, haddr, pte, entry); - pte_unmap(pte); + fe->pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*fe->pte)); + set_pte_at(vma->vm_mm, haddr, fe->pte, entry); + pte_unmap(fe->pte); } kfree(pages); smp_wmb(); /* make pte visible before pmd */ - pmd_populate(mm, pmd, pgtable); + pmd_populate(vma->vm_mm, fe->pmd, pgtable); page_remove_rmap(page, true); - spin_unlock(ptl); + spin_unlock(fe->ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); ret |= VM_FAULT_WRITE; put_page(page); @@ -1293,8 +1278,8 @@ out: return ret; out_free_pages: - spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + spin_unlock(fe->ptl); + mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); for (i = 0; i < HPAGE_PMD_NR; i++) { memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); @@ -1305,25 +1290,23 @@ out_free_pages: goto out; } -int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, pmd_t orig_pmd) +int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) { - spinlock_t *ptl; - int ret = 0; + struct vm_area_struct *vma = fe->vma; struct page *page = NULL, *new_page; struct mem_cgroup *memcg; - unsigned long haddr; + unsigned long haddr = fe->address & HPAGE_PMD_MASK; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ gfp_t huge_gfp; /* for allocation and charge */ + int ret = 0; - ptl = pmd_lockptr(mm, pmd); + fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); - haddr = address & HPAGE_PMD_MASK; if (is_huge_zero_pmd(orig_pmd)) goto alloc; - spin_lock(ptl); - if (unlikely(!pmd_same(*pmd, orig_pmd))) + spin_lock(fe->ptl); + if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) goto out_unlock; page = pmd_page(orig_pmd); @@ -1336,13 +1319,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) - update_mmu_cache_pmd(vma, address, pmd); + if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1)) + update_mmu_cache_pmd(vma, fe->address, fe->pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } get_page(page); - spin_unlock(ptl); + spin_unlock(fe->ptl); alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { @@ -1355,13 +1338,12 @@ alloc: prep_transhuge_page(new_page); } else { if (!page) { - split_huge_pmd(vma, pmd, address); + split_huge_pmd(vma, fe->pmd, fe->address); ret |= VM_FAULT_FALLBACK; } else { - ret = do_huge_pmd_wp_page_fallback(mm, vma, address, - pmd, orig_pmd, page, haddr); + ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page); if (ret & VM_FAULT_OOM) { - split_huge_pmd(vma, pmd, address); + split_huge_pmd(vma, fe->pmd, fe->address); ret |= VM_FAULT_FALLBACK; } put_page(page); @@ -1370,14 +1352,12 @@ alloc: goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg, - true))) { + if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, + huge_gfp, &memcg, true))) { put_page(new_page); - if (page) { - split_huge_pmd(vma, pmd, address); + split_huge_pmd(vma, fe->pmd, fe->address); + if (page) put_page(page); - } else - split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; count_vm_event(THP_FAULT_FALLBACK); goto out; @@ -1393,13 +1373,13 @@ alloc: mmun_start = haddr; mmun_end = haddr + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - spin_lock(ptl); + spin_lock(fe->ptl); if (page) put_page(page); - if (unlikely(!pmd_same(*pmd, orig_pmd))) { - spin_unlock(ptl); + if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) { + spin_unlock(fe->ptl); mem_cgroup_cancel_charge(new_page, memcg, true); put_page(new_page); goto out_mn; @@ -1407,14 +1387,14 @@ alloc: pmd_t entry; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_huge_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); page_add_new_anon_rmap(new_page, vma, haddr, true); mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_active_or_unevictable(new_page, vma); - set_pmd_at(mm, haddr, pmd, entry); - update_mmu_cache_pmd(vma, address, pmd); + set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + update_mmu_cache_pmd(vma, fe->address, fe->pmd); if (!page) { - add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); put_huge_zero_page(); } else { VM_BUG_ON_PAGE(!PageHead(page), page); @@ -1423,13 +1403,13 @@ alloc: } ret |= VM_FAULT_WRITE; } - spin_unlock(ptl); + spin_unlock(fe->ptl); out_mn: - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); out: return ret; out_unlock: - spin_unlock(ptl); + spin_unlock(fe->ptl); return ret; } @@ -1489,13 +1469,12 @@ out: } /* NUMA hinting page fault entry point for trans huge pmds */ -int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t pmd, pmd_t *pmdp) +int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) { - spinlock_t *ptl; + struct vm_area_struct *vma = fe->vma; struct anon_vma *anon_vma = NULL; struct page *page; - unsigned long haddr = addr & HPAGE_PMD_MASK; + unsigned long haddr = fe->address & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); int target_nid, last_cpupid = -1; bool page_locked; @@ -1506,8 +1485,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* A PROT_NONE fault should not end up here */ BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); - ptl = pmd_lock(mm, pmdp); - if (unlikely(!pmd_same(pmd, *pmdp))) + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + if (unlikely(!pmd_same(pmd, *fe->pmd))) goto out_unlock; /* @@ -1515,9 +1494,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * without disrupting NUMA hinting information. Do not relock and * check_same as the page may no longer be mapped. */ - if (unlikely(pmd_trans_migrating(*pmdp))) { - page = pmd_page(*pmdp); - spin_unlock(ptl); + if (unlikely(pmd_trans_migrating(*fe->pmd))) { + page = pmd_page(*fe->pmd); + spin_unlock(fe->ptl); wait_on_page_locked(page); goto out; } @@ -1550,7 +1529,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { - spin_unlock(ptl); + spin_unlock(fe->ptl); wait_on_page_locked(page); page_nid = -1; goto out; @@ -1561,12 +1540,12 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * to serialises splits */ get_page(page); - spin_unlock(ptl); + spin_unlock(fe->ptl); anon_vma = page_lock_anon_vma_read(page); /* Confirm the PMD did not change while page_table_lock was released */ - spin_lock(ptl); - if (unlikely(!pmd_same(pmd, *pmdp))) { + spin_lock(fe->ptl); + if (unlikely(!pmd_same(pmd, *fe->pmd))) { unlock_page(page); put_page(page); page_nid = -1; @@ -1584,9 +1563,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * Migrate the THP to the requested node, returns with page unlocked * and access rights restored. */ - spin_unlock(ptl); - migrated = migrate_misplaced_transhuge_page(mm, vma, - pmdp, pmd, addr, page, target_nid); + spin_unlock(fe->ptl); + migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, + fe->pmd, pmd, fe->address, page, target_nid); if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; @@ -1601,18 +1580,18 @@ clear_pmdnuma: pmd = pmd_mkyoung(pmd); if (was_writable) pmd = pmd_mkwrite(pmd); - set_pmd_at(mm, haddr, pmdp, pmd); - update_mmu_cache_pmd(vma, addr, pmdp); + set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd); + update_mmu_cache_pmd(vma, fe->address, fe->pmd); unlock_page(page); out_unlock: - spin_unlock(ptl); + spin_unlock(fe->ptl); out: if (anon_vma) page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags); return 0; } @@ -2413,20 +2392,23 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd) { - unsigned long _address; - pte_t *pte, pteval; + pte_t pteval; int swapped_in = 0, ret = 0; - - pte = pte_offset_map(pmd, address); - for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE; - pte++, _address += PAGE_SIZE) { - pteval = *pte; + struct fault_env fe = { + .vma = vma, + .address = address, + .flags = FAULT_FLAG_ALLOW_RETRY, + .pmd = pmd, + }; + + fe.pte = pte_offset_map(pmd, address); + for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; + fe.pte++, fe.address += PAGE_SIZE) { + pteval = *fe.pte; if (!is_swap_pte(pteval)) continue; swapped_in++; - ret = do_swap_page(mm, vma, _address, pte, pmd, - FAULT_FLAG_ALLOW_RETRY, - pteval); + ret = do_swap_page(&fe, pteval); /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ if (ret & VM_FAULT_RETRY) { down_read(&mm->mmap_sem); @@ -2442,10 +2424,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, return false; } /* pte is unmapped now, we need to map it */ - pte = pte_offset_map(pmd, _address); + fe.pte = pte_offset_map(pmd, fe.address); } - pte--; - pte_unmap(pte); + fe.pte--; + pte_unmap(fe.pte); trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1); return true; } diff --git a/mm/internal.h b/mm/internal.h index e1531758122b..9b6a6c43ac39 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -36,9 +36,7 @@ /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) -extern int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - unsigned int flags, pte_t orig_pte); +int do_swap_page(struct fault_env *fe, pte_t orig_pte); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/memory.c b/mm/memory.c index 6bf2b8564376..72b520897339 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2070,13 +2070,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ -static inline int wp_page_reuse(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, - struct page *page, int page_mkwrite, - int dirty_shared) - __releases(ptl) +static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, + struct page *page, int page_mkwrite, int dirty_shared) + __releases(fe->ptl) { + struct vm_area_struct *vma = fe->vma; pte_t entry; /* * Clear the pages cpupid information as the existing @@ -2086,12 +2084,12 @@ static inline int wp_page_reuse(struct mm_struct *mm, if (page) page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); - flush_cache_page(vma, address, pte_pfn(orig_pte)); + flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, address, page_table, entry, 1)) - update_mmu_cache(vma, address, page_table); - pte_unmap_unlock(page_table, ptl); + if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1)) + update_mmu_cache(vma, fe->address, fe->pte); + pte_unmap_unlock(fe->pte, fe->ptl); if (dirty_shared) { struct address_space *mapping; @@ -2137,30 +2135,31 @@ st |
