diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-29 19:12:44 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-29 19:12:44 -0800 |
| commit | a0908a1b7d68706ee52ed4a039756e70c8e956e9 (patch) | |
| tree | ce23d955c37b04d816fc7d551db71d4ff86630f7 | |
| parent | b9151761021e25c024a6670df4e7c43ffbab0e1d (diff) | |
| parent | 72639e6df4128dfde8fee7638a35be7db8114000 (diff) | |
| download | linux-a0908a1b7d68706ee52ed4a039756e70c8e956e9.tar.gz linux-a0908a1b7d68706ee52ed4a039756e70c8e956e9.tar.bz2 linux-a0908a1b7d68706ee52ed4a039756e70c8e956e9.zip | |
Merge branch 'akpm' (patches from Andrew)
Mergr misc fixes from Andrew Morton:
"28 fixes"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (28 commits)
fs/hugetlbfs/inode.c: change put_page/unlock_page order in hugetlbfs_fallocate()
mm/hugetlb: fix NULL-pointer dereference on 5-level paging machine
autofs: revert "autofs: fix AT_NO_AUTOMOUNT not being honored"
autofs: revert "autofs: take more care to not update last_used on path walk"
fs/fat/inode.c: fix sb_rdonly() change
mm, memcg: fix mem_cgroup_swapout() for THPs
mm: migrate: fix an incorrect call of prep_transhuge_page()
kmemleak: add scheduling point to kmemleak_scan()
scripts/bloat-o-meter: don't fail with division by 0
fs/mbcache.c: make count_objects() more robust
Revert "mm/page-writeback.c: print a warning if the vm dirtiness settings are illogical"
mm/madvise.c: fix madvise() infinite loop under special circumstances
exec: avoid RLIMIT_STACK races with prlimit()
IB/core: disable memory registration of filesystem-dax vmas
v4l2: disable filesystem-dax mapping support
mm: fail get_vaddr_frames() for filesystem-dax mappings
mm: introduce get_user_pages_longterm
device-dax: implement ->split() to catch invalid munmap attempts
mm, hugetlbfs: introduce ->split() to vm_operations_struct
scripts/faddr2line: extend usage on generic arch
...
40 files changed, 238 insertions, 101 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index b920423f88cb..5025ff9307e6 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -158,10 +158,6 @@ Note: the minimum value allowed for dirty_bytes is two pages (in bytes); any value lower than this limit will be ignored and the old configuration will be retained. -Note: the value of dirty_bytes also must be set greater than -dirty_background_bytes or the amount of memory corresponding to -dirty_background_ratio. - ============================================================== dirty_expire_centisecs @@ -181,9 +177,6 @@ generating disk writes will itself start writing out dirty data. The total available memory is not equal to total system memory. -Note: dirty_ratio must be set greater than dirty_background_ratio or -ratio corresponding to dirty_background_bytes. - ============================================================== dirty_writeback_centisecs diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 2a029bceaf2f..1a7a17b2a1ba 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -221,7 +221,6 @@ static inline pte_t pte_mkspecial(pte_t pte) } #define __HAVE_ARCH_PTE_SPECIAL -#define __HAVE_ARCH_PMD_WRITE #define pmd_write(pmd) (pmd_isclear((pmd), L_PMD_SECT_RDONLY)) #define pmd_dirty(pmd) (pmd_isset((pmd), L_PMD_SECT_DIRTY)) #define pud_page(pud) pmd_page(__pmd(pud_val(pud))) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index c9530b5b5ca8..149d05fb9421 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -345,7 +345,6 @@ static inline int pmd_protnone(pmd_t pmd) #define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd)) -#define __HAVE_ARCH_PMD_WRITE #define pmd_write(pmd) pte_write(pmd_pte(pmd)) #define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT)) diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 9e9e94415d08..1a508a74d48d 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -552,7 +552,7 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd) extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); -#define __HAVE_ARCH_PMD_WRITE +#define pmd_write pmd_write static inline int pmd_write(pmd_t pmd) { return !!(pmd_val(pmd) & _PAGE_WRITE); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 9a677cd5997f..44697817ccc6 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1005,7 +1005,6 @@ static inline int pmd_protnone(pmd_t pmd) } #endif /* CONFIG_NUMA_BALANCING */ -#define __HAVE_ARCH_PMD_WRITE #define pmd_write(pmd) pte_write(pmd_pte(pmd)) #define __pmd_write(pmd) __pte_write(pmd_pte(pmd)) #define pmd_savedwrite(pmd) pte_savedwrite(pmd_pte(pmd)) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index d7fe9838084d..57d7bc92e0b8 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -709,7 +709,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd) return (pmd_val(pmd) & origin_mask) >> PAGE_SHIFT; } -#define __HAVE_ARCH_PMD_WRITE +#define pmd_write pmd_write static inline int pmd_write(pmd_t pmd) { return (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE) != 0; @@ -1264,6 +1264,12 @@ static inline pud_t pud_mkwrite(pud_t pud) return pud; } +#define pud_write pud_write +static inline int pud_write(pud_t pud) +{ + return (pud_val(pud) & _REGION3_ENTRY_WRITE) != 0; +} + static inline pud_t pud_mkclean(pud_t pud) { if (pud_large(pud)) { diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 5a9e96be1665..9937c5ff94a9 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -715,7 +715,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd) return pte_pfn(pte); } -#define __HAVE_ARCH_PMD_WRITE +#define pmd_write pmd_write static inline unsigned long pmd_write(pmd_t pmd) { pte_t pte = __pte(pmd_val(pmd)); diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index 5335ba3c850e..33c0f8bb0f33 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -75,7 +75,7 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, if (!(pmd_val(pmd) & _PAGE_VALID)) return 0; - if (write && !pmd_write(pmd)) + if (!pmd_access_permitted(pmd, write)) return 0; refs = 0; @@ -114,7 +114,7 @@ static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr, if (!(pud_val(pud) & _PAGE_VALID)) return 0; - if (write && !pud_write(pud)) + if (!pud_access_permitted(pud, write)) return 0; refs = 0; diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h index 2a26cc4fefc2..adfa21b18488 100644 --- a/arch/tile/include/asm/pgtable.h +++ b/arch/tile/include/asm/pgtable.h @@ -475,7 +475,6 @@ static inline void pmd_clear(pmd_t *pmdp) #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) #define pmd_huge_page(pmd) pte_huge(pmd_pte(pmd)) #define pmd_mkhuge(pmd) pte_pmd(pte_mkhuge(pmd_pte(pmd))) -#define __HAVE_ARCH_PMD_WRITE #define pfn_pmd(pfn, pgprot) pte_pmd(pfn_pte((pfn), (pgprot))) #define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd)) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 09f9e1e00e3b..95e2dfd75521 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1061,7 +1061,7 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); -#define __HAVE_ARCH_PMD_WRITE +#define pmd_write pmd_write static inline int pmd_write(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_RW; @@ -1088,6 +1088,12 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); } +#define pud_write pud_write +static inline int pud_write(pud_t pud) +{ + return pud_flags(pud) & _PAGE_RW; +} + /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 6833ada237ab..7b0bf825c4e7 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -428,9 +428,21 @@ static int dev_dax_fault(struct vm_fault *vmf) return dev_dax_huge_fault(vmf, PE_SIZE_PTE); } +static int dev_dax_split(struct vm_area_struct *vma, unsigned long addr) +{ + struct file *filp = vma->vm_file; + struct dev_dax *dev_dax = filp->private_data; + struct dax_region *dax_region = dev_dax->region; + + if (!IS_ALIGNED(addr, dax_region->align)) + return -EINVAL; + return 0; +} + static const struct vm_operations_struct dax_vm_ops = { .fault = dev_dax_fault, .huge_fault = dev_dax_huge_fault, + .split = dev_dax_split, }; static int dax_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 21e60b1e2ff4..130606c3b07c 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -191,7 +191,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, sg_list_start = umem->sg_head.sgl; while (npages) { - ret = get_user_pages(cur_base, + ret = get_user_pages_longterm(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), gup_flags, page_list, vma_list); diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index 0b5c43f7e020..f412429cf5ba 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -185,12 +185,13 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma, dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n", data, size, dma->nr_pages); - err = get_user_pages(data & PAGE_MASK, dma->nr_pages, + err = get_user_pages_longterm(data & PAGE_MASK, dma->nr_pages, flags, dma->pages, NULL); if (err != dma->nr_pages) { dma->nr_pages = (err >= 0) ? err : 0; - dprintk(1, "get_user_pages: err=%d [%d]\n", err, dma->nr_pages); + dprintk(1, "get_user_pages_longterm: err=%d [%d]\n", err, + dma->nr_pages); return err < 0 ? err : -EINVAL; } return 0; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index d79ced925861..82e8f6edfb48 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -281,8 +281,8 @@ static int autofs4_mount_wait(const struct path *path, bool rcu_walk) pr_debug("waiting for mount name=%pd\n", path->dentry); status = autofs4_wait(sbi, path, NFY_MOUNT); pr_debug("mount wait done status=%d\n", status); - ino->last_used = jiffies; } + ino->last_used = jiffies; return status; } @@ -321,21 +321,16 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path) */ if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { struct dentry *parent = dentry->d_parent; + struct autofs_info *ino; struct dentry *new; new = d_lookup(parent, &dentry->d_name); if (!new) return NULL; - if (new == dentry) - dput(new); - else { - struct autofs_info *ino; - - ino = autofs4_dentry_ino(new); - ino->last_used = jiffies; - dput(path->dentry); - path->dentry = new; - } + ino = autofs4_dentry_ino(new); + ino->last_used = jiffies; + dput(path->dentry); + path->dentry = new; } return path->dentry; } @@ -627,7 +627,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, if (pfn != pmd_pfn(*pmdp)) goto unlock_pmd; - if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) + if (!pmd_dirty(*pmdp) + && !pmd_access_permitted(*pmdp, WRITE)) goto unlock_pmd; flush_cache_page(vma, address, pfn); diff --git a/fs/exec.c b/fs/exec.c index 1d6243d9f2b6..6be2aa0ab26f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1340,10 +1340,15 @@ void setup_new_exec(struct linux_binprm * bprm) * avoid bad behavior from the prior rlimits. This has to * happen before arch_pick_mmap_layout(), which examines * RLIMIT_STACK, but after the point of no return to avoid - * needing to clean up the change on failure. + * races from other threads changing the limits. This also + * must be protected from races with prlimit() calls. */ + task_lock(current->group_leader); if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM) current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM; + if (current->signal->rlim[RLIMIT_STACK].rlim_max > _STK_LIM) + current->signal->rlim[RLIMIT_STACK].rlim_max = _STK_LIM; + task_unlock(current->group_leader); } arch_pick_mmap_layout(current->mm); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 016c46b5e44c..20a0a89eaca5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -779,7 +779,7 @@ static void __exit fat_destroy_inodecache(void) static int fat_remount(struct super_block *sb, int *flags, char *data) { - int new_rdonly; + bool new_rdonly; struct msdos_sb_info *sbi = MSDOS_SB(sb); *flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 1e76730aac0d..8a85f3f53446 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -639,11 +639,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* - * page_put due to reference from alloc_huge_page() * unlock_page because locked by add_to_page_cache() + * page_put due to reference from alloc_huge_page() */ - put_page(page); unlock_page(page); + put_page(page); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) diff --git a/fs/mbcache.c b/fs/mbcache.c index d818fd236787..b8b8b9ced9f8 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -269,6 +269,9 @@ static unsigned long mb_cache_count(struct shrinker *shrink, struct mb_cache *cache = container_of(shrink, struct mb_cache, c_shrink); + /* Unlikely, but not impossible */ + if (unlikely(cache->c_entry_count < 0)) + return 0; return cache->c_entry_count; } diff --git a/fs/namei.c b/fs/namei.c index f0c7a7b9b6ca..9cc91fb7f156 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1129,18 +1129,9 @@ static int follow_automount(struct path *path, struct nameidata *nd, * of the daemon to instantiate them before they can be used. */ if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE | - LOOKUP_AUTOMOUNT))) { - /* Positive dentry that isn't meant to trigger an - * automount, EISDIR will allow it to be used, - * otherwise there's no mount here "now" so return - * ENOENT. - */ - if (path->dentry->d_inode) - return -EISDIR; - else - return -ENOENT; - } + LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && + path->dentry->d_inode) + return -EISDIR; if (path->dentry->d_sb->s_user_ns != &init_user_ns) return -EACCES; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 757dc6ffc7ba..b234d54f2cb6 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -805,15 +805,23 @@ static inline int pmd_trans_huge(pmd_t pmd) { return 0; } -#ifndef __HAVE_ARCH_PMD_WRITE +#ifndef pmd_write static inline int pmd_write(pmd_t pmd) { BUG(); return 0; } -#endif /* __HAVE_ARCH_PMD_WRITE */ +#endif /* pmd_write */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#ifndef pud_write +static inline int pud_write(pud_t pud) +{ + BUG(); + return 0; +} +#endif /* pud_write */ + #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) diff --git a/include/linux/fs.h b/include/linux/fs.h index bbd92da0946e..511fbaabf624 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3088,7 +3088,8 @@ static inline int vfs_lstat(const char __user *name, struct kstat *stat) static inline int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int flags) { - return vfs_statx(dfd, filename, flags, stat, STATX_BASIC_STATS); + return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT, + stat, STATX_BASIC_STATS); } static inline int vfs_fstat(int fd, struct kstat *stat) { @@ -3194,6 +3195,20 @@ static inline bool vma_is_dax(struct vm_area_struct *vma) return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); } +static inline bool vma_is_fsdax(struct vm_area_struct *vma) +{ + struct inode *inode; + + if (!vma->vm_file) + return false; + if (!vma_is_dax(vma)) + return false; + inode = file_inode(vma->vm_file); + if (inode->i_mode == S_IFCHR) + return false; /* device-dax */ + return true; +} + static inline int iocb_flags(struct file *file) { int res = 0; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index fbf5b31d47ee..82a25880714a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -239,14 +239,6 @@ static inline int pgd_write(pgd_t pgd) } #endif -#ifndef pud_write -static inline int pud_write(pud_t pud) -{ - BUG(); - return 0; -} -#endif - #define HUGETLB_ANON_FILE "anon_hugepage" enum { diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 895ec0c4942e..a2246cf670ba 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -54,7 +54,7 @@ static inline struct page *new_page_nodemask(struct page *page, new_page = __alloc_pages_nodemask(gfp_mask, order, preferred_nid, nodemask); - if (new_page && PageTransHuge(page)) + if (new_page && PageTransHuge(new_page)) prep_transhuge_page(new_page); return new_page; diff --git a/include/linux/mm.h b/include/linux/mm.h index ee073146aaa7..ea818ff739cd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -377,6 +377,7 @@ enum page_entry_size { struct vm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); + int (*split)(struct vm_area_struct * area, unsigned long addr); int (*mremap)(struct vm_area_struct * area); int (*fault)(struct vm_fault *vmf); int (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); @@ -1379,6 +1380,19 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); +#ifdef CONFIG_FS_DAX +long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas); +#else +static inline long get_user_pages_longterm(unsigned long start, + unsigned long nr_pages, unsigned int gup_flags, + struct page **pages, struct vm_area_struct **vmas) +{ + return get_user_pages(start, nr_pages, gup_flags, pages, vmas); +} +#endif /* CONFIG_FS_DAX */ + int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); diff --git a/mm/frame_vector.c b/mm/frame_vector.c index 2f98df0d460e..297c7238f7d4 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -53,6 +53,18 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, ret = -EFAULT; goto out; } + + /* + * While get_vaddr_frames() could be used for transient (kernel + * controlled lifetime) pinning of memory pages all current + * users establish long term (userspace controlled lifetime) + * page pinning. Treat get_vaddr_frames() like + * get_user_pages_longterm() and disallow it for filesystem-dax + * mappings. + */ + if (vma_is_fsdax(vma)) + return -EOPNOTSUPP; + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { vec->got_ref = true; vec->is_pfns = false; @@ -66,7 +66,7 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, */ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) { - return pte_write(pte) || + return pte_access_permitted(pte, WRITE) || ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); } @@ -1095,6 +1095,70 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages); +#ifdef CONFIG_FS_DAX +/* + * This is the same as get_user_pages() in that it assumes we are + * operating on the current task's mm, but it goes further to validate + * that the vmas associated with the address range are suitable for + * longterm elevated page reference counts. For example, filesystem-dax + * mappings are subject to the lifetime enforced by the filesystem and + * we need guarantees that longterm users like RDMA and V4L2 only + * establish mappings that have a kernel enforced revocation mechanism. + * + * "longterm" == userspace controlled elevated page count lifetime. + * Contrast this to iov_iter_get_pages() usages which are transient. + */ +long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas_arg) +{ + struct vm_area_struct **vmas = vmas_arg; + struct vm_area_struct *vma_prev = NULL; + long rc, i; + + if (!pages) + return -EINVAL; + + if (!vmas) { + vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *), + GFP_KERNEL); + if (!vmas) + return -ENOMEM; + } + + rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); + + for (i = 0; i < rc; i++) { + struct vm_area_struct *vma = vmas[i]; + + if (vma == vma_prev) + continue; + + vma_prev = vma; + + if (vma_is_fsdax(vma)) + break; + } + + /* + * Either get_user_pages() failed, or the vma validation + * succeeded, in either case we don't need to put_page() before + * returning. + */ + if (i >= rc) + goto out; + + for (i = 0; i < rc; i++) + put_page(pages[i]); + rc = -EOPNOTSUPP; +out: + if (vmas != vmas_arg) + kfree(vmas); + return rc; +} +EXPORT_SYMBOL(get_user_pages_longterm); +#endif /* CONFIG_FS_DAX */ + /** * populate_vma_page_range() - populate a range of pages in the vma. * @vma: target vma @@ -391,11 +391,11 @@ again: if (pmd_protnone(pmd)) return hmm_vma_walk_clear(start, end, walk); - if (write_fault && !pmd_write(pmd)) + if (!pmd_access_permitted(pmd, write_fault)) return hmm_vma_walk_clear(start, end, walk); pfn = pmd_pfn(pmd) + pte_index(addr); - flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; + flag |= pmd_access_permitted(pmd, WRITE) ? HMM_PFN_WRITE : 0; for (; addr < end; addr += PAGE_SIZE, i++, pfn++) pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag; return 0; @@ -456,11 +456,11 @@ again: continue; } - if (write_fault && !pte_write(pte)) + if (!pte_access_permitted(pte, write_fault)) goto fault; pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag; - pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0; + pfns[i] |= pte_access_permitted(pte, WRITE) ? HMM_PFN_WRITE : 0; continue; fault: diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0e7ded98d114..2f2f5e774902 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -870,7 +870,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, */ WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set"); - if (flags & FOLL_WRITE && !pmd_write(*pmd) |
