summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-02-22 19:29:24 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-22 19:29:24 -0800
commitbc49a7831b1137ce1c2dda1c57e3631655f5d2ae (patch)
tree469380ac3a17e1d927ccf06abc99b6f509deb24a /fs
parentbe5165a51d2500ae1afa1236a8b09858831fdf7e (diff)
parentf201ebd87652cf1519792f8662bb3f862c76aa33 (diff)
downloadlinux-bc49a7831b1137ce1c2dda1c57e3631655f5d2ae.tar.gz
linux-bc49a7831b1137ce1c2dda1c57e3631655f5d2ae.tar.bz2
linux-bc49a7831b1137ce1c2dda1c57e3631655f5d2ae.zip
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: "142 patches: - DAX updates - various misc bits - OCFS2 updates - most of MM" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (142 commits) mm/z3fold.c: limit first_num to the actual range of possible buddy indexes mm: fix <linux/pagemap.h> stray kernel-doc notation zram: remove obsolete sysfs attrs mm/memblock.c: remove unnecessary log and clean up oom-reaper: use madvise_dontneed() logic to decide if unmap the VMA mm: drop unused argument of zap_page_range() mm: drop zap_details::check_swap_entries mm: drop zap_details::ignore_dirty mm, page_alloc: warn_alloc nodemask is NULL when cpusets are disabled mm: help __GFP_NOFAIL allocations which do not trigger OOM killer mm, oom: do not enforce OOM killer for __GFP_NOFAIL automatically mm: consolidate GFP_NOFAIL checks in the allocator slowpath lib/show_mem.c: teach show_mem to work with the given nodemask arch, mm: remove arch specific show_mem mm, page_alloc: warn_alloc print nodemask mm, page_alloc: do not report all nodes in show_mem Revert "mm: bail out in shrink_inactive_list()" mm, vmscan: consider eligible zones in get_scan_count mm, vmscan: cleanup lru size claculations mm, vmscan: do not count freed pages as PGDEACTIVATE ...
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c2
-rw-r--r--fs/binfmt_elf.c30
-rw-r--r--fs/dax.c104
-rw-r--r--fs/ext4/file.c13
-rw-r--r--fs/nfs/write.c2
-rw-r--r--fs/ocfs2/acl.c29
-rw-r--r--fs/ocfs2/dlmglue.c105
-rw-r--r--fs/ocfs2/dlmglue.h18
-rw-r--r--fs/ocfs2/file.c58
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/userfaultfd.c461
-rw-r--r--fs/xfs/xfs_file.c15
12 files changed, 710 insertions, 128 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index b3c2cc79c20d..082d227fa56b 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -277,6 +277,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
case ACL_TYPE_ACCESS:
if (acl) {
struct iattr iattr;
+ struct posix_acl *old_acl = acl;
retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl);
if (retval)
@@ -287,6 +288,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
* by the mode bits. So don't
* update ACL.
*/
+ posix_acl_release(old_acl);
value = NULL;
size = 0;
}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e7bf01373bc4..443a6f537d56 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -91,12 +91,18 @@ static struct linux_binfmt elf_format = {
#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
-static int set_brk(unsigned long start, unsigned long end)
+static int set_brk(unsigned long start, unsigned long end, int prot)
{
start = ELF_PAGEALIGN(start);
end = ELF_PAGEALIGN(end);
if (end > start) {
- int error = vm_brk(start, end - start);
+ /*
+ * Map the last of the bss segment.
+ * If the header is requesting these pages to be
+ * executable, honour that (ppc32 needs this).
+ */
+ int error = vm_brk_flags(start, end - start,
+ prot & PROT_EXEC ? VM_EXEC : 0);
if (error)
return error;
}
@@ -524,6 +530,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
unsigned long load_addr = 0;
int load_addr_set = 0;
unsigned long last_bss = 0, elf_bss = 0;
+ int bss_prot = 0;
unsigned long error = ~0UL;
unsigned long total_size;
int i;
@@ -606,8 +613,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
* elf_bss and last_bss is the bss section.
*/
k = load_addr + eppnt->p_vaddr + eppnt->p_memsz;
- if (k > last_bss)
+ if (k > last_bss) {
last_bss = k;
+ bss_prot = elf_prot;
+ }
}
}
@@ -623,13 +632,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
/*
* Next, align both the file and mem bss up to the page size,
* since this is where elf_bss was just zeroed up to, and where
- * last_bss will end after the vm_brk() below.
+ * last_bss will end after the vm_brk_flags() below.
*/
elf_bss = ELF_PAGEALIGN(elf_bss);
last_bss = ELF_PAGEALIGN(last_bss);
/* Finally, if there is still more bss to allocate, do it. */
if (last_bss > elf_bss) {
- error = vm_brk(elf_bss, last_bss - elf_bss);
+ error = vm_brk_flags(elf_bss, last_bss - elf_bss,
+ bss_prot & PROT_EXEC ? VM_EXEC : 0);
if (error)
goto out;
}
@@ -674,6 +684,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
unsigned long elf_bss, elf_brk;
+ int bss_prot = 0;
int retval, i;
unsigned long elf_entry;
unsigned long interp_load_addr = 0;
@@ -882,7 +893,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
before this one. Map anonymous pages, if needed,
and clear the area. */
retval = set_brk(elf_bss + load_bias,
- elf_brk + load_bias);
+ elf_brk + load_bias,
+ bss_prot);
if (retval)
goto out_free_dentry;
nbyte = ELF_PAGEOFFSET(elf_bss);
@@ -976,8 +988,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
if (end_data < k)
end_data = k;
k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
- if (k > elf_brk)
+ if (k > elf_brk) {
+ bss_prot = elf_prot;
elf_brk = k;
+ }
}
loc->elf_ex.e_entry += load_bias;
@@ -993,7 +1007,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
* mapping in the interpreter, to make sure it doesn't wind
* up getting placed where the bss needs to go.
*/
- retval = set_brk(elf_bss, elf_brk);
+ retval = set_brk(elf_bss, elf_brk, bss_prot);
if (retval)
goto out_free_dentry;
if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
diff --git a/fs/dax.c b/fs/dax.c
index 99b5b4458a78..3f1181563fb1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -35,6 +35,9 @@
#include <linux/iomap.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/fs_dax.h>
+
/* We choose 4096 entries - same as per-zone page wait tables */
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
@@ -1253,21 +1256,21 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault);
*/
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
-static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
- struct vm_fault *vmf, unsigned long address,
- struct iomap *iomap, loff_t pos, bool write, void **entryp)
+static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
+ loff_t pos, void **entryp)
{
- struct address_space *mapping = vma->vm_file->f_mapping;
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
struct block_device *bdev = iomap->bdev;
+ struct inode *inode = mapping->host;
struct blk_dax_ctl dax = {
.sector = dax_iomap_sector(iomap, pos),
.size = PMD_SIZE,
};
long length = dax_map_atomic(bdev, &dax);
- void *ret;
+ void *ret = NULL;
if (length < 0) /* dax_map_atomic() failed */
- return VM_FAULT_FALLBACK;
+ goto fallback;
if (length < PMD_SIZE)
goto unmap_fallback;
if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
@@ -1280,67 +1283,86 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
RADIX_DAX_PMD);
if (IS_ERR(ret))
- return VM_FAULT_FALLBACK;
+ goto fallback;
*entryp = ret;
- return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
+ trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret);
+ return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+ dax.pfn, vmf->flags & FAULT_FLAG_WRITE);
unmap_fallback:
dax_unmap_atomic(bdev, &dax);
+fallback:
+ trace_dax_pmd_insert_mapping_fallback(inode, vmf, length,
+ dax.pfn, ret);
return VM_FAULT_FALLBACK;
}
-static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
- struct vm_fault *vmf, unsigned long address,
- struct iomap *iomap, void **entryp)
+static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
+ void **entryp)
{
- struct address_space *mapping = vma->vm_file->f_mapping;
- unsigned long pmd_addr = address & PMD_MASK;
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ unsigned long pmd_addr = vmf->address & PMD_MASK;
+ struct inode *inode = mapping->host;
struct page *zero_page;
+ void *ret = NULL;
spinlock_t *ptl;
pmd_t pmd_entry;
- void *ret;
- zero_page = mm_get_huge_zero_page(vma->vm_mm);
+ zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
if (unlikely(!zero_page))
- return VM_FAULT_FALLBACK;
+ goto fallback;
ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
RADIX_DAX_PMD | RADIX_DAX_HZP);
if (IS_ERR(ret))
- return VM_FAULT_FALLBACK;
+ goto fallback;
*entryp = ret;
- ptl = pmd_lock(vma->vm_mm, pmd);
- if (!pmd_none(*pmd)) {
+ ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
+ if (!pmd_none(*(vmf->pmd))) {
spin_unlock(ptl);
- return VM_FAULT_FALLBACK;
+ goto fallback;
}
- pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
+ pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
pmd_entry = pmd_mkhuge(pmd_entry);
- set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
+ set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
spin_unlock(ptl);
+ trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
return VM_FAULT_NOPAGE;
+
+fallback:
+ trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
+ return VM_FAULT_FALLBACK;
}
-int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd, unsigned int flags, const struct iomap_ops *ops)
+int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
{
+ struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping;
- unsigned long pmd_addr = address & PMD_MASK;
- bool write = flags & FAULT_FLAG_WRITE;
+ unsigned long pmd_addr = vmf->address & PMD_MASK;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
struct inode *inode = mapping->host;
int result = VM_FAULT_FALLBACK;
struct iomap iomap = { 0 };
pgoff_t max_pgoff, pgoff;
- struct vm_fault vmf;
void *entry;
loff_t pos;
int error;
+ /*
+ * Check whether offset isn't beyond end of file now. Caller is
+ * supposed to hold locks serializing us with truncate / punch hole so
+ * this is a reliable test.
+ */
+ pgoff = linear_page_index(vma, pmd_addr);
+ max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+
+ trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
+
/* Fall back to PTEs if we're going to COW */
if (write && !(vma->vm_flags & VM_SHARED))
goto fallback;
@@ -1351,16 +1373,10 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
if ((pmd_addr + PMD_SIZE) > vma->vm_end)
goto fallback;
- /*
- * Check whether offset isn't beyond end of file now. Caller is
- * supposed to hold locks serializing us with truncate / punch hole so
- * this is a reliable test.
- */
- pgoff = linear_page_index(vma, pmd_addr);
- max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
-
- if (pgoff > max_pgoff)
- return VM_FAULT_SIGBUS;
+ if (pgoff > max_pgoff) {
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
/* If the PMD would extend beyond the file size */
if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
@@ -1389,21 +1405,15 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
if (IS_ERR(entry))
goto finish_iomap;
- vmf.pgoff = pgoff;
- vmf.flags = flags;
- vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
-
switch (iomap.type) {
case IOMAP_MAPPED:
- result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
- &iomap, pos, write, &entry);
+ result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
break;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (WARN_ON_ONCE(write))
goto unlock_entry;
- result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
- &entry);
+ result = dax_pmd_load_hole(vmf, &iomap, &entry);
break;
default:
WARN_ON_ONCE(1);
@@ -1429,9 +1439,11 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
}
fallback:
if (result == VM_FAULT_FALLBACK) {
- split_huge_pmd(vma, pmd, address);
+ split_huge_pmd(vma, vmf->pmd, vmf->address);
count_vm_event(THP_FAULT_FALLBACK);
}
+out:
+ trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
return result;
}
EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 87e11dfe3cde..13021a054fc0 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -273,21 +273,20 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return result;
}
-static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, unsigned int flags)
+static int
+ext4_dax_pmd_fault(struct vm_fault *vmf)
{
int result;
- struct inode *inode = file_inode(vma->vm_file);
+ struct inode *inode = file_inode(vmf->vma->vm_file);
struct super_block *sb = inode->i_sb;
- bool write = flags & FAULT_FLAG_WRITE;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
if (write) {
sb_start_pagefault(sb);
- file_update_time(vma->vm_file);
+ file_update_time(vmf->vma->vm_file);
}
down_read(&EXT4_I(inode)->i_mmap_sem);
- result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
- &ext4_iomap_ops);
+ result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops);
up_read(&EXT4_I(inode)->i_mmap_sem);
if (write)
sb_end_pagefault(sb);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b00d53d13d47..006068526542 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -728,8 +728,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
if (likely(head->wb_page && !PageSwapCache(head->wb_page))) {
set_page_private(head->wb_page, 0);
ClearPagePrivate(head->wb_page);
- smp_mb__after_atomic();
- wake_up_page(head->wb_page, PG_private);
clear_bit(PG_MAPPED, &head->wb_flags);
}
nfsi->nrequests--;
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index bed1fcb63088..dc22ba8c710f 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -283,16 +283,14 @@ int ocfs2_set_acl(handle_t *handle,
int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
struct buffer_head *bh = NULL;
- int status = 0;
+ int status, had_lock;
+ struct ocfs2_lock_holder oh;
- status = ocfs2_inode_lock(inode, &bh, 1);
- if (status < 0) {
- if (status != -ENOENT)
- mlog_errno(status);
- return status;
- }
+ had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
+ if (had_lock < 0)
+ return had_lock;
status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
- ocfs2_inode_unlock(inode, 1);
+ ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
brelse(bh);
return status;
}
@@ -302,21 +300,20 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
struct ocfs2_super *osb;
struct buffer_head *di_bh = NULL;
struct posix_acl *acl;
- int ret;
+ int had_lock;
+ struct ocfs2_lock_holder oh;
osb = OCFS2_SB(inode->i_sb);
if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
return NULL;
- ret = ocfs2_inode_lock(inode, &di_bh, 0);
- if (ret < 0) {
- if (ret != -ENOENT)
- mlog_errno(ret);
- return ERR_PTR(ret);
- }
+
+ had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 0, &oh);
+ if (had_lock < 0)
+ return ERR_PTR(had_lock);
acl = ocfs2_get_acl_nolock(inode, type, di_bh);
- ocfs2_inode_unlock(inode, 0);
+ ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
brelse(di_bh);
return acl;
}
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 77d1632e905d..8dce4099a6ca 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
init_waitqueue_head(&res->l_event);
INIT_LIST_HEAD(&res->l_blocked_list);
INIT_LIST_HEAD(&res->l_mask_waiters);
+ INIT_LIST_HEAD(&res->l_holders);
}
void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
@@ -749,6 +750,50 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
res->l_flags = 0UL;
}
+/*
+ * Keep a list of processes who have interest in a lockres.
+ * Note: this is now only uesed for check recursive cluster locking.
+ */
+static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres,
+ struct ocfs2_lock_holder *oh)
+{
+ INIT_LIST_HEAD(&oh->oh_list);
+ oh->oh_owner_pid = get_pid(task_pid(current));
+
+ spin_lock(&lockres->l_lock);
+ list_add_tail(&oh->oh_list, &lockres->l_holders);
+ spin_unlock(&lockres->l_lock);
+}
+
+static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres,
+ struct ocfs2_lock_holder *oh)
+{
+ spin_lock(&lockres->l_lock);
+ list_del(&oh->oh_list);
+ spin_unlock(&lockres->l_lock);
+
+ put_pid(oh->oh_owner_pid);
+}
+
+static inline int ocfs2_is_locked_by_me(struct ocfs2_lock_res *lockres)
+{
+ struct ocfs2_lock_holder *oh;
+ struct pid *pid;
+
+ /* look in the list of holders for one with the current task as owner */
+ spin_lock(&lockres->l_lock);
+ pid = task_pid(current);
+ list_for_each_entry(oh, &lockres->l_holders, oh_list) {
+ if (oh->oh_owner_pid == pid) {
+ spin_unlock(&lockres->l_lock);
+ return 1;
+ }
+ }
+ spin_unlock(&lockres->l_lock);
+
+ return 0;
+}
+
static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
int level)
{
@@ -2333,8 +2378,9 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
goto getbh;
}
- if (ocfs2_mount_local(osb))
- goto local;
+ if ((arg_flags & OCFS2_META_LOCK_GETBH) ||
+ ocfs2_mount_local(osb))
+ goto update;
if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
ocfs2_wait_for_recovery(osb);
@@ -2363,7 +2409,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
ocfs2_wait_for_recovery(osb);
-local:
+update:
/*
* We only see this flag if we're being called from
* ocfs2_read_locked_inode(). It means we're locking an inode
@@ -2497,6 +2543,59 @@ void ocfs2_inode_unlock(struct inode *inode,
ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
}
+/*
+ * This _tracker variantes are introduced to deal with the recursive cluster
+ * locking issue. The idea is to keep track of a lock holder on the stack of
+ * the current process. If there's a lock holder on the stack, we know the
+ * task context is already protected by cluster locking. Currently, they're
+ * used in some VFS entry routines.
+ *
+ * return < 0 on error, return == 0 if there's no lock holder on the stack
+ * before this call, return == 1 if this call would be a recursive locking.
+ */
+int ocfs2_inode_lock_tracker(struct inode *inode,
+ struct buffer_head **ret_bh,
+ int ex,
+ struct ocfs2_lock_holder *oh)
+{
+ int status;
+ int arg_flags = 0, has_locked;
+ struct ocfs2_lock_res *lockres;
+
+ lockres = &OCFS2_I(inode)->ip_inode_lockres;
+ has_locked = ocfs2_is_locked_by_me(lockres);
+ /* Just get buffer head if the cluster lock has been taken */
+ if (has_locked)
+ arg_flags = OCFS2_META_LOCK_GETBH;
+
+ if (likely(!has_locked || ret_bh)) {
+ status = ocfs2_inode_lock_full(inode, ret_bh, ex, arg_flags);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ return status;
+ }
+ }
+ if (!has_locked)
+ ocfs2_add_holder(lockres, oh);
+
+ return has_locked;
+}
+
+void ocfs2_inode_unlock_tracker(struct inode *inode,
+ int ex,
+ struct ocfs2_lock_holder *oh,
+ int had_lock)
+{
+ struct ocfs2_lock_res *lockres;
+
+ lockres = &OCFS2_I(inode)->ip_inode_lockres;
+ if (!had_lock) {
+ ocfs2_remove_holder(lockres, oh);
+ ocfs2_inode_unlock(inode, ex);
+ }
+}
+
int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
{
struct ocfs2_lock_res *lockres;
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index d293a22c32c5..a7fc18ba0dc1 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -70,6 +70,11 @@ struct ocfs2_orphan_scan_lvb {
__be32 lvb_os_seqno;
};
+struct ocfs2_lock_holder {
+ struct list_head oh_list;
+ struct pid *oh_owner_pid;
+};
+
/* ocfs2_inode_lock_full() 'arg_flags' flags */
/* don't wait on recovery. */
#define OCFS2_META_LOCK_RECOVERY (0x01)
@@ -77,6 +82,8 @@ struct ocfs2_orphan_scan_lvb {
#define OCFS2_META_LOCK_NOQUEUE (0x02)
/* don't block waiting for the downconvert thread, instead return -EAGAIN */
#define OCFS2_LOCK_NONBLOCK (0x04)
+/* just get back disk inode bh if we've got cluster lock. */
+#define OCFS2_META_LOCK_GETBH (0x08)
/* Locking subclasses of inode cluster lock */
enum {
@@ -170,4 +177,15 @@ void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
/* To set the locking protocol on module initialization */
void ocfs2_set_locking_protocol(void);
+
+/* The _tracker pair is used to avoid cluster recursive locking */
+int ocfs2_inode_lock_tracker(struct inode *inode,
+ struct buffer_head **ret_bh,
+ int ex,
+ struct ocfs2_lock_holder *oh);
+void ocfs2_inode_unlock_tracker(struct inode *inode,
+ int ex,
+ struct ocfs2_lock_holder *oh,
+ int had_lock);
+
#endif /* DLMGLUE_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c4889655d32b..7b6a146327d7 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1138,6 +1138,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
handle_t *handle = NULL;
struct dquot *transfer_to[MAXQUOTAS] = { };
int qtype;
+ int had_lock;
+ struct ocfs2_lock_holder oh;
trace_ocfs2_setattr(inode, dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1173,11 +1175,30 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
}
}
- status = ocfs2_inode_lock(inode, &bh, 1);
- if (status < 0) {
- if (status != -ENOENT)
- mlog_errno(status);
+ had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
+ if (had_lock < 0) {
+ status = had_lock;
goto bail_unlock_rw;
+ } else if (had_lock) {
+ /*
+ * As far as we know, ocfs2_setattr() could only be the first
+ * VFS entry point in the call chain of recursive cluster
+ * locking issue.
+ *
+ * For instance:
+ * chmod_common()
+ * notify_change()
+ * ocfs2_setattr()
+ * posix_acl_chmod()
+ * ocfs2_iop_get_acl()
+ *
+ * But, we're not 100% sure if it's always true, because the
+ * ordering of the VFS entry points in the call chain is out
+ * of our control. So, we'd better dump the stack here to
+ * catch the other cases of recursive locking.
+ */
+ mlog(ML_ERROR, "Another case of recursive locking:\n");
+ dump_stack();
}
inode_locked = 1;
@@ -1260,8 +1281,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
- if (status) {
- ocfs2_inode_unlock(inode, 1);
+ if (status && inode_locked) {
+ ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
inode_locked = 0;
}
bail_unlock_rw:
@@ -1279,7 +1300,7 @@ bail:
mlog_errno(status);
}
if (inode_locked)
- ocfs2_inode_unlock(inode, 1);
+ ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
brelse(bh);
return status;
@@ -1320,21 +1341,32 @@ bail:
int ocfs2_permission(struct inode *inode, int mask)
{
- int ret;
+ int ret, had_lock;
+ struct ocfs2_lock_holder oh;
if (mask & MAY_NOT_BLOCK)
return -ECHILD;
- ret = ocfs2_inode_lock(inode, NULL, 0);
- if (ret) {
- if (ret != -ENOENT)
- mlog_errno(ret);
+ had_lock = ocfs2_inode_lock_tracker(inode, NULL, 0, &oh);
+ if (had_lock < 0) {
+ ret = had_lock;
goto out;
+ } else if (had_lock) {
+ /* See comments in ocfs2_setattr() for details.
+ * The call chain of this case could be:
+ * do_sys_open()
+ * may_open()
+ * inode_permission()
+ * ocfs2_permission()
+ * ocfs2_iop_get_acl()
+ */
+ mlog(ML_ERROR, "Another case of recursive locking:\n");
+ dump_stack();
}
ret = generic_permission(inode, mask);
- ocfs2_inode_unlock(inode, 0);
+ ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
out:
return ret;
}
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7e5958b0be6b..0c39d71c67a1 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -172,6 +172,7 @@ struct ocfs2_lock_res {
struct list_head l_blocked_list;
struct list_head l_mask_waiters;
+ struct list_head l_holders;
unsigned long l_flags;
char l_name[OCFS2_LOCK_ID_MAX_LEN];
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 43953e03c356..18406158e13f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -12,6 +12,7 @@
* mm/ksm.c (mm hashing).
*/
+#include <linux/list.h>
#include <linux/hashtable.h>
#include <linux/sched.h>
#include <linux/mm.h>
@@ -26,6 +27,7 @@
#include <linux/mempolicy.h>
#include <linux/ioctl.h>
#include <linux/security.h>
+#include <linux/hugetlb.h>
static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
@@ -45,12 +47,16 @@ struct userfaultfd_ctx {
wait_queue_head_t fault_wqh;
/* waitqueue head for the pseudo fd to wakeup poll/read */
wait_queue_head_t fd_wqh;
+ /* waitqueue head for events */
+ wait_queue_head_t event_wqh;
/* a refile sequence protected by fault_pending_wqh lock */
struct seqcount refile_seq;
/* pseudo fd refcounting */
atomic_t refcount;
/* userfaultfd syscall flags */
unsigned int flags;
+ /* features requested from the userspace */
+ unsigned int features;
/* state machine */
enum userfaultfd_state state;
/* released */
@@ -59,6 +65,12 @@ struct userfaultfd_ctx {
struct mm_struct *mm;
};
+struct userfaultfd_fork_ctx {
+ struct userfaultfd_ctx *orig;
+ struct userfaultfd_ctx *new;
+ struct list_head list;
+};
+
struct userfaultfd_wait_queue {
struct uffd_msg msg;
wait_queue_t wq;
@@ -142,6 +154,8 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
+ VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
+ VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
mmdrop(ctx->mm);
@@ -169,7 +183,7 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
msg.arg.pagefault.address = address;
if (flags & FAULT_FLAG_WRITE)
/*
- * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the
+ * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
* uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
* was not set in a UFFD_EVENT_PAGEFAULT, it means it
* was a read fault, otherwise if set it means it's
@@ -188,6 +202,49 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
return msg;
}
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * Same functionality as userfaultfd_must_wait below with modifications for
+ * hugepmd ranges.
+ */
+static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
+ unsigned long address,
+ unsigned long flags,
+ unsigned long reason)
+{
+ struct mm_struct *mm = ctx->mm;
+ pte_t *pte;
+ bool ret = true;
+
+ VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+ pte = huge_pte_offset(mm, address);
+ if (!pte)
+ goto out;
+
+ ret = false;
+
+ /*
+ * Lockless access: we're in a wait_event so it's ok if it
+ * changes under us.
+ */
+ if (huge_pte_none(*pte))
+ ret = true;
+ if (!huge_pte_write(*pte) && (reason & VM_UFFD_WP))
+ ret = true;
+out:
+ return ret;
+}
+#else
+static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
+ unsigned long address,
+ unsigned long flags,
+ unsigned long reason)
+{
+ return false; /* should never get here */
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
/*
* Verify the pagetables are still not ok after having reigstered into
* the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
@@ -364,8 +421,12 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
set_current_state(blocking_state);
spin_unlock(&ctx->fault_pending_wqh.lock);
- must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
- reason);
+ if (!is_vm_hugetlb_page(vmf->vma))
+ must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
+ reason);
+ else
+ must_wait = userfaultfd_huge_must_wait(ctx, vmf->address,
+ vmf->flags, reason);
up_read(&mm->mmap_sem);
if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
@@ -458,6 +519,196 @@ out:
return ret;
}
+static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wait_queue *ewq)
+{
+ int ret = 0;
+
+ ewq->ctx = ctx;
+ init_waitqueue_entry(&ewq->wq, current);
+
+ spin_lock(&ctx->event_wqh.lock);
+ /*
+ * After the __add_wait_queue the uwq is visible to userland
+ * through poll/read().
+ */
+ __add_wait_queue(&ctx->event_wqh, &ewq->wq);
+ for (;;) {
+ set_current_state(TASK_KILLABLE);
+ if (ewq->msg.event == 0)
+ break;
+ if (ACCESS_ONCE(ctx->released) ||
+ fatal_signal_pending(current)) {
+ ret = -1;
+ __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
+ break;
+ }
+
+ spin_unlock(&ctx->event_wqh.lock);
+
+ wake_up_poll(&ctx->fd_wqh, POLLIN);
+ schedule();
+
+ spin_lock(&ctx->event_wqh.lock);
+ }
+ __set_current_state(TASK_RUNNING);
+ spin_unlock(&ctx->event_wqh.lock);
+
+ /*
+ * ctx may go away after this if the userfault pseudo fd is
+ * already released.
+ */
+
+ userfaultfd_ctx_put(ctx);
+ return ret;
+}
+
+static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wait_queue *ewq)
+{
+ ewq->msg.event = 0;
+ wake_up_locked(&ctx->event_wqh);
+ __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
+}
+
+int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
+{
+ struct userfaultfd_ctx *ctx = NULL, *octx;
+ struct userfaultfd_fork_ctx *fctx;
+
+ octx = vma->vm_userfaultfd_ctx.ctx;
+ if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+ return 0;
+ }
+
+ list_for_each_entry(fctx, fcs, list)
+ if (fctx->orig == octx) {
+ ctx = fctx->new;
+ break;
+ }
+
+ if (!ctx) {
+ fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
+ if (!fctx)
+ return -ENOMEM;
+
+ ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KE