From ffc79664d15841025d90afdd902c4112ffe168d6 Mon Sep 17 00:00:00 2001 From: Milosz Tanski Date: Wed, 25 Sep 2013 11:18:14 -0400 Subject: ceph: hung on ceph fscache invalidate in some cases In some cases I'm on my ceph client cluster I'm seeing hunk kernel tasks in the invalidate page code path. This is due to the fact that we don't check if the page is marked as cache before calling fscache_wait_on_page_write(). This is the log from the hang INFO: task XXXXXX:12034 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. ... Call Trace: [] schedule+0x29/0x70 [] __fscache_wait_on_page_write+0x6d/0xb0 [fscache] [] ? add_wait_queue+0x60/0x60 [] ceph_invalidate_fscache_page+0x29/0x50 [ceph] [] ceph_invalidatepage+0x70/0x190 [ceph] [] ? delete_from_page_cache+0x5f/0x70 [] truncate_inode_page+0x8b/0x90 [] truncate_inode_pages_range.part.12+0x13d/0x620 [] truncate_inode_pages_range+0x4d/0x60 [] truncate_inode_pages+0x15/0x20 [] evict+0x1a6/0x1b0 [] iput+0x103/0x190 ... Signed-off-by: Milosz Tanski Reviewed-by: Sage Weil --- fs/ceph/cache.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 6bfe65e0b038..360b622b0be0 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -324,6 +324,9 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) { struct ceph_inode_info *ci = ceph_inode(inode); + if (!PageFsCache(page)) + return; + fscache_wait_on_page_write(ci->fscache, page); fscache_uncache_page(ci->fscache, page); } -- cgit v1.2.3 From 53e879a485f9def0e55c404dbc7187470a01602d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 18 Sep 2013 09:29:07 +0800 Subject: ceph: remove outdated frag information If directory fragments change, fill_inode() inserts new frags into the fragtree, but it does not remove outdated frags from the fragtree. This patch fixes it. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/inode.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8549a48115f7..1c9fea0c9834 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -577,6 +577,8 @@ static int fill_inode(struct inode *inode, int issued = 0, implemented; struct timespec mtime, atime, ctime; u32 nsplits; + struct ceph_inode_frag *frag; + struct rb_node *rb_node; struct ceph_buffer *xattr_blob = NULL; int err = 0; int queue_trunc = 0; @@ -751,15 +753,38 @@ no_change: /* FIXME: move me up, if/when version reflects fragtree changes */ nsplits = le32_to_cpu(info->fragtree.nsplits); mutex_lock(&ci->i_fragtree_mutex); + rb_node = rb_first(&ci->i_fragtree); for (i = 0; i < nsplits; i++) { u32 id = le32_to_cpu(info->fragtree.splits[i].frag); - struct ceph_inode_frag *frag = __get_or_create_frag(ci, id); - - if (IS_ERR(frag)) - continue; + frag = NULL; + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + if (ceph_frag_compare(frag->frag, id) >= 0) { + if (frag->frag != id) + frag = NULL; + else + rb_node = rb_next(rb_node); + break; + } + rb_node = rb_next(rb_node); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + frag = NULL; + } + if (!frag) { + frag = __get_or_create_frag(ci, id); + if (IS_ERR(frag)) + continue; + } frag->split_by = le32_to_cpu(info->fragtree.splits[i].by); dout(" frag %x split by %d\n", frag->frag, frag->split_by); } + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + rb_node = rb_next(rb_node); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + } mutex_unlock(&ci->i_fragtree_mutex); /* were we issued a capability? */ -- cgit v1.2.3 From 81c6aea5275eae453719d7f3924da07e668265c5 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 18 Sep 2013 09:44:13 +0800 Subject: ceph: handle frag mismatch between readdir request and reply If client has outdated directory fragments information, it may request readdir an non-existent directory fragment. In this case, the MDS finds an approximate directory fragment and sends its contents back to the client. When receiving a reply with fragment that is different than the requested one, the client need to reset the 'readdir offset'. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/dir.c | 11 ++++++++++- fs/ceph/inode.c | 16 ++++++++++++++-- fs/ceph/mds_client.c | 3 +-- 3 files changed, 25 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 868b61d56cac..2a0bcaeb189a 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -352,8 +352,18 @@ more: } /* note next offset and last dentry name */ + rinfo = &req->r_reply_info; + if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { + frag = le32_to_cpu(rinfo->dir_dir->frag); + if (ceph_frag_is_leftmost(frag)) + fi->next_offset = 2; + else + fi->next_offset = 0; + off = fi->next_offset; + } fi->offset = fi->next_offset; fi->last_readdir = req; + fi->frag = frag; if (req->r_reply_info.dir_end) { kfree(fi->last_name); @@ -363,7 +373,6 @@ more: else fi->next_offset = 0; } else { - rinfo = &req->r_reply_info; err = note_last_dentry(fi, rinfo->dir_dname[rinfo->dir_nr-1], rinfo->dir_dname_len[rinfo->dir_nr-1]); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1c9fea0c9834..9a8e396aed89 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1275,8 +1275,20 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, int err = 0, i; struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; - u64 frag = le32_to_cpu(rhead->args.readdir.frag); struct ceph_dentry_info *di; + u64 r_readdir_offset = req->r_readdir_offset; + u32 frag = le32_to_cpu(rhead->args.readdir.frag); + + if (rinfo->dir_dir && + le32_to_cpu(rinfo->dir_dir->frag) != frag) { + dout("readdir_prepopulate got new frag %x -> %x\n", + frag, le32_to_cpu(rinfo->dir_dir->frag)); + frag = le32_to_cpu(rinfo->dir_dir->frag); + if (ceph_frag_is_leftmost(frag)) + r_readdir_offset = 2; + else + r_readdir_offset = 0; + } if (req->r_aborted) return readdir_prepopulate_inodes_only(req, session); @@ -1340,7 +1352,7 @@ retry_lookup: } di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); + di->offset = ceph_make_fpos(frag, i + r_readdir_offset); /* inode */ if (dn->d_inode) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index b7bda5d9611d..f51ab2627b41 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2238,8 +2238,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); if (err == 0) { if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || - req->r_op == CEPH_MDS_OP_LSSNAP) && - rinfo->dir_nr) + req->r_op == CEPH_MDS_OP_LSSNAP)) ceph_readdir_prepopulate(req, req->r_session); ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } -- cgit v1.2.3 From e34ecee2ae791df674dfb466ce40692ca6218e43 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 10 Oct 2013 19:31:47 -0700 Subject: aio: Fix a trinity splat aio kiocb refcounting was broken - it was relying on keeping track of the number of available ring buffer entries, which it needs to do anyways; then at shutdown time it'd wait for completions to be delivered until the # of available ring buffer entries equalled what it was initialized to. Problem with that is that the ring buffer is mapped writable into userspace, so userspace could futz with the head and tail pointers to cause the kernel to see extra completions, and cause free_ioctx() to return while there were still outstanding kiocbs. Which would be bad. Fix is just to directly refcount the kiocbs - which is more straightforward, and with the new percpu refcounting code doesn't cost us any cacheline bouncing which was the whole point of the original scheme. Also clean up ioctx_alloc()'s error path and fix a bug where it wasn't subtracting from aio_nr if ioctx_add_table() failed. Signed-off-by: Kent Overstreet --- fs/aio.c | 129 ++++++++++++++++++++++++--------------------------------------- 1 file changed, 48 insertions(+), 81 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 067e3d340c35..ee77dc13d5b2 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -80,6 +80,8 @@ struct kioctx { struct percpu_ref users; atomic_t dead; + struct percpu_ref reqs; + unsigned long user_id; struct __percpu kioctx_cpu *cpu; @@ -107,7 +109,6 @@ struct kioctx { struct page **ring_pages; long nr_pages; - struct rcu_head rcu_head; struct work_struct free_work; struct { @@ -412,26 +413,34 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb) return cancel(kiocb); } -static void free_ioctx_rcu(struct rcu_head *head) +static void free_ioctx(struct work_struct *work) { - struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); + struct kioctx *ctx = container_of(work, struct kioctx, free_work); + + pr_debug("freeing %p\n", ctx); + aio_free_ring(ctx); free_percpu(ctx->cpu); kmem_cache_free(kioctx_cachep, ctx); } +static void free_ioctx_reqs(struct percpu_ref *ref) +{ + struct kioctx *ctx = container_of(ref, struct kioctx, reqs); + + INIT_WORK(&ctx->free_work, free_ioctx); + schedule_work(&ctx->free_work); +} + /* * When this function runs, the kioctx has been removed from the "hash table" * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - * now it's safe to cancel any that need to be. */ -static void free_ioctx(struct work_struct *work) +static void free_ioctx_users(struct percpu_ref *ref) { - struct kioctx *ctx = container_of(work, struct kioctx, free_work); - struct aio_ring *ring; + struct kioctx *ctx = container_of(ref, struct kioctx, users); struct kiocb *req; - unsigned cpu, avail; - DEFINE_WAIT(wait); spin_lock_irq(&ctx->ctx_lock); @@ -445,54 +454,8 @@ static void free_ioctx(struct work_struct *work) spin_unlock_irq(&ctx->ctx_lock); - for_each_possible_cpu(cpu) { - struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu); - - atomic_add(kcpu->reqs_available, &ctx->reqs_available); - kcpu->reqs_available = 0; - } - - while (1) { - prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE); - - ring = kmap_atomic(ctx->ring_pages[0]); - avail = (ring->head <= ring->tail) - ? ring->tail - ring->head - : ctx->nr_events - ring->head + ring->tail; - - atomic_add(avail, &ctx->reqs_available); - ring->head = ring->tail; - kunmap_atomic(ring); - - if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1) - break; - - schedule(); - } - finish_wait(&ctx->wait, &wait); - - WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1); - - aio_free_ring(ctx); - - pr_debug("freeing %p\n", ctx); - - /* - * Here the call_rcu() is between the wait_event() for reqs_active to - * hit 0, and freeing the ioctx. - * - * aio_complete() decrements reqs_active, but it has to touch the ioctx - * after to issue a wakeup so we use rcu. - */ - call_rcu(&ctx->rcu_head, free_ioctx_rcu); -} - -static void free_ioctx_ref(struct percpu_ref *ref) -{ - struct kioctx *ctx = container_of(ref, struct kioctx, users); - - INIT_WORK(&ctx->free_work, free_ioctx); - schedule_work(&ctx->free_work); + percpu_ref_kill(&ctx->reqs); + percpu_ref_put(&ctx->reqs); } static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) @@ -551,6 +514,16 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) } } +static void aio_nr_sub(unsigned nr) +{ + spin_lock(&aio_nr_lock); + if (WARN_ON(aio_nr - nr > aio_nr)) + aio_nr = 0; + else + aio_nr -= nr; + spin_unlock(&aio_nr_lock); +} + /* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. */ @@ -588,8 +561,11 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->max_reqs = nr_events; - if (percpu_ref_init(&ctx->users, free_ioctx_ref)) - goto out_freectx; + if (percpu_ref_init(&ctx->users, free_ioctx_users)) + goto err; + + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) + goto err; spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->completion_lock); @@ -600,10 +576,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->cpu = alloc_percpu(struct kioctx_cpu); if (!ctx->cpu) - goto out_freeref; + goto err; if (aio_setup_ring(ctx) < 0) - goto out_freepcpu; + goto err; atomic_set(&ctx->reqs_available, ctx->nr_events - 1); ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); @@ -615,7 +591,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (aio_nr + nr_events > (aio_max_nr * 2UL) || aio_nr + nr_events < aio_nr) { spin_unlock(&aio_nr_lock); - goto out_cleanup; + err = -EAGAIN; + goto err; } aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); @@ -624,23 +601,19 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) err = ioctx_add_table(ctx, mm); if (err) - goto out_cleanup_put; + goto err_cleanup; pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; -out_cleanup_put: - percpu_ref_put(&ctx->users); -out_cleanup: - err = -EAGAIN; +err_cleanup: + aio_nr_sub(ctx->max_reqs); +err: aio_free_ring(ctx); -out_freepcpu: free_percpu(ctx->cpu); -out_freeref: + free_percpu(ctx->reqs.pcpu_count); free_percpu(ctx->users.pcpu_count); -out_freectx: - put_aio_ring_file(ctx); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); @@ -675,10 +648,7 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) * -EAGAIN with no ioctxs actually in use (as far as userspace * could tell). */ - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - ctx->max_reqs > aio_nr); - aio_nr -= ctx->max_reqs; - spin_unlock(&aio_nr_lock); + aio_nr_sub(ctx->max_reqs); if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); @@ -810,6 +780,8 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) if (unlikely(!req)) goto out_put; + percpu_ref_get(&ctx->reqs); + req->ki_ctx = ctx; return req; out_put: @@ -879,12 +851,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) return; } - /* - * Take rcu_read_lock() in case the kioctx is being destroyed, as we - * need to issue a wakeup after incrementing reqs_available. - */ - rcu_read_lock(); - if (iocb->ki_list.next) { unsigned long flags; @@ -959,7 +925,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - rcu_read_unlock(); + percpu_ref_put(&ctx->reqs); } EXPORT_SYMBOL(aio_complete); @@ -1370,6 +1336,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, return 0; out_put_req: put_reqs_available(ctx, 1); + percpu_ref_put(&ctx->reqs); kiocb_free(req); return ret; } -- cgit v1.2.3 From 81407c84ace88368ff23abb81caaeacf050c8450 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 24 May 2013 09:49:14 -0400 Subject: audit: allow unsetting the loginuid (with priv) If a task has CAP_AUDIT_CONTROL allow that task to unset their loginuid. This would allow a child of that task to set their loginuid without CAP_AUDIT_CONTROL. Thus when launching a new login daemon, a priviledged helper would be able to unset the loginuid and then the daemon, which may be malicious user facing, do not need priv to function correctly. Signed-off-by: Eric Paris Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- fs/proc/base.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 1485e38daaa3..03c8d747be48 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1151,10 +1151,16 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - kloginuid = make_kuid(file->f_cred->user_ns, loginuid); - if (!uid_valid(kloginuid)) { - length = -EINVAL; - goto out_free_page; + + /* is userspace tring to explicitly UNSET the loginuid? */ + if (loginuid == AUDIT_UID_UNSET) { + kloginuid = INVALID_UID; + } else { + kloginuid = make_kuid(file->f_cred->user_ns, loginuid); + if (!uid_valid(kloginuid)) { + length = -EINVAL; + goto out_free_page; + } } length = audit_set_loginuid(kloginuid); -- cgit v1.2.3 From 14e972b4517128ac8e30e3de2ee4fbd995084223 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 8 May 2013 10:25:58 -0400 Subject: audit: add child record before the create to handle case where create fails Historically, when a syscall that creates a dentry fails, you get an audit record that looks something like this (when trying to create a file named "new" in "/tmp/tmp.SxiLnCcv63"): type=PATH msg=audit(1366128956.279:965): item=0 name="/tmp/tmp.SxiLnCcv63/new" inode=2138308 dev=fd:02 mode=040700 ouid=0 ogid=0 rdev=00:00 obj=staff_u:object_r:user_tmp_t:s15:c0.c1023 This record makes no sense since it's associating the inode information for "/tmp/tmp.SxiLnCcv63" with the path "/tmp/tmp.SxiLnCcv63/new". The recent patch I posted to fix the audit_inode call in do_last fixes this, by making it look more like this: type=PATH msg=audit(1366128765.989:13875): item=0 name="/tmp/tmp.DJ1O8V3e4f/" inode=141 dev=fd:02 mode=040700 ouid=0 ogid=0 rdev=00:00 obj=staff_u:object_r:user_tmp_t:s15:c0.c1023 While this is more correct, if the creation of the file fails, then we have no record of the filename that the user tried to create. This patch adds a call to audit_inode_child to may_create. This creates an AUDIT_TYPE_CHILD_CREATE record that will sit in place until the create succeeds. When and if the create does succeed, then this record will be updated with the correct inode info from the create. This fixes what was broken in commit bfcec708. Commit 79f6530c should also be backported to stable v3.7+. Signed-off-by: Jeff Layton Signed-off-by: Eric Paris Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- fs/namei.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 7720fbd5277b..df9946e83db4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2262,6 +2262,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir) */ static inline int may_create(struct inode *dir, struct dentry *child) { + audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) -- cgit v1.2.3 From 9410d228a4cf434305306746bb799fb7acdd8648 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 30 Oct 2013 18:05:24 -0400 Subject: audit: call audit_bprm() only once to add AUDIT_EXECVE information Move the audit_bprm() call from search_binary_handler() to exec_binprm(). This allows us to get rid of the mm member of struct audit_aux_data_execve since bprm->mm will equal current->mm. This also mitigates the issue that ->argc could be modified by the load_binary() call in search_binary_handler(). audit_bprm() was being called to add an AUDIT_EXECVE record to the audit context every time search_binary_handler() was recursively called. Only one reference is necessary. Reported-by: Oleg Nesterov Cc: Eric Paris Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- This patch is against 3.11, but was developed on Oleg's post-3.11 patches that introduce exec_binprm(). --- fs/exec.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index fd774c7cb483..c5c24f2fc44a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1383,10 +1383,6 @@ int search_binary_handler(struct linux_binprm *bprm) if (retval) return retval; - retval = audit_bprm(bprm); - if (retval) - return retval; - /* Need to fetch pid before load_binary changes it */ old_pid = current->pid; rcu_read_lock(); @@ -1408,6 +1404,7 @@ int search_binary_handler(struct linux_binprm *bprm) bprm->recursion_depth = depth; if (retval >= 0) { if (depth == 0) { + audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); } -- cgit v1.2.3 From 2000f5beabc9c6baf084de5f7879975408e3652c Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Wed, 13 Nov 2013 10:09:42 -0800 Subject: eCryptfs: file->private_data is always valid When accessing the lower_file pointer located in private_data of eCryptfs files, there is no need to check to see if the private_data pointer has been initialized to a non-NULL value. The file->private_data and file->private_data->lower_file pointers are always initialized to non-NULL values in ecryptfs_open(). This change quiets a Smatch warning: CHECK /var/scm/kernel/linux/fs/ecryptfs/file.c fs/ecryptfs/file.c:321 ecryptfs_unlocked_ioctl() error: potential NULL dereference 'lower_file'. fs/ecryptfs/file.c:335 ecryptfs_compat_ioctl() error: potential NULL dereference 'lower_file'. Signed-off-by: Tyler Hicks Reported-by: Dan Carpenter Reviewed-by: Geyslan G. Bem Cc: Al Viro --- fs/ecryptfs/file.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 2229a74aeeed..b1eaa7a1f82c 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -313,11 +313,9 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag) static long ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct file *lower_file = NULL; + struct file *lower_file = ecryptfs_file_to_lower(file); long rc = -ENOTTY; - if (ecryptfs_file_to_private(file)) - lower_file = ecryptfs_file_to_lower(file); if (lower_file->f_op->unlocked_ioctl) rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); return rc; @@ -327,11 +325,9 @@ ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) static long ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct file *lower_file = NULL; + struct file *lower_file = ecryptfs_file_to_lower(file); long rc = -ENOIOCTLCMD; - if (ecryptfs_file_to_private(file)) - lower_file = ecryptfs_file_to_lower(file); if (lower_file->f_op && lower_file->f_op->compat_ioctl) rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); return rc; -- cgit v1.2.3 From db51242d89b3059a46a3cf2f3339f8cd975cb954 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 15 Nov 2013 21:55:52 -0500 Subject: dump_align(): fix the dumb braino Mea culpa - original variant used 64-by-32-bit division, which got caught very late. Getting rid of that wasn't hard, but I'd managed to botch the calling conventions in process ;-/ Signed-off-by: Al Viro --- fs/coredump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index 62406b6959b6..a2856f7bb613 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -733,7 +733,7 @@ int dump_align(struct coredump_params *cprm, int align) { unsigned mod = cprm->written & (align - 1); if (align & (align - 1)) - return -EINVAL; - return mod ? dump_skip(cprm, align - mod) : 0; + return 0; + return mod ? dump_skip(cprm, align - mod) : 1; } EXPORT_SYMBOL(dump_align); -- cgit v1.2.3 From 52da40ae67f6192a3bf70a98cb560e1423554953 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 15 Nov 2013 21:58:33 -0500 Subject: dump_emit(): use __kernel_write(), not vfs_write() the caller has already done file_start_write()... Signed-off-by: Al Viro --- fs/coredump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index a2856f7bb613..bc3fbcd32558 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -695,7 +695,7 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr) while (nr) { if (dump_interrupted()) return 0; - n = vfs_write(file, addr, nr, &pos); + n = __kernel_write(file, addr, nr, &pos); if (n <= 0) return 0; file->f_pos = pos; -- cgit v1.2.3 From 951b4bd553e35a291e6b5732ab0124619e81da05 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 2 Jun 2013 19:53:40 -0400 Subject: gfs2: endianness misannotations Signed-off-by: Al Viro --- fs/gfs2/lock_dlm.c | 8 ++++---- fs/gfs2/quota.c | 23 ++++++++++------------- fs/gfs2/rgrp.c | 4 ++-- 3 files changed, 16 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index c8423d6de6c3..2a6ba06bee6f 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -466,19 +466,19 @@ static void gdlm_cancel(struct gfs2_glock *gl) static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen, char *lvb_bits) { - uint32_t gen; + __le32 gen; memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE); - memcpy(&gen, lvb_bits, sizeof(uint32_t)); + memcpy(&gen, lvb_bits, sizeof(__le32)); *lvb_gen = le32_to_cpu(gen); } static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen, char *lvb_bits) { - uint32_t gen; + __le32 gen; memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE); gen = cpu_to_le32(lvb_gen); - memcpy(ls->ls_control_lvb, &gen, sizeof(uint32_t)); + memcpy(ls->ls_control_lvb, &gen, sizeof(__le32)); } static int all_jid_bits_clear(char *lvb) diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 453b50eaddec..98236d0df3ca 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -667,7 +667,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, struct buffer_head *bh; struct page *page; void *kaddr, *ptr; - struct gfs2_quota q, *qp; + struct gfs2_quota q; int err, nbytes; u64 size; @@ -683,28 +683,25 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, return err; err = -EIO; - qp = &q; - qp->qu_value = be64_to_cpu(qp->qu_value); - qp->qu_value += change; - qp->qu_value = cpu_to_be64(qp->qu_value); - qd->qd_qb.qb_value = qp->qu_value; + be64_add_cpu(&q.qu_value, change); + qd->qd_qb.qb_value = q.qu_value; if (fdq) { if (fdq->d_fieldmask & FS_DQ_BSOFT) { - qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift); - qd->qd_qb.qb_warn = qp->qu_warn; + q.qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_warn = q.qu_warn; } if (fdq->d_fieldmask & FS_DQ_BHARD) { - qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); - qd->qd_qb.qb_limit = qp->qu_limit; + q.qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_limit = q.qu_limit; } if (fdq->d_fieldmask & FS_DQ_BCOUNT) { - qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift); - qd->qd_qb.qb_value = qp->qu_value; + q.qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_value = q.qu_value; } } /* Write the quota into the quota file on disk */ - ptr = qp; + ptr = &q; nbytes = sizeof(struct gfs2_quota); get_a_page: page = find_or_create_page(mapping, index, GFP_NOFS); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 4d83abdd5635..c8d6161bd682 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1127,7 +1127,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); rgd->rd_free_clone = rgd->rd_free; } - if (be32_to_cpu(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { + if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd)); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); @@ -1161,7 +1161,7 @@ int update_rgrp_lvb(struct gfs2_rgrpd *rgd) if (rgd->rd_flags & GFS2_RDF_UPTODATE) return 0; - if (be32_to_cpu(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) + if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) return gfs2_rgrp_bh_get(rgd); rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags); -- cgit v1.2.3 From b26d4cd385fc51e8844e2cdf9ba2051f5bba11a5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 25 Oct 2013 18:47:37 -0400 Subject: consolidate simple ->d_delete() instances Rename simple_delete_dentry() to always_delete_dentry() and export it. Export simple_dentry_operations, while we are at it, and get rid of their duplicates Signed-off-by: Al Viro --- fs/9p/vfs_dentry.c | 19 +------------------ fs/configfs/dir.c | 12 +----------- fs/efivarfs/super.c | 11 +---------- fs/hostfs/hostfs_kern.c | 11 +---------- fs/libfs.c | 12 +++++++----- fs/proc/generic.c | 18 +----------------- fs/proc/namespaces.c | 8 +------- 7 files changed, 13 insertions(+), 78 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index f039b104a98e..b03dd23feda8 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -42,23 +42,6 @@ #include "v9fs_vfs.h" #include "fid.h" -/** - * v9fs_dentry_delete - called when dentry refcount equals 0 - * @dentry: dentry in question - * - * By returning 1 here we should remove cacheing of unused - * dentry components. - * - */ - -static int v9fs_dentry_delete(const struct dentry *dentry) -{ - p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", - dentry->d_name.name, dentry); - - return 1; -} - /** * v9fs_cached_dentry_delete - called when dentry refcount equals 0 * @dentry: dentry in question @@ -134,6 +117,6 @@ const struct dentry_operations v9fs_cached_dentry_operations = { }; const struct dentry_operations v9fs_dentry_operations = { - .d_delete = v9fs_dentry_delete, + .d_delete = always_delete_dentry, .d_release = v9fs_dentry_release, }; diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 277bd1be21fd..4522e0755773 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -66,19 +66,9 @@ static void configfs_d_iput(struct dentry * dentry, iput(inode); } -/* - * We _must_ delete our dentries on last dput, as the chain-to-parent - * behavior is required to clear the parents of default_groups. - */ -static int configfs_d_delete(const struct dentry *dentry) -{ - return 1; -} - const struct dentry_operations configfs_dentry_ops = { .d_iput = configfs_d_iput, - /* simple_delete_dentry() isn't exported */ - .d_delete = configfs_d_delete, + .d_delete = always_delete_dentry, }; #ifdef CONFIG_LOCKDEP diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index a8766b880c07..becc725a1953 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -83,19 +83,10 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr) return 0; } -/* - * Retaining negative dentries for an in-memory filesystem just wastes - * memory and lookup time: arrange for them to be deleted immediately. - */ -static int efivarfs_delete_dentry(const struct dentry *dentry) -{ - return 1; -} - static struct dentry_operations efivarfs_d_ops = { .d_compare = efivarfs_d_compare, .d_hash = efivarfs_d_hash, - .d_delete = efivarfs_delete_dentry, + .d_delete = always_delete_dentry, }; static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 25437280a207..db23ce1bd903 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -33,15 +33,6 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) #define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file)) -static int hostfs_d_delete(const struct dentry *dentry) -{ - return 1; -} - -static const struct dentry_operations hostfs_dentry_ops = { - .d_delete = hostfs_d_delete, -}; - /* Changed in hostfs_args before the kernel starts running */ static char *root_ino = ""; static int append = 0; @@ -925,7 +916,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) sb->s_blocksize_bits = 10; sb->s_magic = HOSTFS_SUPER_MAGIC; sb->s_op = &hostfs_sbops; - sb->s_d_op = &hostfs_dentry_ops; + sb->s_d_op = &simple_dentry_operations; sb->s_maxbytes = MAX_LFS_FILESIZE; /* NULL is printed as by sprintf: avoid that. */ diff --git a/fs/libfs.c b/fs/libfs.c index 5de06947ba5e..a1844244246f 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -47,10 +47,16 @@ EXPORT_SYMBOL(simple_statfs); * Retaining negative dentries for an in-memory filesystem just wastes * memory and lookup time: arrange for them to be deleted immediately. */ -static int simple_delete_dentry(const struct dentry *dentry) +int always_delete_dentry(const struct dentry *dentry) { return 1; } +EXPORT_SYMBOL(always_delete_dentry); + +const struct dentry_operations simple_dentry_operations = { + .d_delete = always_delete_dentry, +}; +EXPORT_SYMBOL(simple_dentry_operations); /* * Lookup the data. This is trivial - if the dentry didn't already @@ -58,10 +64,6 @@ static int simple_delete_dentry(const struct dentry *dentry) */ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - static const struct dentry_operations simple_dentry_operations = { - .d_delete = simple_delete_dentry, - }; - if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); if (!dentry->d_sb->s_d_op) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 737e15615b04..cca93b6fb9a9 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -174,22 +174,6 @@ static const struct inode_operations proc_link_inode_operations = { .follow_link = proc_follow_link, }; -/* - * As some entries in /proc are volatile, we want to - * get rid of unused dentries. This could be made - * smarter: we could keep a "volatile" flag in the - * inode to indicate which ones to keep. - */ -static int proc_delete_dentry(const struct dentry * dentry) -{ - return 1; -} - -static const struct dentry_operations proc_dentry_operations = -{ - .d_delete = proc_delete_dentry, -}; - /* * Don't create negative dentries here, return -ENOENT by hand * instead. @@ -209,7 +193,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, &proc_dentry_operations); + d_set_d_op(dentry, &simple_dentry_operations); d_add(dentry, inode); return NULL; } diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 49a7fff2e83a..9ae46b87470d 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -42,12 +42,6 @@ static const struct inode_operations ns_inode_operations = { .setattr = proc_setattr, }; -static int ns_delete_dentry(const struct dentry *dentry) -{ - /* Don't cache namespace inodes when not in use */ - return 1; -} - static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) { struct inode *inode = dentry->d_inode; @@ -59,7 +53,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) const struct dentry_operations ns_dentry_operations = { - .d_delete = ns_delete_dentry, + .d_delete = always_delete_dentry, .d_dname = ns_dname, }; -- cgit v1.2.3 From 2bc74feba12fbf052ec97aee8624c9f13593a9ac Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 25 Oct 2013 16:39:14 -0400 Subject: take read_seqbegin_or_lock() and friends to seqlock.h Signed-off-by: Al Viro --- fs/dcache.c | 29 ----------------------------- 1 file changed, 29 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 0a38ef8d7f00..667e23ab0b4c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -88,35 +88,6 @@ EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __read_mostly; -/** - * read_seqbegin_or_lock - begin a sequence number check or locking block - * @lock: sequence lock - * @seq : sequence number to be checked - * - * First try it once optimistically without taking the lock. If that fails, - * take the lock. The sequence number is also used as a marker for deciding - * whether to be a reader (even) or writer (odd). - * N.B. seq must be initialized to an even number to begin with. - */ -static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) -{ - if (!(*seq & 1)) /* Even */ - *seq = read_seqbegin(lock); - else /* Odd */ - read_seqlock_excl(lock); -} - -static inline int need_seqretry(seqlock_t *lock, int seq) -{ - return !(seq & 1) && read_seqretry(lock, seq); -} - -static inline void done_seqretry(seqlock_t *lock, int seq) -{ - if (seq & 1) - read_sequnlock_excl(lock); -} - /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try -- cgit v1.2.3 From 482db9066199813d6b999b65a3171afdbec040b6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 25 Oct 2013 16:41:01 -0400 Subject: dcache.c: get rid of pointless macros D_HASH{MASK,BITS} are used once each, both in the same function (d_hash()). At this point they are actively misguiding - they imply that values are compiler constants, which is no longer true. Signed-off-by: Al Viro --- fs/dcache.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 667e23ab0b4c..7f079d00475a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -96,8 +96,6 @@ static struct kmem_cache *dentry_cache __read_mostly; * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. */ -#define D_HASHBITS d_hash_shift -#define D_HASHMASK d_hash_mask static unsigned int d_hash_mask __read_mostly; static unsigned int d_hash_shift __read_mostly; @@ -108,8 +106,8 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; - hash = hash + (hash >> D_HASHBITS); - return dentry_hashtable + (hash & D_HASHMASK); + hash = hash + (hash >> d_hash_shift); + return dentry_hashtable + (hash & d_hash_mask); } /* Statistics gathering. */ -- cgit v1.2.3 From 31dec1327e377b6d91a8a6c92b5cd8513939a233 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 25 Oct 2013 17:04:27 -0400 Subject: fold try_to_ascend() into the sole remaining caller There used to be a bunch of tree-walkers in dcache.c, all alike. try_to_ascend() had been introduced to abstract a piece of logics duplicated in all of them. These days all these tree-walkers are implemented via the same iterator (d_walk()), which is the only remaining caller of try_to_ascend(), so let's fold it back... Signed-off-by: Al Viro --- fs/dcache.c | 49 ++++++++++++++++++------------------------------- 1 file changed, 18 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 7f079d00475a..4bdb300b16e2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -438,7 +438,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) { list_del(&dentry->d_u.d_child); /* - * Inform try_to_ascend() that we are no longer attached to the + * Inform d_walk() that we are no longer attached to the * dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; @@ -1038,34 +1038,6 @@ void shrink_dcache_sb(struct super_block *sb) } EXPORT_SYMBOL(shrink_dcache_sb); -/* - * This tries to ascend one level of parenthood, but - * we can race with renaming, so we need to re-check - * the parenthood after dropping the lock and check - * that the sequence number still matches. - */ -static struct dentry *try_to_ascend(struct dentry *old, unsigned seq) -{ - struct dentry *new = old->d_parent; - - rcu_read_lock(); - spin_unlock(&old->d_lock); - spin_lock(&new->d_lock); - - /* - * might go back up the wrong parent if we have had a rename - * or deletion - */ - if (new != old->d_parent || - (old->d_flags & DCACHE_DENTRY_KILLED) || - need_seqretry(&rename_lock, seq)) { - spin_unlock(&new->d_lock); - new = NULL; - } - rcu_read_unlock(); - return new; -} - /** * enum d_walk_ret - action to talke during tree walk * @D_WALK_CONTINUE: contrinue walk @@ -1154,9 +1126,24 @@ resume: */ if (this_parent != parent) { struct dentry *child = this_parent; - this_parent = try_to_ascend(this_parent, seq); - if (!this_parent) + this_parent = child->d_parent; + + rcu_read_lock(); + spin_unlock(&child->d_lock); + spin_lock(&this_parent->d_lock); + + /* + * might go back up the wrong parent if we have had a rename + * or deletion + */ + if (this_parent != child->d_parent || + (child->d_flags & DCACHE_DENTRY_KILLED) || + need_seqretry(&rename_lock, seq)) { + spin_unlock(&this_parent->d_lock); + rcu_read_unlock(); goto rename_retry; + } + rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } -- cgit v1.2.3 From 9e3908e342eba6684621e616529669c17e271e7e Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Thu, 7 Nov 2013 15:43:28 -0600 Subject: xfs: fix unlock in xfs_bmap_add_attrfork xfs_trans_ijoin() activates the inode in a transaction and also can specify which lock to free when the transaction is committed or canceled. xfs_bmap_add_attrfork call locks and adds the lock to the transaction but also manually removes the lock. Change the routine to not add the lock to the transaction and manually remove lock on completion. While here, clean up the xfs_trans_cancel flags and goto names. Signed-off-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_bmap.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 1c02da8bb7df..3ef11b22e750 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -1137,6 +1137,7 @@ xfs_bmap_add_attrfork( int committed; /* xaction was committed */ int logflags; /* logging flags */ int error; /* error return value */ + int cancel_flags = 0; ASSERT(XFS_IFORK_Q(ip) == 0); @@ -1147,19 +1148,20 @@ xfs_bmap_add_attrfork( if (rsvd) tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); - if (error) - goto error0; + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : XFS_QMOPT_RES_REGBLKS); - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); - return error; - } + if (error) + goto trans_cancel; + cancel_flags |= XFS_TRANS_ABORT; if (XFS_IFORK_Q(ip)) - goto error1; + goto trans_cancel; if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { /* * For inodes coming from pre-6.2 filesystems. @@ -1169,7 +1171,7 @@ xfs_bmap_add_attrfork( } ASSERT(ip->i_d.di_anextents == 0); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); switch (ip->i_d.di_format) { @@ -1191,7 +1193,7 @@ xfs_bmap_add_attrfork( default: ASSERT(0); error = XFS_ERROR(EINVAL); - goto error1; + goto trans_cancel; } ASSERT(ip->i_afp == NULL); @@ -1219,7 +1221,7 @@ xfs_bmap_add_attrfork( if (logflags) xfs_trans_log_inode(tp, ip, logflags); if (error) - goto error2; + goto bmap_cancel; if (!xfs_sb_version_hasattr(&mp->m_sb) || (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { __int64_t sbfields = 0; @@ -1242,14 +1244,16 @@ xfs_bmap_add_attrfork( error = xfs_bmap_finish(&tp, &flist, &committed); if (error) - goto error2; - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); -error2: + goto bmap_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +bmap_cancel: xfs_bmap_cancel(&flist); -error1: +trans_cancel: + xfs_trans_cancel(tp, cancel_flags); xfs_iunlock(ip, XFS_ILOCK_EXCL); -error0: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); return error; } -- cgit v1.2.3 From 8f80587bacb6eb893df0ee4e35fefa0dfcfdf9f4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 1 Nov 2013 15:27:20 +1100 Subject: xfs: increase inode cluster size for v5 filesystems v5 filesystems use 512 byte inodes as a minimum, so read inodes in clusters that are effectively half the size of a v4 filesystem with 256 byte inodes. For v5 fielsystems, scale the inode cluster size with the size of the inode so that we keep a constant 32 inodes per cluster ratio for all inode IO. This only works if mkfs.xfs sets the inode alignment appropriately for larger inode clusters, so this functionality is made conditional on mkfs doing the right thing. xfs_repair needs to know about the inode alignment changes, too. Wall time: create bulkstat find+stat ls -R unlink v4 237s 161s 173s 201s 299s v5 235s 163s 205s 31s 356s patched 234s 160s 182s 29s 317s System time: create bulkstat find+stat ls -R unlink v4 2601s 2490s 1653s 1656s 2960s v5 2637s 2497s 1681s 20s 3216s patched 2613s 2451s 1658s 20s 3007s So, wall time same or down across the board, system time same or down across the board, and cache hit rates all improve except for the ls -R case which is a pure cold cache directory read workload on v5 filesystems... So, this patch removes most of the performance and CPU usage differential between v4 and v5 filesystems on traversal related workloads. Note: while this patch is currently for v5 filesystems only, there is no reason it can't be ported back to v4 filesystems. This hasn't been done here because bringing the code back to v4 requires forwards and backwards kernel compatibility testing. i.e. to deterine if older kernels(*) do the right thing with larger inode alignments but still only using 8k inode cluster sizes. None of this testing and validation on v4 filesystems has been done, so for the moment larger inode clusters is limited to v5 superblocks. (*) a current default config v4 filesystem should mount just fine on 2.6.23 (when lazy-count support was introduced), and so if we change the alignment emitted by mkfs without a feature bit then we have to make sure it works properly on all kernels since 2.6.23. And if we allow it to be changed when the lazy-count bit is not set, then it's all kernels since v2 logs were introduced that need to be tested for compatibility... Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Eric Sandeen Signed-off-by: Ben Myers --- fs/xfs/xfs_mount.c | 15 +++++++++++++++ fs/xfs/xfs_mount.h | 2 +- fs/xfs/xfs_trans_resv.c | 3 +-- 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index da88f167af78..02df7b408a26 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -41,6 +41,7 @@ #include "xfs_fsops.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_dinode.h" #ifdef HAVE_PERCPU_SB @@ -718,8 +719,22 @@ xfs_mountfs( * Set the inode cluster size. * This may still be overridden by the file system * block size if it is larger than the chosen cluster size. + * + * For v5 filesystems, scale the cluster size with the inode size to + * keep a constant ratio of inode per cluster buffer, but only if mkfs + * has set the inode alignment value appropriately for larger cluster + * sizes. */ mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int new_size = mp->m_inode_cluster_size; + + new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; + if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) + mp->m_inode_cluster_size = new_size; + xfs_info(mp, "Using inode cluster size of %d bytes", + mp->m_inode_cluster_size); + } /* * Set inode alignment fields diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 1d8101a10d8e..a466c5e5826e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -112,7 +112,7 @@ typedef struct xfs_mount { __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ __uint8_t m_agno_log; /* log #ag's */ __uint8_t m_agino_log; /* #bits for agino in inum */ - __uint16_t m_inode_cluster_size;/* min inode buf size */ + uint m_inode_cluster_size;/* min inode buf size */ uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ uint m_blockwmask; /* blockwsize-1 */ diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c index d53d9f0627a7..2fd59c0dae66 100644 --- a/fs/xfs/xfs_trans_resv.c +++ b/fs/xfs/xfs_trans_resv.c @@ -385,8 +385,7 @@ xfs_calc_ifree_reservation( xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + - MAX((__uint16_t)XFS_FSB_TO_B(mp, 1), - XFS_INODE_CLUSTER_SIZE(mp)) + + max_t(uint, XFS_FSB_TO_B(mp, 1), XFS_INODE_CLUSTER_SIZE(mp)) + xfs_calc_buf_res(1, 0) + xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels, 0) + -- cgit v1.2.3 From 2fe8c1c08b3fbd87dd2641c8f032ff6e965d5803 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 1 Nov 2013 15:27:17 +1100 Subject: xfs: open code inc_inode_iversion when logging an inode Michael L Semon reported that generic/069 runtime increased on v5 superblocks by 100% compared to v4 superblocks. his perf-based analysis pointed directly at the timestamp updates being done by the write path in this workload. The append writers are doing 4-byte writes, so there are lots of timestamp updates occurring. The thing is, they aren't being triggered by timestamp changes - they are being triggered by the inode change counter needing to be updated. That is, every write(2) system call needs to bump the inode version count, and it does that through the timestamp update mechanism. Hence for v5 filesystems, test generic/069 is running 3 orders of magnitude more timestmap update transactions on v5 filesystems due to the fact it does a huge number of *4 byte* write(2) calls. This isn't a real world scenario we really need to address - anyone doing such sequential IO should be using fwrite(3), not write(2). i.e. fwrite(3) buffers the writes in userspace to minimise the number of write(2) syscalls, and the problem goes away. However, there is a small change we can make to improve the situation - removing the expensive lock operation on the change counter update. All inode version counter changes in XFS occur under the ip->i_ilock during a transaction, and therefore we don't actually need the spin lock that provides exclusive access to it through inc_inode_iversion(). Hence avoid the lock and just open code the increment ourselves when logging the inode. Reported-by: Michael L. Semon Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_trans_inode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 1bba7f60d94c..50c3f5614288 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -111,12 +111,14 @@ xfs_trans_log_inode( /* * First time we log the inode in a transaction, bump the inode change - * counter if it is configured for this to occur. + * counter if it is configured for this to occur. We don't use + * inode_inc_version() because there is no need for extra locking around + * i_version as we already hold the inode locked exclusively for + * metadata modification. */ if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) && IS_I_VERSION(VFS_I(ip))) { - inode_inc_iversion(VFS_I(ip)); - ip->i_d.di_changecount = VFS_I(ip)->i_version; + ip->i_d.di_changecount = ++VFS_I(ip)->i_version; flags |= XFS_ILOG_CORE; } -- cgit v1.2.3 From 818e5a22e907fbae75e9c1fd78233baec9fa64b6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Nov 2013 05:07:30 -0800 Subject: nfsd: split up nfsd_setattr Split out two helpers to make the code more readable and easier to verify for correctness. Signed-off-by: Christoph Hellwig Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 144 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 84 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 94b5f5d2bfed..c3f57cf14a63 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -298,41 +298,12 @@ commit_metadata(struct svc_fh *fhp) } /* - * Set various file attributes. - * N.B. After this call fhp needs an fh_put + * Go over the attributes and take care of the small differences between + * NFS semantics and what Linux expects. */ -__be32 -nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, - int check_guard, time_t guardtime) +static void +nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) { - struct dentry *dentry; - struct inode *inode; - int accmode = NFSD_MAY_SATTR; - umode_t ftype = 0; - __be32 err; - int host_err; - int size_change = 0; - - if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) - accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; - if (iap->ia_valid & ATTR_SIZE) - ftype = S_IFREG; - - /* Get inode */ - err = fh_verify(rqstp, fhp, ftype, accmode); - if (err) - goto out; - - dentry = fhp->fh_dentry; - inode = dentry->d_inode; - - /* Ignore any mode updates on symlinks */ - if (S_ISLNK(inode->i_mode)) - iap->ia_valid &= ~ATTR_MODE; - - if (!iap->ia_valid) - goto out; - /* * NFSv2 does not differentiate between "set-[ac]time-to-now" * which only requires access, and "set-[ac]time-to-X" which @@ -342,8 +313,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, * convert to "set to now" instead of "set to explicit time" * * We only call inode_change_ok as the last test as technically - * it is not an interface that we should be using. It is only - * valid if the filesystem does not define it's own i_op->setattr. + * it is not an interface that we should be using. */ #define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) #define MAX_TOUCH_TIME_ERROR (30*60) @@ -369,30 +339,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, iap->ia_valid &= ~BOTH_TIME_SET; } } - - /* - * The size case is special. - * It changes the file as well as the attributes. - */ - if (iap->ia_valid & ATTR_SIZE) { - if (iap->ia_size < inode->i_size) { - err = nfsd_permission(rqstp, fhp->fh_export, dentry, - NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE); - if (err) - goto out; - } - - host_err = get_write_access(inode); - if (host_err) - goto out_nfserr; - - size_change = 1; - host_err = locks_verify_truncate(inode, NULL, iap->ia_size); - if (host_err) { - put_write_access(inode); - goto out_nfserr; - } - } /* sanitize the mode change */ if (iap->ia_valid & ATTR_MODE) { @@ -415,8 +361,86 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID); } } +} - /* Change the attributes. */ +static __be32 +nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct iattr *iap) +{ + struct inode *inode = fhp->fh_dentry->d_inode; + int host_err; + + if (iap->ia_size < inode->i_size) { + __be32 err; + + err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, + NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE); + if (err) + return err; + } + + host_err = get_write_access(inode); + if (host_err) + goto out_nfserrno; + + host_err = locks_verify_truncate(inode, NULL, iap->ia_size); + if (host_err) + goto out_put_write_access; + return 0; + +out_put_write_access: + put_write_access(inode); +out_nfserrno: + return nfserrno(host_err); +} + +/* + * Set various file attributes. After this call fhp needs an fh_put. + */ +__be32 +nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, + int check_guard, time_t guardtime) +{ + struct dentry *dentry; + struct inode *inode; + int accmode = NFSD_MAY_SATTR; + umode_t ftype = 0; + __be32 err; + int host_err; + int size_change = 0; + + if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) + accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; + if (iap->ia_valid & ATTR_SIZE) + ftype = S_IFREG; + + /* Get inode */ + err = fh_verify(rqstp, fhp, ftype, accmode); + if (err) + goto out; + + dentry = fhp->fh_dentry; + inode = dentry->d_inode; + + /* Ignore any mode updates on symlinks */ + if (S_ISLNK(inode->i_mode)) + iap->ia_valid &= ~ATTR_MODE; + + if (!iap->ia_valid) + goto out; + + nfsd_sanitize_attrs(inode, iap); + + /* + * The size case is special, it changes the file in addition to the + * attributes. + */ + if (iap->ia_valid & ATTR_SIZE) { + err = nfsd_get_write_access(rqstp, fhp, iap); + if (err) + goto out; + size_change = 1; + } iap->ia_valid |= ATTR_CTIME; -- cgit v1.2.3 From 987da4791052fa298b7cfcde4dea9f6f2bbc786b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Nov 2013 05:07:47 -0800 Subject: nfsd: make sure to balance get/put_write_access Use a straight goto error label style in nfsd_setattr to make sure we always do the put_write_access call after we got it earlier. Note that the we have been failing to do that in the case nfsd_break_lease() returns an error, a bug introduced into 2.6.38 with 6a76bebefe15d9a08864f824d7f8d5beaf37c997 "nfsd4: break lease on nfsd setattr". Signed-off-by: Christoph Hellwig Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c3f57cf14a63..7eea63cada1d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -444,27 +444,28 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, iap->ia_valid |= ATTR_CTIME; - err = nfserr_notsync; - if (!check_guard || guardtime == inode->i_ctime.tv_sec) { - host_err = nfsd_break_lease(inode); - if (host_err) - goto out_nfserr; - fh_lock(fhp); - - host_err = notify_change(dentry, iap, NULL); - err = nfserrno(host_err); - fh_unlock(fhp); + if (check_guard && guardtime != inode->i_ctime.tv_sec) { + err = nfserr_notsync; + goto out_put_write_access; } + + host_err = nfsd_break_lease(inode); + if (host_err) + goto out_put_write_access_nfserror; + + fh_lock(fhp); + host_err = notify_change(dentry, iap, NULL); + fh_unlock(fhp); + +out_put_write_access_nfserror: + err = nfserrno(host_err); +out_put_write_access: if (size_change) put_write_access(inode); if (!err) commit_metadata(fhp); out: return err; - -out_nfserr: - err = nfserrno(host_err); - goto out; } #if defined(CONFIG_NFSD_V2_ACL) || \ -- cgit v1.2.3 From 34f2fd8dfe6185b0eaaf7d661281713a6170b077 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 18 Nov 2013 22:11:42 +0900 Subject: bio: fix argument of __bio_add_page() for max_sectors > 0xffff The data type of max_sectors and max_hw_sectors in queue settings are unsigned int. But these values are passed to __bio_add_page() as an argument whose data type is unsigned short. In the worst case such as max_sectors is 0x10000, bio_add_page() can't add a page and IOs can't proceed. Cc: Jens Axboe Cc: Alexander Viro Signed-off-by: Akinobu Mita Signed-off-by: Jens Axboe --- fs/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 2bdb4e25ee77..33d79a4eb92d 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -601,7 +601,7 @@ EXPORT_SYMBOL(bio_get_nr_vecs); static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, - unsigned short max_sectors) + unsigned int max_sectors) { int retried_segments = 0; struct bio_vec *bvec; -- cgit v1.2.3 From 9bf0c9cd431440a831e60c0a0fd0bc4f0e083e7f Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 16 Nov 2013 18:05:28 -0600 Subject: CIFS: Fix SMB2/SMB3 Copy offload support (refcopy) for large files This third version of the patch, incorparating feedback from David Disseldorp extends the ability of copychunk (refcopy) over smb2/smb3 mounts to handle servers with smaller than usual maximum chunk sizes and also fixes it to handle files bigger than the maximum chunk sizes In the future this can be extended further to handle sending multiple chunk requests in on SMB2 ioctl request which will further improve performance, but even with one 1MB chunk per request the speedup on cp is quite large. Reviewed-by: David Disseldorp Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++-------- fs/cifs/smb2pdu.c | 9 ++++- 2 files changed, 93 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 11dde4b24f8a..a3968eeb6fac 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -532,7 +532,10 @@ smb2_clone_range(const unsigned int xid, int rc; unsigned int ret_data_len; struct copychunk_ioctl *pcchunk; - char *retbuf = NULL; + struct copychunk_ioctl_rsp *retbuf = NULL; + struct cifs_tcon *tcon; + int chunks_copied = 0; + bool chunk_sizes_updated = false; pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL); @@ -547,27 +550,96 @@ smb2_clone_range(const unsigned int xid, /* Note: request_res_key sets res_key null only if rc !=0 */ if (rc) - return rc; + goto cchunk_out; /* For now array only one chunk long, will make more flexible later */ pcchunk->ChunkCount = __constant_cpu_to_le32(1); pcchunk->Reserved = 0; - pcchunk->SourceOffset = cpu_to_le64(src_off); - pcchunk->TargetOffset = cpu_to_le64(dest_off); - pcchunk->Length = cpu_to_le32(len); pcchunk->Reserved2 = 0; - /* Request that server copy to target from src file identified by key */ - rc = SMB2_ioctl(xid, tlink_tcon(trgtfile->tlink), - trgtfile->fid.persistent_fid, - trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, - true /* is_fsctl */, (char *)pcchunk, - sizeof(struct copychunk_ioctl), &retbuf, &ret_data_len); + tcon = tlink_tcon(trgtfile->tlink); - /* BB need to special case rc = EINVAL to alter chunk size */ + while (len > 0) { + pcchunk->SourceOffset = cpu_to_le64(src_off); + pcchunk->TargetOffset = cpu_to_le64(dest_off); + pcchunk->Length = + cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk)); - cifs_dbg(FYI, "rc %d data length out %d\n", rc, ret_data_len); + /* Request server copy to target from src identified by key */ + rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, + trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, + true /* is_fsctl */, (char *)pcchunk, + sizeof(struct copychunk_ioctl), (char **)&retbuf, + &ret_data_len); + if (rc == 0) { + if (ret_data_len != + sizeof(struct copychunk_ioctl_rsp)) { + cifs_dbg(VFS, "invalid cchunk response size\n"); + rc = -EIO; + goto cchunk_out; + } + if (retbuf->TotalBytesWritten == 0) { + cifs_dbg(FYI, "no bytes copied\n"); + rc = -EIO; + goto cchunk_out; + } + /* + * Check if s