From a6d7cff472eea87d96899a20fa718d2bab7109f3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Mar 2018 12:10:17 -0700 Subject: fs/aio: Add explicit RCU grace period when freeing kioctx While fixing refcounting, e34ecee2ae79 ("aio: Fix a trinity splat") incorrectly removed explicit RCU grace period before freeing kioctx. The intention seems to be depending on the internal RCU grace periods of percpu_ref; however, percpu_ref uses a different flavor of RCU, sched-RCU. This can lead to kioctx being freed while RCU read protected dereferences are still in progress. Fix it by updating free_ioctx() to go through call_rcu() explicitly. v2: Comment added to explain double bouncing. Signed-off-by: Tejun Heo Reported-by: Jann Horn Fixes: e34ecee2ae79 ("aio: Fix a trinity splat") Cc: Kent Overstreet Cc: Linus Torvalds Cc: stable@vger.kernel.org # v3.13+ --- fs/aio.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index a062d75109cb..eb2e0cfbe2d2 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -115,7 +115,8 @@ struct kioctx { struct page **ring_pages; long nr_pages; - struct work_struct free_work; + struct rcu_head free_rcu; + struct work_struct free_work; /* see free_ioctx() */ /* * signals when all in-flight requests are done @@ -588,6 +589,12 @@ static int kiocb_cancel(struct aio_kiocb *kiocb) return cancel(&kiocb->common); } +/* + * free_ioctx() should be RCU delayed to synchronize against the RCU + * protected lookup_ioctx() and also needs process context to call + * aio_free_ring(), so the double bouncing through kioctx->free_rcu and + * ->free_work. + */ static void free_ioctx(struct work_struct *work) { struct kioctx *ctx = container_of(work, struct kioctx, free_work); @@ -601,6 +608,14 @@ static void free_ioctx(struct work_struct *work) kmem_cache_free(kioctx_cachep, ctx); } +static void free_ioctx_rcufn(struct rcu_head *head) +{ + struct kioctx *ctx = container_of(head, struct kioctx, free_rcu); + + INIT_WORK(&ctx->free_work, free_ioctx); + schedule_work(&ctx->free_work); +} + static void free_ioctx_reqs(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, reqs); @@ -609,8 +624,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref) if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count)) complete(&ctx->rq_wait->comp); - INIT_WORK(&ctx->free_work, free_ioctx); - schedule_work(&ctx->free_work); + /* Synchronize against RCU protected table->table[] dereferences */ + call_rcu(&ctx->free_rcu, free_ioctx_rcufn); } /* @@ -838,7 +853,7 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, table->table[ctx->id] = NULL; spin_unlock(&mm->ioctx_lock); - /* percpu_ref_kill() will do the necessary call_rcu() */ + /* free_ioctx_reqs() will do the necessary RCU synchronization */ wake_up_all(&ctx->wait); /* -- cgit v1.2.3 From d0264c01e7587001a8c4608a5d1818dba9a4c11a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Mar 2018 12:10:17 -0700 Subject: fs/aio: Use RCU accessors for kioctx_table->table[] While converting ioctx index from a list to a table, db446a08c23d ("aio: convert the ioctx list to table lookup v3") missed tagging kioctx_table->table[] as an array of RCU pointers and using the appropriate RCU accessors. This introduces a small window in the lookup path where init and access may race. Mark kioctx_table->table[] with __rcu and use the approriate RCU accessors when using the field. Signed-off-by: Tejun Heo Reported-by: Jann Horn Fixes: db446a08c23d ("aio: convert the ioctx list to table lookup v3") Cc: Benjamin LaHaise Cc: Linus Torvalds Cc: stable@vger.kernel.org # v3.12+ --- fs/aio.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index eb2e0cfbe2d2..6bcd3fb5265a 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -68,9 +68,9 @@ struct aio_ring { #define AIO_RING_PAGES 8 struct kioctx_table { - struct rcu_head rcu; - unsigned nr; - struct kioctx *table[]; + struct rcu_head rcu; + unsigned nr; + struct kioctx __rcu *table[]; }; struct kioctx_cpu { @@ -330,7 +330,7 @@ static int aio_ring_mremap(struct vm_area_struct *vma) for (i = 0; i < table->nr; i++) { struct kioctx *ctx; - ctx = table->table[i]; + ctx = rcu_dereference(table->table[i]); if (ctx && ctx->aio_ring_file == file) { if (!atomic_read(&ctx->dead)) { ctx->user_id = ctx->mmap_base = vma->vm_start; @@ -666,9 +666,9 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) while (1) { if (table) for (i = 0; i < table->nr; i++) - if (!table->table[i]) { + if (!rcu_access_pointer(table->table[i])) { ctx->id = i; - table->table[i] = ctx; + rcu_assign_pointer(table->table[i], ctx); spin_unlock(&mm->ioctx_lock); /* While kioctx setup is in progress, @@ -849,8 +849,8 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, } table = rcu_dereference_raw(mm->ioctx_table); - WARN_ON(ctx != table->table[ctx->id]); - table->table[ctx->id] = NULL; + WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id])); + RCU_INIT_POINTER(table->table[ctx->id], NULL); spin_unlock(&mm->ioctx_lock); /* free_ioctx_reqs() will do the necessary RCU synchronization */ @@ -895,7 +895,8 @@ void exit_aio(struct mm_struct *mm) skipped = 0; for (i = 0; i < table->nr; ++i) { - struct kioctx *ctx = table->table[i]; + struct kioctx *ctx = + rcu_dereference_protected(table->table[i], true); if (!ctx) { skipped++; @@ -1084,7 +1085,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) if (!table || id >= table->nr) goto out; - ctx = table->table[id]; + ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { percpu_ref_get(&ctx->users); ret = ctx; -- cgit v1.2.3 From 74b44bbe80b4c62113ac1501482ea1ee40eb9d67 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Mar 2018 12:10:18 -0700 Subject: RDMAVT: Fix synchronization around percpu_ref rvt_mregion uses percpu_ref for reference counting and RCU to protect accesses from lkey_table. When a rvt_mregion needs to be freed, it first gets unregistered from lkey_table and then rvt_check_refs() is called to wait for in-flight usages before the rvt_mregion is freed. rvt_check_refs() seems to have a couple issues. * It has a fast exit path which tests percpu_ref_is_zero(). However, a percpu_ref reading zero doesn't mean that the object can be released. In fact, the ->release() callback might not even have started executing yet. Proceeding with freeing can lead to use-after-free. * lkey_table is RCU protected but there is no RCU grace period in the free path. percpu_ref uses RCU internally but it's sched-RCU whose grace periods are different from regular RCU. Also, it generally isn't a good idea to depend on internal behaviors like this. To address the above issues, this patch removes the fast exit and adds an explicit synchronize_rcu(). Signed-off-by: Tejun Heo Acked-by: Dennis Dalessandro Cc: Mike Marciniszyn Cc: linux-rdma@vger.kernel.org Cc: Linus Torvalds --- drivers/infiniband/sw/rdmavt/mr.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 1b2e5362a3ff..cc429b567d0a 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -489,11 +489,13 @@ static int rvt_check_refs(struct rvt_mregion *mr, const char *t) unsigned long timeout; struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device); - if (percpu_ref_is_zero(&mr->refcount)) - return 0; - /* avoid dma mr */ - if (mr->lkey) + if (mr->lkey) { + /* avoid dma mr */ rvt_dereg_clean_qps(mr); + /* @mr was indexed on rcu protected @lkey_table */ + synchronize_rcu(); + } + timeout = wait_for_completion_timeout(&mr->comp, 5 * HZ); if (!timeout) { rvt_pr_err(rdi, -- cgit v1.2.3