summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-03-21 16:24:45 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-03-21 16:24:45 -0700
commitaf472a9efdf65cbb3398cb6478ec0e89fbc84109 (patch)
tree11ec956e35851d6b579cbab5c555f70598adc52c
parent93e220a62da36f766b3188e76e234607e41488f9 (diff)
parent5e929367468c8f97cd1ffb0417316cecfebef94b (diff)
downloadlinux-af472a9efdf65cbb3398cb6478ec0e89fbc84109.tar.gz
linux-af472a9efdf65cbb3398cb6478ec0e89fbc84109.tar.bz2
linux-af472a9efdf65cbb3398cb6478ec0e89fbc84109.zip
Merge tag 'for-5.18/io_uring-2022-03-18' of git://git.kernel.dk/linux-block
Pull io_uring updates from Jens Axboe: - Fixes for current file position. Still doesn't have the f_pos_lock sorted, but it's a step in the right direction (Dylan) - Tracing updates (Dylan, Stefan) - Improvements to io-wq locking (Hao) - Improvements for provided buffers (me, Pavel) - Support for registered file descriptors (me, Xiaoguang) - Support for ring messages (me) - Poll improvements (me) - Fix for fixed buffers and non-iterator reads/writes (me) - Support for NAPI on sockets (Olivier) - Ring quiesce improvements (Usama) - Misc fixes (Olivier, Pavel) * tag 'for-5.18/io_uring-2022-03-18' of git://git.kernel.dk/linux-block: (42 commits) io_uring: terminate manual loop iterator loop correctly for non-vecs io_uring: don't check unrelated req->open.how in accept request io_uring: manage provided buffers strictly ordered io_uring: fold evfd signalling under a slower path io_uring: thin down io_commit_cqring() io_uring: shuffle io_eventfd_signal() bits around io_uring: remove extra barrier for non-sqpoll iopoll io_uring: fix provided buffer return on failure for kiocb_done() io_uring: extend provided buf return to fails io_uring: refactor timeout cancellation cqe posting io_uring: normilise naming for fill_cqe* io_uring: cache poll/double-poll state with a request flag io_uring: cache req->apoll->events in req->cflags io_uring: move req->poll_refs into previous struct hole io_uring: make tracing format consistent io_uring: recycle apoll_poll entries io_uring: remove duplicated member check for io_msg_ring_prep() io_uring: allow submissions to continue on error io_uring: recycle provided buffers if request goes async io_uring: ensure reads re-import for selected buffers ...
-rw-r--r--fs/io-wq.c114
-rw-r--r--fs/io_uring.c1251
-rw-r--r--include/linux/io_uring.h5
-rw-r--r--include/trace/events/io_uring.h333
-rw-r--r--include/uapi/linux/io_uring.h17
5 files changed, 1200 insertions, 520 deletions
diff --git a/fs/io-wq.c b/fs/io-wq.c
index bb7f161bb19c..5b93fa67d346 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -76,6 +76,7 @@ struct io_wqe_acct {
unsigned max_workers;
int index;
atomic_t nr_running;
+ raw_spinlock_t lock;
struct io_wq_work_list work_list;
unsigned long flags;
};
@@ -91,7 +92,7 @@ enum {
*/
struct io_wqe {
raw_spinlock_t lock;
- struct io_wqe_acct acct[2];
+ struct io_wqe_acct acct[IO_WQ_ACCT_NR];
int node;
@@ -224,12 +225,12 @@ static void io_worker_exit(struct io_worker *worker)
if (worker->flags & IO_WORKER_F_FREE)
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
- preempt_disable();
+ raw_spin_unlock(&wqe->lock);
io_wqe_dec_running(worker);
worker->flags = 0;
+ preempt_disable();
current->flags &= ~PF_IO_WORKER;
preempt_enable();
- raw_spin_unlock(&wqe->lock);
kfree_rcu(worker, rcu);
io_worker_ref_put(wqe->wq);
@@ -238,10 +239,15 @@ static void io_worker_exit(struct io_worker *worker)
static inline bool io_acct_run_queue(struct io_wqe_acct *acct)
{
+ bool ret = false;
+
+ raw_spin_lock(&acct->lock);
if (!wq_list_empty(&acct->work_list) &&
!test_bit(IO_ACCT_STALLED_BIT, &acct->flags))
- return true;
- return false;
+ ret = true;
+ raw_spin_unlock(&acct->lock);
+
+ return ret;
}
/*
@@ -385,7 +391,6 @@ fail:
}
static void io_wqe_dec_running(struct io_worker *worker)
- __must_hold(wqe->lock)
{
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
@@ -393,13 +398,14 @@ static void io_wqe_dec_running(struct io_worker *worker)
if (!(worker->flags & IO_WORKER_F_UP))
return;
- if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) {
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
- raw_spin_unlock(&wqe->lock);
- io_queue_worker_create(worker, acct, create_worker_cb);
- raw_spin_lock(&wqe->lock);
- }
+ if (!atomic_dec_and_test(&acct->nr_running))
+ return;
+ if (!io_acct_run_queue(acct))
+ return;
+
+ atomic_inc(&acct->nr_running);
+ atomic_inc(&wqe->wq->worker_refs);
+ io_queue_worker_create(worker, acct, create_worker_cb);
}
/*
@@ -407,11 +413,12 @@ static void io_wqe_dec_running(struct io_worker *worker)
* it's currently on the freelist
*/
static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker)
- __must_hold(wqe->lock)
{
if (worker->flags & IO_WORKER_F_FREE) {
worker->flags &= ~IO_WORKER_F_FREE;
+ raw_spin_lock(&wqe->lock);
hlist_nulls_del_init_rcu(&worker->nulls_node);
+ raw_spin_unlock(&wqe->lock);
}
}
@@ -456,7 +463,7 @@ static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
struct io_worker *worker)
- __must_hold(wqe->lock)
+ __must_hold(acct->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work, *tail;
@@ -498,9 +505,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
* work being added and clearing the stalled bit.
*/
set_bit(IO_ACCT_STALLED_BIT, &acct->flags);
- raw_spin_unlock(&wqe->lock);
+ raw_spin_unlock(&acct->lock);
unstalled = io_wait_on_hash(wqe, stall_hash);
- raw_spin_lock(&wqe->lock);
+ raw_spin_lock(&acct->lock);
if (unstalled) {
clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
if (wq_has_sleeper(&wqe->wq->hash->wait))
@@ -538,7 +545,6 @@ static void io_assign_current_work(struct io_worker *worker,
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
static void io_worker_handle_work(struct io_worker *worker)
- __releases(wqe->lock)
{
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
@@ -555,7 +561,9 @@ static void io_worker_handle_work(struct io_worker *worker)
* can't make progress, any work completion or insertion will
* clear the stalled flag.
*/
+ raw_spin_lock(&acct->lock);
work = io_get_next_work(acct, worker);
+ raw_spin_unlock(&acct->lock);
if (work) {
__io_worker_busy(wqe, worker);
@@ -569,10 +577,9 @@ static void io_worker_handle_work(struct io_worker *worker)
raw_spin_lock(&worker->lock);
worker->next_work = work;
raw_spin_unlock(&worker->lock);
- }
- raw_spin_unlock(&wqe->lock);
- if (!work)
+ } else {
break;
+ }
io_assign_current_work(worker, work);
__set_current_state(TASK_RUNNING);
@@ -608,8 +615,6 @@ static void io_worker_handle_work(struct io_worker *worker)
wake_up(&wq->hash->wait);
}
} while (work);
-
- raw_spin_lock(&wqe->lock);
} while (1);
}
@@ -633,12 +638,10 @@ static int io_wqe_worker(void *data)
long ret;
set_current_state(TASK_INTERRUPTIBLE);
-loop:
- raw_spin_lock(&wqe->lock);
- if (io_acct_run_queue(acct)) {
+ while (io_acct_run_queue(acct))
io_worker_handle_work(worker);
- goto loop;
- }
+
+ raw_spin_lock(&wqe->lock);
/* timed out, exit unless we're the last worker */
if (last_timeout && acct->nr_workers > 1) {
acct->nr_workers--;
@@ -662,10 +665,8 @@ loop:
last_timeout = !ret;
}
- if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
- raw_spin_lock(&wqe->lock);
+ if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
io_worker_handle_work(worker);
- }
audit_free(current);
io_worker_exit(worker);
@@ -705,10 +706,7 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
return;
worker->flags &= ~IO_WORKER_F_RUNNING;
-
- raw_spin_lock(&worker->wqe->lock);
io_wqe_dec_running(worker);
- raw_spin_unlock(&worker->wqe->lock);
}
static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker,
@@ -778,10 +776,12 @@ static void create_worker_cont(struct callback_head *cb)
.cancel_all = true,
};
+ raw_spin_unlock(&wqe->lock);
while (io_acct_cancel_pending_work(wqe, acct, &match))
- raw_spin_lock(&wqe->lock);
+ ;
+ } else {
+ raw_spin_unlock(&wqe->lock);
}
- raw_spin_unlock(&wqe->lock);
io_worker_ref_put(wqe->wq);
kfree(worker);
return;
@@ -914,6 +914,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
+ struct io_cb_cancel_data match;
unsigned work_flags = work->flags;
bool do_create;
@@ -927,10 +928,12 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
}
- raw_spin_lock(&wqe->lock);
+ raw_spin_lock(&acct->lock);
io_wqe_insert_work(wqe, work);
clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
+ raw_spin_unlock(&acct->lock);
+ raw_spin_lock(&wqe->lock);
rcu_read_lock();
do_create = !io_wqe_activate_free_worker(wqe, acct);
rcu_read_unlock();
@@ -946,18 +949,18 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
raw_spin_lock(&wqe->lock);
- /* fatal condition, failed to create the first worker */
- if (!acct->nr_workers) {
- struct io_cb_cancel_data match = {
- .fn = io_wq_work_match_item,
- .data = work,
- .cancel_all = false,
- };
-
- if (io_acct_cancel_pending_work(wqe, acct, &match))
- raw_spin_lock(&wqe->lock);
+ if (acct->nr_workers) {
+ raw_spin_unlock(&wqe->lock);
+ return;
}
raw_spin_unlock(&wqe->lock);
+
+ /* fatal condition, failed to create the first worker */
+ match.fn = io_wq_work_match_item,
+ match.data = work,
+ match.cancel_all = false,
+
+ io_acct_cancel_pending_work(wqe, acct, &match);
}
}
@@ -1032,22 +1035,23 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
struct io_wqe_acct *acct,
struct io_cb_cancel_data *match)
- __releases(wqe->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
+ raw_spin_lock(&acct->lock);
wq_list_for_each(node, prev, &acct->work_list) {
work = container_of(node, struct io_wq_work, list);
if (!match->fn(work, match->data))
continue;
io_wqe_remove_pending(wqe, work, prev);
- raw_spin_unlock(&wqe->lock);
+ raw_spin_unlock(&acct->lock);
io_run_cancel(work, wqe);
match->nr_pending++;
/* not safe to continue after unlock */
return true;
}
+ raw_spin_unlock(&acct->lock);
return false;
}
@@ -1061,7 +1065,6 @@ retry:
struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);
if (io_acct_cancel_pending_work(wqe, acct, match)) {
- raw_spin_lock(&wqe->lock);
if (match->cancel_all)
goto retry;
break;
@@ -1103,13 +1106,11 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
- raw_spin_lock(&wqe->lock);
io_wqe_cancel_pending_work(wqe, &match);
- if (match.nr_pending && !match.cancel_all) {
- raw_spin_unlock(&wqe->lock);
+ if (match.nr_pending && !match.cancel_all)
return IO_WQ_CANCEL_OK;
- }
+ raw_spin_lock(&wqe->lock);
io_wqe_cancel_running_work(wqe, &match);
raw_spin_unlock(&wqe->lock);
if (match.nr_running && !match.cancel_all)
@@ -1190,6 +1191,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
acct->index = i;
atomic_set(&acct->nr_running, 0);
INIT_WQ_LIST(&acct->work_list);
+ raw_spin_lock_init(&acct->lock);
}
wqe->wq = wq;
raw_spin_lock_init(&wqe->lock);
@@ -1282,9 +1284,7 @@ static void io_wq_destroy(struct io_wq *wq)
.fn = io_wq_work_match_all,
.cancel_all = true,
};
- raw_spin_lock(&wqe->lock);
io_wqe_cancel_pending_work(wqe, &match);
- raw_spin_unlock(&wqe->lock);
free_cpumask_var(wqe->cpu_mask);
kfree(wqe);
}
@@ -1376,7 +1376,7 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count)
BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
BUILD_BUG_ON((int) IO_WQ_ACCT_NR != 2);
- for (i = 0; i < 2; i++) {
+ for (i = 0; i < IO_WQ_ACCT_NR; i++) {
if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
new_count[i] = task_rlimit(current, RLIMIT_NPROC);
}
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4715980e9015..5fa736344b67 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -63,6 +63,7 @@
#include <net/sock.h>
#include <net/af_unix.h>
#include <net/scm.h>
+#include <net/busy_poll.h>
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
@@ -263,11 +264,18 @@ struct io_rsrc_data {
bool quiesce;
};
+struct io_buffer_list {
+ struct list_head list;
+ struct list_head buf_list;
+ __u16 bgid;
+};
+
struct io_buffer {
struct list_head list;
__u64 addr;
__u32 len;
__u16 bid;
+ __u16 bgid;
};
struct io_restriction {
@@ -326,6 +334,14 @@ struct io_submit_state {
struct blk_plug plug;
};
+struct io_ev_fd {
+ struct eventfd_ctx *cq_ev_fd;
+ unsigned int eventfd_async: 1;
+ struct rcu_head rcu;
+};
+
+#define IO_BUFFERS_HASH_BITS 5
+
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
@@ -335,11 +351,11 @@ struct io_ring_ctx {
unsigned int flags;
unsigned int compat: 1;
unsigned int drain_next: 1;
- unsigned int eventfd_async: 1;
unsigned int restricted: 1;
unsigned int off_timeout_used: 1;
unsigned int drain_active: 1;
unsigned int drain_disabled: 1;
+ unsigned int has_evfd: 1;
} ____cacheline_aligned_in_smp;
/* submission data */
@@ -378,7 +394,9 @@ struct io_ring_ctx {
struct list_head timeout_list;
struct list_head ltimeout_list;
struct list_head cq_overflow_list;
- struct xarray io_buffers;
+ struct list_head *io_buffers;
+ struct list_head io_buffers_cache;
+ struct list_head apoll_cache;
struct xarray personalities;
u32 pers_next;
unsigned sq_thread_idle;
@@ -395,11 +413,16 @@ struct io_ring_ctx {
struct list_head sqd_list;
unsigned long check_cq_overflow;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ /* used to track busy poll napi_id */
+ struct list_head napi_list;
+ spinlock_t napi_lock; /* napi_list lock */
+#endif
struct {
unsigned cached_cq_tail;
unsigned cq_entries;
- struct eventfd_ctx *cq_ev_fd;
+ struct io_ev_fd __rcu *io_ev_fd;
struct wait_queue_head cq_wait;
unsigned cq_extra;
atomic_t cq_timeouts;
@@ -421,6 +444,8 @@ struct io_ring_ctx {
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
bool poll_multi_queue;
+
+ struct list_head io_buffers_comp;
} ____cacheline_aligned_in_smp;
struct io_restriction restrictions;
@@ -436,6 +461,8 @@ struct io_ring_ctx {
struct llist_head rsrc_put_llist;
struct list_head rsrc_ref_list;
spinlock_t rsrc_ref_lock;
+
+ struct list_head io_buffers_pages;
};
/* Keep this last, we don't need it for the fast path */
@@ -461,6 +488,11 @@ struct io_ring_ctx {
};
};
+/*
+ * Arbitrary limit, can be raised if need be
+ */
+#define IO_RINGFD_REG_MAX 16
+
struct io_uring_task {
/* submission side */
int cached_refs;
@@ -476,6 +508,7 @@ struct io_uring_task {
struct io_wq_work_list task_list;
struct io_wq_work_list prior_task_list;
struct callback_head task_work;
+ struct file **registered_rings;
bool task_running;
};
@@ -690,6 +723,12 @@ struct io_hardlink {
int flags;
};
+struct io_msg {
+ struct file *file;
+ u64 user_data;
+ u32 len;
+};
+
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -741,6 +780,8 @@ enum {
REQ_F_ARM_LTIMEOUT_BIT,
REQ_F_ASYNC_DATA_BIT,
REQ_F_SKIP_LINK_CQES_BIT,
+ REQ_F_SINGLE_POLL_BIT,
+ REQ_F_DOUBLE_POLL_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
@@ -799,6 +840,10 @@ enum {
REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
/* don't post CQEs while failing linked requests */
REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),
+ /* single poll may be active */
+ REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),
+ /* double poll may active */
+ REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
};
struct async_poll {
@@ -825,7 +870,7 @@ enum {
* NOTE! Each of the iocb union members has the file pointer
* as the first entry in their struct definition. So you can
* access the file pointer through any of the sub-structs,
- * or directly as just 'ki_filp' in this struct.
+ * or directly as just 'file' in this struct.
*/
struct io_kiocb {
union {
@@ -855,6 +900,7 @@ struct io_kiocb {
struct io_mkdir mkdir;
struct io_symlink symlink;
struct io_hardlink hardlink;
+ struct io_msg msg;
};
u8 opcode;
@@ -877,6 +923,7 @@ struct io_kiocb {
/* used by request caches, completion batching and iopoll */
struct io_wq_work_node comp_list;
atomic_t refs;
+ atomic_t poll_refs;
struct io_kiocb *link;
struct io_task_work io_task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
@@ -885,12 +932,11 @@ struct io_kiocb {
struct async_poll *apoll;
/* opcode allocated if it needs to store data for async defer */
void *async_data;
- struct io_wq_work work;
/* custom credentials, valid IFF REQ_F_CREDS is set */
- const struct cred *creds;
/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
struct io_buffer *kbuf;
- atomic_t poll_refs;
+ const struct cred *creds;
+ struct io_wq_work work;
};
struct io_tctx_node {
@@ -1105,6 +1151,9 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_MKDIRAT] = {},
[IORING_OP_SYMLINKAT] = {},
[IORING_OP_LINKAT] = {},
+ [IORING_OP_MSG_RING] = {
+ .needs_file = 1,
+ },
};
/* requests with any of those set should undergo io_disarm_next() */
@@ -1141,6 +1190,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
+static void io_eventfd_signal(struct io_ring_ctx *ctx);
static struct kmem_cache *req_cachep;
@@ -1267,36 +1317,88 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
}
}
-static unsigned int __io_put_kbuf(struct io_kiocb *req)
+static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
{
struct io_buffer *kbuf = req->kbuf;
unsigned int cflags;
- cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
- cflags |= IORING_CQE_F_BUFFER;
+ cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT);
req->flags &= ~REQ_F_BUFFER_SELECTED;
- kfree(kbuf);
+ list_add(&kbuf->list, list);
req->kbuf = NULL;
return cflags;
}
-static inline unsigned int io_put_kbuf(struct io_kiocb *req)
+static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
+{
+ if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ return 0;
+ return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
+}
+
+static inline unsigned int io_put_kbuf(struct io_kiocb *req,
+ unsigned issue_flags)
{
+ unsigned int cflags;
+
if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
return 0;
- return __io_put_kbuf(req);
+
+ /*
+ * We can add this buffer back to two lists:
+ *
+ * 1) The io_buffers_cache list. This one is protected by the
+ * ctx->uring_lock. If we already hold this lock, add back to this
+ * list as we can grab it from issue as well.
+ * 2) The io_buffers_comp list. This one is protected by the
+ * ctx->completion_lock.
+ *
+ * We migrate buffers from the comp_list to the issue cache list
+ * when we need one.
+ */
+ if (issue_flags & IO_URING_F_UNLOCKED) {
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock(&ctx->completion_lock);
+ cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
+ spin_unlock(&ctx->completion_lock);
+ } else {
+ cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
+ }
+
+ return cflags;
}
-static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
+static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
+ unsigned int bgid)
{
- bool got = percpu_ref_tryget(ref);
+ struct list_head *hash_list;
+ struct io_buffer_list *bl;
- /* already at zero, wait for ->release() */
- if (!got)
- wait_for_completion(compl);
- percpu_ref_resurrect(ref);
- if (got)
- percpu_ref_put(ref);
+ hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
+ list_for_each_entry(bl, hash_list, list)
+ if (bl->bgid == bgid || bgid == -1U)
+ return bl;
+
+ return NULL;
+}
+
+static void io_kbuf_recycle(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
+ struct io_buffer *buf;
+
+ if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ return;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ buf = req->kbuf;
+ bl = io_buffer_get_list(ctx, buf->bgid);
+ list_add(&buf->list, &bl->buf_list);
+ req->flags &= ~REQ_F_BUFFER_SELECTED;
+ req->kbuf = NULL;
}
static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
@@ -1409,7 +1511,7 @@ static __cold void io_fallback_req_func(struct work_struct *work)
static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
- int hash_bits;
+ int i, hash_bits;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
@@ -1436,6 +1538,13 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
/* set invalid range, so io_import_fixed() fails meeting it */
ctx->dummy_ubuf->ubuf = -1UL;
+ ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS,
+ sizeof(struct list_head), GFP_KERNEL);
+ if (!ctx->io_buffers)
+ goto err;
+ for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++)
+ INIT_LIST_HEAD(&ctx->io_buffers[i]);
+
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
goto err;
@@ -1444,14 +1553,17 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
+ INIT_LIST_HEAD(&ctx->io_buffers_cache);
+ INIT_LIST_HEAD(&ctx->apoll_cache);
init_completion(&ctx->ref_comp);
- xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->cq_wait);
spin_lock_init(&ctx->completion_lock);
spin_lock_init(&ctx->timeout_lock);
INIT_WQ_LIST(&ctx->iopoll_list);
+ INIT_LIST_HEAD(&ctx->io_buffers_pages);
+ INIT_LIST_HEAD(&ctx->io_buffers_comp);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
INIT_LIST_HEAD(&ctx->ltimeout_list);
@@ -1464,10 +1576,15 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_WQ_LIST(&ctx->locked_free_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ INIT_LIST_HEAD(&ctx->napi_list);
+ spin_lock_init(&ctx->napi_lock);
+#endif
return ctx;
err:
kfree(ctx->dummy_ubuf);
kfree(ctx->cancel_hash);
+ kfree(ctx->io_buffers);
kfree(ctx);
return NULL;
}
@@ -1610,8 +1727,8 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
req->work.flags |= IO_WQ_WORK_CANCEL;
- trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
- &req->work, req->flags);
+ trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags,
+ &req->work, io_wq_is_hashed(&req->work));
io_wq_enqueue(tctx->io_wq, &req->work);
if (link)
io_queue_linked_timeout(link);
@@ -1681,22 +1798,27 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
spin_unlock_irq(&ctx->timeout_lock);
}
-static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
-{
- if (ctx->off_timeout_used)
- io_flush_timeouts(ctx);
- if (ctx->drain_active)
- io_queue_deferred(ctx);
-}
-
static inline void io_commit_cqring(struct io_ring_ctx *ctx)
{
- if (unlikely(ctx->off_timeout_used || ctx->drain_active))
- __io_commit_cqring_flush(ctx);
/* order cqe stores with ring update */
smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
}
+static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
+{
+ if (ctx->off_timeout_used || ctx->drain_active) {
+ spin_lock(&ctx->completion_lock);
+ if (ctx->off_timeout_used)
+ io_flush_timeouts(ctx);
+ if (ctx->drain_active)
+ io_queue_deferred(ctx);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ }
+ if (ctx->has_evfd)
+ io_eventfd_signal(ctx);
+}
+
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
{
struct io_rings *r = ctx->rings;
@@ -1726,23 +1848,34 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
return &rings->cqes[tail & mask];
}
-static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
+static void io_eventfd_signal(struct io_ring_ctx *ctx)
{
- if (likely(!ctx->cq_ev_fd))
- return false;
+ struct io_ev_fd *ev_fd;
+
+ rcu_read_lock();
+ /*
+ * rcu_dereference ctx->io_ev_fd once and use it for both for checking
+ * and eventfd_signal
+ */
+ ev_fd = rcu_dereference(ctx->io_ev_fd);
+
+ /*
+ * Check again if ev_fd exists incase an io_eventfd_unregister call
+ * completed between the NULL check of ctx->io_ev_fd at the start of
+ * the function and rcu_read_lock.
+ */
+ if (unlikely(!ev_fd))
+ goto out;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
- return false;
- return !ctx->eventfd_async || io_wq_current_is_worker();
+ goto out;
+
+ if (!ev_fd->eventfd_async || io_wq_current_is_worker())
+ eventfd_signal(ev_fd->cq_ev_fd, 1);
+out:
+ rcu_read_unlock();
}
-/*
- * This should only get called when at least one event has been posted.
- * Some applications rely on the eventfd notification count only changing
- * IFF a new CQE has been added to the CQ ring. There's no depedency on
- * 1:1 relationship between how many times this function is called (and
- * hence the eventfd count) and number of CQEs posted to the CQ ring.
- */
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+static inline void io_cqring_wake(struct io_ring_ctx *ctx)
{
/*
* wake_up_all() may seem excessive, but io_wake_function() and
@@ -1751,21 +1884,32 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
*/
if (wq_has_sleeper(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
- if (io_should_trigger_evfd(ctx))
- eventfd_signal(ctx->cq_ev_fd, 1);
+}
+
+/*
+ * This should only get called when at least one event has been posted.
+ * Some applications rely on the eventfd notification count only changing
+ * IFF a new CQE has been added to the CQ ring. There's no depedency on
+ * 1:1 relationship between how many times this function is called (and
+ * hence the eventfd count) and number of CQEs posted to the CQ ring.
+ */
+static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+ if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+ ctx->has_evfd))
+ __io_commit_cqring_flush(ctx);
+
+ io_cqring_wake(ctx);
}
static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
{
- /* see waitqueue_active() comment */
- smp_mb();
+ if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+ ctx->has_evfd))
+ __io_commit_cqring_flush(ctx);
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- if (waitqueue_active(&ctx->cq_wait))
- wake_up_all(&ctx->cq_wait);
- }
- if (io_should_trigger_evfd(ctx))
- eventfd_signal(ctx->cq_ev_fd, 1);
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ io_cqring_wake(ctx);
}
/* Returns true if there are no backlogged entries after the flush */
@@ -1905,8 +2049,6 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
{
struct io_uring_cqe *cqe;
- trace_io_uring_complete(ctx, user_data, res, cflags);
-
/*
* If we can't get a cq entry, userspace overflowed the
* submission (by quite a lot). Increment the overflow count in
@@ -1922,16 +2064,23 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
return io_cqring_event_overflow(ctx, user_data, res, cflags);
}
+static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
+{
+ trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags);
+ return __io_fill_cqe(req->ctx, req->user_data, res, cflags);
+}
+
static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
{
if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe(req->ctx, req->user_data, res, cflags);
+ __io_fill_cqe_req(req, res, cflags);
}
static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
s32 res, u32 cflags)
{
ctx->cq_extra++;
+ trace_io_uring_complete(ctx, NULL, user_data, res, cflags);
return __io_fill_cqe(ctx, user_data, res, cflags);
}
@@ -1941,7 +2090,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,
struct io_ring_ctx *ctx = req->ctx;
if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe(ctx, req->user_data, res, cflags);
+ __io_fill_cqe_req(req, res, cflags);
/*
* If we're the last reference to this request, add to our locked
* free_list cache.
@@ -2000,7 +2149,7 @@ static inline void io_req_complete(struct io_kiocb *req, s32 res)
static void io_req_complete_failed(struct io_kiocb *req, s32 res)
{
req_set_fail(req);
- io_req_complete_post(req, res, 0);
+ io_req_complete_post(req, res, io_put_kbuf(req, 0));
}
static void io_req_complete_fail_submit(struct io_kiocb *req)
@@ -2183,7 +2332,9 @@ static void io_fail_links(struct io_kiocb *req)
nxt = link->link;
link->link = NULL;
- trace_io_uring_fail_link(req, link);
+ trace_io_uring_fail_link(req->ctx, req, req->user_data,
+ req->opcode, link);
+
if (!ignore_cqes) {
link->flags &= ~REQ_F_CQE_SKIP;
io_fill_cqe_req(link, res, 0);
@@ -2302,7 +2453,8 @@ static void handle_prev_tw_list(struct io_wq_work_node *node,
if (likely(*uring_locked))
req->io_task_work.func(req, uring_locked);
else
- __io_req_complete_post(req, req->result, io_put_kbuf(req));
+ __io_req_complete_post(req, req->result,
+ io_put_kbuf_comp(req));
node = next;
} while (node);
@@ -2530,8 +2682,16 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
comp_list);
if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe(ctx, req->user_data, req->result,
- req->cflags);
+ __io_fill_cqe_req(req, req->result, req->cflags);
+ if ((req->flags & REQ_F_POLLED) && req->apoll) {
+ struct async_poll *apoll = req->apoll;
+
+ if (apoll->double_poll)
+ kfree(apoll->double_poll);
+ list_add(&apoll->poll.wait.entry,
+ &ctx->apoll_cache);
+ req->flags &= ~REQ_F_POLLED;
+ }
}
io_commit_cqring(ctx);
@@ -2653,7 +2813,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
if (unlikely(req->flags & REQ_F_CQE_SKIP))
continue;
- __io_fill_cqe(ctx, req->user_data, req->result, io_put_kbuf(req));
+ __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0));
nr_events++;
}
@@ -2829,14 +2989,14 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
{
- unsigned int cflags = io_put_kbuf(req);
int res = req->result;
if (*locked) {
- io_req_complete_state(req, res, cflags);
+ io_req_complete_state(req, res, io_put_kbuf(req, 0));
io_req_add_compl_list(req);
} else {
- io_req_complete_post(req, res, cflags);
+ io_req_complete_post(req, res,
+ io_put_kbuf(req, IO_URING_F_UNLOCKED));
}
}
@@ -2845,7 +3005,8 @@ static void __io_complete_rw(struct io_kiocb *req, long res,
{
if (__io_complete_rw_common(req, res))
return;
- __io_req_complete(req, issue_flags, req->result, io_put_kbuf(req));
+ __io_req_complete(req, issue_flags, req->result,
+ io_put_kbuf(req, issue_flags));
}
static void io_complete_rw(struct kiocb *kiocb, long res)
@@ -3000,14 +3161,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
kiocb->ki_pos = READ_ONCE(sqe->off);
- if (kiocb->ki_pos == -1) {
- if (!(file->f_mode & FMODE_STREAM)) {
- req->flags |= REQ_F_CUR_POS;
- kiocb->ki_pos = file->f_pos;
- } else {
- kiocb->ki_pos = 0;
- }
- }
kiocb->ki_flags = iocb_flags(file);
ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));