summaryrefslogtreecommitdiff
path: root/fs/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c1144
1 files changed, 791 insertions, 353 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index fdd497f92a6e..e4c9f776919b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -95,7 +95,7 @@
#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
/* only define max */
-#define IORING_MAX_FIXED_FILES (1U << 15)
+#define IORING_MAX_FIXED_FILES (1U << 20)
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -117,6 +117,8 @@
#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
IO_REQ_CLEAN_FLAGS)
+#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
+
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
struct io_uring {
@@ -170,7 +172,7 @@ struct io_rings {
* The application needs a full memory barrier before checking
* for IORING_SQ_NEED_WAKEUP after updating the sq tail.
*/
- u32 sq_flags;
+ atomic_t sq_flags;
/*
* Runtime CQ flags
*
@@ -258,6 +260,8 @@ struct io_rsrc_put {
struct io_file_table {
struct io_fixed_file *files;
+ unsigned long *bitmap;
+ unsigned int alloc_hint;
};
struct io_rsrc_node {
@@ -282,10 +286,26 @@ struct io_rsrc_data {
bool quiesce;
};
+#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
struct io_buffer_list {
- struct list_head list;
- struct list_head buf_list;
+ /*
+ * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
+ * then these are classic provided buffers and ->buf_list is used.
+ */
+ union {
+ struct list_head buf_list;
+ struct {
+ struct page **buf_pages;
+ struct io_uring_buf_ring *buf_ring;
+ };
+ };
__u16 bgid;
+
+ /* below is for ring provided buffers */
+ __u16 buf_nr_pages;
+ __u16 nr_entries;
+ __u32 head;
+ __u32 mask;
};
struct io_buffer {
@@ -358,7 +378,7 @@ struct io_ev_fd {
struct rcu_head rcu;
};
-#define IO_BUFFERS_HASH_BITS 5
+#define BGID_ARRAY 64
struct io_ring_ctx {
/* const or read-mostly hot data */
@@ -367,6 +387,7 @@ struct io_ring_ctx {
struct io_rings *rings;
unsigned int flags;
+ enum task_work_notify_mode notify_method;
unsigned int compat: 1;
unsigned int drain_next: 1;
unsigned int restricted: 1;
@@ -411,11 +432,14 @@ struct io_ring_ctx {
struct io_mapped_ubuf **user_bufs;
struct io_submit_state submit_state;
+
+ struct io_buffer_list *io_bl;
+ struct xarray io_bl_xa;
+ struct list_head io_buffers_cache;
+
struct list_head timeout_list;
struct list_head ltimeout_list;
struct list_head cq_overflow_list;
- struct list_head *io_buffers;
- struct list_head io_buffers_cache;
struct list_head apoll_cache;
struct xarray personalities;
u32 pers_next;
@@ -617,7 +641,7 @@ struct io_rw {
struct kiocb kiocb;
u64 addr;
u32 len;
- u32 flags;
+ rwf_t flags;
};
struct io_connect {
@@ -634,9 +658,9 @@ struct io_sr_msg {
void __user *buf;
};
int msg_flags;
- int bgid;
size_t len;
size_t done_io;
+ unsigned int flags;
};
struct io_open {
@@ -804,6 +828,7 @@ enum {
REQ_F_NEED_CLEANUP_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
+ REQ_F_BUFFER_RING_BIT,
REQ_F_COMPLETE_INLINE_BIT,
REQ_F_REISSUE_BIT,
REQ_F_CREDS_BIT,
@@ -814,6 +839,7 @@ enum {
REQ_F_SINGLE_POLL_BIT,
REQ_F_DOUBLE_POLL_BIT,
REQ_F_PARTIAL_IO_BIT,
+ REQ_F_APOLL_MULTISHOT_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
@@ -854,6 +880,8 @@ enum {
REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
/* buffer already selected */
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
+ /* buffer selected from ring, needs commit */
+ REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT),
/* completion is deferred through io_comp_state */
REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
/* caller should reissue async */
@@ -878,6 +906,8 @@ enum {
REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
/* request has already done partial IO */
REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
+ /* fast poll multishot mode */
+ REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
};
struct async_poll {
@@ -956,6 +986,11 @@ struct io_kiocb {
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
+ /*
+ * Can be either a fixed buffer index, or used with provided buffers.
+ * For the latter, before issue it points to the buffer group ID,
+ * and after selection it points to the buffer ID itself.
+ */
u16 buf_index;
unsigned int flags;
@@ -965,14 +1000,26 @@ struct io_kiocb {
struct task_struct *task;
struct io_rsrc_node *rsrc_node;
- /* store used ubuf, so we can prevent reloading */
- struct io_mapped_ubuf *imu;
+
+ union {
+ /* store used ubuf, so we can prevent reloading */
+ struct io_mapped_ubuf *imu;
+
+ /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+ struct io_buffer *kbuf;
+
+ /*
+ * stores buffer ID for ring provided buffers, valid IFF
+ * REQ_F_BUFFER_RING is set.
+ */
+ struct io_buffer_list *buf_list;
+ };
union {
/* used by request caches, completion batching and iopoll */
struct io_wq_work_node comp_list;
/* cache ->apoll->events */
- int apoll_events;
+ __poll_t apoll_events;
};
atomic_t refs;
atomic_t poll_refs;
@@ -983,8 +1030,6 @@ struct io_kiocb {
struct async_poll *apoll;
/* opcode allocated if it needs to store data for async defer */
void *async_data;
- /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
- struct io_buffer *kbuf;
/* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
struct io_kiocb *link;
/* custom credentials, valid IFF REQ_F_CREDS is set */
@@ -1035,12 +1080,20 @@ struct io_op_def {
unsigned not_supported : 1;
/* skip auditing */
unsigned audit_skip : 1;
+ /* supports ioprio */
+ unsigned ioprio : 1;
+ /* supports iopoll */
+ unsigned iopoll : 1;
/* size of async data needed, if any */
unsigned short async_size;
};
static const struct io_op_def io_op_defs[] = {
- [IORING_OP_NOP] = {},
+ [IORING_OP_NOP] = {
+ .audit_skip = 1,
+ .iopoll = 1,
+ .buffer_select = 1,
+ },
[IORING_OP_READV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
@@ -1049,6 +1102,8 @@ static const struct io_op_def io_op_defs[] = {
.needs_async_setup = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITEV] = {
@@ -1059,6 +1114,8 @@ static const struct io_op_def io_op_defs[] = {
.needs_async_setup = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FSYNC] = {
@@ -1071,6 +1128,8 @@ static const struct io_op_def io_op_defs[] = {
.pollin = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE_FIXED] = {
@@ -1080,6 +1139,8 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_POLL_ADD] = {
@@ -1122,6 +1183,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollin = 1,
.poll_exclusive = 1,
+ .ioprio = 1, /* used for flags */
},
[IORING_OP_ASYNC_CANCEL] = {
.audit_skip = 1,
@@ -1144,6 +1206,7 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_CLOSE] = {},
[IORING_OP_FILES_UPDATE] = {
.audit_skip = 1,
+ .iopoll = 1,
},
[IORING_OP_STATX] = {
.audit_skip = 1,
@@ -1155,6 +1218,8 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE] = {
@@ -1164,6 +1229,8 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FADVISE] = {
@@ -1198,9 +1265,11 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_PROVIDE_BUFFERS] = {
.audit_skip = 1,
+ .iopoll = 1,
},
[IORING_OP_REMOVE_BUFFERS] = {
.audit_skip = 1,
+ .iopoll = 1,
},
[IORING_OP_TEE] = {
.needs_file = 1,
@@ -1218,6 +1287,7 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_LINKAT] = {},
[IORING_OP_MSG_RING] = {
.needs_file = 1,
+ .iopoll = 1,
},
[IORING_OP_FSETXATTR] = {
.needs_file = 1
@@ -1249,7 +1319,7 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
static void io_clean_op(struct io_kiocb *req);
static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
-static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd);
+static struct file *io_file_get_normal(struct io_kiocb *req, int fd);
static void io_drop_inflight_file(struct io_kiocb *req);
static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags);
static void io_queue_sqe(struct io_kiocb *req);
@@ -1442,21 +1512,23 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
{
- struct io_buffer *kbuf = req->kbuf;
- unsigned int cflags;
+ if (req->flags & REQ_F_BUFFER_RING) {
+ if (req->buf_list)
+ req->buf_list->head++;
+ req->flags &= ~REQ_F_BUFFER_RING;
+ } else {
+ list_add(&req->kbuf->list, list);
+ req->flags &= ~REQ_F_BUFFER_SELECTED;
+ }
- cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT);
- req->flags &= ~REQ_F_BUFFER_SELECTED;
- list_add(&kbuf->list, list);
- req->kbuf = NULL;
- return cflags;
+ return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
}
static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
{
lockdep_assert_held(&req->ctx->completion_lock);
- if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return 0;
return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
}
@@ -1466,7 +1538,7 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
{
unsigned int cflags;
- if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return 0;
/*
@@ -1481,7 +1553,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
* We migrate buffers from the comp_list to the issue cache list
* when we need one.
*/
- if (issue_flags & IO_URING_F_UNLOCKED) {
+ if (req->flags & REQ_F_BUFFER_RING) {
+ /* no buffers to recycle for this case */
+ cflags = __io_put_kbuf(req, NULL);
+ } else if (issue_flags & IO_URING_F_UNLOCKED) {
struct io_ring_ctx *ctx = req->ctx;
spin_lock(&ctx->completion_lock);
@@ -1499,15 +1574,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
unsigned int bgid)
{
- struct list_head *hash_list;
- struct io_buffer_list *bl;
-
- hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
- list_for_each_entry(bl, hash_list, list)
- if (bl->bgid == bgid || bgid == -1U)
- return bl;
+ if (ctx->io_bl && bgid < BGID_ARRAY)
+ return &ctx->io_bl[bgid];
- return NULL;
+ return xa_load(&ctx->io_bl_xa, bgid);
}
static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
@@ -1516,11 +1586,23 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
struct io_buffer_list *bl;
struct io_buffer *buf;
- if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return;
/* don't recycle if we already did IO to this buffer */
if (req->flags & REQ_F_PARTIAL_IO)
return;
+ /*
+ * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
+ * the flag and hence ensure that bl->head doesn't get incremented.
+ * If the tail has already been incremented, hang on to it.
+ */
+ if (req->flags & REQ_F_BUFFER_RING) {
+ if (req->buf_list) {
+ req->buf_index = req->buf_list->bgid;
+ req->flags &= ~REQ_F_BUFFER_RING;
+ }
+ return;
+ }
io_ring_submit_lock(ctx, issue_flags);
@@ -1528,7 +1610,7 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
bl = io_buffer_get_list(ctx, buf->bgid);
list_add(&buf->list, &bl->buf_list);
req->flags &= ~REQ_F_BUFFER_SELECTED;
- req->kbuf = NULL;
+ req->buf_index = buf->bgid;
io_ring_submit_unlock(ctx, issue_flags);
}
@@ -1613,12 +1695,14 @@ static __cold void io_fallback_req_func(struct work_struct *work)
static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
- int i, hash_bits;
+ int hash_bits;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return NULL;
+ xa_init(&ctx->io_bl_xa);
+
/*
* Use 5 bits less than the max cq entries, that should give us around
* 32 entries per hash list if totally full and uniformly spread.
@@ -1640,13 +1724,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
/* set invalid range, so io_import_fixed() fails meeting it */
ctx->dummy_ubuf->ubuf = -1UL;
- ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS,
- sizeof(struct list_head), GFP_KERNEL);
- if (!ctx->io_buffers)
- goto err;
- for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++)
- INIT_LIST_HEAD(&ctx->io_buffers[i]);
-
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
goto err;
@@ -1682,7 +1759,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
err:
kfree(ctx->dummy_ubuf);
kfree(ctx->cancel_hash);
- kfree(ctx->io_buffers);
+ kfree(ctx->io_bl);
+ xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
return NULL;
}
@@ -2046,8 +2124,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
all_flushed = list_empty(&ctx->cq_overflow_list);
if (all_flushed) {
clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
+ atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
}
io_commit_cqring(ctx);
@@ -2141,8 +2218,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
}
if (list_empty(&ctx->cq_overflow_list)) {
set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
+ atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
}
ocqe->cqe.user_data = user_data;
@@ -2525,6 +2601,8 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
{
if (!ctx)
return;
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+ atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
if (*locked) {
io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
@@ -2641,8 +2719,8 @@ static void tctx_task_work(struct callback_head *cb)
static void io_req_task_work_add(struct io_kiocb *req, bool priority)
{
struct task_struct *tsk = req->task;
+ struct io_ring_ctx *ctx = req->ctx;
struct io_uring_task *tctx = tsk->io_uring;
- enum task_work_notify_mode notify;
struct io_wq_work_node *node;
unsigned long flags;
bool running;
@@ -2665,18 +2743,11 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority)
if (running)
return;
- /*
- * SQPOLL kernel thread doesn't need notification, just a wakeup. For
- * all other cases, use TWA_SIGNAL unconditionally to ensure we're
- * processing task_work. There's no reliable way to tell if TWA_RESUME
- * will do the job.
- */
- notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
- if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
- if (notify == TWA_NONE)
- wake_up_process(tsk);
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+ atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+
+ if (likely(!task_work_add(tsk, &tctx->task_work, ctx->notify_method)))
return;
- }
spin_lock_irqsave(&tctx->task_lock, flags);
tctx->task_running = false;
@@ -3299,6 +3370,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
req->rw.flags = READ_ONCE(sqe->rw_flags);
+ /* used for fixed read/write too - just read unconditionally */
req->buf_index = READ_ONCE(sqe->buf_index);
return 0;
}
@@ -3447,56 +3519,96 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
return __io_import_fixed(req, rw, iter, imu);
}
-static void io_buffer_add_list(struct io_ring_ctx *ctx,
- struct io_buffer_list *bl, unsigned int bgid)
+static int io_buffer_add_list(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl, unsigned int bgid)
{
- struct list_head *list;
-
- list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
- INIT_LIST_HEAD(&bl->buf_list);
bl->bgid = bgid;
- list_add(&bl->list, list);
+ if (bgid < BGID_ARRAY)
+ return 0;
+
+ return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
}
-static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
- int bgid, unsigned int issue_flags)
+static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
+ struct io_buffer_list *bl)
{
- struct io_buffer *kbuf = req->kbuf;
- struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
-
- if (req->flags & REQ_F_BUFFER_SELECTED)
- return kbuf;
-
- io_ring_submit_lock(req->ctx, issue_flags);
+ if (!list_empty(&bl->buf_list)) {
+ struct io_buffer *kbuf;
- bl = io_buffer_get_list(ctx, bgid);
- if (bl && !list_empty(&bl->buf_list)) {
kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&kbuf->list);
if (*len > kbuf->len)
*len = kbuf->len;
req->flags |= REQ_F_BUFFER_SELECTED;
req->kbuf = kbuf;
+ req->buf_index = kbuf->bid;
+ return u64_to_user_ptr(kbuf->addr);
+ }
+ return NULL;
+}
+
+static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
+ struct io_buffer_list *bl,
+ unsigned int issue_flags)
+{
+ struct io_uring_buf_ring *br = bl->buf_ring;
+ struct io_uring_buf *buf;
+ __u32 head = bl->head;
+
+ if (unlikely(smp_load_acquire(&br->tail) == head)) {
+ io_ring_submit_unlock(req->ctx, issue_flags);
+ return NULL;
+ }
+
+ head &= bl->mask;
+ if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
+ buf = &br->bufs[head];
} else {
- kbuf = ERR_PTR(-ENOBUFS);
+ int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
+ int index = head / IO_BUFFER_LIST_BUF_PER_PAGE - 1;
+ buf = page_address(bl->buf_pages[index]);
+ buf += off;
}
+ if (*len > buf->len)
+ *len = buf->len;
+ req->flags |= REQ_F_BUFFER_RING;
+ req->buf_list = bl;
+ req->buf_index = buf->bid;
- io_ring_submit_unlock(req->ctx, issue_flags);
- return kbuf;
+ if (issue_flags & IO_URING_F_UNLOCKED) {
+ /*
+ * If we came in unlocked, we have no choice but to consume the
+ * buffer here. This does mean it'll be pinned until the IO
+ * completes. But coming in unlocked means we're in io-wq
+ * context, hence there should be no further retry. For the
+ * locked case, the caller must ensure to call the commit when
+ * the transfer completes (or if we get -EAGAIN and must poll
+ * or retry).
+ */
+ req->buf_list = NULL;
+ bl->head++;
+ }
+ return u64_to_user_ptr(buf->addr);
}
-static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
- unsigned int issue_flags)
+static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
+ unsigned int issue_flags)
{
- struct io_buffer *kbuf;
- u16 bgid;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
+ void __user *ret = NULL;
- bgid = req->buf_index;
- kbuf = io_buffer_select(req, len, bgid, issue_flags);
- if (IS_ERR(kbuf))
- return kbuf;
- return u64_to_user_ptr(kbuf->addr);
+ io_ring_submit_lock(req->ctx, issue_flags);
+
+ bl = io_buffer_get_list(ctx, req->buf_index);
+ if (likely(bl)) {
+ if (bl->buf_nr_pages)
+ ret = io_ring_buffer_select(req, len, bl, issue_flags);
+ else
+ ret = io_provided_buffer_select(req, len, bl);
+ }
+ io_ring_submit_unlock(req->ctx, issue_flags);
+ return ret;
}
#ifdef CONFIG_COMPAT
@@ -3506,7 +3618,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
struct compat_iovec __user *uiov;
compat_ssize_t clen;
void __user *buf;
- ssize_t len;
+ size_t len;
uiov = u64_to_user_ptr(req->rw.addr);
if (!access_ok(uiov, sizeof(*uiov)))
@@ -3517,11 +3629,12 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
return -EINVAL;
len = clen;
- buf = io_rw_buffer_select(req, &len, issue_flags);
- if (IS_ERR(buf))
- return PTR_ERR(buf);
+ buf = io_buffer_select(req, &len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ req->rw.addr = (unsigned long) buf;
iov[0].iov_base = buf;
- iov[0].iov_len = (compat_size_t) len;
+ req->rw.len = iov[0].iov_len = (compat_size_t) len;
return 0;
}
#endif
@@ -3539,22 +3652,21 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
len = iov[0].iov_len;
if (len < 0)
return -EINVAL;
- buf = io_rw_buffer_select(req, &len, issue_flags);
- if (IS_ERR(buf))
- return PTR_ERR(buf);
+ buf = io_buffer_select(req, &len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ req->rw.addr = (unsigned long) buf;
iov[0].iov_base = buf;
- iov[0].iov_len = len;
+ req->rw.len = iov[0].iov_len = len;
return 0;
}
static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
unsigned int issue_flags)
{
- if (req->flags & REQ_F_BUFFER_SELECTED) {
- struct io_buffer *kbuf = req->kbuf;
-
- iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
- iov[0].iov_len = kbuf->len;
+ if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
+ iov[0].iov_base = u64_to_user_ptr(req->rw.addr);
+ iov[0].iov_len = req->rw.len;
return 0;
}
if (req->rw.len != 1)
@@ -3568,6 +3680,13 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
return __io_iov_buffer_select(req, iov, issue_flags);
}
+static inline bool io_do_buffer_select(struct io_kiocb *req)
+{
+ if (!(req->flags & REQ_F_BUFFER_SELECT))
+ return false;
+ return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
+}
+
static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
struct io_rw_state *s,
unsigned int issue_flags)
@@ -3586,18 +3705,15 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
return NULL;
}
- /* buffer index only valid with fixed read/write, or buffer select */
- if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
- return ERR_PTR(-EINVAL);
-
buf = u64_to_user_ptr(req->rw.addr);
sqe_len = req->rw.len;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
- if (req->flags & REQ_F_BUFFER_SELECT) {
- buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
- if (IS_ERR(buf))
- return ERR_CAST(buf);
+ if (io_do_buffer_select(req)) {
+ buf = io_buffer_select(req, &sqe_len, issue_flags);
+ if (!buf)
+ return ERR_PTR(-ENOBUFS);
+ req->rw.addr = (unsigned long) buf;
req->rw.len = sqe_len;
}
@@ -3899,6 +4015,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
return -EOPNOTSUPP;
+ kiocb->private = NULL;
kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
kiocb->ki_complete = io_complete_rw_iopoll;
req->iopoll_completed = 0;
@@ -4163,9 +4280,7 @@ static int io_renameat_prep(struct io_kiocb *req,
struct io_rename *ren = &req->rename;
const char __user *oldf, *newf;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4460,10 +4575,7 @@ static int io_unlinkat_prep(struct io_kiocb *req,
struct io_unlink *un = &req->unlink;
const char __user *fname;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
- sqe->splice_fd_in)
+ if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4507,10 +4619,7 @@ static int io_mkdirat_prep(struct io_kiocb *req,
struct io_mkdir *mkd = &req->mkdir;
const char __user *fname;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
- sqe->splice_fd_in)
+ if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4548,10 +4657,7 @@ static int io_symlinkat_prep(struct io_kiocb *req,
struct io_symlink *sl = &req->symlink;
const char __user *oldpath, *newpath;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
- sqe->splice_fd_in)
+ if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4595,9 +4701,7 @@ static int io_linkat_prep(struct io_kiocb *req,
struct io_hardlink *lnk = &req->hardlink;
const char __user *oldf, *newf;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
+ if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4642,9 +4746,7 @@ static int io_shutdown_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
+ if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
sqe->buf_index || sqe->splice_fd_in))
return -EINVAL;
@@ -4682,9 +4784,6 @@ static int __io_splice_prep(struct io_kiocb *req,
struct io_splice *sp = &req->splice;
unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
sp->len = READ_ONCE(sqe->len);
sp->flags = READ_ONCE(sqe->splice_flags);
if (unlikely(sp->flags & ~valid_flags))
@@ -4783,20 +4882,25 @@ done:
*/
static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
{
- struct io_ring_ctx *ctx = req->ctx;
+ void __user *buf;
- if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ size_t len = 1;
- __io_req_complete(req, issue_flags, 0, 0);
+ buf = io_buffer_select(req, &len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ }
+
+ __io_req_complete(req, issue_flags, 0, io_put_kbuf(req, issue_flags));
return 0;
}
static int io_msg_ring_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags ||
- sqe->splice_fd_in || sqe->buf_index || sqe->personality))
+ if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
+ sqe->buf_index || sqe->personality))
return -EINVAL;
req->msg.user_data = READ_ONCE(sqe->off);
@@ -4832,17 +4936,15 @@ done:
if (ret < 0)
req_set_fail(req);
__io_req_complete(req, issue_flags, ret, 0);
+ /* put file to avoid an attempt to IOPOLL the req */
+ io_put_file(req->file);
+ req->file = NULL;
return 0;
}
static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
- sqe->splice_fd_in))
+ if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
return -EINVAL;
req->sync.flags = READ_ONCE(sqe->fsync_flags);
@@ -4873,10 +4975,7 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
static int io_fallocate_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
- sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
req->sync.off = READ_ONCE(sqe->off);
@@ -4905,9 +5004,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
const char __user *fname;
int ret;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->ioprio || sqe->buf_index))
+ if (unlikely(sqe->buf_index))
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4962,6 +5059,61 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return __io_openat_prep(req, sqe);
}
+static int io_file_bitmap_get(struct io_ring_ctx *ctx)
+{
+ struct io_file_table *table = &ctx->file_table;
+ unsigned long nr = ctx->nr_user_files;
+ int ret;
+
+ if (table->alloc_hint >= nr)
+ table->alloc_hint = 0;
+
+ do {
+ ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint);
+ if (ret != nr) {
+ table->alloc_hint = ret + 1;
+ return ret;
+ }
+ if (!table->alloc_hint)
+ break;
+
+ nr = table->alloc_hint;
+ table->alloc_hint = 0;
+ } while (1);
+
+ return -ENFILE;
+}
+
+static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
+ struct file *file, unsigned int file_slot)
+{
+ bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC;
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret;
+
+ if (alloc_slot) {
+ io_ring_submit_lock(ctx, issue_flags);
+ ret = io_file_bitmap_get(ctx);
+ if (unlikely(ret < 0)) {
+ io_ring_submit_unlock(ctx, issue_flags);
+ return ret;
+ }
+
+ file_slot = ret;
+ } else {
+ file_slot--;
+ }
+
+ ret = io_install_fixed_file(req, file, issue_flags, file_slot);
+ if (alloc_slot) {
+ io_ring_submit_unlock(ctx, issue_flags);
+ if (!ret)
+ return file_slot;
+ }
+
+ return ret;
+}
+
static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
{
struct open_flags op;
@@ -5017,8 +5169,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
if (!fixed)
fd_install(ret, file);
else
- ret = io_install_fixed_file(req, file, issue_flags,
- req->open.file_slot - 1);
+ ret = io_fixed_fd_install(req, issue_flags, file,
+ req->open.file_slot);
err:
putname(req->open.filename);
req->flags &= ~REQ_F_NEED_CLEANUP;
@@ -5039,7 +5191,7 @@ static int io_remove_buffers_prep(struct io_kiocb *req,
struct io_provide_buf *p = &req->pbuf;
u64 tmp;
- if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
+ if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
sqe->splice_fd_in)
return -EINVAL;
@@ -5062,6 +5214,20 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (!nbufs)
return 0;
+ if (bl->buf_nr_pages) {
+ int j;
+
+ i = bl->buf_ring->tail - bl->head;
+ for (j = 0; j < bl->buf_nr_pages; j++)
+ unpin_user_page(bl->buf_pages[j]);
+ kvfree(bl->buf_pages);
+ bl->buf_pages = NULL;
+ bl->buf_nr_pages = 0;
+ /* make sure it's seen as empty */
+ INIT_LIST_HEAD(&bl->buf_list);
+ return i;
+ }
+
/* the head kbuf is the list itself */
while (!list_empty(&bl->buf_list)) {
struct io_buffer *nxt;
@@ -5088,8 +5254,12 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
ret = -ENOENT;
bl = io_buffer_get_list(ctx, p->bgid);
- if (bl)
- ret = __io_remove_buffers(ctx, bl, p->nbufs);
+ if (bl) {
+ ret = -EINVAL;
+ /* can't use provide/remove buffers command on mapped buffers */
+ if (!bl->buf_nr_pages)
+ ret = __io_remove_buffers(ctx, bl, p->nbufs);
+ }
if (ret < 0)
req_set_fail(req);
@@ -5106,7 +5276,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
struct io_provide_buf *p = &req->pbuf;
u64 tmp;
- if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
+ if (sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
tmp = READ_ONCE(sqe->fd);
@@ -5203,6 +5373,23 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
return i ? 0 : -ENOMEM;
}
+static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
+{
+ int i;
+
+ ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
+ GFP_KERNEL);
+ if (!ctx->io_bl)
+ return -ENOMEM;
+
+ for (i = 0; i < BGID_ARRAY; i++) {