summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/android/binder.c6
-rw-r--r--fs/eventpoll.c87
-rw-r--r--fs/file.c6
-rw-r--r--fs/internal.h8
-rw-r--r--fs/io-wq.c103
-rw-r--r--fs/io-wq.h11
-rw-r--r--fs/io_uring.c2200
-rw-r--r--fs/open.c5
-rw-r--r--fs/stat.c34
-rw-r--r--include/linux/eventpoll.h9
-rw-r--r--include/linux/mm.h1
-rw-r--r--include/linux/percpu-refcount.h26
-rw-r--r--include/trace/events/io_uring.h13
-rw-r--r--include/uapi/linux/io_uring.h73
-rw-r--r--mm/madvise.c7
15 files changed, 2112 insertions, 477 deletions
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 9fcc761031d8..a6b2082c24f8 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2249,10 +2249,12 @@ static void binder_deferred_fd_close(int fd)
return;
init_task_work(&twcb->twork, binder_do_fd_close);
__close_fd_get_file(fd, &twcb->file);
- if (twcb->file)
+ if (twcb->file) {
+ filp_close(twcb->file, current->files);
task_work_add(current, &twcb->twork, true);
- else
+ } else {
kfree(twcb);
+ }
}
static void binder_transaction_buffer_release(struct binder_proc *proc,
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 67a395039268..b041b66002db 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -354,12 +354,6 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
return container_of(p, struct ep_pqueue, pt)->epi;
}
-/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
-static inline int ep_op_has_event(int op)
-{
- return op != EPOLL_CTL_DEL;
-}
-
/* Initialize the poll safe wake up structure */
static void ep_nested_calls_init(struct nested_calls *ncalls)
{
@@ -2074,27 +2068,28 @@ SYSCALL_DEFINE1(epoll_create, int, size)
return do_epoll_create(0);
}
-/*
- * The following function implements the controller interface for
- * the eventpoll file that enables the insertion/removal/change of
- * file descriptors inside the interest set.
- */
-SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
- struct epoll_event __user *, event)
+static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
+ bool nonblock)
+{
+ if (!nonblock) {
+ mutex_lock_nested(mutex, depth);
+ return 0;
+ }
+ if (mutex_trylock(mutex))
+ return 0;
+ return -EAGAIN;
+}
+
+int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
+ bool nonblock)
{
int error;
int full_check = 0;
struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
- struct epoll_event epds;
struct eventpoll *tep = NULL;
- error = -EFAULT;
- if (ep_op_has_event(op) &&
- copy_from_user(&epds, event, sizeof(struct epoll_event)))
- goto error_return;
-
error = -EBADF;
f = fdget(epfd);
if (!f.file)
@@ -2112,7 +2107,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
/* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
- ep_take_care_of_epollwakeup(&epds);
+ ep_take_care_of_epollwakeup(epds);
/*
* We have to check that the file structure underneath the file descriptor
@@ -2128,11 +2123,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
* Also, we do not currently supported nested exclusive wakeups.
*/
- if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) {
+ if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
if (op == EPOLL_CTL_MOD)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
- (epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
+ (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
goto error_tgt_fput;
}
@@ -2157,13 +2152,17 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* deep wakeup paths from forming in parallel through multiple
* EPOLL_CTL_ADD operations.
*/
- mutex_lock_nested(&ep->mtx, 0);
+ error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
+ if (error)
+ goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
if (!list_empty(&f.file->f_ep_links) ||
is_file_epoll(tf.file)) {
- full_check = 1;
mutex_unlock(&ep->mtx);
- mutex_lock(&epmutex);
+ error = epoll_mutex_lock(&epmutex, 0, nonblock);
+ if (error)
+ goto error_tgt_fput;
+ full_check = 1;
if (is_file_epoll(tf.file)) {
error = -ELOOP;
if (ep_loop_check(ep, tf.file) != 0) {
@@ -2173,10 +2172,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
} else
list_add(&tf.file->f_tfile_llink,
&tfile_check_list);
- mutex_lock_nested(&ep->mtx, 0);
+ error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
+ if (error) {
+out_del:
+ list_del(&tf.file->f_tfile_llink);
+ goto error_tgt_fput;
+ }
if (is_file_epoll(tf.file)) {
tep = tf.file->private_data;
- mutex_lock_nested(&tep->mtx, 1);
+ error = epoll_mutex_lock(&tep->mtx, 1, nonblock);
+ if (error) {
+ mutex_unlock(&ep->mtx);
+ goto out_del;
+ }
}
}
}
@@ -2192,8 +2200,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
- epds.events |= EPOLLERR | EPOLLHUP;
- error = ep_insert(ep, &epds, tf.file, fd, full_check);
+ epds->events |= EPOLLERR | EPOLLHUP;
+ error = ep_insert(ep, epds, tf.file, fd, full_check);
} else
error = -EEXIST;
if (full_check)
@@ -2208,8 +2216,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
case EPOLL_CTL_MOD:
if (epi) {
if (!(epi->event.events & EPOLLEXCLUSIVE)) {
- epds.events |= EPOLLERR | EPOLLHUP;
- error = ep_modify(ep, epi, &epds);
+ epds->events |= EPOLLERR | EPOLLHUP;
+ error = ep_modify(ep, epi, epds);
}
} else
error = -ENOENT;
@@ -2232,6 +2240,23 @@ error_return:
}
/*
+ * The following function implements the controller interface for
+ * the eventpoll file that enables the insertion/removal/change of
+ * file descriptors inside the interest set.
+ */
+SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
+ struct epoll_event __user *, event)
+{
+ struct epoll_event epds;
+
+ if (ep_op_has_event(op) &&
+ copy_from_user(&epds, event, sizeof(struct epoll_event)))
+ return -EFAULT;
+
+ return do_epoll_ctl(epfd, op, fd, &epds, false);
+}
+
+/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
*/
diff --git a/fs/file.c b/fs/file.c
index 3da91a112bab..fb7081bfac2b 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -642,7 +642,9 @@ out_unlock:
EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
/*
- * variant of __close_fd that gets a ref on the file for later fput
+ * variant of __close_fd that gets a ref on the file for later fput.
+ * The caller must ensure that filp_close() called on the file, and then
+ * an fput().
*/
int __close_fd_get_file(unsigned int fd, struct file **res)
{
@@ -662,7 +664,7 @@ int __close_fd_get_file(unsigned int fd, struct file **res)
spin_unlock(&files->file_lock);
get_file(file);
*res = file;
- return filp_close(file, files);
+ return 0;
out_unlock:
spin_unlock(&files->file_lock);
diff --git a/fs/internal.h b/fs/internal.h
index cf6ca30e93de..f3f280b952a3 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -124,6 +124,8 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op);
extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
const char *, const struct open_flags *);
+extern struct open_how build_open_how(int flags, umode_t mode);
+extern int build_open_flags(const struct open_how *how, struct open_flags *op);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
long do_faccessat(int dfd, const char __user *filename, int mode);
@@ -182,3 +184,9 @@ extern const struct dentry_operations ns_dentry_operations;
/* direct-io.c: */
int sb_init_dio_done_wq(struct super_block *sb);
+
+/*
+ * fs/stat.c:
+ */
+unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags);
+int cp_statx(const struct kstat *stat, struct statx __user *buffer);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 5147d2213b01..cb60a42b9fdf 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -56,7 +56,8 @@ struct io_worker {
struct rcu_head rcu;
struct mm_struct *mm;
- const struct cred *creds;
+ const struct cred *cur_creds;
+ const struct cred *saved_creds;
struct files_struct *restore_files;
};
@@ -109,10 +110,10 @@ struct io_wq {
struct task_struct *manager;
struct user_struct *user;
- const struct cred *creds;
- struct mm_struct *mm;
refcount_t refs;
struct completion done;
+
+ refcount_t use_refs;
};
static bool io_worker_get(struct io_worker *worker)
@@ -135,9 +136,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
{
bool dropped_lock = false;
- if (worker->creds) {
- revert_creds(worker->creds);
- worker->creds = NULL;
+ if (worker->saved_creds) {
+ revert_creds(worker->saved_creds);
+ worker->cur_creds = worker->saved_creds = NULL;
}
if (current->files != worker->restore_files) {
@@ -396,6 +397,43 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
return NULL;
}
+static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
+{
+ if (worker->mm) {
+ unuse_mm(worker->mm);
+ mmput(worker->mm);
+ worker->mm = NULL;
+ }
+ if (!work->mm) {
+ set_fs(KERNEL_DS);
+ return;
+ }
+ if (mmget_not_zero(work->mm)) {
+ use_mm(work->mm);
+ if (!worker->mm)
+ set_fs(USER_DS);
+ worker->mm = work->mm;
+ /* hang on to this mm */
+ work->mm = NULL;
+ return;
+ }
+
+ /* failed grabbing mm, ensure work gets cancelled */
+ work->flags |= IO_WQ_WORK_CANCEL;
+}
+
+static void io_wq_switch_creds(struct io_worker *worker,
+ struct io_wq_work *work)
+{
+ const struct cred *old_creds = override_creds(work->creds);
+
+ worker->cur_creds = work->creds;
+ if (worker->saved_creds)
+ put_cred(old_creds); /* creds set by previous switch */
+ else
+ worker->saved_creds = old_creds;
+}
+
static void io_worker_handle_work(struct io_worker *worker)
__releases(wqe->lock)
{
@@ -438,24 +476,19 @@ next:
if (work->flags & IO_WQ_WORK_CB)
work->func(&work);
- if ((work->flags & IO_WQ_WORK_NEEDS_FILES) &&
- current->files != work->files) {
+ if (work->files && current->files != work->files) {
task_lock(current);
current->files = work->files;
task_unlock(current);
}
- if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm &&
- wq->mm) {
- if (mmget_not_zero(wq->mm)) {
- use_mm(wq->mm);
- set_fs(USER_DS);
- worker->mm = wq->mm;
- } else {
- work->flags |= IO_WQ_WORK_CANCEL;
- }
- }
- if (!worker->creds)
- worker->creds = override_creds(wq->creds);
+ if (work->mm != worker->mm)
+ io_wq_switch_mm(worker, work);
+ if (worker->cur_creds != work->creds)
+ io_wq_switch_creds(worker, work);
+ /*
+ * OK to set IO_WQ_WORK_CANCEL even for uncancellable work,
+ * the worker function will do the right thing.
+ */
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
work->flags |= IO_WQ_WORK_CANCEL;
if (worker->mm)
@@ -720,6 +753,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
+ int work_flags;
unsigned long flags;
/*
@@ -734,12 +768,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
}
+ work_flags = work->flags;
spin_lock_irqsave(&wqe->lock, flags);
wq_list_add_tail(&work->list, &wqe->work_list);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
spin_unlock_irqrestore(&wqe->lock, flags);
- if (!atomic_read(&acct->nr_running))
+ if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
+ !atomic_read(&acct->nr_running))
io_wqe_wake_worker(wqe, acct);
}
@@ -828,6 +864,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
*/
spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work &&
+ !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
data->cancel(worker->cur_work, data->caller_data)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
@@ -902,7 +939,8 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
return false;
spin_lock_irqsave(&worker->lock, flags);
- if (worker->cur_work == work) {
+ if (worker->cur_work == work &&
+ !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
}
@@ -1026,7 +1064,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
/* caller must already hold a reference to this */
wq->user = data->user;
- wq->creds = data->creds;
for_each_node(node) {
struct io_wqe *wqe;
@@ -1053,9 +1090,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
init_completion(&wq->done);
- /* caller must have already done mmgrab() on this mm */
- wq->mm = data->mm;
-
wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
if (!IS_ERR(wq->manager)) {
wake_up_process(wq->manager);
@@ -1064,6 +1098,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
ret = -ENOMEM;
goto err;
}
+ refcount_set(&wq->use_refs, 1);
reinit_completion(&wq->done);
return wq;
}
@@ -1078,13 +1113,21 @@ err:
return ERR_PTR(ret);
}
+bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
+{
+ if (data->get_work != wq->get_work || data->put_work != wq->put_work)
+ return false;
+
+ return refcount_inc_not_zero(&wq->use_refs);
+}
+
static bool io_wq_worker_wake(struct io_worker *worker, void *data)
{
wake_up_process(worker->task);
return false;
}
-void io_wq_destroy(struct io_wq *wq)
+static void __io_wq_destroy(struct io_wq *wq)
{
int node;
@@ -1104,3 +1147,9 @@ void io_wq_destroy(struct io_wq *wq)
kfree(wq->wqes);
kfree(wq);
}
+
+void io_wq_destroy(struct io_wq *wq)
+{
+ if (refcount_dec_and_test(&wq->use_refs))
+ __io_wq_destroy(wq);
+}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 3f5e356de980..50b3378febf2 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -7,11 +7,11 @@ enum {
IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HAS_MM = 2,
IO_WQ_WORK_HASHED = 4,
- IO_WQ_WORK_NEEDS_USER = 8,
- IO_WQ_WORK_NEEDS_FILES = 16,
IO_WQ_WORK_UNBOUND = 32,
IO_WQ_WORK_INTERNAL = 64,
IO_WQ_WORK_CB = 128,
+ IO_WQ_WORK_NO_CANCEL = 256,
+ IO_WQ_WORK_CONCURRENT = 512,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
@@ -72,6 +72,8 @@ struct io_wq_work {
};
void (*func)(struct io_wq_work **);
struct files_struct *files;
+ struct mm_struct *mm;
+ const struct cred *creds;
unsigned flags;
};
@@ -81,21 +83,22 @@ struct io_wq_work {
(work)->func = _func; \
(work)->flags = 0; \
(work)->files = NULL; \
+ (work)->mm = NULL; \
+ (work)->creds = NULL; \
} while (0) \
typedef void (get_work_fn)(struct io_wq_work *);
typedef void (put_work_fn)(struct io_wq_work *);
struct io_wq_data {
- struct mm_struct *mm;
struct user_struct *user;
- const struct cred *creds;
get_work_fn *get_work;
put_work_fn *put_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
+bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e54556b0fcc6..ac5340fdcdfe 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -46,6 +46,7 @@
#include <linux/compat.h>
#include <linux/refcount.h>
#include <linux/uio.h>
+#include <linux/bits.h>
#include <linux/sched/signal.h>
#include <linux/fs.h>
@@ -70,6 +71,10 @@
#include <linux/sizes.h>
#include <linux/hugetlb.h>
#include <linux/highmem.h>
+#include <linux/namei.h>
+#include <linux/fsnotify.h>
+#include <linux/fadvise.h>
+#include <linux/eventpoll.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -177,6 +182,21 @@ struct fixed_file_table {
struct file **files;
};
+enum {
+ FFD_F_ATOMIC,
+};
+
+struct fixed_file_data {
+ struct fixed_file_table *table;
+ struct io_ring_ctx *ctx;
+
+ struct percpu_ref refs;
+ struct llist_head put_llist;
+ unsigned long state;
+ struct work_struct ref_work;
+ struct completion done;
+};
+
struct io_ring_ctx {
struct {
struct percpu_ref refs;
@@ -184,10 +204,11 @@ struct io_ring_ctx {
struct {
unsigned int flags;
- bool compat;
- bool account_mem;
- bool cq_overflow_flushed;
- bool drain_next;
+ int compat: 1;
+ int account_mem: 1;
+ int cq_overflow_flushed: 1;
+ int drain_next: 1;
+ int eventfd_async: 1;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
@@ -207,13 +228,14 @@ struct io_ring_ctx {
unsigned sq_thread_idle;
unsigned cached_sq_dropped;
atomic_t cached_cq_overflow;
- struct io_uring_sqe *sq_sqes;
+ unsigned long sq_check_overflow;
struct list_head defer_list;
struct list_head timeout_list;
struct list_head cq_overflow_list;
wait_queue_head_t inflight_wait;
+ struct io_uring_sqe *sq_sqes;
} ____cacheline_aligned_in_smp;
struct io_rings *rings;
@@ -229,8 +251,10 @@ struct io_ring_ctx {
* readers must ensure that ->refs is alive as long as the file* is
* used. Only updated through io_uring_register(2).
*/
- struct fixed_file_table *file_table;
+ struct fixed_file_data *file_data;
unsigned nr_user_files;
+ int ring_fd;
+ struct file *ring_file;
/* if used, fixed mapped user buffers */
unsigned nr_user_bufs;
@@ -250,11 +274,14 @@ struct io_ring_ctx {
struct socket *ring_sock;
#endif
+ struct idr personality_idr;
+
struct {
unsigned cached_cq_tail;
unsigned cq_entries;
unsigned cq_mask;
atomic_t cq_timeouts;
+ unsigned long cq_check_overflow;
struct wait_queue_head cq_wait;
struct fasync_struct *cq_fasync;
struct eventfd_ctx *cq_ev_fd;
@@ -267,7 +294,8 @@ struct io_ring_ctx {
struct {
spinlock_t completion_lock;
- bool poll_multi_file;
+ struct llist_head poll_llist;
+
/*
* ->poll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
@@ -277,6 +305,7 @@ struct io_ring_ctx {
struct list_head poll_list;
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
+ bool poll_multi_file;
spinlock_t inflight_lock;
struct list_head inflight_list;
@@ -299,6 +328,12 @@ struct io_poll_iocb {
struct wait_queue_entry wait;
};
+struct io_close {
+ struct file *file;
+ struct file *put_file;
+ int fd;
+};
+
struct io_timeout_data {
struct io_kiocb *req;
struct hrtimer timer;
@@ -319,6 +354,7 @@ struct io_sync {
loff_t len;
loff_t off;
int flags;
+ int mode;
};
struct io_cancel {
@@ -348,8 +384,52 @@ struct io_connect {
struct io_sr_msg {
struct file *file;
- struct user_msghdr __user *msg;
+ union {
+ struct user_msghdr __user *msg;
+ void __user *buf;
+ };
int msg_flags;
+ size_t len;
+};
+
+struct io_open {
+ struct file *file;
+ int dfd;
+ union {
+ unsigned mask;
+ };
+ struct filename *filename;
+ struct statx __user *buffer;
+ struct open_how how;
+};
+
+struct io_files_update {
+ struct file *file;
+ u64 arg;
+ u32 nr_args;
+ u32 offset;
+};
+
+struct io_fadvise {
+ struct file *file;
+ u64 offset;
+ u32 len;
+ u32 advice;
+};
+
+struct io_madvise {
+ struct file *file;
+ u64 addr;
+ u32 len;
+ u32 advice;
+};
+
+struct io_epoll {
+ struct file *file;
+ int epfd;
+ int op;
+ int fd;
+ struct epoll_event event;
};
struct io_async_connect {
@@ -370,15 +450,79 @@ struct io_async_rw {
ssize_t size;
};
+struct io_async_open {
+ struct filename *filename;
+};
+
struct io_async_ctx {
union {
struct io_async_rw rw;
struct io_async_msghdr msg;
struct io_async_connect connect;
struct io_timeout_data timeout;
+ struct io_async_open open;
};
};
+enum {
+ REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
+ REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
+ REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
+ REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
+ REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
+
+ REQ_F_LINK_NEXT_BIT,
+ REQ_F_FAIL_LINK_BIT,
+ REQ_F_INFLIGHT_BIT,
+ REQ_F_CUR_POS_BIT,
+ REQ_F_NOWAIT_BIT,
+ REQ_F_IOPOLL_COMPLETED_BIT,
+ REQ_F_LINK_TIMEOUT_BIT,
+ REQ_F_TIMEOUT_BIT,
+ REQ_F_ISREG_BIT,
+ REQ_F_MUST_PUNT_BIT,
+ REQ_F_TIMEOUT_NOSEQ_BIT,
+ REQ_F_COMP_LOCKED_BIT,
+};
+
+enum {
+ /* ctx owns file */
+ REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
+ /* drain existing IO first */
+ REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
+ /* linked sqes */
+ REQ_F_LINK = BIT(REQ_F_LINK_BIT),
+ /* doesn't sever on completion < 0 */
+ REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
+ /* IOSQE_ASYNC */
+ REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
+
+ /* already grabbed next link */
+ REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
+ /* fail rest of links */
+ REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
+ /* on inflight list */
+ REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
+ /* read/write uses file position */
+ REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
+ /* must not punt to workers */
+ REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
+ /* polled IO has completed */
+ REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT),
+ /* has linked timeout */
+ REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
+ /* timeout request */
+ REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT),
+ /* regular file */
+ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
+ /* must be punted even for NONBLOCK */
+ REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT),
+ /* no timeout sequence */
+ REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
+ /* completion under lock */
+ REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
+};
+
/*
* NOTE! Each of the iocb union members has the file pointer
* as the first entry in their struct definition. So you can
@@ -396,11 +540,19 @@ struct io_kiocb {
struct io_timeout timeout;
struct io_connect connect;
struct io_sr_msg sr_msg;
+ struct io_open open;
+ struct io_close close;
+ struct io_files_update files_update;
+ struct io_fadvise fadvise;
+ struct io_madvise madvise;
+ struct io_epoll epoll;
};
struct io_async_ctx *io;
- struct file *ring_file;
- int ring_fd;
+ /*
+ * llist_node is only used for poll deferred completions
+ */
+ struct llist_node llist_node;
bool has_user;
bool in_async;
bool needs_fixed_file;
@@ -414,23 +566,6 @@ struct io_kiocb {
struct list_head link_list;
unsigned int flags;
refcount_t refs;
-#define REQ_F_NOWAIT 1 /* must not punt to workers */
-#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
-#define REQ_F_FIXED_FILE 4 /* ctx owns file */
-#define REQ_F_LINK_NEXT 8 /* already grabbed next link */
-#define REQ_F_IO_DRAIN 16 /* drain existing IO first */
-#define REQ_F_IO_DRAINED 32 /* drain done */
-#define REQ_F_LINK 64 /* linked sqes */
-#define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */
-#define REQ_F_FAIL_LINK 256 /* fail rest of links */
-#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */
-#define REQ_F_TIMEOUT 1024 /* timeout request */
-#define REQ_F_ISREG 2048 /* regular file */
-#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */
-#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */
-#define REQ_F_INFLIGHT 16384 /* on inflight list */
-#define REQ_F_COMP_LOCKED 32768 /* completion under lock */
-#define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */
u64 user_data;
u32 result;
u32 sequence;
@@ -463,14 +598,162 @@ struct io_submit_state {
unsigned int ios_left;
};
+struct io_op_def {
+ /* needs req->io allocated for deferral/async */
+ unsigned async_ctx : 1;
+ /* needs current->mm setup, does mm access */
+ unsigned needs_mm : 1;
+ /* needs req->file assigned */
+ unsigned needs_file : 1;
+ /* needs req->file assigned IFF fd is >= 0 */
+ unsigned fd_non_neg : 1;
+ /* hash wq insertion if file is a regular file */
+ unsigned hash_reg_file : 1;
+ /* unbound wq insertion if file is a non-regular file */
+ unsigned unbound_nonreg_file : 1;
+ /* opcode is not supported by this kernel */
+ unsigned not_supported : 1;
+ /* needs file table */
+ unsigned file_table : 1;
+};
+
+static const struct io_op_def io_op_defs[] = {
+ [IORING_OP_NOP] = {},
+ [IORING_OP_READV] = {
+ .async_ctx = 1,
+ .needs_mm = 1,
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_WRITEV] = {
+ .async_ctx = 1,
+ .needs_mm = 1,
+ .needs_file = 1,
+ .hash_reg_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_FSYNC] = {
+ .needs_file = 1,
+ },
+ [IORING_OP_READ_FIXED] = {
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_WRITE_FIXED] = {
+ .needs_file = 1,
+ .hash_reg_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_POLL_ADD] = {
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_POLL_REMOVE] = {},
+ [IORING_OP_SYNC_FILE_RANGE] = {
+ .needs_file = 1,
+ },
+ [IORING_OP_SENDMSG] = {
+ .async_ctx = 1,
+ .needs_mm = 1,
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_RECVMSG] = {
+ .async_ctx = 1,
+ .needs_mm = 1,
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_TIMEOUT] = {
+ .async_ctx = 1,
+ .needs_mm = 1,
+ },
+ [IORING_OP_TIMEOUT_REMOVE] = {},
+ [IORING_OP_ACCEPT] = {
+ .needs_mm = 1,
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ .file_table = 1,
+ },
+ [IORING_OP_ASYNC_CANCEL] = {},
+ [IORING_OP_LINK_TIMEOUT] = {
+ .async_ctx = 1,
+ .needs_mm = 1,
+ },
+ [IORING_OP_CONNECT] = {
+ .async_ctx = 1,
+ .needs_mm = 1,
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_FALLOCATE] = {
+ .needs_file = 1,
+ },
+ [IORING_OP_OPENAT] = {
+ .needs_file = 1,
+ .fd_non_neg = 1,
+ .file_table = 1,
+ },
+ [IORING_OP_CLOSE] = {
+ .needs_file = 1,
+ .file_table = 1,
+ },
+ [IORING_OP_FILES_UPDATE] = {
+ .needs_mm = 1,
+ .file_table = 1,
+ },
+ [IORING_OP_STATX] = {
+ .needs_mm = 1,
+ .needs_file = 1,
+ .fd_non_neg = 1,
+ },
+ [IORING_OP_READ] = {
+ .needs_mm = 1,
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_WRITE] = {
+ .needs_mm = 1,
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_FADVISE] = {
+ .needs_file = 1,
+ },
+ [IORING_OP_MADVISE] = {
+ .needs_mm = 1,
+ },
+ [IORING_OP_SEND] = {
+ .needs_mm = 1,
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_RECV] = {
+ .needs_mm = 1,
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_OPENAT2] = {
+ .needs_file = 1,
+ .fd_non_neg = 1,
+ .file_table = 1,
+ },
+ [IORING_OP_EPOLL_CTL] = {
+ .unbound_nonreg_file = 1,
+ .file_table = 1,
+ },
+};
+
static void io_wq_submit_work(struct io_wq_work **workptr);
static void io_cqring_fill_event(struct io_kiocb *req, long res);
-static void __io_free_req(struct io_kiocb *req);
static void io_put_req(struct io_kiocb *req);
-static void io_double_put_req(struct io_kiocb *req);
static void __io_double_put_req(struct io_kiocb *req);
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
+static int __io_sqe_files_update(struct io_ring_ctx *ctx,
+ struct io_uring_files_update *ip,
+ unsigned nr_args);
+static int io_grab_files(struct io_kiocb *req);
static struct kmem_cache *req_cachep;
@@ -537,9 +820,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->completions[0]);
init_completion(&ctx->completions[1]);
+ idr_init(&ctx->personality_idr);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait);
spin_lock_init(&ctx->completion_lock);
+ init_llist_head(&ctx->poll_llist);
INIT_LIST_HEAD(&ctx->poll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
@@ -566,7 +851,7 @@ static inline bool __req_need_defer(struct io_kiocb *req)
static inline bool req_need_defer(struct io_kiocb *req)
{
- if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN)
+ if (unlikely(req->flags & REQ_F_IO_DRAIN))
return __req_need_defer(req);
return false;
@@ -606,53 +891,53 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
- if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {
- /* order cqe stores with ring update */
- smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
+ /* order cqe stores with ring update */
+ smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
- if (wq_has_sleeper(&ctx->cq_wait)) {
- wake_up_interruptible(&ctx->cq_wait);
- kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
- }
+ if (wq_has_sleeper(&ctx->cq_wait)) {
+ wake_up_interruptible(&ctx->cq_wait);
+ kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
+ }
+}
+
+static inline void io_req_work_grab_env(struct io_kiocb *req,
+ const struct io_op_def *def)
+{
+ if (!req->work.mm && def->needs_mm) {
+ mmgrab(current->mm);
+ req->work.mm = current->mm;
}
+ if (!req->work.creds)
+ req->work.creds = get_current_cred();
}
-static inline bool io_req_needs_user(struct io_kiocb *req)
+static inline void io_req_work_drop_env(struct io_kiocb *req)
{
- return !(req->opcode == IORING_OP_READ_FIXED ||
- req->opcode == IORING_OP_WRITE_FIXED);
+ if (req->work.mm) {
+ mmdrop(req->work.mm);
+ req->w