diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-02 13:20:44 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-02 13:20:44 -0700 |
| commit | b349b1181d24af1c151134a3c39725e94a5619dd (patch) | |
| tree | 7347cc4035de947c22e575ac7c649c0fa8658dd1 | |
| parent | efb2883060afc79638bb1eb19e2c30e7f6c5a178 (diff) | |
| parent | f6b543fd03d347e8bf245cee4f2d54eb6ffd8fcb (diff) | |
| download | linux-b349b1181d24af1c151134a3c39725e94a5619dd.tar.gz linux-b349b1181d24af1c151134a3c39725e94a5619dd.tar.bz2 linux-b349b1181d24af1c151134a3c39725e94a5619dd.zip | |
Merge tag 'for-5.20/io_uring-2022-07-29' of git://git.kernel.dk/linux-block
Pull io_uring updates from Jens Axboe:
- As per (valid) complaint in the last merge window, fs/io_uring.c has
grown quite large these days. io_uring isn't really tied to fs
either, as it supports a wide variety of functionality outside of
that.
Move the code to io_uring/ and split it into files that either
implement a specific request type, and split some code into helpers
as well. The code is organized a lot better like this, and io_uring.c
is now < 4K LOC (me).
- Deprecate the epoll_ctl opcode. It'll still work, just trigger a
warning once if used. If we don't get any complaints on this, and I
don't expect any, then we can fully remove it in a future release
(me).
- Improve the cancel hash locking (Hao)
- kbuf cleanups (Hao)
- Efficiency improvements to the task_work handling (Dylan, Pavel)
- Provided buffer improvements (Dylan)
- Add support for recv/recvmsg multishot support. This is similar to
the accept (or poll) support for have for multishot, where a single
SQE can trigger everytime data is received. For applications that
expect to do more than a few receives on an instantiated socket, this
greatly improves efficiency (Dylan).
- Efficiency improvements for poll handling (Pavel)
- Poll cancelation improvements (Pavel)
- Allow specifiying a range for direct descriptor allocations (Pavel)
- Cleanup the cqe32 handling (Pavel)
- Move io_uring types to greatly cleanup the tracing (Pavel)
- Tons of great code cleanups and improvements (Pavel)
- Add a way to do sync cancelations rather than through the sqe -> cqe
interface, as that's a lot easier to use for some use cases (me).
- Add support to IORING_OP_MSG_RING for sending direct descriptors to a
different ring. This avoids the usually problematic SCM case, as we
disallow those. (me)
- Make the per-command alloc cache we use for apoll generic, place
limits on it, and use it for netmsg as well (me).
- Various cleanups (me, Michal, Gustavo, Uros)
* tag 'for-5.20/io_uring-2022-07-29' of git://git.kernel.dk/linux-block: (172 commits)
io_uring: ensure REQ_F_ISREG is set async offload
net: fix compat pointer in get_compat_msghdr()
io_uring: Don't require reinitable percpu_ref
io_uring: fix types in io_recvmsg_multishot_overflow
io_uring: Use atomic_long_try_cmpxchg in __io_account_mem
io_uring: support multishot in recvmsg
net: copy from user before calling __get_compat_msghdr
net: copy from user before calling __copy_msghdr
io_uring: support 0 length iov in buffer select in compat
io_uring: fix multishot ending when not polled
io_uring: add netmsg cache
io_uring: impose max limit on apoll cache
io_uring: add abstraction around apoll cache
io_uring: move apoll cache to poll.c
io_uring: consolidate hash_locked io-wq handling
io_uring: clear REQ_F_HASH_LOCKED on hash removal
io_uring: don't race double poll setting REQ_F_ASYNC_DATA
io_uring: don't miss setting REQ_F_DOUBLE_POLL
io_uring: disable multishot recvmsg
io_uring: only trace one of complete or overflow
...
66 files changed, 15141 insertions, 13529 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index d0e31f59c314..b2e6ce3df1e6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7811,9 +7811,6 @@ F: include/linux/fs.h F: include/linux/fs_types.h F: include/uapi/linux/fs.h F: include/uapi/linux/openat2.h -X: fs/io-wq.c -X: fs/io-wq.h -X: fs/io_uring.c FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER M: Riku Voipio <riku.voipio@iki.fi> @@ -10521,9 +10518,7 @@ L: io-uring@vger.kernel.org S: Maintained T: git git://git.kernel.dk/linux-block T: git git://git.kernel.dk/liburing -F: fs/io-wq.c -F: fs/io-wq.h -F: fs/io_uring.c +F: io_uring/ F: include/linux/io_uring.h F: include/uapi/linux/io_uring.h F: tools/io_uring/ @@ -1097,6 +1097,7 @@ export MODULES_NSDEPS := $(extmod_prefix)modules.nsdeps ifeq ($(KBUILD_EXTMOD),) core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ core-$(CONFIG_BLOCK) += block/ +core-$(CONFIG_IO_URING) += io_uring/ vmlinux-dirs := $(patsubst %/,%,$(filter %/, \ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ diff --git a/fs/Makefile b/fs/Makefile index 208a74e0b00e..93b80529f8e8 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -34,8 +34,6 @@ obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o -obj-$(CONFIG_IO_URING) += io_uring.o -obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_VERITY) += verity/ diff --git a/fs/io_uring.c b/fs/io_uring.c deleted file mode 100644 index e8e769be9ed0..000000000000 --- a/fs/io_uring.c +++ /dev/null @@ -1,13273 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Shared application/kernel submission and completion ring pairs, for - * supporting fast/efficient IO. - * - * A note on the read/write ordering memory barriers that are matched between - * the application and kernel side. - * - * After the application reads the CQ ring tail, it must use an - * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses - * before writing the tail (using smp_load_acquire to read the tail will - * do). It also needs a smp_mb() before updating CQ head (ordering the - * entry load(s) with the head store), pairing with an implicit barrier - * through a control-dependency in io_get_cqe (smp_store_release to - * store head will do). Failure to do so could lead to reading invalid - * CQ entries. - * - * Likewise, the application must use an appropriate smp_wmb() before - * writing the SQ tail (ordering SQ entry stores with the tail store), - * which pairs with smp_load_acquire in io_get_sqring (smp_store_release - * to store the tail will do). And it needs a barrier ordering the SQ - * head load before writing new SQ entries (smp_load_acquire to read - * head will do). - * - * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application - * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* - * updating the SQ tail; a full memory barrier smp_mb() is needed - * between. - * - * Also see the examples in the liburing library: - * - * git://git.kernel.dk/liburing - * - * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens - * from data shared between the kernel and application. This is done both - * for ordering purposes, but also to ensure that once a value is loaded from - * data that the application could potentially modify, it remains stable. - * - * Copyright (C) 2018-2019 Jens Axboe - * Copyright (c) 2018-2019 Christoph Hellwig - */ -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/errno.h> -#include <linux/syscalls.h> -#include <linux/compat.h> -#include <net/compat.h> -#include <linux/refcount.h> -#include <linux/uio.h> -#include <linux/bits.h> - -#include <linux/sched/signal.h> -#include <linux/fs.h> -#include <linux/file.h> -#include <linux/fdtable.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/percpu.h> -#include <linux/slab.h> -#include <linux/blk-mq.h> -#include <linux/bvec.h> -#include <linux/net.h> -#include <net/sock.h> -#include <net/af_unix.h> -#include <net/scm.h> -#include <linux/anon_inodes.h> -#include <linux/sched/mm.h> -#include <linux/uaccess.h> -#include <linux/nospec.h> -#include <linux/sizes.h> -#include <linux/hugetlb.h> -#include <linux/highmem.h> -#include <linux/namei.h> -#include <linux/fsnotify.h> -#include <linux/fadvise.h> -#include <linux/eventpoll.h> -#include <linux/splice.h> -#include <linux/task_work.h> -#include <linux/pagemap.h> -#include <linux/io_uring.h> -#include <linux/audit.h> -#include <linux/security.h> -#include <linux/xattr.h> - -#define CREATE_TRACE_POINTS -#include <trace/events/io_uring.h> - -#include <uapi/linux/io_uring.h> - -#include "internal.h" -#include "io-wq.h" - -#define IORING_MAX_ENTRIES 32768 -#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) -#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 - -/* only define max */ -#define IORING_MAX_FIXED_FILES (1U << 20) -#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ - IORING_REGISTER_LAST + IORING_OP_LAST) - -#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) -#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) -#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) - -#define IORING_MAX_REG_BUFFERS (1U << 14) - -#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ - IOSQE_IO_HARDLINK | IOSQE_ASYNC) - -#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ - IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) - -#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ - REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ - REQ_F_ASYNC_DATA) - -#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ - IO_REQ_CLEAN_FLAGS) - -#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) - -#define IO_TCTX_REFS_CACHE_NR (1U << 10) - -struct io_uring { - u32 head ____cacheline_aligned_in_smp; - u32 tail ____cacheline_aligned_in_smp; -}; - -/* - * This data is shared with the application through the mmap at offsets - * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. - * - * The offsets to the member fields are published through struct - * io_sqring_offsets when calling io_uring_setup. - */ -struct io_rings { - /* - * Head and tail offsets into the ring; the offsets need to be - * masked to get valid indices. - * - * The kernel controls head of the sq ring and the tail of the cq ring, - * and the application controls tail of the sq ring and the head of the - * cq ring. - */ - struct io_uring sq, cq; - /* - * Bitmasks to apply to head and tail offsets (constant, equals - * ring_entries - 1) - */ - u32 sq_ring_mask, cq_ring_mask; - /* Ring sizes (constant, power of 2) */ - u32 sq_ring_entries, cq_ring_entries; - /* - * Number of invalid entries dropped by the kernel due to - * invalid index stored in array - * - * Written by the kernel, shouldn't be modified by the - * application (i.e. get number of "new events" by comparing to - * cached value). - * - * After a new SQ head value was read by the application this - * counter includes all submissions that were dropped reaching - * the new SQ head (and possibly more). - */ - u32 sq_dropped; - /* - * Runtime SQ flags - * - * Written by the kernel, shouldn't be modified by the - * application. - * - * The application needs a full memory barrier before checking - * for IORING_SQ_NEED_WAKEUP after updating the sq tail. - */ - atomic_t sq_flags; - /* - * Runtime CQ flags - * - * Written by the application, shouldn't be modified by the - * kernel. - */ - u32 cq_flags; - /* - * Number of completion events lost because the queue was full; - * this should be avoided by the application by making sure - * there are not more requests pending than there is space in - * the completion queue. - * - * Written by the kernel, shouldn't be modified by the - * application (i.e. get number of "new events" by comparing to - * cached value). - * - * As completion events come in out of order this counter is not - * ordered with any other data. - */ - u32 cq_overflow; - /* - * Ring buffer of completion events. - * - * The kernel writes completion events fresh every time they are - * produced, so the application is allowed to modify pending - * entries. - */ - struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; -}; - -struct io_mapped_ubuf { - u64 ubuf; - u64 ubuf_end; - unsigned int nr_bvecs; - unsigned long acct_pages; - struct bio_vec bvec[]; -}; - -struct io_ring_ctx; - -struct io_overflow_cqe { - struct list_head list; - struct io_uring_cqe cqe; -}; - -/* - * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 - * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we - * can't safely always dereference the file when the task has exited and ring - * cleanup is done. If a file is tracked and part of SCM, then unix gc on - * process exit may reap it before __io_sqe_files_unregister() is run. - */ -#define FFS_NOWAIT 0x1UL -#define FFS_ISREG 0x2UL -#if defined(CONFIG_64BIT) -#define FFS_SCM 0x4UL -#else -#define IO_URING_SCM_ALL -#define FFS_SCM 0x0UL -#endif -#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) - -struct io_fixed_file { - /* file * with additional FFS_* flags */ - unsigned long file_ptr; -}; - -struct io_rsrc_put { - struct list_head list; - u64 tag; - union { - void *rsrc; - struct file *file; - struct io_mapped_ubuf *buf; - }; -}; - -struct io_file_table { - struct io_fixed_file *files; - unsigned long *bitmap; - unsigned int alloc_hint; -}; - -struct io_rsrc_node { - struct percpu_ref refs; - struct list_head node; - struct list_head rsrc_list; - struct io_rsrc_data *rsrc_data; - struct llist_node llist; - bool done; -}; - -typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); - -struct io_rsrc_data { - struct io_ring_ctx *ctx; - - u64 **tags; - unsigned int nr; - rsrc_put_fn *do_put; - atomic_t refs; - struct completion done; - bool quiesce; -}; - -#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) -struct io_buffer_list { - /* - * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, - * then these are classic provided buffers and ->buf_list is used. - */ - union { - struct list_head buf_list; - struct { - struct page **buf_pages; - struct io_uring_buf_ring *buf_ring; - }; - }; - __u16 bgid; - - /* below is for ring provided buffers */ - __u16 buf_nr_pages; - __u16 nr_entries; - __u16 head; - __u16 mask; -}; - -struct io_buffer { - struct list_head list; - __u64 addr; - __u32 len; - __u16 bid; - __u16 bgid; -}; - -struct io_restriction { - DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); - DECLARE_BITMAP(sqe_op, IORING_OP_LAST); - u8 sqe_flags_allowed; - u8 sqe_flags_required; - bool registered; -}; - -enum { - IO_SQ_THREAD_SHOULD_STOP = 0, - IO_SQ_THREAD_SHOULD_PARK, -}; - -struct io_sq_data { - refcount_t refs; - atomic_t park_pending; - struct mutex lock; - - /* ctx's that are using this sqd */ - struct list_head ctx_list; - - struct task_struct *thread; - struct wait_queue_head wait; - - unsigned sq_thread_idle; - int sq_cpu; - pid_t task_pid; - pid_t task_tgid; - - unsigned long state; - struct completion exited; -}; - -#define IO_COMPL_BATCH 32 -#define IO_REQ_CACHE_SIZE 32 -#define IO_REQ_ALLOC_BATCH 8 - -struct io_submit_link { - struct io_kiocb *head; - struct io_kiocb *last; -}; - -struct io_submit_state { - /* inline/task_work completion list, under ->uring_lock */ - struct io_wq_work_node free_list; - /* batch completion logic */ - struct io_wq_work_list compl_reqs; - struct io_submit_link link; - - bool plug_started; - bool need_plug; - bool flush_cqes; - unsigned short submit_nr; - struct blk_plug plug; -}; - -struct io_ev_fd { - struct eventfd_ctx *cq_ev_fd; - unsigned int eventfd_async: 1; - struct rcu_head rcu; -}; - -#define BGID_ARRAY 64 - -struct io_ring_ctx { - /* const or read-mostly hot data */ - struct { - struct percpu_ref refs; - - struct io_rings *rings; - unsigned int flags; - enum task_work_notify_mode notify_method; - unsigned int compat: 1; - unsigned int drain_next: 1; - unsigned int restricted: 1; - unsigned int off_timeout_used: 1; - unsigned int drain_active: 1; - unsigned int drain_disabled: 1; - unsigned int has_evfd: 1; - unsigned int syscall_iopoll: 1; - } ____cacheline_aligned_in_smp; - - /* submission data */ - struct { - struct mutex uring_lock; - - /* - * Ring buffer of indices into array of io_uring_sqe, which is - * mmapped by the application using the IORING_OFF_SQES offset. - * - * This indirection could e.g. be used to assign fixed - * io_uring_sqe entries to operations and only submit them to - * the queue when needed. - * - * The kernel modifies neither the indices array nor the entries - * array. - */ - u32 *sq_array; - struct io_uring_sqe *sq_sqes; - unsigned cached_sq_head; - unsigned sq_entries; - struct list_head defer_list; - - /* - * Fixed resources fast path, should be accessed only under - * uring_lock, and updated through io_uring_register(2) - */ - struct io_rsrc_node *rsrc_node; - int rsrc_cached_refs; - atomic_t cancel_seq; - struct io_file_table file_table; - unsigned nr_user_files; - unsigned nr_user_bufs; - struct io_mapped_ubuf **user_bufs; - - struct io_submit_state submit_state; - - struct io_buffer_list *io_bl; - struct xarray io_bl_xa; - struct list_head io_buffers_cache; - - struct list_head timeout_list; - struct list_head ltimeout_list; - struct list_head cq_overflow_list; - struct list_head apoll_cache; - struct xarray personalities; - u32 pers_next; - unsigned sq_thread_idle; - } ____cacheline_aligned_in_smp; - - /* IRQ completion list, under ->completion_lock */ - struct io_wq_work_list locked_free_list; - unsigned int locked_free_nr; - - const struct cred *sq_creds; /* cred used for __io_sq_thread() */ - struct io_sq_data *sq_data; /* if using sq thread polling */ - - struct wait_queue_head sqo_sq_wait; - struct list_head sqd_list; - - unsigned long check_cq; - - struct { - /* - * We cache a range of free CQEs we can use, once exhausted it - * should go through a slower range setup, see __io_get_cqe() - */ - struct io_uring_cqe *cqe_cached; - struct io_uring_cqe *cqe_sentinel; - - unsigned cached_cq_tail; - unsigned cq_entries; - struct io_ev_fd __rcu *io_ev_fd; - struct wait_queue_head cq_wait; - unsigned cq_extra; - atomic_t cq_timeouts; - unsigned cq_last_tm_flush; - } ____cacheline_aligned_in_smp; - - struct { - spinlock_t completion_lock; - - spinlock_t timeout_lock; - - /* - * ->iopoll_list is protected by the ctx->uring_lock for - * io_uring instances that don't use IORING_SETUP_SQPOLL. - * For SQPOLL, only the single threaded io_sq_thread() will - * manipulate the list, hence no extra locking is needed there. - */ - struct io_wq_work_list iopoll_list; - struct hlist_head *cancel_hash; - unsigned cancel_hash_bits; - bool poll_multi_queue; - - struct list_head io_buffers_comp; - } ____cacheline_aligned_in_smp; - - struct io_restriction restrictions; - - /* slow path rsrc auxilary data, used by update/register */ - struct { - struct io_rsrc_node *rsrc_backup_node; - struct io_mapped_ubuf *dummy_ubuf; - struct io_rsrc_data *file_data; - struct io_rsrc_data *buf_data; - - struct delayed_work rsrc_put_work; - struct llist_head rsrc_put_llist; - struct list_head rsrc_ref_list; - spinlock_t rsrc_ref_lock; |
