diff options
| -rw-r--r-- | include/linux/io_uring_types.h | 3 | ||||
| -rw-r--r-- | include/uapi/linux/io_uring.h | 29 | ||||
| -rw-r--r-- | init/Kconfig | 13 | ||||
| -rw-r--r-- | io_uring/Makefile | 4 | ||||
| -rw-r--r-- | io_uring/eventfd.c | 13 | ||||
| -rw-r--r-- | io_uring/fdinfo.c | 14 | ||||
| -rw-r--r-- | io_uring/io-wq.c | 25 | ||||
| -rw-r--r-- | io_uring/io_uring.c | 212 | ||||
| -rw-r--r-- | io_uring/io_uring.h | 12 | ||||
| -rw-r--r-- | io_uring/kbuf.c | 96 | ||||
| -rw-r--r-- | io_uring/kbuf.h | 94 | ||||
| -rw-r--r-- | io_uring/napi.c | 35 | ||||
| -rw-r--r-- | io_uring/napi.h | 16 | ||||
| -rw-r--r-- | io_uring/net.c | 27 | ||||
| -rw-r--r-- | io_uring/register.c | 31 | ||||
| -rw-r--r-- | io_uring/rsrc.c | 149 | ||||
| -rw-r--r-- | io_uring/rsrc.h | 12 | ||||
| -rw-r--r-- | io_uring/rw.c | 19 | ||||
| -rw-r--r-- | io_uring/sqpoll.c | 7 |
19 files changed, 573 insertions, 238 deletions
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3315005df117..4b9ba523978d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -239,6 +239,9 @@ struct io_ring_ctx { struct io_rings *rings; struct percpu_ref refs; + clockid_t clockid; + enum tk_offsets clock_offset; + enum task_work_notify_mode notify_method; unsigned sq_thread_idle; } ____cacheline_aligned_in_smp; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index adc2524fd8e3..a275f91d2ac0 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -440,11 +440,21 @@ struct io_uring_cqe { * IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv * IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct * them from sends. + * IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get + * more completions. In other words, the buffer is being + * partially consumed, and will be used by the kernel for + * more completions. This is only set for buffers used via + * the incremental buffer consumption, as provided by + * a ring buffer setup with IOU_PBUF_RING_INC. For any + * other provided buffer type, all completions with a + * buffer passed back is automatically returned to the + * application. */ #define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_MORE (1U << 1) #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) #define IORING_CQE_F_NOTIF (1U << 3) +#define IORING_CQE_F_BUF_MORE (1U << 4) #define IORING_CQE_BUFFER_SHIFT 16 @@ -507,6 +517,7 @@ struct io_cqring_offsets { #define IORING_ENTER_SQ_WAIT (1U << 2) #define IORING_ENTER_EXT_ARG (1U << 3) #define IORING_ENTER_REGISTERED_RING (1U << 4) +#define IORING_ENTER_ABS_TIMER (1U << 5) /* * Passed in for io_uring_setup(2). Copied back with updated info on success @@ -542,6 +553,7 @@ struct io_uring_params { #define IORING_FEAT_LINKED_FILE (1U << 12) #define IORING_FEAT_REG_REG_RING (1U << 13) #define IORING_FEAT_RECVSEND_BUNDLE (1U << 14) +#define IORING_FEAT_MIN_TIMEOUT (1U << 15) /* * io_uring_register(2) opcodes and arguments @@ -595,6 +607,8 @@ enum io_uring_register_op { IORING_REGISTER_NAPI = 27, IORING_UNREGISTER_NAPI = 28, + IORING_REGISTER_CLOCK = 29, + /* this goes last */ IORING_REGISTER_LAST, @@ -675,6 +689,11 @@ struct io_uring_restriction { __u32 resv2[3]; }; +struct io_uring_clock_register { + __u32 clockid; + __u32 __resv[3]; +}; + struct io_uring_buf { __u64 addr; __u32 len; @@ -707,9 +726,17 @@ struct io_uring_buf_ring { * mmap(2) with the offset set as: * IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) * to get a virtual mapping for the ring. + * IOU_PBUF_RING_INC: If set, buffers consumed from this buffer ring can be + * consumed incrementally. Normally one (or more) buffers + * are fully consumed. With incremental consumptions, it's + * feasible to register big ranges of buffers, and each + * use of it will consume only as much as it needs. This + * requires that both the kernel and application keep + * track of where the current read/recv index is at. */ enum io_uring_register_pbuf_ring_flags { IOU_PBUF_RING_MMAP = 1, + IOU_PBUF_RING_INC = 2, }; /* argument for IORING_(UN)REGISTER_PBUF_RING */ @@ -758,7 +785,7 @@ enum io_uring_register_restriction_op { struct io_uring_getevents_arg { __u64 sigmask; __u32 sigmask_sz; - __u32 pad; + __u32 min_wait_usec; __u64 ts; }; diff --git a/init/Kconfig b/init/Kconfig index 5783a0b87517..3b6ca7cce03b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1687,6 +1687,19 @@ config IO_URING applications to submit and complete IO through submission and completion rings that are shared between the kernel and application. +config GCOV_PROFILE_URING + bool "Enable GCOV profiling on the io_uring subsystem" + depends on GCOV_KERNEL + help + Enable GCOV profiling on the io_uring subsystem, to facilitate + code coverage testing. + + If unsure, say N. + + Note that this will have a negative impact on the performance of + the io_uring subsystem, hence this should only be enabled for + specific test purposes. + config ADVISE_SYSCALLS bool "Enable madvise/fadvise syscalls" if EXPERT default y diff --git a/io_uring/Makefile b/io_uring/Makefile index 61923e11c767..53167bef37d7 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -2,6 +2,10 @@ # # Makefile for io_uring +ifdef CONFIG_GCOV_PROFILE_URING +GCOV_PROFILE := y +endif + obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ tctx.o filetable.o rw.o net.o poll.o \ eventfd.o uring_cmd.o openclose.o \ diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index b9384503a2b7..e37fddd5d9ce 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -15,7 +15,7 @@ struct io_ev_fd { struct eventfd_ctx *cq_ev_fd; unsigned int eventfd_async: 1; struct rcu_head rcu; - atomic_t refs; + refcount_t refs; atomic_t ops; }; @@ -37,7 +37,7 @@ static void io_eventfd_do_signal(struct rcu_head *rcu) eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - if (atomic_dec_and_test(&ev_fd->refs)) + if (refcount_dec_and_test(&ev_fd->refs)) io_eventfd_free(rcu); } @@ -63,7 +63,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) */ if (unlikely(!ev_fd)) return; - if (!atomic_inc_not_zero(&ev_fd->refs)) + if (!refcount_inc_not_zero(&ev_fd->refs)) return; if (ev_fd->eventfd_async && !io_wq_current_is_worker()) goto out; @@ -77,7 +77,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) } } out: - if (atomic_dec_and_test(&ev_fd->refs)) + if (refcount_dec_and_test(&ev_fd->refs)) call_rcu(&ev_fd->rcu, io_eventfd_free); } @@ -126,6 +126,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); if (IS_ERR(ev_fd->cq_ev_fd)) { int ret = PTR_ERR(ev_fd->cq_ev_fd); + kfree(ev_fd); return ret; } @@ -136,7 +137,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, ev_fd->eventfd_async = eventfd_async; ctx->has_evfd = true; - atomic_set(&ev_fd->refs, 1); + refcount_set(&ev_fd->refs, 1); atomic_set(&ev_fd->ops, 0); rcu_assign_pointer(ctx->io_ev_fd, ev_fd); return 0; @@ -151,7 +152,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx) if (ev_fd) { ctx->has_evfd = false; rcu_assign_pointer(ctx->io_ev_fd, NULL); - if (atomic_dec_and_test(&ev_fd->refs)) + if (refcount_dec_and_test(&ev_fd->refs)) call_rcu(&ev_fd->rcu, io_eventfd_free); return 0; } diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index b1e0e0d85349..d43e1b5fcb36 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -221,7 +221,19 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) cqe->user_data, cqe->res, cqe->flags); } - spin_unlock(&ctx->completion_lock); + +#ifdef CONFIG_NET_RX_BUSY_POLL + if (ctx->napi_enabled) { + seq_puts(m, "NAPI:\tenabled\n"); + seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt); + if (ctx->napi_prefer_busy_poll) + seq_puts(m, "napi_prefer_busy_poll:\ttrue\n"); + else + seq_puts(m, "napi_prefer_busy_poll:\tfalse\n"); + } else { + seq_puts(m, "NAPI:\tdisabled\n"); + } +#endif } #endif diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index f1e7c670add8..a38f36b68060 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/rculist_nulls.h> #include <linux/cpu.h> +#include <linux/cpuset.h> #include <linux/task_work.h> #include <linux/audit.h> #include <linux/mmu_context.h> @@ -1167,7 +1168,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL)) goto err; - cpumask_copy(wq->cpu_mask, cpu_possible_mask); + cpuset_cpus_allowed(data->task, wq->cpu_mask); wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; wq->acct[IO_WQ_ACCT_UNBOUND].max_workers = task_rlimit(current, RLIMIT_NPROC); @@ -1322,17 +1323,29 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node) int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask) { + cpumask_var_t allowed_mask; + int ret = 0; + if (!tctx || !tctx->io_wq) return -EINVAL; + if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL)) + return -ENOMEM; + rcu_read_lock(); - if (mask) - cpumask_copy(tctx->io_wq->cpu_mask, mask); - else - cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask); + cpuset_cpus_allowed(tctx->io_wq->task, allowed_mask); + if (mask) { + if (cpumask_subset(mask, allowed_mask)) + cpumask_copy(tctx->io_wq->cpu_mask, mask); + else + ret = -EINVAL; + } else { + cpumask_copy(tctx->io_wq->cpu_mask, allowed_mask); + } rcu_read_unlock(); - return 0; + free_cpumask_var(allowed_mask); + return ret; } /* diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3942db160f18..1aca501efaf6 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -904,7 +904,7 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res) lockdep_assert_held(&req->ctx->uring_lock); req_set_fail(req); - io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); + io_req_set_res(req, res, io_put_kbuf(req, res, IO_URING_F_UNLOCKED)); if (def->fail) def->fail(req); io_req_complete_defer(req); @@ -2350,22 +2350,92 @@ static bool current_pending_io(void) return percpu_counter_read_positive(&tctx->inflight); } -/* when returns >0, the caller should retry */ -static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq) +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) { - int ret; + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); - if (unlikely(READ_ONCE(ctx->check_cq))) - return 1; - if (unlikely(!llist_empty(&ctx->work_llist))) - return 1; - if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) - return 1; - if (unlikely(task_sigpending(current))) - return -EINTR; - if (unlikely(io_should_wake(iowq))) - return 0; + WRITE_ONCE(iowq->hit_timeout, 1); + iowq->min_timeout = 0; + wake_up_process(iowq->wq.private); + return HRTIMER_NORESTART; +} + +/* + * Doing min_timeout portion. If we saw any timeouts, events, or have work, + * wake up. If not, and we have a normal timeout, switch to that and keep + * sleeping. + */ +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) +{ + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); + struct io_ring_ctx *ctx = iowq->ctx; + + /* no general timeout, or shorter (or equal), we are done */ + if (iowq->timeout == KTIME_MAX || + ktime_compare(iowq->min_timeout, iowq->timeout) >= 0) + goto out_wake; + /* work we may need to run, wake function will see if we need to wake */ + if (io_has_work(ctx)) + goto out_wake; + /* got events since we started waiting, min timeout is done */ + if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) + goto out_wake; + /* if we have any events and min timeout expired, we're done */ + if (io_cqring_events(ctx)) + goto out_wake; + + /* + * If using deferred task_work running and application is waiting on + * more than one request, ensure we reset it now where we are switching + * to normal sleeps. Any request completion post min_wait should wake + * the task and return. + */ + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 1); + smp_mb(); + if (!llist_empty(&ctx->work_llist)) + goto out_wake; + } + + iowq->t.function = io_cqring_timer_wakeup; + hrtimer_set_expires(timer, iowq->timeout); + return HRTIMER_RESTART; +out_wake: + return io_cqring_timer_wakeup(timer); +} + +static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, + clockid_t clock_id, ktime_t start_time) +{ + ktime_t timeout; + + hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS); + if (iowq->min_timeout) { + timeout = ktime_add_ns(iowq->min_timeout, start_time); + iowq->t.function = io_cqring_min_timer_wakeup; + } else { + timeout = iowq->timeout; + iowq->t.function = io_cqring_timer_wakeup; + } + + hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); + hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); + + if (!READ_ONCE(iowq->hit_timeout)) + schedule(); + + hrtimer_cancel(&iowq->t); + destroy_hrtimer_on_stack(&iowq->t); + __set_current_state(TASK_RUNNING); + + return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; +} + +static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + ktime_t start_time) +{ + int ret = 0; /* * Mark us as being in io_wait if we have pending requests, so cpufreq @@ -2374,25 +2444,50 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, */ if (current_pending_io()) current->in_iowait = 1; - ret = 0; - if (iowq->timeout == KTIME_MAX) + if (iowq->timeout != KTIME_MAX || iowq->min_timeout) + ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); + else schedule(); - else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS)) - ret = -ETIME; current->in_iowait = 0; return ret; } +/* If this returns > 0, the caller should retry */ +static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + ktime_t start_time) +{ + if (unlikely(READ_ONCE(ctx->check_cq))) + return 1; + if (unlikely(!llist_empty(&ctx->work_llist))) + return 1; + if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) + return 1; + if (unlikely(task_sigpending(current))) + return -EINTR; + if (unlikely(io_should_wake(iowq))) + return 0; + + return __io_cqring_wait_schedule(ctx, iowq, start_time); +} + +struct ext_arg { + size_t argsz; + struct __kernel_timespec __user *ts; + const sigset_t __user *sig; + ktime_t min_time; +}; + /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. */ -static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, - const sigset_t __user *sig, size_t sigsz, - struct __kernel_timespec __user *uts) +static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, + struct ext_arg *ext_arg) { struct io_wait_queue iowq; struct io_rings *rings = ctx->rings; + ktime_t start_time; int ret; if (!io_allowed_run_tw(ctx)) @@ -2410,30 +2505,33 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, iowq.wq.private = current; INIT_LIST_HEAD(&iowq.wq.entry); iowq.ctx = ctx; - iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; + iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); + iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); + iowq.hit_timeout = 0; + iowq.min_timeout = ext_arg->min_time; iowq.timeout = KTIME_MAX; + start_time = io_get_time(ctx); - if (uts) { + if (ext_arg->ts) { struct timespec64 ts; - ktime_t dt; - if (get_timespec64(&ts, uts)) + if (get_timespec64(&ts, ext_arg->ts)) return -EFAULT; - dt = timespec64_to_ktime(ts); - iowq.timeout = ktime_add(dt, ktime_get()); - io_napi_adjust_timeout(ctx, &iowq, dt); + iowq.timeout = timespec64_to_ktime(ts); + if (!(flags & IORING_ENTER_ABS_TIMER)) + iowq.timeout = ktime_add(iowq.timeout, start_time); } - if (sig) { + if (ext_arg->sig) { #ifdef CONFIG_COMPAT if (in_compat_syscall()) - ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, - sigsz); + ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig, + ext_arg->argsz); else #endif - ret = set_user_sigmask(sig, sigsz); + ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz); if (ret) return ret; @@ -2443,8 +2541,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, trace_io_uring_cqring_wait(ctx, min_events); do { - int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); unsigned long check_cq; + int nr_wait; + + /* if min timeout has been hit, don't reset wait count */ + if (!iowq.hit_timeout) + nr_wait = (int) iowq.cq_tail - + READ_ONCE(ctx->rings->cq.tail); + else + nr_wait = 1; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, nr_wait); @@ -2454,7 +2559,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, TASK_INTERRUPTIBLE); } - ret = io_cqring_wait_schedule(ctx, &iowq); + ret = io_cqring_wait_schedule(ctx, &iowq, start_time); __set_current_state(TASK_RUNNING); atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); @@ -3112,9 +3217,8 @@ static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t a return 0; } -static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, - struct __kernel_timespec __user **ts, - const sigset_t __user **sig) +static int io_get_ext_arg(unsigned flags, const void __user *argp, + struct ext_arg *ext_arg) { struct io_uring_getevents_arg arg; @@ -3123,8 +3227,8 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz * is just a pointer to the sigset_t. */ if (!(flags & IORING_ENTER_EXT_ARG)) { - *sig = (const sigset_t __user *) argp; - *ts = NULL; + ext_arg->sig = (const sigset_t __user *) argp; + ext_arg->ts = NULL; return 0; } @@ -3132,15 +3236,14 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz * EXT_ARG is set - ensure we agree on the size of it and copy in our * timespec and sigset_t pointers if good. */ - if (*argsz != sizeof(arg)) + if (ext_arg->argsz != sizeof(arg)) return -EINVAL; if (copy_from_user(&arg, argp, sizeof(arg))) return -EFAULT; - if (arg.pad) - return -EINVAL; - *sig = u64_to_user_ptr(arg.sigmask); - *argsz = arg.sigmask_sz; - *ts = u64_to_user_ptr(arg.ts); + ext_arg->min_time = arg.min_wait_usec * NSEC_PER_USEC; + ext_arg->sig = u64_to_user_ptr(arg.sigmask); + ext_arg->argsz = arg.sigmask_sz; + ext_arg->ts = u64_to_user_ptr(arg.ts); return 0; } @@ -3154,7 +3257,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | - IORING_ENTER_REGISTERED_RING))) + IORING_ENTER_REGISTERED_RING | + IORING_ENTER_ABS_TIMER))) return -EINVAL; /* @@ -3245,15 +3349,14 @@ iopoll_locked: } mutex_unlock(&ctx->uring_lock); } else { - const sigset_t __user *sig; - struct __kernel_timespec __user *ts; + struct ext_arg ext_arg = { .argsz = argsz }; - ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); + ret2 = io_get_ext_arg(flags, argp, &ext_arg); if (likely(!ret2)) { min_complete = min(min_complete, ctx->cq_entries); - ret2 = io_cqring_wait(ctx, min_complete, sig, - argsz, ts); + ret2 = io_cqring_wait(ctx, min_complete, flags, + &ext_arg); } } @@ -3424,6 +3527,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, if (!ctx) return -ENOMEM; + ctx->clockid = CLOCK_MONOTONIC; + ctx->clock_offset = 0; + if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && !(ctx->flags & IORING_SETUP_IOPOLL) && !(ctx->flags & IORING_SETUP_SQPOLL)) @@ -3535,7 +3641,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | - IORING_FEAT_RECVSEND_BUNDLE; + IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index c2acf6180845..65078e641390 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -39,8 +39,12 @@ struct io_wait_queue { struct wait_queue_entry wq; struct io_ring_ctx *ctx; unsigned cq_tail; + unsigned cq_min_tail; unsigned nr_timeouts; + int hit_timeout; + ktime_t min_timeout; ktime_t timeout; + struct hrtimer t; #ifdef CONFIG_NET_RX_BUSY_POLL ktime_t napi_busy_poll_dt; @@ -437,6 +441,14 @@ static inline bool io_file_can_poll(struct io_kiocb *req) return false; } +static inline ktime_t io_get_time(struct io_ring_ctx *ctx) +{ + if (ctx->clockid == CLOCK_MONOTONIC) + return ktime_get(); + + return ktime_get_with_offset(ctx->clock_offset); +} + enum { IO_CHECK_CQ_OVERFLOW_BIT, IO_CHECK_CQ_DROPPED_BIT, diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index bdfa30b38321..d407576ddfb7 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -70,7 +70,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) return true; } -void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) +void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) { /* * We can add this buffer back to two lists: @@ -88,12 +88,12 @@ void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) struct io_ring_ctx *ctx = req->ctx; spin_lock(&ctx->completion_lock); - __io_put_kbuf_list(req, &ctx->io_buffers_comp); + __io_put_kbuf_list(req, len, &ctx->io_buffers_comp); spin_unlock(&ctx->completion_lock); } else { lockdep_assert_held(&req->ctx->uring_lock); - __io_put_kbuf_list(req, &req->ctx->io_buffers_cache); + __io_put_kbuf_list(req, len, &req->ctx->io_buffers_cache); } } @@ -132,12 +132,6 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len, return 1; } -static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br, - __u16 head, __u16 mask) -{ - return &br->bufs[head & mask]; -} - static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, struct io_buffer_list *bl, unsigned int issue_flags) @@ -171,9 +165,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, * the transfer completes (or if we get -EAGAIN and must poll of * retry). */ - req->flags &= ~REQ_F_BUFFERS_COMMIT; + io_kbuf_commit(req, bl, *len, 1); req->buf_list = NULL; - bl->head++; } return u64_to_user_ptr(buf->addr); } @@ -189,7 +182,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, bl = io_buffer_get_list(ctx, req->buf_index); if (likely(bl)) { - if (bl->is_buf_ring) + if (bl->flags & IOBL_BUF_RING) ret = io_ring_buffer_select(req, len, bl, issue_flags); else ret = io_provided_buffer_select(req, len, bl); @@ -219,14 +212,25 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, buf = io_ring_head_to_buf(br, head, bl->mask); if (arg->max_len) { u32 len = READ_ONCE(buf->len); - size_t needed; if (unlikely(!len)) return -ENOBUFS; - needed = (arg->max_len + len - 1) / len; - needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); - if (nr_avail > needed) - nr_avail = needed; + /* + * Limit incremental buffers to 1 segment. No point trying + * to peek ahead and map more than we need, when the buffers + * themselves should be large when setup with + * IOU_PBUF_RING_INC. + */ + if (bl->flags & IOBL_INC) { + nr_avail = 1; + } else { + size_t needed; + + needed = (arg->max_len + len - 1) / len; + needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); + if (nr_avail > needed) + nr_avail = needed; + } } /* @@ -251,16 +255,21 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, req->buf_index = buf->bid; do { - /* truncate end piece, if needed */ - if (buf->len > arg->max_len) - buf->len = arg->max_len; + u32 len = buf->len; + + /* truncate end piece, if needed, for non partial buffers */ + if (len > arg->max_len) { + len = arg->max_len; + if (!(bl->flags & IOBL_INC)) + buf->len = len; + } iov->iov_base = u64_to_user_ptr(buf->addr); - iov->iov_len = buf->len; + iov->iov_len = len; iov++; - arg->out_len += buf->len; - arg->max_len -= buf->len; + arg->out_len += len; + arg->max_len -= len; if (!arg->max_len) break; @@ -287,7 +296,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, if (unlikely(!bl)) goto out_unlock; - if (bl->is_buf_ring) { + if (bl->flags & IOBL_BUF_RING) { ret = io_ring_buffers_peek(req, arg, bl); /* * Don't recycle these buffers if we need to go through poll. @@ -297,8 +306,8 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, * committed them, they cannot be put back in the queue. */ if (ret > 0) { - req->flags |= REQ_F_BL_NO_RECYCLE; - req->buf_list->head += ret; + req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE; + io_kbuf_commit(req, bl, arg->out_len, ret); } } else { ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); @@ -320,7 +329,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) if (unlikely(!bl)) return -ENOENT; - if (bl->is_buf_ring) { + if (bl->flags & IOBL_BUF_RING) { ret = io_ring_buffers_peek(req, arg, bl); if (ret > 0) req->flags |= REQ_F_BUFFERS_COMMIT; @@ -340,22 +349,22 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (!nbufs) return 0; - if (bl->is_buf_ring) { + if (bl->flags & IOBL_BUF_RING) { i = bl->buf_ring->tail - bl->head; if (bl->buf_nr_pages) { int j; - if (!bl->is_mmap) { + if (!(bl->flags & IOBL_MMAP)) { for (j = 0; j < bl->buf_nr_pages; j++) unpin_user_page(bl->buf_pages[j]); } io_pages_unmap(bl->buf_ring, &bl->buf_pages, - &bl->buf_nr_pages, bl->is_mmap); - bl->is_mmap = 0; + &bl->buf_nr_pages, bl->flags & IOBL_MMAP); + bl->flags &= ~IOBL_MMAP; } /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); - bl->is_buf_ring = 0; + bl->flags &= ~IOBL_BUF_RING; return i; } @@ -442,7 +451,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) if (bl) { ret = -EINVAL; /* can't use provide/remove buffers command on mapped buffers */ - if (!bl->is_buf_ring) + if (!(bl->flags & IOBL_BUF_RING)) ret = __io_remove_buffers(ctx, bl, p->nbufs); } io_ring_submit_unlock(ctx, issue_flags); @@ -589,7 +598,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) } } /* can't add buffers via this command for a mapped buffer ring */ - if (bl->is_buf_ring) { + if (bl->flags & IOBL_BUF_RING) { ret = -EINVAL; goto err; } @@ -641,8 +650,8 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, bl->buf_pages = pages; bl->buf_nr_pages = nr_pages; bl->buf_ring = br; - bl->is_buf_ring = 1; - bl->is_mmap = 0; + bl->flags |= IOBL_BUF_RING; + bl->flags &= ~IOBL_MMAP; return 0; error_unpin: unpin_user_pages(pages, nr_pages); @@ -665,8 +674,7 @@ static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, return -ENOMEM; } - bl->is_buf_ring = 1; - bl->is_mmap = 1; + bl->flags |= (IOBL_BUF_RING | IOBL_MMAP); return 0; } @@ -683,7 +691,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (reg.resv[0] || reg.resv[1] || reg.resv[2]) return -EINVAL; - if (reg.flags & ~IOU_PBUF_RING_MMAP) + if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) return -EINVAL; if (!(reg.flags & IOU_PBUF_RING_MMAP)) { if (!reg.ring_addr) @@ -705,7 +713,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) bl = io_buffer_get_list(ctx, reg.bgid); if (bl) { /* if mapped buffer ring OR classic exists, don't allow */ - if (bl->is_buf_ring || !list_empty(&bl->buf_list)) + if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list)) return -EEXIST; } else { free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); @@ -721,6 +729,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (!ret) { bl->nr_entries = reg.ring_entries; bl->mask = reg.ring_entries - 1; + if (reg.flags & IOU_PBUF_RING_INC) + bl->flags |= IOBL_INC; io_buffer_add_list(ctx, bl, reg.bgid); return 0; @@ -747,7 +757,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) bl = io_buffer_get_list(ctx, reg.bgid); if (!bl) return -ENOENT; - if (!bl->is_buf_ring) + if (!(bl->flags & IOBL_BUF_RING)) return -EINVAL; xa_erase(&ctx->io_bl_xa, bl->bgid); @@ -771,7 +781,7 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) bl = io_buffer_get_list(ctx, buf_status.buf_group); if (!bl) return -ENOENT; - if (!bl->is_buf_ring) + if (!(bl->flags & IOBL_BUF_RING)) return -EINVAL; buf_status.head = bl->head; @@ -802,7 +812,7 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, bl = xa_load(&ctx->io_bl_xa, bgid); /* must be a mmap'able buffer ring and have pages */ ret = false; - if (bl && bl->is_mmap) + if (bl && bl->flags & IOBL_MMAP) ret = atomic_inc_not_zero(&bl->refs); rcu_read_unlock(); diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index b90aca3a57fa..36aadfe5ac0 |
