diff options
author | Jens Axboe <axboe@kernel.dk> | 2025-05-17 18:47:41 -0600 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2025-05-17 18:47:41 -0600 |
commit | f6873e3f089c6a386df3dac965abb5516c2eccda (patch) | |
tree | 53d49ad886d76202f08cd41d40f71bafa7064b54 | |
parent | c2d8018325b6f0d0940c2544678822e88e41adbc (diff) | |
parent | f660fd2ca15a3743f65f6110ae60d5b80500d856 (diff) | |
download | linux-block-for-next.tar.gz |
Merge branch 'for-6.16/io_uring' into for-nextfor-next
* for-6.16/io_uring: (53 commits)
io_uring: add new helpers for posting overflows
io_uring: pass in struct io_big_cqe to io_alloc_ocqe()
io_uring: make io_alloc_ocqe() take a struct io_cqe pointer
io_uring: split alloc and add of overflow
io_uring: open code io_req_cqe_overflow()
io_uring/fdinfo: get rid of dumping credentials
io_uring/fdinfo: only compile if CONFIG_PROC_FS is set
io_uring/kbuf: unify legacy buf provision and removal
io_uring/kbuf: refactor __io_remove_buffers
io_uring/kbuf: don't compute size twice on prep
io_uring/kbuf: drop extra vars in io_register_pbuf_ring
io_uring/kbuf: use mem_is_zero()
io_uring/kbuf: account ring io_buffer_list memory
io_uring: drain based on allocates reqs
io_uring: count allocated requests
io_uring: open code io_account_cq_overflow()
io_uring: consolidate drain seq checking
io_uring: remove drain prealloc checks
io_uring: simplify drain ret passing
io_uring: fix spurious drain flushing
...
-rw-r--r-- | include/linux/io_uring_types.h | 15 | ||||
-rw-r--r-- | include/uapi/linux/io_uring.h | 8 | ||||
-rw-r--r-- | io_uring/Makefile | 6 | ||||
-rw-r--r-- | io_uring/cmd_net.c | 83 | ||||
-rw-r--r-- | io_uring/eventfd.c | 66 | ||||
-rw-r--r-- | io_uring/eventfd.h | 3 | ||||
-rw-r--r-- | io_uring/fdinfo.c | 40 | ||||
-rw-r--r-- | io_uring/io-wq.c | 15 | ||||
-rw-r--r-- | io_uring/io-wq.h | 5 | ||||
-rw-r--r-- | io_uring/io_uring.c | 283 | ||||
-rw-r--r-- | io_uring/io_uring.h | 3 | ||||
-rw-r--r-- | io_uring/kbuf.c | 146 | ||||
-rw-r--r-- | io_uring/kbuf.h | 8 | ||||
-rw-r--r-- | io_uring/memmap.c | 11 | ||||
-rw-r--r-- | io_uring/memmap.h | 4 | ||||
-rw-r--r-- | io_uring/net.c | 48 | ||||
-rw-r--r-- | io_uring/notif.c | 1 | ||||
-rw-r--r-- | io_uring/opdef.c | 11 | ||||
-rw-r--r-- | io_uring/openclose.c | 133 | ||||
-rw-r--r-- | io_uring/openclose.h | 3 | ||||
-rw-r--r-- | io_uring/rsrc.c | 89 | ||||
-rw-r--r-- | io_uring/rsrc.h | 28 | ||||
-rw-r--r-- | io_uring/rw.c | 5 | ||||
-rw-r--r-- | io_uring/rw.h | 2 | ||||
-rw-r--r-- | io_uring/tctx.c | 2 | ||||
-rw-r--r-- | io_uring/timeout.c | 11 | ||||
-rw-r--r-- | io_uring/timeout.h | 13 | ||||
-rw-r--r-- | io_uring/uring_cmd.c | 83 | ||||
-rw-r--r-- | io_uring/zcrx.c | 372 | ||||
-rw-r--r-- | io_uring/zcrx.h | 26 |
30 files changed, 866 insertions, 657 deletions
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index b44d201520d857..2922635986f521 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -40,8 +40,6 @@ enum io_uring_cmd_flags { IO_URING_F_TASK_DEAD = (1 << 13), }; -struct io_zcrx_ifq; - struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -343,7 +341,6 @@ struct io_ring_ctx { unsigned cached_cq_tail; unsigned cq_entries; struct io_ev_fd __rcu *io_ev_fd; - unsigned cq_extra; void *cq_wait_arg; size_t cq_wait_size; @@ -394,7 +391,8 @@ struct io_ring_ctx { struct wait_queue_head poll_wq; struct io_restriction restrictions; - struct io_zcrx_ifq *ifq; + /* Stores zcrx object pointers of type struct io_zcrx_ifq */ + struct xarray zcrx_ctxs; u32 pers_next; struct xarray personalities; @@ -418,6 +416,7 @@ struct io_ring_ctx { struct callback_head poll_wq_task_work; struct list_head defer_list; + unsigned nr_drained; struct io_alloc_cache msg_cache; spinlock_t msg_lock; @@ -436,6 +435,7 @@ struct io_ring_ctx { /* protected by ->completion_lock */ unsigned evfd_last_cq_tail; + unsigned nr_req_allocated; /* * Protection for resize vs mmap races - both the mmap and resize @@ -448,8 +448,6 @@ struct io_ring_ctx { struct io_mapped_region ring_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; - /* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */ - struct io_mapped_region zcrx_region; }; /* @@ -653,8 +651,7 @@ struct io_kiocb { u8 iopoll_completed; /* * Can be either a fixed buffer index, or used with provided buffers. - * For the latter, before issue it points to the buffer group ID, - * and after selection it points to the buffer ID itself. + * For the latter, it points to the selected buffer ID. */ u16 buf_index; @@ -713,7 +710,7 @@ struct io_kiocb { const struct cred *creds; struct io_wq_work work; - struct { + struct io_big_cqe { u64 extra1; u64 extra2; } big_cqe; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 50e372ea97c549..cfd17e382082fa 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -73,6 +73,7 @@ struct io_uring_sqe { __u32 futex_flags; __u32 install_fd_flags; __u32 nop_flags; + __u32 pipe_flags; }; __u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ @@ -287,6 +288,7 @@ enum io_uring_op { IORING_OP_EPOLL_WAIT, IORING_OP_READV_FIXED, IORING_OP_WRITEV_FIXED, + IORING_OP_PIPE, /* this goes last, obviously */ IORING_OP_LAST, @@ -992,12 +994,16 @@ struct io_uring_zcrx_offsets { __u64 __resv[2]; }; +enum io_uring_zcrx_area_flags { + IORING_ZCRX_AREA_DMABUF = 1, +}; + struct io_uring_zcrx_area_reg { __u64 addr; __u64 len; __u64 rq_area_token; __u32 flags; - __u32 __resv1; + __u32 dmabuf_fd; __u64 __resv2[2]; }; diff --git a/io_uring/Makefile b/io_uring/Makefile index 3e28a741ca15c3..d97c6b51d58491 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -7,11 +7,11 @@ GCOV_PROFILE := y endif obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ - tctx.o filetable.o rw.o net.o poll.o \ + tctx.o filetable.o rw.o poll.o \ eventfd.o uring_cmd.o openclose.o \ sqpoll.o xattr.o nop.o fs.o splice.o \ sync.o msg_ring.o advise.o openclose.o \ - statx.o timeout.o fdinfo.o cancel.o \ + statx.o timeout.o cancel.o \ waitid.o register.o truncate.o \ memmap.o alloc_cache.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o @@ -19,3 +19,5 @@ obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_EPOLL) += epoll.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o +obj-$(CONFIG_NET) += net.o cmd_net.o +obj-$(CONFIG_PROC_FS) += fdinfo.o diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c new file mode 100644 index 00000000000000..e99170c7d41add --- /dev/null +++ b/io_uring/cmd_net.c @@ -0,0 +1,83 @@ +#include <asm/ioctls.h> +#include <linux/io_uring/net.h> +#include <net/sock.h> + +#include "uring_cmd.h" + +static inline int io_uring_cmd_getsockopt(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + bool compat = !!(issue_flags & IO_URING_F_COMPAT); + int optlen, optname, level, err; + void __user *optval; + + level = READ_ONCE(sqe->level); + if (level != SOL_SOCKET) + return -EOPNOTSUPP; + + optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); + optname = READ_ONCE(sqe->optname); + optlen = READ_ONCE(sqe->optlen); + + err = do_sock_getsockopt(sock, compat, level, optname, + USER_SOCKPTR(optval), + KERNEL_SOCKPTR(&optlen)); + if (err) + return err; + + /* On success, return optlen */ + return optlen; +} + +static inline int io_uring_cmd_setsockopt(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + bool compat = !!(issue_flags & IO_URING_F_COMPAT); + int optname, optlen, level; + void __user *optval; + sockptr_t optval_s; + + optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); + optname = READ_ONCE(sqe->optname); + optlen = READ_ONCE(sqe->optlen); + level = READ_ONCE(sqe->level); + optval_s = USER_SOCKPTR(optval); + + return do_sock_setsockopt(sock, compat, level, optname, optval_s, + optlen); +} + +int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct socket *sock = cmd->file->private_data; + struct sock *sk = sock->sk; + struct proto *prot = READ_ONCE(sk->sk_prot); + int ret, arg = 0; + + if (!prot || !prot->ioctl) + return -EOPNOTSUPP; + + switch (cmd->cmd_op) { + case SOCKET_URING_OP_SIOCINQ: + ret = prot->ioctl(sk, SIOCINQ, &arg); + if (ret) + return ret; + return arg; + case SOCKET_URING_OP_SIOCOUTQ: + ret = prot->ioctl(sk, SIOCOUTQ, &arg); + if (ret) + return ret; + return arg; + case SOCKET_URING_OP_GETSOCKOPT: + return io_uring_cmd_getsockopt(sock, cmd, issue_flags); + case SOCKET_URING_OP_SETSOCKOPT: + return io_uring_cmd_setsockopt(sock, cmd, issue_flags); + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL_GPL(io_uring_cmd_sock); diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 100d5da94cb95c..78f8ab7db104f0 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -47,13 +47,6 @@ static void io_eventfd_do_signal(struct rcu_head *rcu) io_eventfd_put(ev_fd); } -static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref) -{ - if (put_ref) - io_eventfd_put(ev_fd); - rcu_read_unlock(); -} - /* * Returns true if the caller should put the ev_fd reference, false if not. */ @@ -72,63 +65,34 @@ static bool __io_eventfd_signal(struct io_ev_fd *ev_fd) /* * Trigger if eventfd_async isn't set, or if it's set and the caller is - * an async worker. If ev_fd isn't valid, obviously return false. + * an async worker. */ static bool io_eventfd_trigger(struct io_ev_fd *ev_fd) { - if (ev_fd) - return !ev_fd->eventfd_async || io_wq_current_is_worker(); - return false; + return !ev_fd->eventfd_async || io_wq_current_is_worker(); } -/* - * On success, returns with an ev_fd reference grabbed and the RCU read - * lock held. - */ -static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx) +void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event) { + bool skip = false; struct io_ev_fd *ev_fd; if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - return NULL; - - rcu_read_lock(); + return; - /* - * rcu_dereference ctx->io_ev_fd once and use it for both for checking - * and eventfd_signal - */ + guard(rcu)(); ev_fd = rcu_dereference(ctx->io_ev_fd); - /* * Check again if ev_fd exists in case an io_eventfd_unregister call * completed between the NULL check of ctx->io_ev_fd at the start of * the function and rcu_read_lock. */ - if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs)) - return ev_fd; - - rcu_read_unlock(); - return NULL; -} - -void io_eventfd_signal(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = io_eventfd_grab(ctx); - if (ev_fd) - io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd)); -} - -void io_eventfd_flush_signal(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = io_eventfd_grab(ctx); - if (ev_fd) { - bool skip, put_ref = true; + if (!ev_fd) + return; + if (!io_eventfd_trigger(ev_fd) || !refcount_inc_not_zero(&ev_fd->refs)) + return; + if (cqe_event) { /* * Eventfd should only get triggered when at least one event * has been posted. Some applications rely on the eventfd @@ -142,12 +106,10 @@ void io_eventfd_flush_signal(struct io_ring_ctx *ctx) skip = ctx->cached_cq_tail == ev_fd->last_cq_tail; ev_fd->last_cq_tail = ctx->cached_cq_tail; spin_unlock(&ctx->completion_lock); - - if (!skip) - put_ref = __io_eventfd_signal(ev_fd); - - io_eventfd_release(ev_fd, put_ref); } + + if (skip || __io_eventfd_signal(ev_fd)) + io_eventfd_put(ev_fd); } int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h index d394f49c632105..e2f1985c2cf98e 100644 --- a/io_uring/eventfd.h +++ b/io_uring/eventfd.h @@ -4,5 +4,4 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int eventfd_async); int io_eventfd_unregister(struct io_ring_ctx *ctx); -void io_eventfd_flush_signal(struct io_ring_ctx *ctx); -void io_eventfd_signal(struct io_ring_ctx *ctx); +void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event); diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index e0d6a59a89fa1b..e9355276ab5d52 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -15,37 +15,6 @@ #include "cancel.h" #include "rsrc.h" -#ifdef CONFIG_PROC_FS -static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, - const struct cred *cred) -{ - struct user_namespace *uns = seq_user_ns(m); - struct group_info *gi; - kernel_cap_t cap; - int g; - - seq_printf(m, "%5d\n", id); - seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); - seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); - seq_puts(m, "\n\tGroups:\t"); - gi = cred->group_info; - for (g = 0; g < gi->ngroups; g++) { - seq_put_decimal_ull(m, g ? " " : "", - from_kgid_munged(uns, gi->gid[g])); - } - seq_puts(m, "\n\tCapEff:\t"); - cap = cred->cap_effective; - seq_put_hex_ll(m, NULL, cap.val, 16); - seq_putc(m, '\n'); - return 0; -} - #ifdef CONFIG_NET_RX_BUSY_POLL static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m, @@ -214,14 +183,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) else seq_printf(m, "%5u: <none>\n", i); } - if (!xa_empty(&ctx->personalities)) { - unsigned long index; - const struct cred *cred; - - seq_printf(m, "Personalities:\n"); - xa_for_each(&ctx->personalities, index, cred) - io_uring_show_cred(m, index, cred); - } seq_puts(m, "PollList:\n"); for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { @@ -264,4 +225,3 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) mutex_unlock(&ctx->uring_lock); } } -#endif diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 04a75d66619510..d52069b1177b0a 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -114,9 +114,6 @@ enum { struct io_wq { unsigned long state; - free_work_fn *free_work; - io_wq_work_fn *do_work; - struct io_wq_hash *hash; atomic_t worker_refs; @@ -612,10 +609,10 @@ static void io_worker_handle_work(struct io_wq_acct *acct, if (do_kill && (work_flags & IO_WQ_WORK_UNBOUND)) atomic_or(IO_WQ_WORK_CANCEL, &work->flags); - wq->do_work(work); + io_wq_submit_work(work); io_assign_current_work(worker, NULL); - linked = wq->free_work(work); + linked = io_wq_free_work(work); work = next_hashed; if (!work && linked && !io_wq_is_hashed(linked)) { work = linked; @@ -934,8 +931,8 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) { do { atomic_or(IO_WQ_WORK_CANCEL, &work->flags); - wq->do_work(work); - work = wq->free_work(work); + io_wq_submit_work(work); + work = io_wq_free_work(work); } while (work); } @@ -1195,8 +1192,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) int ret, i; struct io_wq *wq; - if (WARN_ON_ONCE(!data->free_work || !data->do_work)) - return ERR_PTR(-EINVAL); if (WARN_ON_ONCE(!bounded)) return ERR_PTR(-EINVAL); @@ -1206,8 +1201,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) refcount_inc(&data->hash->refs); wq->hash = data->hash; - wq->free_work = data->free_work; - wq->do_work = data->do_work; ret = -ENOMEM; diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index d4fb2940e435f7..774abab54732ef 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -21,9 +21,6 @@ enum io_wq_cancel { IO_WQ_CANCEL_NOTFOUND, /* work not found */ }; -typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *); -typedef void (io_wq_work_fn)(struct io_wq_work *); - struct io_wq_hash { refcount_t refs; unsigned long map; @@ -39,8 +36,6 @@ static inline void io_wq_put_hash(struct io_wq_hash *hash) struct io_wq_data { struct io_wq_hash *hash; struct task_struct *task; - io_wq_work_fn *do_work; - free_work_fn *free_work; }; struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3f079713914154..d85657ecb8933e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -129,7 +129,6 @@ struct io_defer_entry { struct list_head list; struct io_kiocb *req; - u32 seq; }; /* requests with any of those set should undergo io_disarm_next() */ @@ -149,6 +148,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool is_sqpoll_thread); static void io_queue_sqe(struct io_kiocb *req); +static void __io_req_caches_free(struct io_ring_ctx *ctx); static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray); @@ -359,6 +359,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; INIT_HLIST_HEAD(&ctx->waitid_list); + xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC); #ifdef CONFIG_FUTEX INIT_HLIST_HEAD(&ctx->futex_list); #endif @@ -380,25 +381,6 @@ err: return NULL; } -static void io_account_cq_overflow(struct io_ring_ctx *ctx) -{ - struct io_rings *r = ctx->rings; - - WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); - ctx->cq_extra--; -} - -static bool req_need_defer(struct io_kiocb *req, u32 seq) -{ - if (unlikely(req->flags & REQ_F_IO_DRAIN)) { - struct io_ring_ctx *ctx = req->ctx; - - return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; - } - - return false; -} - static void io_clean_op(struct io_kiocb *req) { if (unlikely(req->flags & REQ_F_BUFFER_SELECTED)) @@ -537,20 +519,37 @@ void io_req_queue_iowq(struct io_kiocb *req) io_req_task_work_add(req); } +static unsigned io_linked_nr(struct io_kiocb *req) +{ + struct io_kiocb *tmp; + unsigned nr = 0; + + io_for_each_link(tmp, req) + nr++; + return nr; +} + static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) { - spin_lock(&ctx->completion_lock); + bool drain_seen = false, first = true; + + lockdep_assert_held(&ctx->uring_lock); + __io_req_caches_free(ctx); + while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); - if (req_need_defer(de->req, de->seq)) - break; + drain_seen |= de->req->flags & REQ_F_IO_DRAIN; + if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained) + return; + list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); io_req_task_queue(de->req); kfree(de); + first = false; } - spin_unlock(&ctx->completion_lock); } void __io_commit_cqring_flush(struct io_ring_ctx *ctx) @@ -559,10 +558,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); - if (ctx->drain_active) - io_queue_deferred(ctx); if (ctx->has_evfd) - io_eventfd_flush_signal(ctx); + io_eventfd_signal(ctx, true); } static inline void __io_cq_lock(struct io_ring_ctx *ctx) @@ -700,27 +697,20 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } } -static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags, u64 extra1, u64 extra2) +static __cold bool io_cqring_add_overflow(struct io_ring_ctx *ctx, + struct io_overflow_cqe *ocqe) { - struct io_overflow_cqe *ocqe; - size_t ocq_size = sizeof(struct io_overflow_cqe); - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); - lockdep_assert_held(&ctx->completion_lock); - if (is_cqe32) - ocq_size += sizeof(struct io_uring_cqe); - - ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); - trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); if (!ocqe) { + struct io_rings *r = ctx->rings; + /* * If we're in ring overflow flush mode, or in task cancel mode, * or cannot allocate an overflow entry, then we need to drop it * on the floor. */ - io_account_cq_overflow(ctx); + WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); return false; } @@ -729,23 +719,35 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } - ocqe->cqe.user_data = user_data; - ocqe->cqe.res = res; - ocqe->cqe.flags = cflags; - if (is_cqe32) { - ocqe->cqe.big_cqe[0] = extra1; - ocqe->cqe.big_cqe[1] = extra2; - } list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } -static void io_req_cqe_overflow(struct io_kiocb *req) +static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, + struct io_cqe *cqe, + struct io_big_cqe *big_cqe, gfp_t gfp) { - io_cqring_event_overflow(req->ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->big_cqe.extra1, req->big_cqe.extra2); - memset(&req->big_cqe, 0, sizeof(req->big_cqe)); + struct io_overflow_cqe *ocqe; + size_t ocq_size = sizeof(struct io_overflow_cqe); + bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + + if (is_cqe32) + ocq_size += sizeof(struct io_uring_cqe); + + ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); + trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); + if (ocqe) { + ocqe->cqe.user_data = cqe->user_data; + ocqe->cqe.res = cqe->res; + ocqe->cqe.flags = cqe->flags; + if (is_cqe32 && big_cqe) { + ocqe->cqe.big_cqe[0] = big_cqe->extra1; + ocqe->cqe.big_cqe[1] = big_cqe->extra2; + } + } + if (big_cqe) + big_cqe->extra1 = big_cqe->extra2 = 0; + return ocqe; } /* @@ -790,13 +792,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, { struct io_uring_cqe *cqe; - ctx->cq_extra++; - - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ if (likely(io_get_cqe(ctx, &cqe))) { WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); @@ -813,14 +808,43 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, return false; } +static inline struct io_cqe io_init_cqe(u64 user_data, s32 res, u32 cflags) +{ + return (struct io_cqe) { .user_data = user_data, .res = res, .flags = cflags }; +} + +static __cold void io_cqe_overflow(struct io_ring_ctx *ctx, struct io_cqe *cqe, + struct io_big_cqe *big_cqe) +{ + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_KERNEL); + spin_lock(&ctx->completion_lock); + io_cqring_add_overflow(ctx, ocqe); + spin_unlock(&ctx->completion_lock); +} + +static __cold bool io_cqe_overflow_locked(struct io_ring_ctx *ctx, + struct io_cqe *cqe, + struct io_big_cqe *big_cqe) +{ + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_ATOMIC); + return io_cqring_add_overflow(ctx, ocqe); +} + bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { bool filled; io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); - if (!filled) - filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + if (unlikely(!filled)) { + struct io_cqe cqe = io_init_cqe(user_data, res, cflags); + + filled = io_cqe_overflow_locked(ctx, &cqe, NULL); + } io_cq_unlock_post(ctx); return filled; } @@ -831,10 +855,13 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags */ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { + lockdep_assert_held(&ctx->uring_lock); + lockdep_assert(ctx->lockless_cq); + if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { - spin_lock(&ctx->completion_lock); - io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); - spin_unlock(&ctx->completion_lock); + struct io_cqe cqe = io_init_cqe(user_data, res, cflags); + + io_cqe_overflow(ctx, &cqe, NULL); } ctx->submit_state.cq_flush = true; } @@ -924,22 +951,6 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res) } /* - * Don't initialise the fields below on every allocation, but do that in - * advance and keep them valid across allocations. - */ -static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) -{ - req->ctx = ctx; - req->buf_node = NULL; - req->file_node = NULL; - req->link = NULL; - req->async_data = NULL; - /* not necessary, but safer to zero */ - memset(&req->cqe, 0, sizeof(req->cqe)); - memset(&req->big_cqe, 0, sizeof(req->big_cqe)); -} - -/* * A request might get retired back into the request caches even before opcode * handlers and io_issue_sqe() are done with it, e.g. inline completion path. * Because of that, io_alloc_req() should be called only under ->uring_lock @@ -948,7 +959,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO; void *reqs[IO_REQ_ALLOC_BATCH]; int ret; @@ -966,10 +977,11 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) } percpu_ref_get_many(&ctx->refs, ret); + ctx->nr_req_allocated += ret; + while (ret--) { struct io_kiocb *req = reqs[ret]; - io_preinit_req(req, ctx); io_req_add_to_cache(req, ctx); } return true; @@ -1191,7 +1203,7 @@ static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (ctx->has_evfd) - io_eventfd_signal(ctx); + io_eventfd_signal(ctx, false); } nr_wait = atomic_read(&ctx->cq_wait_nr); @@ -1383,6 +1395,16 @@ void io_queue_next(struct io_kiocb *req) io_req_task_queue(nxt); } +static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) +{ + if (req->file_node) { + io_put_rsrc_node(req->ctx, req->file_node); + req->file_node = NULL; + } + if (req->flags & REQ_F_BUF_NODE) + io_put_rsrc_node(req->ctx, req->buf_node); +} + static void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) @@ -1443,13 +1465,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) */ if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { - if (ctx->lockless_cq) { - spin_lock(&ctx->completion_lock); - io_req_cqe_overflow(req); - spin_unlock(&ctx->completion_lock); - } else { - io_req_cqe_overflow(req); - } + if (ctx->lockless_cq) + io_cqe_overflow(ctx, &req->cqe, &req->big_cqe); + else + io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe); } } __io_cq_unlock_post(ctx); @@ -1458,6 +1477,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } + + if (unlikely(ctx->drain_active)) + io_queue_deferred(ctx); + ctx->submit_state.cq_flush = false; } @@ -1645,56 +1668,28 @@ io_req_flags_t io_file_get_flags(struct file *file) return res; } -static u32 io_get_sequence(struct io_kiocb *req) -{ - u32 seq = req->ctx->cached_sq_head; - struct io_kiocb *cur; - - /* need original cached_sq_head, but it was increased for each req */ - io_for_each_link(cur, req) - seq--; - return seq; -} - static __cold void io_drain_req(struct io_kiocb *req) __must_hold(&ctx->uring_lock) { struct io_ring_ctx *ctx = req->ctx; + bool drain = req->flags & IOSQE_IO_DRAIN; struct io_defer_entry *de; - int ret; - u32 seq = io_get_sequence(req); - - /* Still need defer if there is pending req in defer list. */ - spin_lock(&ctx->completion_lock); - if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { - spin_unlock(&ctx->completion_lock); -queue: - ctx->drain_active = false; - io_req_task_queue(req); - return; - } - spin_unlock(&ctx->completion_lock); - io_prep_async_link(req); - de = kmalloc(sizeof(*de), GFP_KERNEL); + de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT); if (!de) { - ret = -ENOMEM; - io_req_defer_failed(req, ret); + io_req_defer_failed(req, -ENOMEM); return; } - spin_lock(&ctx->completion_lock); - if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { - spin_unlock(&ctx->completion_lock); - kfree(de); - goto queue; - } - + io_prep_async_link(req); trace_io_uring_defer(req); de->req = req; - de->seq = seq; + + ctx->nr_drained += io_linked_nr(req); list_add_tail(&de->list, &ctx->defer_list); - spin_unlock(&ctx->completion_lock); + io_queue_deferred(ctx); + if (!drain && list_empty(&ctx->defer_list)) + ctx->drain_active = false; } static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, @@ -1815,7 +1810,7 @@ void io_wq_submit_work(struct io_wq_work *work) bool needs_poll = false; int ret = 0, err = -ECANCELED; - /* one will be dropped by ->io_wq_free_work() after returning to io-wq */ + /* one will be dropped by io_wq_free_work() after returning to io-wq */ if (!(req->flags & REQ_F_REFCOUNT)) __io_req_set_refcount(req, 2); else @@ -1913,7 +1908,8 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, io_ring_submit_lock(ctx, issue_flags); node = io_rsrc_node_lookup(&ctx->file_table.data, fd); if (node) { - io_req_assign_rsrc_node(&req->file_node, node); + node->refs++; + req->file_node = node; req->flags |= io_slot_flags(node); file = io_slot_file(node); } @@ -2046,7 +2042,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, int personality; u8 opcode; - /* req is partially pre-initialised, see io_preinit_req() */ + req->ctx = ctx; req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ sqe_flags = READ_ONCE(sqe->flags); @@ -2277,10 +2273,6 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) { head = READ_ONCE(ctx->sq_array[head]); if (unlikely(head >= ctx->sq_entries)) { - /* drop invalid entries */ - spin_lock(&ctx->completion_lock); - ctx->cq_extra--; - spin_unlock(&ctx->completion_lock); WRITE_ONCE(ctx->rings->sq_dropped, READ_ONCE(ctx->rings->sq_dropped) + 1); return false; @@ -2698,21 +2690,26 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, return off; } -static void io_req_caches_free(struct io_ring_ctx *ctx) +static __cold void __io_req_caches_free(struct io_ring_ctx *ctx) { struct io_kiocb *req; int nr = 0; - mutex_lock(&ctx->uring_lock); - while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); kmem_cache_free(req_cachep, req); nr++; } - if (nr) + if (nr) { + ctx->nr_req_allocated -= nr; percpu_ref_put_many(&ctx->refs, nr); - mutex_unlock(&ctx->uring_lock); + } +} + +static __cold void io_req_caches_free(struct io_ring_ctx *ctx) +{ + guard(mutex)(&ctx->uring_lock); + __io_req_caches_free(ctx); } static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) @@ -2748,6 +2745,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) percpu_ref_exit(&ctx->refs); free_uid(ctx->user); io_req_caches_free(ctx); + + WARN_ON_ONCE(ctx->nr_req_allocated); + if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); io_napi_free(ctx); @@ -2882,7 +2882,7 @@ static __cold void io_ring_exit_work(struct work_struct *work) io_cqring_overflow_kill(ctx); mutex_unlock(&ctx->uring_lock); } - if (ctx->ifq) { + if (!xa_empty(&ctx->zcrx_ctxs)) { mutex_lock(&ctx->uring_lock); io_shutdown_zcrx_ifqs(ctx); mutex_unlock(&ctx->uring_lock); @@ -3014,20 +3014,19 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, struct io_defer_entry *de; LIST_HEAD(list); - spin_lock(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { if (io_match_task_safe(de->req, tctx, cancel_all)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } } - spin_unlock(&ctx->completion_lock); if (list_empty(&list)) return false; while (!list_empty(&list)) { de = list_first_entry(&list, struct io_defer_entry, list); list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); io_req_task_queue_fail(de->req, -ECANCELED); kfree(de); } @@ -3102,8 +3101,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && io_allowed_defer_tw_run(ctx)) ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; - ret |= io_cancel_defer_files(ctx, tctx, cancel_all); mutex_lock(&ctx->uring_lock); + ret |= io_cancel_defer_files(ctx, tctx, cancel_all); ret |= io_poll_remove_all(ctx, tctx, cancel_all); ret |= io_waitid_remove_all(ctx, tctx, cancel_all); ret |= io_futex_remove_all(ctx, tctx, cancel_all); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e4050b2d0821f5..81f22196a57de6 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -196,7 +196,6 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, { io_lockdep_assert_cq_locked(ctx); - ctx->cq_extra++; ctx->submit_state.cq_flush = true; return io_get_cqe(ctx, cqe_ret); } @@ -414,7 +413,7 @@ static inline void io_req_complete_defer(struct io_kiocb *req) static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active || + if (unlikely(ctx->off_timeout_used || ctx->has_evfd || ctx->poll_activated)) __io_commit_cqring_flush(ctx); } diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 953d5e74256916..823e7eb15fb2a8 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -92,7 +92,6 @@ void io_kbuf_drop_legacy(struct io_kiocb *req) { if (WARN_ON_ONCE(!(req->flags & REQ_F_BUFFER_SELECTED))) return; - req->buf_index = req->kbuf->bgid; req->flags &= ~REQ_F_BUFFER_SELECTED; kfree(req->kbuf); req->kbuf = NULL; @@ -110,7 +109,6 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) bl = io_buffer_get_list(ctx, buf->bgid); list_add(&buf->list, &bl->buf_list); req->flags &= ~REQ_F_BUFFER_SELECTED; - req->buf_index = buf->bgid; io_ring_submit_unlock(ctx, issue_flags); return true; @@ -193,7 +191,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, } void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags) + unsigned buf_group, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; @@ -201,7 +199,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, io_ring_submit_lock(req->ctx, issue_flags); - bl = io_buffer_get_list(ctx, req->buf_index); + bl = io_buffer_get_list(ctx, buf_group); if (likely(bl)) { if (bl->flags & IOBL_BUF_RING) ret = io_ring_buffer_select(req, len, bl, issue_flags); @@ -302,7 +300,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, int ret = -ENOENT; io_ring_submit_lock(ctx, issue_flags); - bl = io_buffer_get_list(ctx, req->buf_index); + bl = io_buffer_get_list(ctx, arg->buf_group); if (unlikely(!bl)) goto out_unlock; @@ -335,7 +333,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) lockdep_assert_held(&ctx->uring_lock); - bl = io_buffer_get_list(ctx, req->buf_index); + bl = io_buffer_get_list(ctx, arg->buf_group); if (unlikely(!bl)) return -ENOENT; @@ -355,10 +353,9 @@ static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) struct io_buffer_list *bl = req->buf_list; bool ret = true; - if (bl) { + if (bl) ret = io_kbuf_commit(req, bl, len, nr); - req->buf_index = bl->bgid; - } + req->flags &= ~REQ_F_BUFFER_RING; return ret; } @@ -379,45 +376,33 @@ unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs) return ret; } -static int __io_remove_buffers(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned nbufs) +static int io_remove_buffers_legacy(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, + unsigned long nbufs) { - unsigned i = 0; - - /* shouldn't happen */ - if (!nbufs) - return 0; - - if (bl->flags & IOBL_BUF_RING) { - i = bl->buf_ring->tail - bl->head; - io_free_region(ctx, &bl->region); - /* make sure it's seen as empty */ - INIT_LIST_HEAD(&bl->buf_list); - bl->flags &= ~IOBL_BUF_RING; - return i; - } + unsigned long i = 0; + struct io_buffer *nxt; /* protects io_buffers_cache */ lockdep_assert_held(&ctx->uring_lock); + WARN_ON_ONCE(bl->flags & IOBL_BUF_RING); - while (!list_empty(&bl->buf_list)) { - struct io_buffer *nxt; - + for (i = 0; i < nbufs && !list_empty(&bl->buf_list); i++) { nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&nxt->list); kfree(nxt); - - if (++i == nbufs) - return i; cond_resched(); } - return i; } static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { - __io_remove_buffers(ctx, bl, -1U); + if (bl->flags & IOBL_BUF_RING) + io_free_region(ctx, &bl->region); + else + io_remove_buffers_legacy(ctx, bl, -1U); + kfree(bl); } @@ -465,30 +450,6 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - ret = -ENOENT; - bl = io_buffer_get_list(ctx, p->bgid); - if (bl) { - ret = -EINVAL; - /* can't use provide/remove buffers command on mapped buffers */ - if (!(bl->flags & IOBL_BUF_RING)) - ret = __io_remove_buffers(ctx, bl, p->nbufs); - } - io_ring_submit_unlock(ctx, issue_flags); - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { unsigned long size, tmp_check; @@ -512,8 +473,6 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return -EOVERFLOW; if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) return -EOVERFLOW; - - size = (unsigned long)p->len * p->nbufs; if (!access_ok(u64_to_user_ptr(p->addr), size)) return -EFAULT; @@ -552,37 +511,44 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, return i ? 0 : -ENOMEM; } -int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) +static int __io_manage_buffers_legacy(struct io_kiocb *req, + struct io_buffer_list *bl) { struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); + int ret; - bl = io_buffer_get_list(ctx, p->bgid); - if (unlikely(!bl)) { + if (!bl) { + if (req->opcode != IORING_OP_PROVIDE_BUFFERS) + return -ENOENT; bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); - if (!bl) { - ret = -ENOMEM; - goto err; - } + if (!bl) + return -ENOMEM; + INIT_LIST_HEAD(&bl->buf_list); - ret = io_buffer_add_list(ctx, bl, p->bgid); + ret = io_buffer_add_list(req->ctx, bl, p->bgid); if (ret) { kfree(bl); - goto err; + return ret; } } - /* can't add buffers via this command for a mapped buffer ring */ - if (bl->flags & IOBL_BUF_RING) { - ret = -EINVAL; - goto err; - } + /* can't use provide/remove buffers command on mapped buffers */ + if (bl->flags & IOBL_BUF_RING) + return -EINVAL; + if (req->opcode == IORING_OP_PROVIDE_BUFFERS) + return io_add_buffers(req->ctx, p, bl); + return io_remove_buffers_legacy(req->ctx, bl, p->nbufs); +} + +int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret; - ret = io_add_buffers(ctx, p, bl); -err: + io_ring_submit_lock(ctx, issue_flags); + bl = io_buffer_get_list(ctx, p->bgid); + ret = __io_manage_buffers_legacy(req, bl); io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) @@ -594,7 +560,7 @@ err: int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_reg reg; - struct io_buffer_list *bl, *free_bl = NULL; + struct io_buffer_list *bl; struct io_uring_region_desc rd; struct io_uring_buf_ring *br; unsigned long mmap_offset; @@ -605,8 +571,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - - if (reg.resv[0] || reg.resv[1] || reg.resv[2]) + if (!mem_is_zero(reg.resv, sizeof(reg.resv))) return -EINVAL; if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) return -EINVAL; @@ -624,7 +589,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) io_destroy_bl(ctx, bl); } - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); + bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); if (!bl) return -ENOMEM; @@ -669,7 +634,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return 0; fail: io_free_region(ctx, &bl->region); - kfree(free_bl); + kfree(bl); return ret; } @@ -682,9 +647,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (reg.resv[0] || reg.resv[1] || reg.resv[2]) - return -EINVAL; - if (reg.flags) + if (!mem_is_zero(reg.resv, sizeof(reg.resv)) || reg.flags) return -EINVAL; bl = io_buffer_get_list(ctx, reg.bgid); @@ -704,14 +667,11 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_status buf_status; struct io_buffer_list *bl; - int i; if (copy_from_user(&buf_status, arg, sizeof(buf_status))) return -EFAULT; - - for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++) - if (buf_status.resv[i]) - return -EINVAL; + if (!mem_is_zero(buf_status.resv, sizeof(buf_status.resv))) + return -EINVAL; bl = io_buffer_get_list(ctx, buf_status.buf_group); if (!bl) diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 2ec0b983ce243c..4d2c209d1a4112 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -55,20 +55,19 @@ struct buf_sel_arg { size_t max_len; unsigned short nr_iovs; unsigned short mode; + unsigned buf_group; }; void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags); + unsigned buf_group, unsigned int issue_flags); int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, unsigned int issue_flags); int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg); void io_destroy_buffers(struct io_ring_ctx *ctx); int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags); - int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags); +int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags); int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); @@ -94,7 +93,6 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) * to monopolize the buffer. */ if (req->buf_list) { - req->buf_index = req->buf_list->bgid; req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT); return true; } diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 07f8a5cbd37ec7..725dc0bec24c42 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -13,6 +13,7 @@ #include "memmap.h" #include "kbuf.h" #include "rsrc.h" +#include "zcrx.h" static void *io_mem_alloc_compound(struct page **pages, int nr_pages, size_t size, gfp_t gfp) @@ -258,7 +259,8 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, loff_t pgoff) { loff_t offset = pgoff << PAGE_SHIFT; - unsigned int bgid; + unsigned int id; + switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: @@ -267,12 +269,13 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, case IORING_OFF_SQES: return &ctx->sq_region; case IORING_OFF_PBUF_RING: - bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - return io_pbuf_get_region(ctx, bgid); + id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + return io_pbuf_get_region(ctx, id); case IORING_MAP_OFF_PARAM_REGION: return &ctx->param_region; case IORING_MAP_OFF_ZCRX_REGION: - return &ctx->zcrx_region; + id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_ZCRX_SHIFT; + return io_zcrx_get_region(ctx, id); } return NULL; } diff --git a/io_uring/memmap.h b/io_uring/memmap.h index dad0aa5b1b4574..08419684e4bc91 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -4,7 +4,9 @@ #define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL #define IORING_MAP_OFF_ZCRX_REGION 0x30000000ULL -struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); +#define IORING_OFF_ZCRX_SHIFT 16 + +struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages); #ifndef CONFIG_MMU unsigned int io_uring_nommu_mmap_capabilities(struct file *file); diff --git a/io_uring/net.c b/io_uring/net.c index 24040bc3916a1b..1fbdb2bbb3f3fb 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -18,7 +18,6 @@ #include "rsrc.h" #include "zcrx.h" -#if defined(CONFIG_NET) struct io_shutdown { struct file *file; int how; @@ -190,7 +189,6 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req, sr->done_io = 0; sr->retry = false; sr->len = 0; /* get from the provided buffer */ - req->buf_index = sr->buf_group; } static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg, @@ -359,15 +357,13 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) kmsg->msg.msg_name = &kmsg->addr; kmsg->msg.msg_namelen = addr_len; } - if (sr->flags & IORING_RECVSEND_FIXED_BUF) + if (sr->flags & IORING_RECVSEND_FIXED_BUF) { + req->flags |= REQ_F_IMPORT_BUFFER; return 0; - if (!io_do_buffer_select(req)) { - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, - &kmsg->msg.msg_iter); - if (unlikely(ret < 0)) - return ret; } - return 0; + if (req->flags & REQ_F_BUFFER_SELECT) + return 0; + return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); } static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -409,13 +405,12 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; + if (req->flags & REQ_F_BUFFER_SELECT) + sr->buf_group = req->buf_index; if (sr->flags & IORING_RECVSEND_BUNDLE) { if (req->opcode == IORING_OP_SENDMSG) return -EINVAL; - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return -EINVAL; sr->msg_flags |= MSG_WAITALL; - sr->buf_group = req->buf_index; req->buf_list = NULL; req->flags |= REQ_F_MULTISHOT; } @@ -571,6 +566,7 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, .iovs = &kmsg->fast_iov, .max_len = min_not_zero(sr->len, INT_MAX), .nr_iovs = 1, + .buf_group = sr->buf_group, }; if (kmsg->vec.iovec) { @@ -723,7 +719,6 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg; - int ret; kmsg = io_msg_alloc_async(req); if (unlikely(!kmsg)) @@ -739,13 +734,10 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req) kmsg->msg.msg_iocb = NULL; kmsg->msg.msg_ubuf = NULL; - if (!io_do_buffer_select(req)) { - ret = import_ubuf(ITER_DEST, sr->buf, sr->len, - &kmsg->msg.msg_iter); - if (unlikely(ret)) - return ret; - } - return 0; + if (req->flags & REQ_F_BUFFER_SELECT) + return 0; + return import_ubuf(ITER_DEST, sr->buf, sr->len, + &kmsg->msg.msg_iter); } return io_recvmsg_copy_hdr(req, kmsg); @@ -985,7 +977,7 @@ retry_multishot: void __user *buf; size_t len = sr->len; - buf = io_buffer_select(req, &len, issue_flags); + buf = io_buffer_select(req, &len, sr->buf_group, issue_flags); if (!buf) return -ENOBUFS; @@ -1063,6 +1055,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg .iovs = &kmsg->fast_iov, .nr_iovs = 1, .mode = KBUF_MODE_EXPAND, + .buf_group = sr->buf_group, }; if (kmsg->vec.iovec) { @@ -1095,7 +1088,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg void __user *buf; *len = sr->len; - buf = io_buffer_select(req, len, issue_flags); + buf = io_buffer_select(req, len, sr->buf_group, issue_flags); if (!buf) return -ENOBUFS; sr->buf = buf; @@ -1191,16 +1184,14 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); unsigned ifq_idx; - if (unlikely(sqe->file_index || sqe->addr2 || sqe->addr || - sqe->addr3)) + if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3)) return -EINVAL; ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); - if (ifq_idx != 0) - return -EINVAL; - zc->ifq = req->ctx->ifq; + zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx); if (!zc->ifq) return -EINVAL; + zc->len = READ_ONCE(sqe->len); zc->flags = READ_ONCE(sqe->ioprio); zc->msg_flags = READ_ONCE(sqe->msg_flags); @@ -1321,8 +1312,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -ENOMEM; if (req->opcode == IORING_OP_SEND_ZC) { - if (zc->flags & IORING_RECVSEND_FIXED_BUF) - req->flags |= REQ_F_IMPORT_BUFFER; ret = io_send_setup(req, sqe); } else { if (unlikely(sqe->addr2 || sqe->file_index)) @@ -1846,4 +1835,3 @@ void io_netmsg_cache_free(const void *entry) io_vec_free(&kmsg->vec); kfree(kmsg); } -#endif diff --git a/io_uring/notif.c b/io_uring/notif.c index 7bd92538dccbc7..9a6f6e92d74242 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -112,6 +112,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) if (unlikely(!io_alloc_req(ctx, ¬if))) return NULL; + notif->ctx = ctx; notif->opcode = IORING_OP_NOP; notif->flags = 0; notif->file = NULL; diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 489384c0438bd8..6e0882b051f93b 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -333,13 +333,13 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .iopoll = 1, .prep = io_provide_buffers_prep, - .issue = io_provide_buffers, + .issue = io_manage_buffers_legacy, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, .prep = io_remove_buffers_prep, - .issue = io_remove_buffers, + .issue = io_manage_buffers_legacy, }, [IORING_OP_TEE] = { .needs_file = 1, @@ -569,6 +569,10 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_prep_writev_fixed, .issue = io_write, }, + [IORING_OP_PIPE] = { + .prep = io_pipe_prep, + .issue = io_pipe, + }, }; const struct io_cold_def io_cold_defs[] = { @@ -815,6 +819,9 @@ const struct io_cold_def io_cold_defs[] = { .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, + [IORING_OP_PIPE] = { + .name = "PIPE", + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/openclose.c b/io_uring/openclose.c index e3357dfa14ca42..4dd46116345783 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -6,6 +6,8 @@ #include <linux/fdtable.h> #include <linux/fsnotify.h> #include <linux/namei.h> +#include <linux/pipe_fs_i.h> +#include <linux/watch_queue.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> @@ -302,3 +304,134 @@ int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); return IOU_OK; } + +struct io_pipe { + struct file *file; + int __user *fds; + int flags; + int file_slot; + unsigned long nofile; +}; + +int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + + if (sqe->fd || sqe->off || sqe->addr3) + return -EINVAL; + + p->fds = u64_to_user_ptr(READ_ONCE(sqe->addr)); + p->flags = READ_ONCE(sqe->pipe_flags); + if (p->flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE)) + return -EINVAL; + + p->file_slot = READ_ONCE(sqe->file_index); + p->nofile = rlimit(RLIMIT_NOFILE); + return 0; +} + +static int io_pipe_fixed(struct io_kiocb *req, struct file **files, + unsigned int issue_flags) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + struct io_ring_ctx *ctx = req->ctx; + int ret, fds[2] = { -1, -1 }; + int slot = p->file_slot; + + if (p->flags & O_CLOEXEC) + return -EINVAL; + + io_ring_submit_lock(ctx, issue_flags); + + ret = __io_fixed_fd_install(ctx, files[0], slot); + if (ret < 0) + goto err; + fds[0] = ret; + files[0] = NULL; + + /* + * If a specific slot is given, next one will be used for + * the write side. + */ + if (slot != IORING_FILE_INDEX_ALLOC) + slot++; + + ret = __io_fixed_fd_install(ctx, files[1], slot); + if (ret < 0) + goto err; + fds[1] = ret; + files[1] = NULL; + + io_ring_submit_unlock(ctx, issue_flags); + + if (!copy_to_user(p->fds, fds, sizeof(fds))) + return 0; + + ret = -EFAULT; + io_ring_submit_lock(ctx, issue_flags); +err: + if (fds[0] != -1) + io_fixed_fd_remove(ctx, fds[0]); + if (fds[1] != -1) + io_fixed_fd_remove(ctx, fds[1]); + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} + +static int io_pipe_fd(struct io_kiocb *req, struct file **files) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + int ret, fds[2] = { -1, -1 }; + + ret = __get_unused_fd_flags(p->flags, p->nofile); + if (ret < 0) + goto err; + fds[0] = ret; + + ret = __get_unused_fd_flags(p->flags, p->nofile); + if (ret < 0) + goto err; + fds[1] = ret; + + if (!copy_to_user(p->fds, fds, sizeof(fds))) { + fd_install(fds[0], files[0]); + fd_install(fds[1], files[1]); + return 0; + } + ret = -EFAULT; +err: + if (fds[0] != -1) + put_unused_fd(fds[0]); + if (fds[1] != -1) + put_unused_fd(fds[1]); + return ret; +} + +int io_pipe(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + struct file *files[2]; + int ret; + + ret = create_pipe_files(files, p->flags); + if (ret) + return ret; + files[0]->f_mode |= FMODE_NOWAIT; + files[1]->f_mode |= FMODE_NOWAIT; + + if (!!p->file_slot) + ret = io_pipe_fixed(req, files, issue_flags); + else + ret = io_pipe_fd(req, files); + + io_req_set_res(req, ret, 0); + if (!ret) + return IOU_OK; + + req_set_fail(req); + if (files[0]) + fput(files[0]); + if (files[1]) + fput(files[1]); + return ret; +} diff --git a/io_uring/openclose.h b/io_uring/openclose.h index 8a93c98ad0adc6..4ca2a9935abc9c 100644 --- a/io_uring/openclose.h +++ b/io_uring/openclose.h @@ -13,5 +13,8 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags); int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_close(struct io_kiocb *req, unsigned int issue_flags); +int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_pipe(struct io_kiocb *req, unsigned int issue_flags); + int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f80a77c4973f30..1657d775c8bab2 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -80,10 +80,21 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return 0; } -int io_buffer_validate(struct iovec *iov) +int io_validate_user_buf_range(u64 uaddr, u64 ulen) { - unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); + unsigned long tmp, base = (unsigned long)uaddr; + unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); + /* arbitrary limit, but we need something */ + if (ulen > SZ_1G || !ulen) + return -EFAULT; + if (check_add_overflow(base, acct_len, &tmp)) + return -EOVERFLOW; + return 0; +} + +static int io_buffer_validate(struct iovec *iov) +{ /* * Don't impose further limits on the size and buffer * constraints here, we'll -EINVAL later when IO is @@ -91,17 +102,9 @@ int io_buffer_validate(struct iovec *iov) */ if (!iov->iov_base) return iov->iov_len ? -EFAULT : 0; - if (!iov->iov_len) - return -EFAULT; - - /* arbitrary limit, but we need something */ - if (iov->iov_len > SZ_1G) - return -EFAULT; - if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) - return -EOVERFLOW; - - return 0; + return io_validate_user_buf_range((unsigned long)iov->iov_base, + iov->iov_len); } static void io_release_ubuf(void *priv) @@ -685,38 +688,34 @@ static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, struct io_imu_folio_data *data) { struct page **page_array = *pages, **new_array = NULL; - int nr_pages_left = *nr_pages, i, j; - int nr_folios = data->nr_folios; + unsigned nr_pages_left = *nr_pages; + unsigned nr_folios = data->nr_folios; + unsigned i, j; /* Store head pages only*/ - new_array = kvmalloc_array(nr_folios, sizeof(struct page *), - GFP_KERNEL); + new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL); if (!new_array) return false; - new_array[0] = compound_head(page_array[0]); - /* - * The pages are bound to the folio, it doesn't - * actually unpin them but drops all but one reference, - * which is usually put down by io_buffer_unmap(). - * Note, needs a better helper. - */ - if (data->nr_pages_head > 1) - unpin_user_pages(&page_array[1], data->nr_pages_head - 1); - - j = data->nr_pages_head; - nr_pages_left -= data->nr_pages_head; - for (i = 1; i < nr_folios; i++) { - unsigned int nr_unpin; - - new_array[i] = page_array[j]; - nr_unpin = min_t(unsigned int, nr_pages_left - 1, - data->nr_pages_mid - 1); - if (nr_unpin) - unpin_user_pages(&page_array[j+1], nr_unpin); - j += data->nr_pages_mid; - nr_pages_left -= data->nr_pages_mid; + for (i = 0, j = 0; i < nr_folios; i++) { + struct page *p = compound_head(page_array[j]); + struct folio *folio = page_folio(p); + unsigned int nr; + + WARN_ON_ONCE(i > 0 && p != page_array[j]); + + nr = i ? data->nr_pages_mid : data->nr_pages_head; + nr = min(nr, nr_pages_left); + /* Drop all but one ref, the entire folio will remain pinned. */ + if (nr > 1) + unpin_user_folio(folio, nr - 1); + j += nr; + nr_pages_left -= nr; + new_array[i] = p; } + + WARN_ON_ONCE(j != *nr_pages); + kvfree(page_array); *pages = new_array; *nr_pages = nr_folios; @@ -1062,8 +1061,6 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, size_t offset; int ret; - if (WARN_ON_ONCE(!imu)) - return -EFAULT; ret = validate_fixed_range(buf_addr, len, imu); if (unlikely(ret)) return ret; @@ -1110,13 +1107,19 @@ inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, if (req->flags & REQ_F_BUF_NODE) return req->buf_node; + req->flags |= REQ_F_BUF_NODE; io_ring_submit_lock(ctx, issue_flags); node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); - if (node) - io_req_assign_buf_node(req, node); + if (node) { + node->refs++; + req->buf_node = node; + io_ring_submit_unlock(ctx, issue_flags); + return node; + } + req->flags &= ~REQ_F_BUF_NODE; io_ring_submit_unlock(ctx, issue_flags); - return node; + return NULL; } int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index b52242852ff342..0d2138f16322b2 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -83,7 +83,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type); int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); -int io_buffer_validate(struct iovec *iov); +int io_validate_user_buf_range(u64 uaddr, u64 ulen); bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, struct io_imu_folio_data *data); @@ -115,32 +115,6 @@ static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx, return true; } -static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) -{ - if (req->file_node) { - io_put_rsrc_node(req->ctx, req->file_node); - req->file_node = NULL; - } - if (req->flags & REQ_F_BUF_NODE) { - io_put_rsrc_node(req->ctx, req->buf_node); - req->buf_node = NULL; - } -} - -static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node, - struct io_rsrc_node *node) -{ - node->refs++; - *dst_node = node; -} - -static inline void io_req_assign_buf_node(struct io_kiocb *req, - struct io_rsrc_node *node) -{ - io_req_assign_rsrc_node(&req->buf_node, node); - req->flags |= REQ_F_BUF_NODE; -} - int io_files_update(struct io_kiocb *req, unsigned int issue_flags); int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); diff --git a/io_uring/rw.c b/io_uring/rw.c index b8389674a747fc..303fdded375851 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -119,7 +119,7 @@ static int __io_import_rw_buffer(int ddir, struct io_kiocb *req, return io_import_vec(ddir, req, io, buf, sqe_len); if (io_do_buffer_select(req)) { - buf = io_buffer_select(req, &sqe_len, issue_flags); + buf = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags); if (!buf) return -ENOBUFS; rw->addr = (unsigned long) buf; @@ -253,16 +253,19 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, int ddir) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct io_async_rw *io; unsigned ioprio; u64 attr_type_mask; int ret; if (io_rw_alloc_async(req)) return -ENOMEM; + io = req->async_data; rw->kiocb.ki_pos = READ_ONCE(sqe->off); /* used for fixed read/write too - just read unconditionally */ req->buf_index = READ_ONCE(sqe->buf_index); + io->buf_group = req->buf_index; ioprio = READ_ONCE(sqe->ioprio); if (ioprio) { diff --git a/io_uring/rw.h b/io_uring/rw.h index 81d6d9a8cf6937..129a53fe54825d 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -16,6 +16,8 @@ struct io_async_rw { struct iov_iter iter; struct iov_iter_state iter_state; struct iovec fast_iov; + unsigned buf_group; + /* * wpq is for buffered io, while meta fields are used with * direct io diff --git a/io_uring/tctx.c b/io_uring/tctx.c index adc6e42c14df6c..5b66755579c08f 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -35,8 +35,6 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, data.hash = hash; data.task = task; - data.free_work = io_wq_free_work; - data.do_work = io_wq_submit_work; /* Do QD, or 4 * CPUS, whatever is smallest */ concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 2a107665230b07..a6ff8c026b1f22 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -35,6 +35,9 @@ struct io_timeout_rem { bool ltimeout; }; +static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, + struct io_kiocb *link); + static inline bool io_is_timeout_noseq(struct io_kiocb *req) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); @@ -218,7 +221,9 @@ void io_disarm_next(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; raw_spin_lock_irq(&ctx->timeout_lock); - link = io_disarm_linked_timeout(req); + if (req->link && req->link->opcode == IORING_OP_LINK_TIMEOUT) + link = __io_disarm_linked_timeout(req, req->link); + raw_spin_unlock_irq(&ctx->timeout_lock); if (link) io_req_queue_tw_complete(link, -ECANCELED); @@ -228,8 +233,8 @@ void io_disarm_next(struct io_kiocb *req) io_fail_links(req); } -struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, - struct io_kiocb *link) +static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, + struct io_kiocb *link) __must_hold(&req->ctx->completion_lock) __must_hold(&req->ctx->timeout_lock) { diff --git a/io_uring/timeout.h b/io_uring/timeout.h index e91b32448dcf90..2b7c9ad729925c 100644 --- a/io_uring/timeout.h +++ b/io_uring/timeout.h @@ -8,19 +8,6 @@ struct io_timeout_data { u32 flags; }; -struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, - struct io_kiocb *link); - -static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) -{ - struct io_kiocb *link = req->link; - - if (link && link->opcode == IORING_OP_LINK_TIMEOUT) - return __io_disarm_linked_timeout(req, link); - - return NULL; -} - __cold void io_flush_timeouts(struct io_ring_ctx *ctx); struct io_cancel_data; int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd); diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 430ed620ddfe01..8a6b0ddef79644 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -3,13 +3,10 @@ #include <linux/errno.h> #include <linux/file.h> #include <linux/io_uring/cmd.h> -#include <linux/io_uring/net.h> #include <linux/security.h> #include <linux/nospec.h> -#include <net/sock.h> #include <uapi/linux/io_uring.h> -#include <asm/ioctls.h> #include "io_uring.h" #include "alloc_cache.h" @@ -307,83 +304,3 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) io_req_queue_iowq(req); } - -static inline int io_uring_cmd_getsockopt(struct socket *sock, - struct io_uring_cmd *cmd, - unsigned int issue_flags) -{ - const struct io_uring_sqe *sqe = cmd->sqe; - bool compat = !!(issue_flags & IO_URING_F_COMPAT); - int optlen, optname, level, err; - void __user *optval; - - level = READ_ONCE(sqe->level); - if (level != SOL_SOCKET) - return -EOPNOTSUPP; - - optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); - optname = READ_ONCE(sqe->optname); - optlen = READ_ONCE(sqe->optlen); - - err = do_sock_getsockopt(sock, compat, level, optname, - USER_SOCKPTR(optval), - KERNEL_SOCKPTR(&optlen)); - if (err) - return err; - - /* On success, return optlen */ - return optlen; -} - -static inline int io_uring_cmd_setsockopt(struct socket *sock, - struct io_uring_cmd *cmd, - unsigned int issue_flags) -{ - const struct io_uring_sqe *sqe = cmd->sqe; - bool compat = !!(issue_flags & IO_URING_F_COMPAT); - int optname, optlen, level; - void __user *optval; - sockptr_t optval_s; - - optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); - optname = READ_ONCE(sqe->optname); - optlen = READ_ONCE(sqe->optlen); - level = READ_ONCE(sqe->level); - optval_s = USER_SOCKPTR(optval); - - return do_sock_setsockopt(sock, compat, level, optname, optval_s, - optlen); -} - -#if defined(CONFIG_NET) -int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) -{ - struct socket *sock = cmd->file->private_data; - struct sock *sk = sock->sk; - struct proto *prot = READ_ONCE(sk->sk_prot); - int ret, arg = 0; - - if (!prot || !prot->ioctl) - return -EOPNOTSUPP; - - switch (cmd->cmd_op) { - case SOCKET_URING_OP_SIOCINQ: - ret = prot->ioctl(sk, SIOCINQ, &arg); - if (ret) - return ret; - return arg; - case SOCKET_URING_OP_SIOCOUTQ: - ret = prot->ioctl(sk, SIOCOUTQ, &arg); - if (ret) - return ret; - return arg; - case SOCKET_URING_OP_GETSOCKOPT: - return io_uring_cmd_getsockopt(sock, cmd, issue_flags); - case SOCKET_URING_OP_SETSOCKOPT: - return io_uring_cmd_setsockopt(sock, cmd, issue_flags); - default: - return -EOPNOTSUPP; - } -} -EXPORT_SYMBOL_GPL(io_uring_cmd_sock); -#endif diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index fe86606b9f304d..9a568d04920470 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -26,29 +26,207 @@ #include "zcrx.h" #include "rsrc.h" +#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) + static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) { return pp->mp_priv; } -#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) +{ + struct net_iov_area *owner = net_iov_owner(niov); -static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, - struct io_zcrx_area *area, int nr_mapped) + return container_of(owner, struct io_zcrx_area, nia); +} + +static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + return area->mem.pages[net_iov_idx(niov)]; +} + +static void io_release_dmabuf(struct io_zcrx_mem *mem) +{ + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return; + + if (mem->sgt) + dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt, + DMA_FROM_DEVICE); + if (mem->attach) + dma_buf_detach(mem->dmabuf, mem->attach); + if (mem->dmabuf) + dma_buf_put(mem->dmabuf); + + mem->sgt = NULL; + mem->attach = NULL; + mem->dmabuf = NULL; +} + +static int io_import_dmabuf(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + unsigned long off = (unsigned long)area_reg->addr; + unsigned long len = (unsigned long)area_reg->len; + unsigned long total_size = 0; + struct scatterlist *sg; + int dmabuf_fd = area_reg->dmabuf_fd; + int i, ret; + + if (WARN_ON_ONCE(!ifq->dev)) + return -EFAULT; + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return -EINVAL; + + mem->is_dmabuf = true; + mem->dmabuf = dma_buf_get(dmabuf_fd); + if (IS_ERR(mem->dmabuf)) { + ret = PTR_ERR(mem->dmabuf); + mem->dmabuf = NULL; + goto err; + } + + mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev); + if (IS_ERR(mem->attach)) { + ret = PTR_ERR(mem->attach); + mem->attach = NULL; + goto err; + } + + mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE); + if (IS_ERR(mem->sgt)) { + ret = PTR_ERR(mem->sgt); + mem->sgt = NULL; + goto err; + } + + for_each_sgtable_dma_sg(mem->sgt, sg, i) + total_size += sg_dma_len(sg); + + if (total_size < off + len) + return -EINVAL; + + mem->dmabuf_offset = off; + mem->size = len; + return 0; +err: + io_release_dmabuf(mem); + return ret; +} + +static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + unsigned long off = area->mem.dmabuf_offset; + struct scatterlist *sg; + unsigned i, niov_idx = 0; + + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return -EINVAL; + + for_each_sgtable_dma_sg(area->mem.sgt, sg, i) { + dma_addr_t dma = sg_dma_address(sg); + unsigned long sg_len = sg_dma_len(sg); + unsigned long sg_off = min(sg_len, off); + + off -= sg_off; + sg_len -= sg_off; + dma += sg_off; + + while (sg_len && niov_idx < area->nia.num_niovs) { + struct net_iov *niov = &area->nia.niovs[niov_idx]; + + if (net_mp_niov_set_dma_addr(niov, dma)) + return 0; + sg_len -= PAGE_SIZE; + dma += PAGE_SIZE; + niov_idx++; + } + } + return niov_idx; +} + +static int io_import_umem(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + struct page **pages; + int nr_pages; + + if (area_reg->dmabuf_fd) + return -EINVAL; + if (!area_reg->addr) + return -EFAULT; + pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, + &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + mem->pages = pages; + mem->nr_folios = nr_pages; + mem->size = area_reg->len; + return 0; +} + +static void io_release_area_mem(struct io_zcrx_mem *mem) +{ + if (mem->is_dmabuf) { + io_release_dmabuf(mem); + return; + } + if (mem->pages) { + unpin_user_pages(mem->pages, mem->nr_folios); + kvfree(mem->pages); + } +} + +static int io_import_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + int ret; + + ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); + if (ret) + return ret; + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) + return -EINVAL; + + if (area_reg->flags & IORING_ZCRX_AREA_DMABUF) + return io_import_dmabuf(ifq, mem, area_reg); + return io_import_umem(ifq, mem, area_reg); +} + +static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area, int nr_mapped) { int i; for (i = 0; i < nr_mapped; i++) { - struct net_iov *niov = &area->nia.niovs[i]; - dma_addr_t dma; + netmem_ref netmem = net_iov_to_netmem(&area->nia.niovs[i]); + dma_addr_t dma = page_pool_get_dma_addr_netmem(netmem); - dma = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR); - net_mp_niov_set_dma_addr(niov, 0); } } +static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area, int nr_mapped) +{ + int i; + + if (area->mem.is_dmabuf) + io_release_dmabuf(&area->mem); + else + io_zcrx_unmap_umem(ifq, area, nr_mapped); + + for (i = 0; i < area->nia.num_niovs; i++) + net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); +} + static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) { guard(mutex)(&ifq->dma_lock); @@ -58,20 +236,16 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *are area->is_mapped = false; } -static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) { int i; - guard(mutex)(&ifq->dma_lock); - if (area->is_mapped) - return 0; - for (i = 0; i < area->nia.num_niovs; i++) { struct net_iov *niov = &area->nia.niovs[i]; dma_addr_t dma; - dma = dma_map_page_attrs(ifq->dev, area->pages[i], 0, PAGE_SIZE, - DMA_FROM_DEVICE, IO_DMA_ATTR); + dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0, + PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR); if (dma_mapping_error(ifq->dev, dma)) break; if (net_mp_niov_set_dma_addr(niov, dma)) { @@ -80,9 +254,24 @@ static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) break; } } + return i; +} + +static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + unsigned nr; + + guard(mutex)(&ifq->dma_lock); + if (area->is_mapped) + return 0; + + if (area->mem.is_dmabuf) + nr = io_zcrx_map_area_dmabuf(ifq, area); + else + nr = io_zcrx_map_area_umem(ifq, area); - if (i != area->nia.num_niovs) { - __io_zcrx_unmap_area(ifq, area, i); + if (nr != area->nia.num_niovs) { + __io_zcrx_unmap_area(ifq, area, nr); return -EINVAL; } @@ -118,13 +307,6 @@ struct io_zcrx_args { static const struct memory_provider_ops io_uring_pp_zc_ops; -static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) -{ - struct net_iov_area *owner = net_iov_owner(niov); - - return container_of(owner, struct io_zcrx_area, nia); -} - static inline atomic_t *io_get_user_counter(struct net_iov *niov) { struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); @@ -147,17 +329,12 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov) atomic_inc(io_get_user_counter(niov)); } -static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) -{ - struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); - - return area->pages[net_iov_idx(niov)]; -} - static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, - struct io_uring_region_desc *rd) + struct io_uring_region_desc *rd, + u32 id) { + u64 mmap_offset; size_t off, size; void *ptr; int ret; @@ -167,12 +344,14 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, if (size > rd->size) return -EINVAL; - ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd, - IORING_MAP_OFF_ZCRX_REGION); + mmap_offset = IORING_MAP_OFF_ZCRX_REGION; + mmap_offset += id << IORING_OFF_PBUF_SHIFT; + + ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset); if (ret < 0) return ret; - ptr = io_region_get_ptr(&ifq->ctx->zcrx_region); + ptr = io_region_get_ptr(&ifq->region); ifq->rq_ring = (struct io_uring *)ptr; ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); return 0; @@ -180,7 +359,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) { - io_free_region(ifq->ctx, &ifq->ctx->zcrx_region); + io_free_region(ifq->ctx, &ifq->region); ifq->rq_ring = NULL; ifq->rqes = NULL; } @@ -188,53 +367,44 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) static void io_zcrx_free_area(struct io_zcrx_area *area) { io_zcrx_unmap_area(area->ifq, area); + io_release_area_mem(&area->mem); kvfree(area->freelist); kvfree(area->nia.niovs); kvfree(area->user_refs); - if (area->pages) { - unpin_user_pages(area->pages, area->nr_folios); - kvfree(area->pages); - } kfree(area); } +#define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) + static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area **res, struct io_uring_zcrx_area_reg *area_reg) { struct io_zcrx_area *area; - int i, ret, nr_pages, nr_iovs; - struct iovec iov; + unsigned nr_iovs; + int i, ret; - if (area_reg->flags || area_reg->rq_area_token) + if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) return -EINVAL; - if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) + if (area_reg->rq_area_token) return -EINVAL; - if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) + if (area_reg->__resv2[0] || area_reg->__resv2[1]) return -EINVAL; - iov.iov_base = u64_to_user_ptr(area_reg->addr); - iov.iov_len = area_reg->len; - ret = io_buffer_validate(&iov); - if (ret) - return ret; - ret = -ENOMEM; area = kzalloc(sizeof(*area), GFP_KERNEL); if (!area) goto err; - area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, - &nr_pages); - if (IS_ERR(area->pages)) { - ret = PTR_ERR(area->pages); - area->pages = NULL; + ret = io_import_area(ifq, &area->mem, area_reg); + if (ret) goto err; - } - area->nr_folios = nr_iovs = nr_pages; + + nr_iovs = area->mem.size >> PAGE_SHIFT; area->nia.num_niovs = nr_iovs; + ret = -ENOMEM; area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]), GFP_KERNEL | __GFP_ZERO); if (!area->nia.niovs) @@ -245,9 +415,6 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, if (!area->freelist) goto err; - for (i = 0; i < nr_iovs; i++) - area->freelist[i] = i; - area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]), GFP_KERNEL | __GFP_ZERO); if (!area->user_refs) @@ -341,6 +508,16 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) kfree(ifq); } +struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, + unsigned int id) +{ + struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id); + + lockdep_assert_held(&ctx->mmap_lock); + + return ifq ? &ifq->region : NULL; +} + int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { @@ -350,6 +527,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_region_desc rd; struct io_zcrx_ifq *ifq; int ret; + u32 id; /* * 1. Interface queue allocation. @@ -362,8 +540,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && ctx->flags & IORING_SETUP_CQE32)) return -EINVAL; - if (ctx->ifq) - return -EBUSY; if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) @@ -386,29 +562,37 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ifq = io_zcrx_ifq_alloc(ctx); if (!ifq) return -ENOMEM; + ifq->rq_entries = reg.rq_entries; - ret = io_allocate_rbuf_ring(ifq, ®, &rd); - if (ret) - goto err; + scoped_guard(mutex, &ctx->mmap_lock) { + /* preallocate id */ + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); + if (ret) + goto ifq_free; + } - ret = io_zcrx_create_area(ifq, &ifq->area, &area); + ret = io_allocate_rbuf_ring(ifq, ®, &rd, id); if (ret) goto err; - ifq->rq_entries = reg.rq_entries; - - ret = -ENODEV; ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx, &ifq->netdev_tracker, GFP_KERNEL); - if (!ifq->netdev) + if (!ifq->netdev) { + ret = -ENODEV; goto err; + } ifq->dev = ifq->netdev->dev.parent; - ret = -EOPNOTSUPP; - if (!ifq->dev) + if (!ifq->dev) { + ret = -EOPNOTSUPP; goto err; + } get_device(ifq->dev); + ret = io_zcrx_create_area(ifq, &ifq->area, &area); + if (ret) + goto err; + mp_param.mp_ops = &io_uring_pp_zc_ops; mp_param.mp_priv = ifq; ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param); @@ -419,6 +603,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, reg.offsets.rqes = sizeof(struct io_uring); reg.offsets.head = offsetof(struct io_uring, head); reg.offsets.tail = offsetof(struct io_uring, tail); + reg.zcrx_id = id; + + scoped_guard(mutex, &ctx->mmap_lock) { + /* publish ifq */ + ret = -ENOMEM; + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) + goto err; + } if (copy_to_user(arg, ®, sizeof(reg)) || copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || @@ -426,24 +618,34 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ret = -EFAULT; goto err; } - ctx->ifq = ifq; return 0; err: + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->zcrx_ctxs, id); +ifq_free: io_zcrx_ifq_free(ifq); return ret; } void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) { - struct io_zcrx_ifq *ifq = ctx->ifq; + struct io_zcrx_ifq *ifq; + unsigned long id; lockdep_assert_held(&ctx->uring_lock); - if (!ifq) - return; + while (1) { + scoped_guard(mutex, &ctx->mmap_lock) { + ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); + if (ifq) + xa_erase(&ctx->zcrx_ctxs, id); + } + if (!ifq) + break; + io_zcrx_ifq_free(ifq); + } - ctx->ifq = NULL; - io_zcrx_ifq_free(ifq); + xa_destroy(&ctx->zcrx_ctxs); } static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) @@ -500,12 +702,15 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) { + struct io_zcrx_ifq *ifq; + unsigned long index; + lockdep_assert_held(&ctx->uring_lock); - if (!ctx->ifq) - return; - io_zcrx_scrub(ctx->ifq); - io_close_queue(ctx->ifq); + xa_for_each(&ctx->zcrx_ctxs, index, ifq) { + io_zcrx_scrub(ifq); + io_close_queue(ifq); + } } static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) @@ -742,6 +947,9 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, size_t copied = 0; int ret = 0; + if (area->mem.is_dmabuf) + return -EFAULT; + while (len) { size_t copy_size = min_t(size_t, PAGE_SIZE, len); const int dst_off = 0; diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index f2bc811f022c67..2f5e26389f2218 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -3,10 +3,24 @@ #define IOU_ZC_RX_H #include <linux/io_uring_types.h> +#include <linux/dma-buf.h> #include <linux/socket.h> #include <net/page_pool/types.h> #include <net/net_trackers.h> +struct io_zcrx_mem { + unsigned long size; + bool is_dmabuf; + + struct page **pages; + unsigned long nr_folios; + + struct dma_buf_attachment *attach; + struct dma_buf *dmabuf; + struct sg_table *sgt; + unsigned long dmabuf_offset; +}; + struct io_zcrx_area { struct net_iov_area nia; struct io_zcrx_ifq *ifq; @@ -14,13 +28,13 @@ struct io_zcrx_area { bool is_mapped; u16 area_id; - struct page **pages; - unsigned long nr_folios; /* freelist */ spinlock_t freelist_lock ____cacheline_aligned_in_smp; u32 free_count; u32 *freelist; + + struct io_zcrx_mem mem; }; struct io_zcrx_ifq { @@ -39,6 +53,7 @@ struct io_zcrx_ifq { netdevice_tracker netdev_tracker; spinlock_t lock; struct mutex dma_lock; + struct io_mapped_region region; }; #if defined(CONFIG_IO_URING_ZCRX) @@ -49,6 +64,8 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, struct socket *sock, unsigned int flags, unsigned issue_flags, unsigned int *len); +struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, + unsigned int id); #else static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) @@ -67,6 +84,11 @@ static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, { return -EOPNOTSUPP; } +static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, + unsigned int id) +{ + return NULL; +} #endif int io_recvzc(struct io_kiocb *req, unsigned int issue_flags); |