aboutsummaryrefslogtreecommitdiffstats
diff options
authorJens Axboe <axboe@kernel.dk>2025-05-17 18:47:41 -0600
committerJens Axboe <axboe@kernel.dk>2025-05-17 18:47:41 -0600
commitf6873e3f089c6a386df3dac965abb5516c2eccda (patch)
tree53d49ad886d76202f08cd41d40f71bafa7064b54
parentc2d8018325b6f0d0940c2544678822e88e41adbc (diff)
parentf660fd2ca15a3743f65f6110ae60d5b80500d856 (diff)
downloadlinux-block-for-next.tar.gz
Merge branch 'for-6.16/io_uring' into for-nextfor-next
* for-6.16/io_uring: (53 commits) io_uring: add new helpers for posting overflows io_uring: pass in struct io_big_cqe to io_alloc_ocqe() io_uring: make io_alloc_ocqe() take a struct io_cqe pointer io_uring: split alloc and add of overflow io_uring: open code io_req_cqe_overflow() io_uring/fdinfo: get rid of dumping credentials io_uring/fdinfo: only compile if CONFIG_PROC_FS is set io_uring/kbuf: unify legacy buf provision and removal io_uring/kbuf: refactor __io_remove_buffers io_uring/kbuf: don't compute size twice on prep io_uring/kbuf: drop extra vars in io_register_pbuf_ring io_uring/kbuf: use mem_is_zero() io_uring/kbuf: account ring io_buffer_list memory io_uring: drain based on allocates reqs io_uring: count allocated requests io_uring: open code io_account_cq_overflow() io_uring: consolidate drain seq checking io_uring: remove drain prealloc checks io_uring: simplify drain ret passing io_uring: fix spurious drain flushing ...
-rw-r--r--include/linux/io_uring_types.h15
-rw-r--r--include/uapi/linux/io_uring.h8
-rw-r--r--io_uring/Makefile6
-rw-r--r--io_uring/cmd_net.c83
-rw-r--r--io_uring/eventfd.c66
-rw-r--r--io_uring/eventfd.h3
-rw-r--r--io_uring/fdinfo.c40
-rw-r--r--io_uring/io-wq.c15
-rw-r--r--io_uring/io-wq.h5
-rw-r--r--io_uring/io_uring.c283
-rw-r--r--io_uring/io_uring.h3
-rw-r--r--io_uring/kbuf.c146
-rw-r--r--io_uring/kbuf.h8
-rw-r--r--io_uring/memmap.c11
-rw-r--r--io_uring/memmap.h4
-rw-r--r--io_uring/net.c48
-rw-r--r--io_uring/notif.c1
-rw-r--r--io_uring/opdef.c11
-rw-r--r--io_uring/openclose.c133
-rw-r--r--io_uring/openclose.h3
-rw-r--r--io_uring/rsrc.c89
-rw-r--r--io_uring/rsrc.h28
-rw-r--r--io_uring/rw.c5
-rw-r--r--io_uring/rw.h2
-rw-r--r--io_uring/tctx.c2
-rw-r--r--io_uring/timeout.c11
-rw-r--r--io_uring/timeout.h13
-rw-r--r--io_uring/uring_cmd.c83
-rw-r--r--io_uring/zcrx.c372
-rw-r--r--io_uring/zcrx.h26
30 files changed, 866 insertions, 657 deletions
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index b44d201520d857..2922635986f521 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -40,8 +40,6 @@ enum io_uring_cmd_flags {
IO_URING_F_TASK_DEAD = (1 << 13),
};
-struct io_zcrx_ifq;
-
struct io_wq_work_node {
struct io_wq_work_node *next;
};
@@ -343,7 +341,6 @@ struct io_ring_ctx {
unsigned cached_cq_tail;
unsigned cq_entries;
struct io_ev_fd __rcu *io_ev_fd;
- unsigned cq_extra;
void *cq_wait_arg;
size_t cq_wait_size;
@@ -394,7 +391,8 @@ struct io_ring_ctx {
struct wait_queue_head poll_wq;
struct io_restriction restrictions;
- struct io_zcrx_ifq *ifq;
+ /* Stores zcrx object pointers of type struct io_zcrx_ifq */
+ struct xarray zcrx_ctxs;
u32 pers_next;
struct xarray personalities;
@@ -418,6 +416,7 @@ struct io_ring_ctx {
struct callback_head poll_wq_task_work;
struct list_head defer_list;
+ unsigned nr_drained;
struct io_alloc_cache msg_cache;
spinlock_t msg_lock;
@@ -436,6 +435,7 @@ struct io_ring_ctx {
/* protected by ->completion_lock */
unsigned evfd_last_cq_tail;
+ unsigned nr_req_allocated;
/*
* Protection for resize vs mmap races - both the mmap and resize
@@ -448,8 +448,6 @@ struct io_ring_ctx {
struct io_mapped_region ring_region;
/* used for optimised request parameter and wait argument passing */
struct io_mapped_region param_region;
- /* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */
- struct io_mapped_region zcrx_region;
};
/*
@@ -653,8 +651,7 @@ struct io_kiocb {
u8 iopoll_completed;
/*
* Can be either a fixed buffer index, or used with provided buffers.
- * For the latter, before issue it points to the buffer group ID,
- * and after selection it points to the buffer ID itself.
+ * For the latter, it points to the selected buffer ID.
*/
u16 buf_index;
@@ -713,7 +710,7 @@ struct io_kiocb {
const struct cred *creds;
struct io_wq_work work;
- struct {
+ struct io_big_cqe {
u64 extra1;
u64 extra2;
} big_cqe;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 50e372ea97c549..cfd17e382082fa 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -73,6 +73,7 @@ struct io_uring_sqe {
__u32 futex_flags;
__u32 install_fd_flags;
__u32 nop_flags;
+ __u32 pipe_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
@@ -287,6 +288,7 @@ enum io_uring_op {
IORING_OP_EPOLL_WAIT,
IORING_OP_READV_FIXED,
IORING_OP_WRITEV_FIXED,
+ IORING_OP_PIPE,
/* this goes last, obviously */
IORING_OP_LAST,
@@ -992,12 +994,16 @@ struct io_uring_zcrx_offsets {
__u64 __resv[2];
};
+enum io_uring_zcrx_area_flags {
+ IORING_ZCRX_AREA_DMABUF = 1,
+};
+
struct io_uring_zcrx_area_reg {
__u64 addr;
__u64 len;
__u64 rq_area_token;
__u32 flags;
- __u32 __resv1;
+ __u32 dmabuf_fd;
__u64 __resv2[2];
};
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 3e28a741ca15c3..d97c6b51d58491 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,11 +7,11 @@ GCOV_PROFILE := y
endif
obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
- tctx.o filetable.o rw.o net.o poll.o \
+ tctx.o filetable.o rw.o poll.o \
eventfd.o uring_cmd.o openclose.o \
sqpoll.o xattr.o nop.o fs.o splice.o \
sync.o msg_ring.o advise.o openclose.o \
- statx.o timeout.o fdinfo.o cancel.o \
+ statx.o timeout.o cancel.o \
waitid.o register.o truncate.o \
memmap.o alloc_cache.o
obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
@@ -19,3 +19,5 @@ obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_EPOLL) += epoll.o
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
+obj-$(CONFIG_NET) += net.o cmd_net.o
+obj-$(CONFIG_PROC_FS) += fdinfo.o
diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c
new file mode 100644
index 00000000000000..e99170c7d41add
--- /dev/null
+++ b/io_uring/cmd_net.c
@@ -0,0 +1,83 @@
+#include <asm/ioctls.h>
+#include <linux/io_uring/net.h>
+#include <net/sock.h>
+
+#include "uring_cmd.h"
+
+static inline int io_uring_cmd_getsockopt(struct socket *sock,
+ struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ const struct io_uring_sqe *sqe = cmd->sqe;
+ bool compat = !!(issue_flags & IO_URING_F_COMPAT);
+ int optlen, optname, level, err;
+ void __user *optval;
+
+ level = READ_ONCE(sqe->level);
+ if (level != SOL_SOCKET)
+ return -EOPNOTSUPP;
+
+ optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
+ optname = READ_ONCE(sqe->optname);
+ optlen = READ_ONCE(sqe->optlen);
+
+ err = do_sock_getsockopt(sock, compat, level, optname,
+ USER_SOCKPTR(optval),
+ KERNEL_SOCKPTR(&optlen));
+ if (err)
+ return err;
+
+ /* On success, return optlen */
+ return optlen;
+}
+
+static inline int io_uring_cmd_setsockopt(struct socket *sock,
+ struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ const struct io_uring_sqe *sqe = cmd->sqe;
+ bool compat = !!(issue_flags & IO_URING_F_COMPAT);
+ int optname, optlen, level;
+ void __user *optval;
+ sockptr_t optval_s;
+
+ optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
+ optname = READ_ONCE(sqe->optname);
+ optlen = READ_ONCE(sqe->optlen);
+ level = READ_ONCE(sqe->level);
+ optval_s = USER_SOCKPTR(optval);
+
+ return do_sock_setsockopt(sock, compat, level, optname, optval_s,
+ optlen);
+}
+
+int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ struct socket *sock = cmd->file->private_data;
+ struct sock *sk = sock->sk;
+ struct proto *prot = READ_ONCE(sk->sk_prot);
+ int ret, arg = 0;
+
+ if (!prot || !prot->ioctl)
+ return -EOPNOTSUPP;
+
+ switch (cmd->cmd_op) {
+ case SOCKET_URING_OP_SIOCINQ:
+ ret = prot->ioctl(sk, SIOCINQ, &arg);
+ if (ret)
+ return ret;
+ return arg;
+ case SOCKET_URING_OP_SIOCOUTQ:
+ ret = prot->ioctl(sk, SIOCOUTQ, &arg);
+ if (ret)
+ return ret;
+ return arg;
+ case SOCKET_URING_OP_GETSOCKOPT:
+ return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
+ case SOCKET_URING_OP_SETSOCKOPT:
+ return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c
index 100d5da94cb95c..78f8ab7db104f0 100644
--- a/io_uring/eventfd.c
+++ b/io_uring/eventfd.c
@@ -47,13 +47,6 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)
io_eventfd_put(ev_fd);
}
-static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref)
-{
- if (put_ref)
- io_eventfd_put(ev_fd);
- rcu_read_unlock();
-}
-
/*
* Returns true if the caller should put the ev_fd reference, false if not.
*/
@@ -72,63 +65,34 @@ static bool __io_eventfd_signal(struct io_ev_fd *ev_fd)
/*
* Trigger if eventfd_async isn't set, or if it's set and the caller is
- * an async worker. If ev_fd isn't valid, obviously return false.
+ * an async worker.
*/
static bool io_eventfd_trigger(struct io_ev_fd *ev_fd)
{
- if (ev_fd)
- return !ev_fd->eventfd_async || io_wq_current_is_worker();
- return false;
+ return !ev_fd->eventfd_async || io_wq_current_is_worker();
}
-/*
- * On success, returns with an ev_fd reference grabbed and the RCU read
- * lock held.
- */
-static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx)
+void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event)
{
+ bool skip = false;
struct io_ev_fd *ev_fd;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
- return NULL;
-
- rcu_read_lock();
+ return;
- /*
- * rcu_dereference ctx->io_ev_fd once and use it for both for checking
- * and eventfd_signal
- */
+ guard(rcu)();
ev_fd = rcu_dereference(ctx->io_ev_fd);
-
/*
* Check again if ev_fd exists in case an io_eventfd_unregister call
* completed between the NULL check of ctx->io_ev_fd at the start of
* the function and rcu_read_lock.
*/
- if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs))
- return ev_fd;
-
- rcu_read_unlock();
- return NULL;
-}
-
-void io_eventfd_signal(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd;
-
- ev_fd = io_eventfd_grab(ctx);
- if (ev_fd)
- io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd));
-}
-
-void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd;
-
- ev_fd = io_eventfd_grab(ctx);
- if (ev_fd) {
- bool skip, put_ref = true;
+ if (!ev_fd)
+ return;
+ if (!io_eventfd_trigger(ev_fd) || !refcount_inc_not_zero(&ev_fd->refs))
+ return;
+ if (cqe_event) {
/*
* Eventfd should only get triggered when at least one event
* has been posted. Some applications rely on the eventfd
@@ -142,12 +106,10 @@ void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
skip = ctx->cached_cq_tail == ev_fd->last_cq_tail;
ev_fd->last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
-
- if (!skip)
- put_ref = __io_eventfd_signal(ev_fd);
-
- io_eventfd_release(ev_fd, put_ref);
}
+
+ if (skip || __io_eventfd_signal(ev_fd))
+ io_eventfd_put(ev_fd);
}
int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h
index d394f49c632105..e2f1985c2cf98e 100644
--- a/io_uring/eventfd.h
+++ b/io_uring/eventfd.h
@@ -4,5 +4,4 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int eventfd_async);
int io_eventfd_unregister(struct io_ring_ctx *ctx);
-void io_eventfd_flush_signal(struct io_ring_ctx *ctx);
-void io_eventfd_signal(struct io_ring_ctx *ctx);
+void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event);
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index e0d6a59a89fa1b..e9355276ab5d52 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -15,37 +15,6 @@
#include "cancel.h"
#include "rsrc.h"
-#ifdef CONFIG_PROC_FS
-static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
- const struct cred *cred)
-{
- struct user_namespace *uns = seq_user_ns(m);
- struct group_info *gi;
- kernel_cap_t cap;
- int g;
-
- seq_printf(m, "%5d\n", id);
- seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
- seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
- seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
- seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
- seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
- seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
- seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
- seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
- seq_puts(m, "\n\tGroups:\t");
- gi = cred->group_info;
- for (g = 0; g < gi->ngroups; g++) {
- seq_put_decimal_ull(m, g ? " " : "",
- from_kgid_munged(uns, gi->gid[g]));
- }
- seq_puts(m, "\n\tCapEff:\t");
- cap = cred->cap_effective;
- seq_put_hex_ll(m, NULL, cap.val, 16);
- seq_putc(m, '\n');
- return 0;
-}
-
#ifdef CONFIG_NET_RX_BUSY_POLL
static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx,
struct seq_file *m,
@@ -214,14 +183,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
else
seq_printf(m, "%5u: <none>\n", i);
}
- if (!xa_empty(&ctx->personalities)) {
- unsigned long index;
- const struct cred *cred;
-
- seq_printf(m, "Personalities:\n");
- xa_for_each(&ctx->personalities, index, cred)
- io_uring_show_cred(m, index, cred);
- }
seq_puts(m, "PollList:\n");
for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) {
@@ -264,4 +225,3 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
mutex_unlock(&ctx->uring_lock);
}
}
-#endif
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 04a75d66619510..d52069b1177b0a 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -114,9 +114,6 @@ enum {
struct io_wq {
unsigned long state;
- free_work_fn *free_work;
- io_wq_work_fn *do_work;
-
struct io_wq_hash *hash;
atomic_t worker_refs;
@@ -612,10 +609,10 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
if (do_kill &&
(work_flags & IO_WQ_WORK_UNBOUND))
atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
- wq->do_work(work);
+ io_wq_submit_work(work);
io_assign_current_work(worker, NULL);
- linked = wq->free_work(work);
+ linked = io_wq_free_work(work);
work = next_hashed;
if (!work && linked && !io_wq_is_hashed(linked)) {
work = linked;
@@ -934,8 +931,8 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq)
{
do {
atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
- wq->do_work(work);
- work = wq->free_work(work);
+ io_wq_submit_work(work);
+ work = io_wq_free_work(work);
} while (work);
}
@@ -1195,8 +1192,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
int ret, i;
struct io_wq *wq;
- if (WARN_ON_ONCE(!data->free_work || !data->do_work))
- return ERR_PTR(-EINVAL);
if (WARN_ON_ONCE(!bounded))
return ERR_PTR(-EINVAL);
@@ -1206,8 +1201,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
refcount_inc(&data->hash->refs);
wq->hash = data->hash;
- wq->free_work = data->free_work;
- wq->do_work = data->do_work;
ret = -ENOMEM;
diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h
index d4fb2940e435f7..774abab54732ef 100644
--- a/io_uring/io-wq.h
+++ b/io_uring/io-wq.h
@@ -21,9 +21,6 @@ enum io_wq_cancel {
IO_WQ_CANCEL_NOTFOUND, /* work not found */
};
-typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
-typedef void (io_wq_work_fn)(struct io_wq_work *);
-
struct io_wq_hash {
refcount_t refs;
unsigned long map;
@@ -39,8 +36,6 @@ static inline void io_wq_put_hash(struct io_wq_hash *hash)
struct io_wq_data {
struct io_wq_hash *hash;
struct task_struct *task;
- io_wq_work_fn *do_work;
- free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3f079713914154..d85657ecb8933e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -129,7 +129,6 @@
struct io_defer_entry {
struct list_head list;
struct io_kiocb *req;
- u32 seq;
};
/* requests with any of those set should undergo io_disarm_next() */
@@ -149,6 +148,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
bool is_sqpoll_thread);
static void io_queue_sqe(struct io_kiocb *req);
+static void __io_req_caches_free(struct io_ring_ctx *ctx);
static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray);
@@ -359,6 +359,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->tctx_list);
ctx->submit_state.free_list.next = NULL;
INIT_HLIST_HEAD(&ctx->waitid_list);
+ xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC);
#ifdef CONFIG_FUTEX
INIT_HLIST_HEAD(&ctx->futex_list);
#endif
@@ -380,25 +381,6 @@ err:
return NULL;
}
-static void io_account_cq_overflow(struct io_ring_ctx *ctx)
-{
- struct io_rings *r = ctx->rings;
-
- WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
- ctx->cq_extra--;
-}
-
-static bool req_need_defer(struct io_kiocb *req, u32 seq)
-{
- if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
- struct io_ring_ctx *ctx = req->ctx;
-
- return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
- }
-
- return false;
-}
-
static void io_clean_op(struct io_kiocb *req)
{
if (unlikely(req->flags & REQ_F_BUFFER_SELECTED))
@@ -537,20 +519,37 @@ void io_req_queue_iowq(struct io_kiocb *req)
io_req_task_work_add(req);
}
+static unsigned io_linked_nr(struct io_kiocb *req)
+{
+ struct io_kiocb *tmp;
+ unsigned nr = 0;
+
+ io_for_each_link(tmp, req)
+ nr++;
+ return nr;
+}
+
static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
{
- spin_lock(&ctx->completion_lock);
+ bool drain_seen = false, first = true;
+
+ lockdep_assert_held(&ctx->uring_lock);
+ __io_req_caches_free(ctx);
+
while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
- if (req_need_defer(de->req, de->seq))
- break;
+ drain_seen |= de->req->flags & REQ_F_IO_DRAIN;
+ if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained)
+ return;
+
list_del_init(&de->list);
+ ctx->nr_drained -= io_linked_nr(de->req);
io_req_task_queue(de->req);
kfree(de);
+ first = false;
}
- spin_unlock(&ctx->completion_lock);
}
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
@@ -559,10 +558,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
io_poll_wq_wake(ctx);
if (ctx->off_timeout_used)
io_flush_timeouts(ctx);
- if (ctx->drain_active)
- io_queue_deferred(ctx);
if (ctx->has_evfd)
- io_eventfd_flush_signal(ctx);
+ io_eventfd_signal(ctx, true);
}
static inline void __io_cq_lock(struct io_ring_ctx *ctx)
@@ -700,27 +697,20 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
}
}
-static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
- s32 res, u32 cflags, u64 extra1, u64 extra2)
+static __cold bool io_cqring_add_overflow(struct io_ring_ctx *ctx,
+ struct io_overflow_cqe *ocqe)
{
- struct io_overflow_cqe *ocqe;
- size_t ocq_size = sizeof(struct io_overflow_cqe);
- bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
-
lockdep_assert_held(&ctx->completion_lock);
- if (is_cqe32)
- ocq_size += sizeof(struct io_uring_cqe);
-
- ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
- trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
if (!ocqe) {
+ struct io_rings *r = ctx->rings;
+
/*
* If we're in ring overflow flush mode, or in task cancel mode,
* or cannot allocate an overflow entry, then we need to drop it
* on the floor.
*/
- io_account_cq_overflow(ctx);
+ WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
return false;
}
@@ -729,23 +719,35 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
}
- ocqe->cqe.user_data = user_data;
- ocqe->cqe.res = res;
- ocqe->cqe.flags = cflags;
- if (is_cqe32) {
- ocqe->cqe.big_cqe[0] = extra1;
- ocqe->cqe.big_cqe[1] = extra2;
- }
list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
return true;
}
-static void io_req_cqe_overflow(struct io_kiocb *req)
+static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
+ struct io_cqe *cqe,
+ struct io_big_cqe *big_cqe, gfp_t gfp)
{
- io_cqring_event_overflow(req->ctx, req->cqe.user_data,
- req->cqe.res, req->cqe.flags,
- req->big_cqe.extra1, req->big_cqe.extra2);
- memset(&req->big_cqe, 0, sizeof(req->big_cqe));
+ struct io_overflow_cqe *ocqe;
+ size_t ocq_size = sizeof(struct io_overflow_cqe);
+ bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
+
+ if (is_cqe32)
+ ocq_size += sizeof(struct io_uring_cqe);
+
+ ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT);
+ trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe);
+ if (ocqe) {
+ ocqe->cqe.user_data = cqe->user_data;
+ ocqe->cqe.res = cqe->res;
+ ocqe->cqe.flags = cqe->flags;
+ if (is_cqe32 && big_cqe) {
+ ocqe->cqe.big_cqe[0] = big_cqe->extra1;
+ ocqe->cqe.big_cqe[1] = big_cqe->extra2;
+ }
+ }
+ if (big_cqe)
+ big_cqe->extra1 = big_cqe->extra2 = 0;
+ return ocqe;
}
/*
@@ -790,13 +792,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
{
struct io_uring_cqe *cqe;
- ctx->cq_extra++;
-
- /*
- * If we can't get a cq entry, userspace overflowed the
- * submission (by quite a lot). Increment the overflow count in
- * the ring.
- */
if (likely(io_get_cqe(ctx, &cqe))) {
WRITE_ONCE(cqe->user_data, user_data);
WRITE_ONCE(cqe->res, res);
@@ -813,14 +808,43 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
return false;
}
+static inline struct io_cqe io_init_cqe(u64 user_data, s32 res, u32 cflags)
+{
+ return (struct io_cqe) { .user_data = user_data, .res = res, .flags = cflags };
+}
+
+static __cold void io_cqe_overflow(struct io_ring_ctx *ctx, struct io_cqe *cqe,
+ struct io_big_cqe *big_cqe)
+{
+ struct io_overflow_cqe *ocqe;
+
+ ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_KERNEL);
+ spin_lock(&ctx->completion_lock);
+ io_cqring_add_overflow(ctx, ocqe);
+ spin_unlock(&ctx->completion_lock);
+}
+
+static __cold bool io_cqe_overflow_locked(struct io_ring_ctx *ctx,
+ struct io_cqe *cqe,
+ struct io_big_cqe *big_cqe)
+{
+ struct io_overflow_cqe *ocqe;
+
+ ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_ATOMIC);
+ return io_cqring_add_overflow(ctx, ocqe);
+}
+
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
bool filled;
io_cq_lock(ctx);
filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
- if (!filled)
- filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
+ if (unlikely(!filled)) {
+ struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
+
+ filled = io_cqe_overflow_locked(ctx, &cqe, NULL);
+ }
io_cq_unlock_post(ctx);
return filled;
}
@@ -831,10 +855,13 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
*/
void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
+ lockdep_assert_held(&ctx->uring_lock);
+ lockdep_assert(ctx->lockless_cq);
+
if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) {
- spin_lock(&ctx->completion_lock);
- io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
- spin_unlock(&ctx->completion_lock);
+ struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
+
+ io_cqe_overflow(ctx, &cqe, NULL);
}
ctx->submit_state.cq_flush = true;
}
@@ -924,22 +951,6 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res)
}
/*
- * Don't initialise the fields below on every allocation, but do that in
- * advance and keep them valid across allocations.
- */
-static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
-{
- req->ctx = ctx;
- req->buf_node = NULL;
- req->file_node = NULL;
- req->link = NULL;
- req->async_data = NULL;
- /* not necessary, but safer to zero */
- memset(&req->cqe, 0, sizeof(req->cqe));
- memset(&req->big_cqe, 0, sizeof(req->big_cqe));
-}
-
-/*
* A request might get retired back into the request caches even before opcode
* handlers and io_issue_sqe() are done with it, e.g. inline completion path.
* Because of that, io_alloc_req() should be called only under ->uring_lock
@@ -948,7 +959,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
__cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
- gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+ gfp_t gfp = GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO;
void *reqs[IO_REQ_ALLOC_BATCH];
int ret;
@@ -966,10 +977,11 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
}
percpu_ref_get_many(&ctx->refs, ret);
+ ctx->nr_req_allocated += ret;
+
while (ret--) {
struct io_kiocb *req = reqs[ret];
- io_preinit_req(req, ctx);
io_req_add_to_cache(req, ctx);
}
return true;
@@ -1191,7 +1203,7 @@ static void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
if (ctx->has_evfd)
- io_eventfd_signal(ctx);
+ io_eventfd_signal(ctx, false);
}
nr_wait = atomic_read(&ctx->cq_wait_nr);
@@ -1383,6 +1395,16 @@ void io_queue_next(struct io_kiocb *req)
io_req_task_queue(nxt);
}
+static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
+{
+ if (req->file_node) {
+ io_put_rsrc_node(req->ctx, req->file_node);
+ req->file_node = NULL;
+ }
+ if (req->flags & REQ_F_BUF_NODE)
+ io_put_rsrc_node(req->ctx, req->buf_node);
+}
+
static void io_free_batch_list(struct io_ring_ctx *ctx,
struct io_wq_work_node *node)
__must_hold(&ctx->uring_lock)
@@ -1443,13 +1465,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
*/
if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) &&
unlikely(!io_fill_cqe_req(ctx, req))) {
- if (ctx->lockless_cq) {
- spin_lock(&ctx->completion_lock);
- io_req_cqe_overflow(req);
- spin_unlock(&ctx->completion_lock);
- } else {
- io_req_cqe_overflow(req);
- }
+ if (ctx->lockless_cq)
+ io_cqe_overflow(ctx, &req->cqe, &req->big_cqe);
+ else
+ io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe);
}
}
__io_cq_unlock_post(ctx);
@@ -1458,6 +1477,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
io_free_batch_list(ctx, state->compl_reqs.first);
INIT_WQ_LIST(&state->compl_reqs);
}
+
+ if (unlikely(ctx->drain_active))
+ io_queue_deferred(ctx);
+
ctx->submit_state.cq_flush = false;
}
@@ -1645,56 +1668,28 @@ io_req_flags_t io_file_get_flags(struct file *file)
return res;
}
-static u32 io_get_sequence(struct io_kiocb *req)
-{
- u32 seq = req->ctx->cached_sq_head;
- struct io_kiocb *cur;
-
- /* need original cached_sq_head, but it was increased for each req */
- io_for_each_link(cur, req)
- seq--;
- return seq;
-}
-
static __cold void io_drain_req(struct io_kiocb *req)
__must_hold(&ctx->uring_lock)
{
struct io_ring_ctx *ctx = req->ctx;
+ bool drain = req->flags & IOSQE_IO_DRAIN;
struct io_defer_entry *de;
- int ret;
- u32 seq = io_get_sequence(req);
-
- /* Still need defer if there is pending req in defer list. */
- spin_lock(&ctx->completion_lock);
- if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
- spin_unlock(&ctx->completion_lock);
-queue:
- ctx->drain_active = false;
- io_req_task_queue(req);
- return;
- }
- spin_unlock(&ctx->completion_lock);
- io_prep_async_link(req);
- de = kmalloc(sizeof(*de), GFP_KERNEL);
+ de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT);
if (!de) {
- ret = -ENOMEM;
- io_req_defer_failed(req, ret);
+ io_req_defer_failed(req, -ENOMEM);
return;
}
- spin_lock(&ctx->completion_lock);
- if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
- spin_unlock(&ctx->completion_lock);
- kfree(de);
- goto queue;
- }
-
+ io_prep_async_link(req);
trace_io_uring_defer(req);
de->req = req;
- de->seq = seq;
+
+ ctx->nr_drained += io_linked_nr(req);
list_add_tail(&de->list, &ctx->defer_list);
- spin_unlock(&ctx->completion_lock);
+ io_queue_deferred(ctx);
+ if (!drain && list_empty(&ctx->defer_list))
+ ctx->drain_active = false;
}
static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
@@ -1815,7 +1810,7 @@ void io_wq_submit_work(struct io_wq_work *work)
bool needs_poll = false;
int ret = 0, err = -ECANCELED;
- /* one will be dropped by ->io_wq_free_work() after returning to io-wq */
+ /* one will be dropped by io_wq_free_work() after returning to io-wq */
if (!(req->flags & REQ_F_REFCOUNT))
__io_req_set_refcount(req, 2);
else
@@ -1913,7 +1908,8 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
io_ring_submit_lock(ctx, issue_flags);
node = io_rsrc_node_lookup(&ctx->file_table.data, fd);
if (node) {
- io_req_assign_rsrc_node(&req->file_node, node);
+ node->refs++;
+ req->file_node = node;
req->flags |= io_slot_flags(node);
file = io_slot_file(node);
}
@@ -2046,7 +2042,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
int personality;
u8 opcode;
- /* req is partially pre-initialised, see io_preinit_req() */
+ req->ctx = ctx;
req->opcode = opcode = READ_ONCE(sqe->opcode);
/* same numerical values with corresponding REQ_F_*, safe to copy */
sqe_flags = READ_ONCE(sqe->flags);
@@ -2277,10 +2273,6 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
(!(ctx->flags & IORING_SETUP_NO_SQARRAY))) {
head = READ_ONCE(ctx->sq_array[head]);
if (unlikely(head >= ctx->sq_entries)) {
- /* drop invalid entries */
- spin_lock(&ctx->completion_lock);
- ctx->cq_extra--;
- spin_unlock(&ctx->completion_lock);
WRITE_ONCE(ctx->rings->sq_dropped,
READ_ONCE(ctx->rings->sq_dropped) + 1);
return false;
@@ -2698,21 +2690,26 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
return off;
}
-static void io_req_caches_free(struct io_ring_ctx *ctx)
+static __cold void __io_req_caches_free(struct io_ring_ctx *ctx)
{
struct io_kiocb *req;
int nr = 0;
- mutex_lock(&ctx->uring_lock);
-
while (!io_req_cache_empty(ctx)) {
req = io_extract_req(ctx);
kmem_cache_free(req_cachep, req);
nr++;
}
- if (nr)
+ if (nr) {
+ ctx->nr_req_allocated -= nr;
percpu_ref_put_many(&ctx->refs, nr);
- mutex_unlock(&ctx->uring_lock);
+ }
+}
+
+static __cold void io_req_caches_free(struct io_ring_ctx *ctx)
+{
+ guard(mutex)(&ctx->uring_lock);
+ __io_req_caches_free(ctx);
}
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
@@ -2748,6 +2745,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
io_req_caches_free(ctx);
+
+ WARN_ON_ONCE(ctx->nr_req_allocated);
+
if (ctx->hash_map)
io_wq_put_hash(ctx->hash_map);
io_napi_free(ctx);
@@ -2882,7 +2882,7 @@ static __cold void io_ring_exit_work(struct work_struct *work)
io_cqring_overflow_kill(ctx);
mutex_unlock(&ctx->uring_lock);
}
- if (ctx->ifq) {
+ if (!xa_empty(&ctx->zcrx_ctxs)) {
mutex_lock(&ctx->uring_lock);
io_shutdown_zcrx_ifqs(ctx);
mutex_unlock(&ctx->uring_lock);
@@ -3014,20 +3014,19 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
struct io_defer_entry *de;
LIST_HEAD(list);
- spin_lock(&ctx->completion_lock);
list_for_each_entry_reverse(de, &ctx->defer_list, list) {
if (io_match_task_safe(de->req, tctx, cancel_all)) {
list_cut_position(&list, &ctx->defer_list, &de->list);
break;
}
}
- spin_unlock(&ctx->completion_lock);
if (list_empty(&list))
return false;
while (!list_empty(&list)) {
de = list_first_entry(&list, struct io_defer_entry, list);
list_del_init(&de->list);
+ ctx->nr_drained -= io_linked_nr(de->req);
io_req_task_queue_fail(de->req, -ECANCELED);
kfree(de);
}
@@ -3102,8 +3101,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
io_allowed_defer_tw_run(ctx))
ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0;
- ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
mutex_lock(&ctx->uring_lock);
+ ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
ret |= io_poll_remove_all(ctx, tctx, cancel_all);
ret |= io_waitid_remove_all(ctx, tctx, cancel_all);
ret |= io_futex_remove_all(ctx, tctx, cancel_all);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index e4050b2d0821f5..81f22196a57de6 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -196,7 +196,6 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx,
{
io_lockdep_assert_cq_locked(ctx);
- ctx->cq_extra++;
ctx->submit_state.cq_flush = true;
return io_get_cqe(ctx, cqe_ret);
}
@@ -414,7 +413,7 @@ static inline void io_req_complete_defer(struct io_kiocb *req)
static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
- if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+ if (unlikely(ctx->off_timeout_used ||
ctx->has_evfd || ctx->poll_activated))
__io_commit_cqring_flush(ctx);
}
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 953d5e74256916..823e7eb15fb2a8 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -92,7 +92,6 @@ void io_kbuf_drop_legacy(struct io_kiocb *req)
{
if (WARN_ON_ONCE(!(req->flags & REQ_F_BUFFER_SELECTED)))
return;
- req->buf_index = req->kbuf->bgid;
req->flags &= ~REQ_F_BUFFER_SELECTED;
kfree(req->kbuf);
req->kbuf = NULL;
@@ -110,7 +109,6 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
bl = io_buffer_get_list(ctx, buf->bgid);
list_add(&buf->list, &bl->buf_list);
req->flags &= ~REQ_F_BUFFER_SELECTED;
- req->buf_index = buf->bgid;
io_ring_submit_unlock(ctx, issue_flags);
return true;
@@ -193,7 +191,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
}
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
- unsigned int issue_flags)
+ unsigned buf_group, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
@@ -201,7 +199,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
io_ring_submit_lock(req->ctx, issue_flags);
- bl = io_buffer_get_list(ctx, req->buf_index);
+ bl = io_buffer_get_list(ctx, buf_group);
if (likely(bl)) {
if (bl->flags & IOBL_BUF_RING)
ret = io_ring_buffer_select(req, len, bl, issue_flags);
@@ -302,7 +300,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
int ret = -ENOENT;
io_ring_submit_lock(ctx, issue_flags);
- bl = io_buffer_get_list(ctx, req->buf_index);
+ bl = io_buffer_get_list(ctx, arg->buf_group);
if (unlikely(!bl))
goto out_unlock;
@@ -335,7 +333,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
lockdep_assert_held(&ctx->uring_lock);
- bl = io_buffer_get_list(ctx, req->buf_index);
+ bl = io_buffer_get_list(ctx, arg->buf_group);
if (unlikely(!bl))
return -ENOENT;
@@ -355,10 +353,9 @@ static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr)
struct io_buffer_list *bl = req->buf_list;
bool ret = true;
- if (bl) {
+ if (bl)
ret = io_kbuf_commit(req, bl, len, nr);
- req->buf_index = bl->bgid;
- }
+
req->flags &= ~REQ_F_BUFFER_RING;
return ret;
}
@@ -379,45 +376,33 @@ unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs)
return ret;
}
-static int __io_remove_buffers(struct io_ring_ctx *ctx,
- struct io_buffer_list *bl, unsigned nbufs)
+static int io_remove_buffers_legacy(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl,
+ unsigned long nbufs)
{
- unsigned i = 0;
-
- /* shouldn't happen */
- if (!nbufs)
- return 0;
-
- if (bl->flags & IOBL_BUF_RING) {
- i = bl->buf_ring->tail - bl->head;
- io_free_region(ctx, &bl->region);
- /* make sure it's seen as empty */
- INIT_LIST_HEAD(&bl->buf_list);
- bl->flags &= ~IOBL_BUF_RING;
- return i;
- }
+ unsigned long i = 0;
+ struct io_buffer *nxt;
/* protects io_buffers_cache */
lockdep_assert_held(&ctx->uring_lock);
+ WARN_ON_ONCE(bl->flags & IOBL_BUF_RING);
- while (!list_empty(&bl->buf_list)) {
- struct io_buffer *nxt;
-
+ for (i = 0; i < nbufs && !list_empty(&bl->buf_list); i++) {
nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&nxt->list);
kfree(nxt);
-
- if (++i == nbufs)
- return i;
cond_resched();
}
-
return i;
}
static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
{
- __io_remove_buffers(ctx, bl, -1U);
+ if (bl->flags & IOBL_BUF_RING)
+ io_free_region(ctx, &bl->region);
+ else
+ io_remove_buffers_legacy(ctx, bl, -1U);
+
kfree(bl);
}
@@ -465,30 +450,6 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
- struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
- int ret = 0;
-
- io_ring_submit_lock(ctx, issue_flags);
-
- ret = -ENOENT;
- bl = io_buffer_get_list(ctx, p->bgid);
- if (bl) {
- ret = -EINVAL;
- /* can't use provide/remove buffers command on mapped buffers */
- if (!(bl->flags & IOBL_BUF_RING))
- ret = __io_remove_buffers(ctx, bl, p->nbufs);
- }
- io_ring_submit_unlock(ctx, issue_flags);
- if (ret < 0)
- req_set_fail(req);
- io_req_set_res(req, ret, 0);
- return IOU_OK;
-}
-
int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
unsigned long size, tmp_check;
@@ -512,8 +473,6 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return -EOVERFLOW;
if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
return -EOVERFLOW;
-
- size = (unsigned long)p->len * p->nbufs;
if (!access_ok(u64_to_user_ptr(p->addr), size))
return -EFAULT;
@@ -552,37 +511,44 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
return i ? 0 : -ENOMEM;
}
-int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_manage_buffers_legacy(struct io_kiocb *req,
+ struct io_buffer_list *bl)
{
struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
- struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
- int ret = 0;
-
- io_ring_submit_lock(ctx, issue_flags);
+ int ret;
- bl = io_buffer_get_list(ctx, p->bgid);
- if (unlikely(!bl)) {
+ if (!bl) {
+ if (req->opcode != IORING_OP_PROVIDE_BUFFERS)
+ return -ENOENT;
bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
- if (!bl) {
- ret = -ENOMEM;
- goto err;
- }
+ if (!bl)
+ return -ENOMEM;
+
INIT_LIST_HEAD(&bl->buf_list);
- ret = io_buffer_add_list(ctx, bl, p->bgid);
+ ret = io_buffer_add_list(req->ctx, bl, p->bgid);
if (ret) {
kfree(bl);
- goto err;
+ return ret;
}
}
- /* can't add buffers via this command for a mapped buffer ring */
- if (bl->flags & IOBL_BUF_RING) {
- ret = -EINVAL;
- goto err;
- }
+ /* can't use provide/remove buffers command on mapped buffers */
+ if (bl->flags & IOBL_BUF_RING)
+ return -EINVAL;
+ if (req->opcode == IORING_OP_PROVIDE_BUFFERS)
+ return io_add_buffers(req->ctx, p, bl);
+ return io_remove_buffers_legacy(req->ctx, bl, p->nbufs);
+}
+
+int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
+ int ret;
- ret = io_add_buffers(ctx, p, bl);
-err:
+ io_ring_submit_lock(ctx, issue_flags);
+ bl = io_buffer_get_list(ctx, p->bgid);
+ ret = __io_manage_buffers_legacy(req, bl);
io_ring_submit_unlock(ctx, issue_flags);
if (ret < 0)
@@ -594,7 +560,7 @@ err:
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg;
- struct io_buffer_list *bl, *free_bl = NULL;
+ struct io_buffer_list *bl;
struct io_uring_region_desc rd;
struct io_uring_buf_ring *br;
unsigned long mmap_offset;
@@ -605,8 +571,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
-
- if (reg.resv[0] || reg.resv[1] || reg.resv[2])
+ if (!mem_is_zero(reg.resv, sizeof(reg.resv)))
return -EINVAL;
if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
return -EINVAL;
@@ -624,7 +589,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
io_destroy_bl(ctx, bl);
}
- free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+ bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
if (!bl)
return -ENOMEM;
@@ -669,7 +634,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return 0;
fail:
io_free_region(ctx, &bl->region);
- kfree(free_bl);
+ kfree(bl);
return ret;
}
@@ -682,9 +647,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
- if (reg.resv[0] || reg.resv[1] || reg.resv[2])
- return -EINVAL;
- if (reg.flags)
+ if (!mem_is_zero(reg.resv, sizeof(reg.resv)) || reg.flags)
return -EINVAL;
bl = io_buffer_get_list(ctx, reg.bgid);
@@ -704,14 +667,11 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_status buf_status;
struct io_buffer_list *bl;
- int i;
if (copy_from_user(&buf_status, arg, sizeof(buf_status)))
return -EFAULT;
-
- for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++)
- if (buf_status.resv[i])
- return -EINVAL;
+ if (!mem_is_zero(buf_status.resv, sizeof(buf_status.resv)))
+ return -EINVAL;
bl = io_buffer_get_list(ctx, buf_status.buf_group);
if (!bl)
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 2ec0b983ce243c..4d2c209d1a4112 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -55,20 +55,19 @@ struct buf_sel_arg {
size_t max_len;
unsigned short nr_iovs;
unsigned short mode;
+ unsigned buf_group;
};
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
- unsigned int issue_flags);
+ unsigned buf_group, unsigned int issue_flags);
int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
unsigned int issue_flags);
int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg);
void io_destroy_buffers(struct io_ring_ctx *ctx);
int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags);
-
int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
+int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags);
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
@@ -94,7 +93,6 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
* to monopolize the buffer.
*/
if (req->buf_list) {
- req->buf_index = req->buf_list->bgid;
req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT);
return true;
}
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index 07f8a5cbd37ec7..725dc0bec24c42 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -13,6 +13,7 @@
#include "memmap.h"
#include "kbuf.h"
#include "rsrc.h"
+#include "zcrx.h"
static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
size_t size, gfp_t gfp)
@@ -258,7 +259,8 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx,
loff_t pgoff)
{
loff_t offset = pgoff << PAGE_SHIFT;
- unsigned int bgid;
+ unsigned int id;
+
switch (offset & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING:
@@ -267,12 +269,13 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx,
case IORING_OFF_SQES:
return &ctx->sq_region;
case IORING_OFF_PBUF_RING:
- bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
- return io_pbuf_get_region(ctx, bgid);
+ id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
+ return io_pbuf_get_region(ctx, id);
case IORING_MAP_OFF_PARAM_REGION:
return &ctx->param_region;
case IORING_MAP_OFF_ZCRX_REGION:
- return &ctx->zcrx_region;
+ id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_ZCRX_SHIFT;
+ return io_zcrx_get_region(ctx, id);
}
return NULL;
}
diff --git a/io_uring/memmap.h b/io_uring/memmap.h
index dad0aa5b1b4574..08419684e4bc91 100644
--- a/io_uring/memmap.h
+++ b/io_uring/memmap.h
@@ -4,7 +4,9 @@
#define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL
#define IORING_MAP_OFF_ZCRX_REGION 0x30000000ULL
-struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
+#define IORING_OFF_ZCRX_SHIFT 16
+
+struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages);
#ifndef CONFIG_MMU
unsigned int io_uring_nommu_mmap_capabilities(struct file *file);
diff --git a/io_uring/net.c b/io_uring/net.c
index 24040bc3916a1b..1fbdb2bbb3f3fb 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -18,7 +18,6 @@
#include "rsrc.h"
#include "zcrx.h"
-#if defined(CONFIG_NET)
struct io_shutdown {
struct file *file;
int how;
@@ -190,7 +189,6 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req,
sr->done_io = 0;
sr->retry = false;
sr->len = 0; /* get from the provided buffer */
- req->buf_index = sr->buf_group;
}
static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg,
@@ -359,15 +357,13 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
kmsg->msg.msg_name = &kmsg->addr;
kmsg->msg.msg_namelen = addr_len;
}
- if (sr->flags & IORING_RECVSEND_FIXED_BUF)
+ if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
+ req->flags |= REQ_F_IMPORT_BUFFER;
return 0;
- if (!io_do_buffer_select(req)) {
- ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
- &kmsg->msg.msg_iter);
- if (unlikely(ret < 0))
- return ret;
}
- return 0;
+ if (req->flags & REQ_F_BUFFER_SELECT)
+ return 0;
+ return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter);
}
static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -409,13 +405,12 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
+ if (req->flags & REQ_F_BUFFER_SELECT)
+ sr->buf_group = req->buf_index;
if (sr->flags & IORING_RECVSEND_BUNDLE) {
if (req->opcode == IORING_OP_SENDMSG)
return -EINVAL;
- if (!(req->flags & REQ_F_BUFFER_SELECT))
- return -EINVAL;
sr->msg_flags |= MSG_WAITALL;
- sr->buf_group = req->buf_index;
req->buf_list = NULL;
req->flags |= REQ_F_MULTISHOT;
}
@@ -571,6 +566,7 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
.iovs = &kmsg->fast_iov,
.max_len = min_not_zero(sr->len, INT_MAX),
.nr_iovs = 1,
+ .buf_group = sr->buf_group,
};
if (kmsg->vec.iovec) {
@@ -723,7 +719,6 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct io_async_msghdr *kmsg;
- int ret;
kmsg = io_msg_alloc_async(req);
if (unlikely(!kmsg))
@@ -739,13 +734,10 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req)
kmsg->msg.msg_iocb = NULL;
kmsg->msg.msg_ubuf = NULL;
- if (!io_do_buffer_select(req)) {
- ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
- &kmsg->msg.msg_iter);
- if (unlikely(ret))
- return ret;
- }
- return 0;
+ if (req->flags & REQ_F_BUFFER_SELECT)
+ return 0;
+ return import_ubuf(ITER_DEST, sr->buf, sr->len,
+ &kmsg->msg.msg_iter);
}
return io_recvmsg_copy_hdr(req, kmsg);
@@ -985,7 +977,7 @@ retry_multishot:
void __user *buf;
size_t len = sr->len;
- buf = io_buffer_select(req, &len, issue_flags);
+ buf = io_buffer_select(req, &len, sr->buf_group, issue_flags);
if (!buf)
return -ENOBUFS;
@@ -1063,6 +1055,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
.iovs = &kmsg->fast_iov,
.nr_iovs = 1,
.mode = KBUF_MODE_EXPAND,
+ .buf_group = sr->buf_group,
};
if (kmsg->vec.iovec) {
@@ -1095,7 +1088,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
void __user *buf;
*len = sr->len;
- buf = io_buffer_select(req, len, issue_flags);
+ buf = io_buffer_select(req, len, sr->buf_group, issue_flags);
if (!buf)
return -ENOBUFS;
sr->buf = buf;
@@ -1191,16 +1184,14 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
unsigned ifq_idx;
- if (unlikely(sqe->file_index || sqe->addr2 || sqe->addr ||
- sqe->addr3))
+ if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3))
return -EINVAL;
ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx);
- if (ifq_idx != 0)
- return -EINVAL;
- zc->ifq = req->ctx->ifq;
+ zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx);
if (!zc->ifq)
return -EINVAL;
+
zc->len = READ_ONCE(sqe->len);
zc->flags = READ_ONCE(sqe->ioprio);
zc->msg_flags = READ_ONCE(sqe->msg_flags);
@@ -1321,8 +1312,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -ENOMEM;
if (req->opcode == IORING_OP_SEND_ZC) {
- if (zc->flags & IORING_RECVSEND_FIXED_BUF)
- req->flags |= REQ_F_IMPORT_BUFFER;
ret = io_send_setup(req, sqe);
} else {
if (unlikely(sqe->addr2 || sqe->file_index))
@@ -1846,4 +1835,3 @@ void io_netmsg_cache_free(const void *entry)
io_vec_free(&kmsg->vec);
kfree(kmsg);
}
-#endif
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 7bd92538dccbc7..9a6f6e92d74242 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -112,6 +112,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
if (unlikely(!io_alloc_req(ctx, &notif)))
return NULL;
+ notif->ctx = ctx;
notif->opcode = IORING_OP_NOP;
notif->flags = 0;
notif->file = NULL;
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 489384c0438bd8..6e0882b051f93b 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -333,13 +333,13 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.iopoll = 1,
.prep = io_provide_buffers_prep,
- .issue = io_provide_buffers,
+ .issue = io_manage_buffers_legacy,
},
[IORING_OP_REMOVE_BUFFERS] = {
.audit_skip = 1,
.iopoll = 1,
.prep = io_remove_buffers_prep,
- .issue = io_remove_buffers,
+ .issue = io_manage_buffers_legacy,
},
[IORING_OP_TEE] = {
.needs_file = 1,
@@ -569,6 +569,10 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_prep_writev_fixed,
.issue = io_write,
},
+ [IORING_OP_PIPE] = {
+ .prep = io_pipe_prep,
+ .issue = io_pipe,
+ },
};
const struct io_cold_def io_cold_defs[] = {
@@ -815,6 +819,9 @@ const struct io_cold_def io_cold_defs[] = {
.cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
+ [IORING_OP_PIPE] = {
+ .name = "PIPE",
+ },
};
const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index e3357dfa14ca42..4dd46116345783 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -6,6 +6,8 @@
#include <linux/fdtable.h>
#include <linux/fsnotify.h>
#include <linux/namei.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/watch_queue.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
@@ -302,3 +304,134 @@ int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags)
io_req_set_res(req, ret, 0);
return IOU_OK;
}
+
+struct io_pipe {
+ struct file *file;
+ int __user *fds;
+ int flags;
+ int file_slot;
+ unsigned long nofile;
+};
+
+int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
+
+ if (sqe->fd || sqe->off || sqe->addr3)
+ return -EINVAL;
+
+ p->fds = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ p->flags = READ_ONCE(sqe->pipe_flags);
+ if (p->flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
+ return -EINVAL;
+
+ p->file_slot = READ_ONCE(sqe->file_index);
+ p->nofile = rlimit(RLIMIT_NOFILE);
+ return 0;
+}
+
+static int io_pipe_fixed(struct io_kiocb *req, struct file **files,
+ unsigned int issue_flags)
+{
+ struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret, fds[2] = { -1, -1 };
+ int slot = p->file_slot;
+
+ if (p->flags & O_CLOEXEC)
+ return -EINVAL;
+
+ io_ring_submit_lock(ctx, issue_flags);
+
+ ret = __io_fixed_fd_install(ctx, files[0], slot);
+ if (ret < 0)
+ goto err;
+ fds[0] = ret;
+ files[0] = NULL;
+
+ /*
+ * If a specific slot is given, next one will be used for
+ * the write side.
+ */
+ if (slot != IORING_FILE_INDEX_ALLOC)
+ slot++;
+
+ ret = __io_fixed_fd_install(ctx, files[1], slot);
+ if (ret < 0)
+ goto err;
+ fds[1] = ret;
+ files[1] = NULL;
+
+ io_ring_submit_unlock(ctx, issue_flags);
+
+ if (!copy_to_user(p->fds, fds, sizeof(fds)))
+ return 0;
+
+ ret = -EFAULT;
+ io_ring_submit_lock(ctx, issue_flags);
+err:
+ if (fds[0] != -1)
+ io_fixed_fd_remove(ctx, fds[0]);
+ if (fds[1] != -1)
+ io_fixed_fd_remove(ctx, fds[1]);
+ io_ring_submit_unlock(ctx, issue_flags);
+ return ret;
+}
+
+static int io_pipe_fd(struct io_kiocb *req, struct file **files)
+{
+ struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
+ int ret, fds[2] = { -1, -1 };
+
+ ret = __get_unused_fd_flags(p->flags, p->nofile);
+ if (ret < 0)
+ goto err;
+ fds[0] = ret;
+
+ ret = __get_unused_fd_flags(p->flags, p->nofile);
+ if (ret < 0)
+ goto err;
+ fds[1] = ret;
+
+ if (!copy_to_user(p->fds, fds, sizeof(fds))) {
+ fd_install(fds[0], files[0]);
+ fd_install(fds[1], files[1]);
+ return 0;
+ }
+ ret = -EFAULT;
+err:
+ if (fds[0] != -1)
+ put_unused_fd(fds[0]);
+ if (fds[1] != -1)
+ put_unused_fd(fds[1]);
+ return ret;
+}
+
+int io_pipe(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
+ struct file *files[2];
+ int ret;
+
+ ret = create_pipe_files(files, p->flags);
+ if (ret)
+ return ret;
+ files[0]->f_mode |= FMODE_NOWAIT;
+ files[1]->f_mode |= FMODE_NOWAIT;
+
+ if (!!p->file_slot)
+ ret = io_pipe_fixed(req, files, issue_flags);
+ else
+ ret = io_pipe_fd(req, files);
+
+ io_req_set_res(req, ret, 0);
+ if (!ret)
+ return IOU_OK;
+
+ req_set_fail(req);
+ if (files[0])
+ fput(files[0]);
+ if (files[1])
+ fput(files[1]);
+ return ret;
+}
diff --git a/io_uring/openclose.h b/io_uring/openclose.h
index 8a93c98ad0adc6..4ca2a9935abc9c 100644
--- a/io_uring/openclose.h
+++ b/io_uring/openclose.h
@@ -13,5 +13,8 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags);
int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_close(struct io_kiocb *req, unsigned int issue_flags);
+int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_pipe(struct io_kiocb *req, unsigned int issue_flags);
+
int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index f80a77c4973f30..1657d775c8bab2 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -80,10 +80,21 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
return 0;
}
-int io_buffer_validate(struct iovec *iov)
+int io_validate_user_buf_range(u64 uaddr, u64 ulen)
{
- unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
+ unsigned long tmp, base = (unsigned long)uaddr;
+ unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen);
+ /* arbitrary limit, but we need something */
+ if (ulen > SZ_1G || !ulen)
+ return -EFAULT;
+ if (check_add_overflow(base, acct_len, &tmp))
+ return -EOVERFLOW;
+ return 0;
+}
+
+static int io_buffer_validate(struct iovec *iov)
+{
/*
* Don't impose further limits on the size and buffer
* constraints here, we'll -EINVAL later when IO is
@@ -91,17 +102,9 @@ int io_buffer_validate(struct iovec *iov)
*/
if (!iov->iov_base)
return iov->iov_len ? -EFAULT : 0;
- if (!iov->iov_len)
- return -EFAULT;
-
- /* arbitrary limit, but we need something */
- if (iov->iov_len > SZ_1G)
- return -EFAULT;
- if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
- return -EOVERFLOW;
-
- return 0;
+ return io_validate_user_buf_range((unsigned long)iov->iov_base,
+ iov->iov_len);
}
static void io_release_ubuf(void *priv)
@@ -685,38 +688,34 @@ static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
struct io_imu_folio_data *data)
{
struct page **page_array = *pages, **new_array = NULL;
- int nr_pages_left = *nr_pages, i, j;
- int nr_folios = data->nr_folios;
+ unsigned nr_pages_left = *nr_pages;
+ unsigned nr_folios = data->nr_folios;
+ unsigned i, j;
/* Store head pages only*/
- new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
- GFP_KERNEL);
+ new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL);
if (!new_array)
return false;
- new_array[0] = compound_head(page_array[0]);
- /*
- * The pages are bound to the folio, it doesn't
- * actually unpin them but drops all but one reference,
- * which is usually put down by io_buffer_unmap().
- * Note, needs a better helper.
- */
- if (data->nr_pages_head > 1)
- unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
-
- j = data->nr_pages_head;
- nr_pages_left -= data->nr_pages_head;
- for (i = 1; i < nr_folios; i++) {
- unsigned int nr_unpin;
-
- new_array[i] = page_array[j];
- nr_unpin = min_t(unsigned int, nr_pages_left - 1,
- data->nr_pages_mid - 1);
- if (nr_unpin)
- unpin_user_pages(&page_array[j+1], nr_unpin);
- j += data->nr_pages_mid;
- nr_pages_left -= data->nr_pages_mid;
+ for (i = 0, j = 0; i < nr_folios; i++) {
+ struct page *p = compound_head(page_array[j]);
+ struct folio *folio = page_folio(p);
+ unsigned int nr;
+
+ WARN_ON_ONCE(i > 0 && p != page_array[j]);
+
+ nr = i ? data->nr_pages_mid : data->nr_pages_head;
+ nr = min(nr, nr_pages_left);
+ /* Drop all but one ref, the entire folio will remain pinned. */
+ if (nr > 1)
+ unpin_user_folio(folio, nr - 1);
+ j += nr;
+ nr_pages_left -= nr;
+ new_array[i] = p;
}
+
+ WARN_ON_ONCE(j != *nr_pages);
+
kvfree(page_array);
*pages = new_array;
*nr_pages = nr_folios;
@@ -1062,8 +1061,6 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
size_t offset;
int ret;
- if (WARN_ON_ONCE(!imu))
- return -EFAULT;
ret = validate_fixed_range(buf_addr, len, imu);
if (unlikely(ret))
return ret;
@@ -1110,13 +1107,19 @@ inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
if (req->flags & REQ_F_BUF_NODE)
return req->buf_node;
+ req->flags |= REQ_F_BUF_NODE;
io_ring_submit_lock(ctx, issue_flags);
node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
- if (node)
- io_req_assign_buf_node(req, node);
+ if (node) {
+ node->refs++;
+ req->buf_node = node;
+ io_ring_submit_unlock(ctx, issue_flags);
+ return node;
+ }
+ req->flags &= ~REQ_F_BUF_NODE;
io_ring_submit_unlock(ctx, issue_flags);
- return node;
+ return NULL;
}
int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index b52242852ff342..0d2138f16322b2 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -83,7 +83,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned size, unsigned type);
int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type);
-int io_buffer_validate(struct iovec *iov);
+int io_validate_user_buf_range(u64 uaddr, u64 ulen);
bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
struct io_imu_folio_data *data);
@@ -115,32 +115,6 @@ static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx,
return true;
}
-static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
-{
- if (req->file_node) {
- io_put_rsrc_node(req->ctx, req->file_node);
- req->file_node = NULL;
- }
- if (req->flags & REQ_F_BUF_NODE) {
- io_put_rsrc_node(req->ctx, req->buf_node);
- req->buf_node = NULL;
- }
-}
-
-static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node,
- struct io_rsrc_node *node)
-{
- node->refs++;
- *dst_node = node;
-}
-
-static inline void io_req_assign_buf_node(struct io_kiocb *req,
- struct io_rsrc_node *node)
-{
- io_req_assign_rsrc_node(&req->buf_node, node);
- req->flags |= REQ_F_BUF_NODE;
-}
-
int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index b8389674a747fc..303fdded375851 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -119,7 +119,7 @@ static int __io_import_rw_buffer(int ddir, struct io_kiocb *req,
return io_import_vec(ddir, req, io, buf, sqe_len);
if (io_do_buffer_select(req)) {
- buf = io_buffer_select(req, &sqe_len, issue_flags);
+ buf = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags);
if (!buf)
return -ENOBUFS;
rw->addr = (unsigned long) buf;
@@ -253,16 +253,19 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
int ddir)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ struct io_async_rw *io;
unsigned ioprio;
u64 attr_type_mask;
int ret;
if (io_rw_alloc_async(req))
return -ENOMEM;
+ io = req->async_data;
rw->kiocb.ki_pos = READ_ONCE(sqe->off);
/* used for fixed read/write too - just read unconditionally */
req->buf_index = READ_ONCE(sqe->buf_index);
+ io->buf_group = req->buf_index;
ioprio = READ_ONCE(sqe->ioprio);
if (ioprio) {
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 81d6d9a8cf6937..129a53fe54825d 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -16,6 +16,8 @@ struct io_async_rw {
struct iov_iter iter;
struct iov_iter_state iter_state;
struct iovec fast_iov;
+ unsigned buf_group;
+
/*
* wpq is for buffered io, while meta fields are used with
* direct io
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index adc6e42c14df6c..5b66755579c08f 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -35,8 +35,6 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
data.hash = hash;
data.task = task;
- data.free_work = io_wq_free_work;
- data.do_work = io_wq_submit_work;
/* Do QD, or 4 * CPUS, whatever is smallest */
concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 2a107665230b07..a6ff8c026b1f22 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -35,6 +35,9 @@ struct io_timeout_rem {
bool ltimeout;
};
+static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
+ struct io_kiocb *link);
+
static inline bool io_is_timeout_noseq(struct io_kiocb *req)
{
struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
@@ -218,7 +221,9 @@ void io_disarm_next(struct io_kiocb *req)
struct io_ring_ctx *ctx = req->ctx;
raw_spin_lock_irq(&ctx->timeout_lock);
- link = io_disarm_linked_timeout(req);
+ if (req->link && req->link->opcode == IORING_OP_LINK_TIMEOUT)
+ link = __io_disarm_linked_timeout(req, req->link);
+
raw_spin_unlock_irq(&ctx->timeout_lock);
if (link)
io_req_queue_tw_complete(link, -ECANCELED);
@@ -228,8 +233,8 @@ void io_disarm_next(struct io_kiocb *req)
io_fail_links(req);
}
-struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
- struct io_kiocb *link)
+static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
+ struct io_kiocb *link)
__must_hold(&req->ctx->completion_lock)
__must_hold(&req->ctx->timeout_lock)
{
diff --git a/io_uring/timeout.h b/io_uring/timeout.h
index e91b32448dcf90..2b7c9ad729925c 100644
--- a/io_uring/timeout.h
+++ b/io_uring/timeout.h
@@ -8,19 +8,6 @@ struct io_timeout_data {
u32 flags;
};
-struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
- struct io_kiocb *link);
-
-static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
-{
- struct io_kiocb *link = req->link;
-
- if (link && link->opcode == IORING_OP_LINK_TIMEOUT)
- return __io_disarm_linked_timeout(req, link);
-
- return NULL;
-}
-
__cold void io_flush_timeouts(struct io_ring_ctx *ctx);
struct io_cancel_data;
int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 430ed620ddfe01..8a6b0ddef79644 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -3,13 +3,10 @@
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/io_uring/cmd.h>
-#include <linux/io_uring/net.h>
#include <linux/security.h>
#include <linux/nospec.h>
-#include <net/sock.h>
#include <uapi/linux/io_uring.h>
-#include <asm/ioctls.h>
#include "io_uring.h"
#include "alloc_cache.h"
@@ -307,83 +304,3 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
io_req_queue_iowq(req);
}
-
-static inline int io_uring_cmd_getsockopt(struct socket *sock,
- struct io_uring_cmd *cmd,
- unsigned int issue_flags)
-{
- const struct io_uring_sqe *sqe = cmd->sqe;
- bool compat = !!(issue_flags & IO_URING_F_COMPAT);
- int optlen, optname, level, err;
- void __user *optval;
-
- level = READ_ONCE(sqe->level);
- if (level != SOL_SOCKET)
- return -EOPNOTSUPP;
-
- optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
- optname = READ_ONCE(sqe->optname);
- optlen = READ_ONCE(sqe->optlen);
-
- err = do_sock_getsockopt(sock, compat, level, optname,
- USER_SOCKPTR(optval),
- KERNEL_SOCKPTR(&optlen));
- if (err)
- return err;
-
- /* On success, return optlen */
- return optlen;
-}
-
-static inline int io_uring_cmd_setsockopt(struct socket *sock,
- struct io_uring_cmd *cmd,
- unsigned int issue_flags)
-{
- const struct io_uring_sqe *sqe = cmd->sqe;
- bool compat = !!(issue_flags & IO_URING_F_COMPAT);
- int optname, optlen, level;
- void __user *optval;
- sockptr_t optval_s;
-
- optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
- optname = READ_ONCE(sqe->optname);
- optlen = READ_ONCE(sqe->optlen);
- level = READ_ONCE(sqe->level);
- optval_s = USER_SOCKPTR(optval);
-
- return do_sock_setsockopt(sock, compat, level, optname, optval_s,
- optlen);
-}
-
-#if defined(CONFIG_NET)
-int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
-{
- struct socket *sock = cmd->file->private_data;
- struct sock *sk = sock->sk;
- struct proto *prot = READ_ONCE(sk->sk_prot);
- int ret, arg = 0;
-
- if (!prot || !prot->ioctl)
- return -EOPNOTSUPP;
-
- switch (cmd->cmd_op) {
- case SOCKET_URING_OP_SIOCINQ:
- ret = prot->ioctl(sk, SIOCINQ, &arg);
- if (ret)
- return ret;
- return arg;
- case SOCKET_URING_OP_SIOCOUTQ:
- ret = prot->ioctl(sk, SIOCOUTQ, &arg);
- if (ret)
- return ret;
- return arg;
- case SOCKET_URING_OP_GETSOCKOPT:
- return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
- case SOCKET_URING_OP_SETSOCKOPT:
- return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
- default:
- return -EOPNOTSUPP;
- }
-}
-EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
-#endif
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index fe86606b9f304d..9a568d04920470 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -26,29 +26,207 @@
#include "zcrx.h"
#include "rsrc.h"
+#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
+
static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
{
return pp->mp_priv;
}
-#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
+static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
+{
+ struct net_iov_area *owner = net_iov_owner(niov);
-static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
- struct io_zcrx_area *area, int nr_mapped)
+ return container_of(owner, struct io_zcrx_area, nia);
+}
+
+static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
+{
+ struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
+
+ return area->mem.pages[net_iov_idx(niov)];
+}
+
+static void io_release_dmabuf(struct io_zcrx_mem *mem)
+{
+ if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+ return;
+
+ if (mem->sgt)
+ dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt,
+ DMA_FROM_DEVICE);
+ if (mem->attach)
+ dma_buf_detach(mem->dmabuf, mem->attach);
+ if (mem->dmabuf)
+ dma_buf_put(mem->dmabuf);
+
+ mem->sgt = NULL;
+ mem->attach = NULL;
+ mem->dmabuf = NULL;
+}
+
+static int io_import_dmabuf(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_mem *mem,
+ struct io_uring_zcrx_area_reg *area_reg)
+{
+ unsigned long off = (unsigned long)area_reg->addr;
+ unsigned long len = (unsigned long)area_reg->len;
+ unsigned long total_size = 0;
+ struct scatterlist *sg;
+ int dmabuf_fd = area_reg->dmabuf_fd;
+ int i, ret;
+
+ if (WARN_ON_ONCE(!ifq->dev))
+ return -EFAULT;
+ if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+ return -EINVAL;
+
+ mem->is_dmabuf = true;
+ mem->dmabuf = dma_buf_get(dmabuf_fd);
+ if (IS_ERR(mem->dmabuf)) {
+ ret = PTR_ERR(mem->dmabuf);
+ mem->dmabuf = NULL;
+ goto err;
+ }
+
+ mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev);
+ if (IS_ERR(mem->attach)) {
+ ret = PTR_ERR(mem->attach);
+ mem->attach = NULL;
+ goto err;
+ }
+
+ mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE);
+ if (IS_ERR(mem->sgt)) {
+ ret = PTR_ERR(mem->sgt);
+ mem->sgt = NULL;
+ goto err;
+ }
+
+ for_each_sgtable_dma_sg(mem->sgt, sg, i)
+ total_size += sg_dma_len(sg);
+
+ if (total_size < off + len)
+ return -EINVAL;
+
+ mem->dmabuf_offset = off;
+ mem->size = len;
+ return 0;
+err:
+ io_release_dmabuf(mem);
+ return ret;
+}
+
+static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
+{
+ unsigned long off = area->mem.dmabuf_offset;
+ struct scatterlist *sg;
+ unsigned i, niov_idx = 0;
+
+ if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+ return -EINVAL;
+
+ for_each_sgtable_dma_sg(area->mem.sgt, sg, i) {
+ dma_addr_t dma = sg_dma_address(sg);
+ unsigned long sg_len = sg_dma_len(sg);
+ unsigned long sg_off = min(sg_len, off);
+
+ off -= sg_off;
+ sg_len -= sg_off;
+ dma += sg_off;
+
+ while (sg_len && niov_idx < area->nia.num_niovs) {
+ struct net_iov *niov = &area->nia.niovs[niov_idx];
+
+ if (net_mp_niov_set_dma_addr(niov, dma))
+ return 0;
+ sg_len -= PAGE_SIZE;
+ dma += PAGE_SIZE;
+ niov_idx++;
+ }
+ }
+ return niov_idx;
+}
+
+static int io_import_umem(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_mem *mem,
+ struct io_uring_zcrx_area_reg *area_reg)
+{
+ struct page **pages;
+ int nr_pages;
+
+ if (area_reg->dmabuf_fd)
+ return -EINVAL;
+ if (!area_reg->addr)
+ return -EFAULT;
+ pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
+ &nr_pages);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ mem->pages = pages;
+ mem->nr_folios = nr_pages;
+ mem->size = area_reg->len;
+ return 0;
+}
+
+static void io_release_area_mem(struct io_zcrx_mem *mem)
+{
+ if (mem->is_dmabuf) {
+ io_release_dmabuf(mem);
+ return;
+ }
+ if (mem->pages) {
+ unpin_user_pages(mem->pages, mem->nr_folios);
+ kvfree(mem->pages);
+ }
+}
+
+static int io_import_area(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_mem *mem,
+ struct io_uring_zcrx_area_reg *area_reg)
+{
+ int ret;
+
+ ret = io_validate_user_buf_range(area_reg->addr, area_reg->len);
+ if (ret)
+ return ret;
+ if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
+ return -EINVAL;
+
+ if (area_reg->flags & IORING_ZCRX_AREA_DMABUF)
+ return io_import_dmabuf(ifq, mem, area_reg);
+ return io_import_umem(ifq, mem, area_reg);
+}
+
+static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_area *area, int nr_mapped)
{
int i;
for (i = 0; i < nr_mapped; i++) {
- struct net_iov *niov = &area->nia.niovs[i];
- dma_addr_t dma;
+ netmem_ref netmem = net_iov_to_netmem(&area->nia.niovs[i]);
+ dma_addr_t dma = page_pool_get_dma_addr_netmem(netmem);
- dma = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov));
dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
DMA_FROM_DEVICE, IO_DMA_ATTR);
- net_mp_niov_set_dma_addr(niov, 0);
}
}
+static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_area *area, int nr_mapped)
+{
+ int i;
+
+ if (area->mem.is_dmabuf)
+ io_release_dmabuf(&area->mem);
+ else
+ io_zcrx_unmap_umem(ifq, area, nr_mapped);
+
+ for (i = 0; i < area->nia.num_niovs; i++)
+ net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
+}
+
static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
guard(mutex)(&ifq->dma_lock);
@@ -58,20 +236,16 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *are
area->is_mapped = false;
}
-static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
+static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
int i;
- guard(mutex)(&ifq->dma_lock);
- if (area->is_mapped)
- return 0;
-
for (i = 0; i < area->nia.num_niovs; i++) {
struct net_iov *niov = &area->nia.niovs[i];
dma_addr_t dma;
- dma = dma_map_page_attrs(ifq->dev, area->pages[i], 0, PAGE_SIZE,
- DMA_FROM_DEVICE, IO_DMA_ATTR);
+ dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0,
+ PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR);
if (dma_mapping_error(ifq->dev, dma))
break;
if (net_mp_niov_set_dma_addr(niov, dma)) {
@@ -80,9 +254,24 @@ static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
break;
}
}
+ return i;
+}
+
+static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
+{
+ unsigned nr;
+
+ guard(mutex)(&ifq->dma_lock);
+ if (area->is_mapped)
+ return 0;
+
+ if (area->mem.is_dmabuf)
+ nr = io_zcrx_map_area_dmabuf(ifq, area);
+ else
+ nr = io_zcrx_map_area_umem(ifq, area);
- if (i != area->nia.num_niovs) {
- __io_zcrx_unmap_area(ifq, area, i);
+ if (nr != area->nia.num_niovs) {
+ __io_zcrx_unmap_area(ifq, area, nr);
return -EINVAL;
}
@@ -118,13 +307,6 @@ struct io_zcrx_args {
static const struct memory_provider_ops io_uring_pp_zc_ops;
-static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
-{
- struct net_iov_area *owner = net_iov_owner(niov);
-
- return container_of(owner, struct io_zcrx_area, nia);
-}
-
static inline atomic_t *io_get_user_counter(struct net_iov *niov)
{
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
@@ -147,17 +329,12 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov)
atomic_inc(io_get_user_counter(niov));
}
-static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
-{
- struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
-
- return area->pages[net_iov_idx(niov)];
-}
-
static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
struct io_uring_zcrx_ifq_reg *reg,
- struct io_uring_region_desc *rd)
+ struct io_uring_region_desc *rd,
+ u32 id)
{
+ u64 mmap_offset;
size_t off, size;
void *ptr;
int ret;
@@ -167,12 +344,14 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
if (size > rd->size)
return -EINVAL;
- ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd,
- IORING_MAP_OFF_ZCRX_REGION);
+ mmap_offset = IORING_MAP_OFF_ZCRX_REGION;
+ mmap_offset += id << IORING_OFF_PBUF_SHIFT;
+
+ ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset);
if (ret < 0)
return ret;
- ptr = io_region_get_ptr(&ifq->ctx->zcrx_region);
+ ptr = io_region_get_ptr(&ifq->region);
ifq->rq_ring = (struct io_uring *)ptr;
ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
return 0;
@@ -180,7 +359,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
{
- io_free_region(ifq->ctx, &ifq->ctx->zcrx_region);
+ io_free_region(ifq->ctx, &ifq->region);
ifq->rq_ring = NULL;
ifq->rqes = NULL;
}
@@ -188,53 +367,44 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
static void io_zcrx_free_area(struct io_zcrx_area *area)
{
io_zcrx_unmap_area(area->ifq, area);
+ io_release_area_mem(&area->mem);
kvfree(area->freelist);
kvfree(area->nia.niovs);
kvfree(area->user_refs);
- if (area->pages) {
- unpin_user_pages(area->pages, area->nr_folios);
- kvfree(area->pages);
- }
kfree(area);
}
+#define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF)
+
static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
struct io_zcrx_area **res,
struct io_uring_zcrx_area_reg *area_reg)
{
struct io_zcrx_area *area;
- int i, ret, nr_pages, nr_iovs;
- struct iovec iov;
+ unsigned nr_iovs;
+ int i, ret;
- if (area_reg->flags || area_reg->rq_area_token)
+ if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS)
return -EINVAL;
- if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1])
+ if (area_reg->rq_area_token)
return -EINVAL;
- if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
+ if (area_reg->__resv2[0] || area_reg->__resv2[1])
return -EINVAL;
- iov.iov_base = u64_to_user_ptr(area_reg->addr);
- iov.iov_len = area_reg->len;
- ret = io_buffer_validate(&iov);
- if (ret)
- return ret;
-
ret = -ENOMEM;
area = kzalloc(sizeof(*area), GFP_KERNEL);
if (!area)
goto err;
- area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
- &nr_pages);
- if (IS_ERR(area->pages)) {
- ret = PTR_ERR(area->pages);
- area->pages = NULL;
+ ret = io_import_area(ifq, &area->mem, area_reg);
+ if (ret)
goto err;
- }
- area->nr_folios = nr_iovs = nr_pages;
+
+ nr_iovs = area->mem.size >> PAGE_SHIFT;
area->nia.num_niovs = nr_iovs;
+ ret = -ENOMEM;
area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]),
GFP_KERNEL | __GFP_ZERO);
if (!area->nia.niovs)
@@ -245,9 +415,6 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
if (!area->freelist)
goto err;
- for (i = 0; i < nr_iovs; i++)
- area->freelist[i] = i;
-
area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]),
GFP_KERNEL | __GFP_ZERO);
if (!area->user_refs)
@@ -341,6 +508,16 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
kfree(ifq);
}
+struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
+ unsigned int id)
+{
+ struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id);
+
+ lockdep_assert_held(&ctx->mmap_lock);
+
+ return ifq ? &ifq->region : NULL;
+}
+
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
{
@@ -350,6 +527,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_region_desc rd;
struct io_zcrx_ifq *ifq;
int ret;
+ u32 id;
/*
* 1. Interface queue allocation.
@@ -362,8 +540,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
ctx->flags & IORING_SETUP_CQE32))
return -EINVAL;
- if (ctx->ifq)
- return -EBUSY;
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
@@ -386,29 +562,37 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
ifq = io_zcrx_ifq_alloc(ctx);
if (!ifq)
return -ENOMEM;
+ ifq->rq_entries = reg.rq_entries;
- ret = io_allocate_rbuf_ring(ifq, &reg, &rd);
- if (ret)
- goto err;
+ scoped_guard(mutex, &ctx->mmap_lock) {
+ /* preallocate id */
+ ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
+ if (ret)
+ goto ifq_free;
+ }
- ret = io_zcrx_create_area(ifq, &ifq->area, &area);
+ ret = io_allocate_rbuf_ring(ifq, &reg, &rd, id);
if (ret)
goto err;
- ifq->rq_entries = reg.rq_entries;
-
- ret = -ENODEV;
ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx,
&ifq->netdev_tracker, GFP_KERNEL);
- if (!ifq->netdev)
+ if (!ifq->netdev) {
+ ret = -ENODEV;
goto err;
+ }
ifq->dev = ifq->netdev->dev.parent;
- ret = -EOPNOTSUPP;
- if (!ifq->dev)
+ if (!ifq->dev) {
+ ret = -EOPNOTSUPP;
goto err;
+ }
get_device(ifq->dev);
+ ret = io_zcrx_create_area(ifq, &ifq->area, &area);
+ if (ret)
+ goto err;
+
mp_param.mp_ops = &io_uring_pp_zc_ops;
mp_param.mp_priv = ifq;
ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param);
@@ -419,6 +603,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
reg.offsets.rqes = sizeof(struct io_uring);
reg.offsets.head = offsetof(struct io_uring, head);
reg.offsets.tail = offsetof(struct io_uring, tail);
+ reg.zcrx_id = id;
+
+ scoped_guard(mutex, &ctx->mmap_lock) {
+ /* publish ifq */
+ ret = -ENOMEM;
+ if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
+ goto err;
+ }
if (copy_to_user(arg, &reg, sizeof(reg)) ||
copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) ||
@@ -426,24 +618,34 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
ret = -EFAULT;
goto err;
}
- ctx->ifq = ifq;
return 0;
err:
+ scoped_guard(mutex, &ctx->mmap_lock)
+ xa_erase(&ctx->zcrx_ctxs, id);
+ifq_free:
io_zcrx_ifq_free(ifq);
return ret;
}
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
{
- struct io_zcrx_ifq *ifq = ctx->ifq;
+ struct io_zcrx_ifq *ifq;
+ unsigned long id;
lockdep_assert_held(&ctx->uring_lock);
- if (!ifq)
- return;
+ while (1) {
+ scoped_guard(mutex, &ctx->mmap_lock) {
+ ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
+ if (ifq)
+ xa_erase(&ctx->zcrx_ctxs, id);
+ }
+ if (!ifq)
+ break;
+ io_zcrx_ifq_free(ifq);
+ }
- ctx->ifq = NULL;
- io_zcrx_ifq_free(ifq);
+ xa_destroy(&ctx->zcrx_ctxs);
}
static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
@@ -500,12 +702,15 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
{
+ struct io_zcrx_ifq *ifq;
+ unsigned long index;
+
lockdep_assert_held(&ctx->uring_lock);
- if (!ctx->ifq)
- return;
- io_zcrx_scrub(ctx->ifq);
- io_close_queue(ctx->ifq);
+ xa_for_each(&ctx->zcrx_ctxs, index, ifq) {
+ io_zcrx_scrub(ifq);
+ io_close_queue(ifq);
+ }
}
static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
@@ -742,6 +947,9 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
size_t copied = 0;
int ret = 0;
+ if (area->mem.is_dmabuf)
+ return -EFAULT;
+
while (len) {
size_t copy_size = min_t(size_t, PAGE_SIZE, len);
const int dst_off = 0;
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index f2bc811f022c67..2f5e26389f2218 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -3,10 +3,24 @@
#define IOU_ZC_RX_H
#include <linux/io_uring_types.h>
+#include <linux/dma-buf.h>
#include <linux/socket.h>
#include <net/page_pool/types.h>
#include <net/net_trackers.h>
+struct io_zcrx_mem {
+ unsigned long size;
+ bool is_dmabuf;
+
+ struct page **pages;
+ unsigned long nr_folios;
+
+ struct dma_buf_attachment *attach;
+ struct dma_buf *dmabuf;
+ struct sg_table *sgt;
+ unsigned long dmabuf_offset;
+};
+
struct io_zcrx_area {
struct net_iov_area nia;
struct io_zcrx_ifq *ifq;
@@ -14,13 +28,13 @@ struct io_zcrx_area {
bool is_mapped;
u16 area_id;
- struct page **pages;
- unsigned long nr_folios;
/* freelist */
spinlock_t freelist_lock ____cacheline_aligned_in_smp;
u32 free_count;
u32 *freelist;
+
+ struct io_zcrx_mem mem;
};
struct io_zcrx_ifq {
@@ -39,6 +53,7 @@ struct io_zcrx_ifq {
netdevice_tracker netdev_tracker;
spinlock_t lock;
struct mutex dma_lock;
+ struct io_mapped_region region;
};
#if defined(CONFIG_IO_URING_ZCRX)
@@ -49,6 +64,8 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx);
int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
struct socket *sock, unsigned int flags,
unsigned issue_flags, unsigned int *len);
+struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
+ unsigned int id);
#else
static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
@@ -67,6 +84,11 @@ static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
{
return -EOPNOTSUPP;
}
+static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
+ unsigned int id)
+{
+ return NULL;
+}
#endif
int io_recvzc(struct io_kiocb *req, unsigned int issue_flags);