diff options
12 files changed, 811 insertions, 0 deletions
diff --git a/queue-6.12/io_uring-fix-potential-page-leak-in-io_sqe_buffer_register.patch b/queue-6.12/io_uring-fix-potential-page-leak-in-io_sqe_buffer_register.patch new file mode 100644 index 0000000000..532c694222 --- /dev/null +++ b/queue-6.12/io_uring-fix-potential-page-leak-in-io_sqe_buffer_register.patch @@ -0,0 +1,52 @@ +From bb71440639de0757a801ca818d5046c5ce08ced5 Mon Sep 17 00:00:00 2001 +From: Penglei Jiang <superman.xpt@gmail.com> +Date: Tue, 17 Jun 2025 09:56:44 -0700 +Subject: io_uring: fix potential page leak in io_sqe_buffer_register() + +From: Penglei Jiang <superman.xpt@gmail.com> + +Commit e1c75831f682eef0f68b35723437146ed86070b1 upstream. + +If allocation of the 'imu' fails, then the existing pages aren't +unpinned in the error path. This is mostly a theoretical issue, +requiring fault injection to hit. + +Move unpin_user_pages() to unified error handling to fix the page leak +issue. + +Fixes: d8c2237d0aa9 ("io_uring: add io_pin_pages() helper") +Signed-off-by: Penglei Jiang <superman.xpt@gmail.com> +Link: https://lore.kernel.org/r/20250617165644.79165-1-superman.xpt@gmail.com +Signed-off-by: Jens Axboe <axboe@kernel.dk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + io_uring/rsrc.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/io_uring/rsrc.c ++++ b/io_uring/rsrc.c +@@ -983,10 +983,8 @@ static int io_sqe_buffer_register(struct + goto done; + + ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); +- if (ret) { +- unpin_user_pages(pages, nr_pages); ++ if (ret) + goto done; +- } + + size = iov->iov_len; + /* store original address for later verification */ +@@ -1010,8 +1008,11 @@ static int io_sqe_buffer_register(struct + size -= vec_len; + } + done: +- if (ret) ++ if (ret) { + kvfree(imu); ++ if (pages) ++ unpin_user_pages(pages, nr_pages); ++ } + kvfree(pages); + return ret; + } diff --git a/queue-6.12/io_uring-kbuf-flag-partial-buffer-mappings.patch b/queue-6.12/io_uring-kbuf-flag-partial-buffer-mappings.patch new file mode 100644 index 0000000000..e018ad124d --- /dev/null +++ b/queue-6.12/io_uring-kbuf-flag-partial-buffer-mappings.patch @@ -0,0 +1,142 @@ +From 29305dedb17704599efaa5c3ee6b1bee7dc80fd8 Mon Sep 17 00:00:00 2001 +From: Jens Axboe <axboe@kernel.dk> +Date: Thu, 26 Jun 2025 12:17:48 -0600 +Subject: io_uring/kbuf: flag partial buffer mappings + +From: Jens Axboe <axboe@kernel.dk> + +A previous commit aborted mapping more for a non-incremental ring for +bundle peeking, but depending on where in the process this peeking +happened, it would not necessarily prevent a retry by the user. That can +create gaps in the received/read data. + +Add struct buf_sel_arg->partial_map, which can pass this information +back. The networking side can then map that to internal state and use it +to gate retry as well. + +Since this necessitates a new flag, change io_sr_msg->retry to a +retry_flags member, and store both the retry and partial map condition +in there. + +Cc: stable@vger.kernel.org +Fixes: 26ec15e4b0c1 ("io_uring/kbuf: don't truncate end buffer for multiple buffer peeks") +Signed-off-by: Jens Axboe <axboe@kernel.dk> +(cherry picked from commit 178b8ff66ff827c41b4fa105e9aabb99a0b5c537) +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + io_uring/kbuf.c | 1 + + io_uring/kbuf.h | 1 + + io_uring/net.c | 23 +++++++++++++++-------- + 3 files changed, 17 insertions(+), 8 deletions(-) + +--- a/io_uring/kbuf.c ++++ b/io_uring/kbuf.c +@@ -263,6 +263,7 @@ static int io_ring_buffers_peek(struct i + if (len > arg->max_len) { + len = arg->max_len; + if (!(bl->flags & IOBL_INC)) { ++ arg->partial_map = 1; + if (iov != arg->iovs) + break; + buf->len = len; +--- a/io_uring/kbuf.h ++++ b/io_uring/kbuf.h +@@ -61,6 +61,7 @@ struct buf_sel_arg { + size_t max_len; + unsigned short nr_iovs; + unsigned short mode; ++ unsigned short partial_map; + }; + + void __user *io_buffer_select(struct io_kiocb *req, size_t *len, +--- a/io_uring/net.c ++++ b/io_uring/net.c +@@ -76,13 +76,18 @@ struct io_sr_msg { + /* initialised and used only by !msg send variants */ + u16 addr_len; + u16 buf_group; +- bool retry; ++ unsigned short retry_flags; + void __user *addr; + void __user *msg_control; + /* used only for send zerocopy */ + struct io_kiocb *notif; + }; + ++enum sr_retry_flags { ++ IO_SR_MSG_RETRY = 1, ++ IO_SR_MSG_PARTIAL_MAP = 2, ++}; ++ + /* + * Number of times we'll try and do receives if there's more data. If we + * exceed this limit, then add us to the back of the queue and retry from +@@ -204,7 +209,7 @@ static inline void io_mshot_prep_retry(s + + req->flags &= ~REQ_F_BL_EMPTY; + sr->done_io = 0; +- sr->retry = false; ++ sr->retry_flags = 0; + sr->len = 0; /* get from the provided buffer */ + req->buf_index = sr->buf_group; + } +@@ -411,7 +416,7 @@ int io_sendmsg_prep(struct io_kiocb *req + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + + sr->done_io = 0; +- sr->retry = false; ++ sr->retry_flags = 0; + + if (req->opcode == IORING_OP_SEND) { + if (READ_ONCE(sqe->__pad3[0])) +@@ -783,7 +788,7 @@ int io_recvmsg_prep(struct io_kiocb *req + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + + sr->done_io = 0; +- sr->retry = false; ++ sr->retry_flags = 0; + + if (unlikely(sqe->file_index || sqe->addr2)) + return -EINVAL; +@@ -856,7 +861,7 @@ static inline bool io_recv_finish(struct + + cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret), + issue_flags); +- if (sr->retry) ++ if (sr->retry_flags & IO_SR_MSG_RETRY) + cflags = req->cqe.flags | (cflags & CQE_F_MASK); + /* bundle with no more immediate buffers, we're done */ + if (req->flags & REQ_F_BL_EMPTY) +@@ -865,12 +870,12 @@ static inline bool io_recv_finish(struct + * If more is available AND it was a full transfer, retry and + * append to this one + */ +- if (!sr->retry && kmsg->msg.msg_inq > 1 && this_ret > 0 && ++ if (!sr->retry_flags && kmsg->msg.msg_inq > 1 && this_ret > 0 && + !iov_iter_count(&kmsg->msg.msg_iter)) { + req->cqe.flags = cflags & ~CQE_F_MASK; + sr->len = kmsg->msg.msg_inq; + sr->done_io += this_ret; +- sr->retry = true; ++ sr->retry_flags |= IO_SR_MSG_RETRY; + return false; + } + } else { +@@ -1123,6 +1128,8 @@ static int io_recv_buf_select(struct io_ + kmsg->free_iov = arg.iovs; + req->flags |= REQ_F_NEED_CLEANUP; + } ++ if (arg.partial_map) ++ sr->retry_flags |= IO_SR_MSG_PARTIAL_MAP; + + /* special case 1 vec, can be a fast path */ + if (ret == 1) { +@@ -1252,7 +1259,7 @@ int io_send_zc_prep(struct io_kiocb *req + struct io_kiocb *notif; + + zc->done_io = 0; +- zc->retry = false; ++ zc->retry_flags = 0; + req->flags |= REQ_F_POLL_NO_LAZY; + + if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) diff --git a/queue-6.12/io_uring-net-always-use-current-transfer-count-for-buffer-put.patch b/queue-6.12/io_uring-net-always-use-current-transfer-count-for-buffer-put.patch new file mode 100644 index 0000000000..2bd4660ed3 --- /dev/null +++ b/queue-6.12/io_uring-net-always-use-current-transfer-count-for-buffer-put.patch @@ -0,0 +1,35 @@ +From b66423f9c952d70f4c8130da3b9fc2be68db52cc Mon Sep 17 00:00:00 2001 +From: Jens Axboe <axboe@kernel.dk> +Date: Fri, 20 Jun 2025 07:41:21 -0600 +Subject: io_uring/net: always use current transfer count for buffer put + +From: Jens Axboe <axboe@kernel.dk> + +A previous fix corrected the retry condition for when to continue a +current bundle, but it missed that the current (not the total) transfer +count also applies to the buffer put. If not, then for incrementally +consumed buffer rings repeated completions on the same request may end +up over consuming. + +Reported-by: Roy Tang (ErgoniaTrading) <royonia@ergonia.io> +Cc: stable@vger.kernel.org +Fixes: 3a08988123c8 ("io_uring/net: only retry recv bundle for a full transfer") +Link: https://github.com/axboe/liburing/issues/1423 +Signed-off-by: Jens Axboe <axboe@kernel.dk> +(cherry picked from commit 51a4598ad5d9eb6be4ec9ba65bbfdf0ac302eb2e) +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + io_uring/net.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/io_uring/net.c ++++ b/io_uring/net.c +@@ -854,7 +854,7 @@ static inline bool io_recv_finish(struct + if (sr->flags & IORING_RECVSEND_BUNDLE) { + size_t this_ret = *ret - sr->done_io; + +- cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, this_ret), ++ cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret), + issue_flags); + if (sr->retry) + cflags = req->cqe.flags | (cflags & CQE_F_MASK); diff --git a/queue-6.12/io_uring-net-improve-recv-bundles.patch b/queue-6.12/io_uring-net-improve-recv-bundles.patch new file mode 100644 index 0000000000..71ad9f143d --- /dev/null +++ b/queue-6.12/io_uring-net-improve-recv-bundles.patch @@ -0,0 +1,128 @@ +From 3a5ac5f9a18ac9a80cdcee755a88b9ba8db90e3c Mon Sep 17 00:00:00 2001 +From: Jens Axboe <axboe@kernel.dk> +Date: Sat, 8 Feb 2025 10:50:34 -0700 +Subject: io_uring/net: improve recv bundles + +From: Jens Axboe <axboe@kernel.dk> + +Commit 7c71a0af81ba72de9b2c501065e4e718aba9a271 upstream. + +Current recv bundles are only supported for multishot receives, and +additionally they also always post at least 2 CQEs if more data is +available than what a buffer will hold. This happens because the initial +bundle recv will do a single buffer, and then do the rest of what is in +the socket as a followup receive. As shown in a test program, if 1k +buffers are available and 32k is available to receive in the socket, +you'd get the following completions: + +bundle=1, mshot=0 +cqe res 1024 +cqe res 1024 +[...] +cqe res 1024 + +bundle=1, mshot=1 +cqe res 1024 +cqe res 31744 + +where bundle=1 && mshot=0 will post 32 1k completions, and bundle=1 && +mshot=1 will post a 1k completion and then a 31k completion. + +To support bundle recv without multishot, it's possible to simply retry +the recv immediately and post a single completion, rather than split it +into two completions. With the below patch, the same test looks as +follows: + +bundle=1, mshot=0 +cqe res 32768 + +bundle=1, mshot=1 +cqe res 32768 + +where mshot=0 works fine for bundles, and both of them post just a +single 32k completion rather than split it into separate completions. +Posting fewer completions is always a nice win, and not needing +multishot for proper bundle efficiency is nice for cases that can't +necessarily use multishot. + +Reported-by: Norman Maurer <norman_maurer@apple.com> +Link: https://lore.kernel.org/r/184f9f92-a682-4205-a15d-89e18f664502@kernel.dk +Fixes: 2f9c9515bdfd ("io_uring/net: support bundles for recv") +Signed-off-by: Jens Axboe <axboe@kernel.dk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + io_uring/net.c | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +--- a/io_uring/net.c ++++ b/io_uring/net.c +@@ -76,6 +76,7 @@ struct io_sr_msg { + /* initialised and used only by !msg send variants */ + u16 addr_len; + u16 buf_group; ++ bool retry; + void __user *addr; + void __user *msg_control; + /* used only for send zerocopy */ +@@ -203,6 +204,7 @@ static inline void io_mshot_prep_retry(s + + req->flags &= ~REQ_F_BL_EMPTY; + sr->done_io = 0; ++ sr->retry = false; + sr->len = 0; /* get from the provided buffer */ + req->buf_index = sr->buf_group; + } +@@ -409,6 +411,7 @@ int io_sendmsg_prep(struct io_kiocb *req + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + + sr->done_io = 0; ++ sr->retry = false; + + if (req->opcode == IORING_OP_SEND) { + if (READ_ONCE(sqe->__pad3[0])) +@@ -780,6 +783,7 @@ int io_recvmsg_prep(struct io_kiocb *req + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + + sr->done_io = 0; ++ sr->retry = false; + + if (unlikely(sqe->file_index || sqe->addr2)) + return -EINVAL; +@@ -828,6 +832,9 @@ int io_recvmsg_prep(struct io_kiocb *req + return io_recvmsg_prep_setup(req); + } + ++/* bits to clear in old and inherit in new cflags on bundle retry */ ++#define CQE_F_MASK (IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE) ++ + /* + * Finishes io_recv and io_recvmsg. + * +@@ -847,9 +854,19 @@ static inline bool io_recv_finish(struct + if (sr->flags & IORING_RECVSEND_BUNDLE) { + cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), + issue_flags); ++ if (sr->retry) ++ cflags = req->cqe.flags | (cflags & CQE_F_MASK); + /* bundle with no more immediate buffers, we're done */ + if (req->flags & REQ_F_BL_EMPTY) + goto finish; ++ /* if more is available, retry and append to this one */ ++ if (!sr->retry && kmsg->msg.msg_inq > 0 && *ret > 0) { ++ req->cqe.flags = cflags & ~CQE_F_MASK; ++ sr->len = kmsg->msg.msg_inq; ++ sr->done_io += *ret; ++ sr->retry = true; ++ return false; ++ } + } else { + cflags |= io_put_kbuf(req, *ret, issue_flags); + } +@@ -1228,6 +1245,7 @@ int io_send_zc_prep(struct io_kiocb *req + struct io_kiocb *notif; + + zc->done_io = 0; ++ zc->retry = false; + req->flags |= REQ_F_POLL_NO_LAZY; + + if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) diff --git a/queue-6.12/io_uring-net-mark-iov-as-dynamically-allocated-even-for-single-segments.patch b/queue-6.12/io_uring-net-mark-iov-as-dynamically-allocated-even-for-single-segments.patch new file mode 100644 index 0000000000..6b7645ccae --- /dev/null +++ b/queue-6.12/io_uring-net-mark-iov-as-dynamically-allocated-even-for-single-segments.patch @@ -0,0 +1,49 @@ +From c4e101eab9014e6174d9042c3dc7ff80ce22b889 Mon Sep 17 00:00:00 2001 +From: Jens Axboe <axboe@kernel.dk> +Date: Wed, 25 Jun 2025 10:17:06 -0600 +Subject: io_uring/net: mark iov as dynamically allocated even for single segments + +From: Jens Axboe <axboe@kernel.dk> + +Commit 9a709b7e98e6fa51600b5f2d24c5068efa6d39de upstream. + +A bigger array of vecs could've been allocated, but +io_ring_buffers_peek() still decided to cap the mapped range depending +on how much data was available. Hence don't rely on the segment count +to know if the request should be marked as needing cleanup, always +check upfront if the iov array is different than the fast_iov array. + +Fixes: 26ec15e4b0c1 ("io_uring/kbuf: don't truncate end buffer for multiple buffer peeks") +Signed-off-by: Jens Axboe <axboe@kernel.dk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + io_uring/net.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +--- a/io_uring/net.c ++++ b/io_uring/net.c +@@ -1118,6 +1118,12 @@ static int io_recv_buf_select(struct io_ + if (unlikely(ret < 0)) + return ret; + ++ if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { ++ kmsg->free_iov_nr = ret; ++ kmsg->free_iov = arg.iovs; ++ req->flags |= REQ_F_NEED_CLEANUP; ++ } ++ + /* special case 1 vec, can be a fast path */ + if (ret == 1) { + sr->buf = arg.iovs[0].iov_base; +@@ -1126,11 +1132,6 @@ static int io_recv_buf_select(struct io_ + } + iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, + arg.out_len); +- if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { +- kmsg->free_iov_nr = ret; +- kmsg->free_iov = arg.iovs; +- req->flags |= REQ_F_NEED_CLEANUP; +- } + } else { + void __user *buf; + diff --git a/queue-6.12/io_uring-net-only-consider-msg_inq-if-larger-than-1.patch b/queue-6.12/io_uring-net-only-consider-msg_inq-if-larger-than-1.patch new file mode 100644 index 0000000000..8c14a50bef --- /dev/null +++ b/queue-6.12/io_uring-net-only-consider-msg_inq-if-larger-than-1.patch @@ -0,0 +1,49 @@ +From 8861bd9b328862a49e1cc613bb70eba48d6ce0c8 Mon Sep 17 00:00:00 2001 +From: Jens Axboe <axboe@kernel.dk> +Date: Wed, 28 May 2025 13:45:44 -0600 +Subject: io_uring/net: only consider msg_inq if larger than 1 + +From: Jens Axboe <axboe@kernel.dk> + +Commit 2c7f023219966777be0687e15b57689894304cd3 upstream. + +Currently retry and general validity of msg_inq is gated on it being +larger than zero, but it's entirely possible for this to be slightly +inaccurate. In particular, if FIN is received, it'll return 1. + +Just use larger than 1 as the check. This covers both the FIN case, and +at the same time, it doesn't make much sense to retry a recv immediately +if there's even just a single 1 byte of valid data in the socket. + +Leave the SOCK_NONEMPTY flagging when larger than 0 still, as an app may +use that for the final receive. + +Cc: stable@vger.kernel.org +Reported-by: Christian Mazakas <christian.mazakas@gmail.com> +Fixes: 7c71a0af81ba ("io_uring/net: improve recv bundles") +Signed-off-by: Jens Axboe <axboe@kernel.dk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + io_uring/net.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/io_uring/net.c ++++ b/io_uring/net.c +@@ -865,7 +865,7 @@ static inline bool io_recv_finish(struct + * If more is available AND it was a full transfer, retry and + * append to this one + */ +- if (!sr->retry && kmsg->msg.msg_inq > 0 && this_ret > 0 && ++ if (!sr->retry && kmsg->msg.msg_inq > 1 && this_ret > 0 && + !iov_iter_count(&kmsg->msg.msg_iter)) { + req->cqe.flags = cflags & ~CQE_F_MASK; + sr->len = kmsg->msg.msg_inq; +@@ -1111,7 +1111,7 @@ static int io_recv_buf_select(struct io_ + arg.mode |= KBUF_MODE_FREE; + } + +- if (kmsg->msg.msg_inq > 0) ++ if (kmsg->msg.msg_inq > 1) + arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); + + ret = io_buffers_peek(req, &arg); diff --git a/queue-6.12/io_uring-net-only-retry-recv-bundle-for-a-full-transfer.patch b/queue-6.12/io_uring-net-only-retry-recv-bundle-for-a-full-transfer.patch new file mode 100644 index 0000000000..1843e74058 --- /dev/null +++ b/queue-6.12/io_uring-net-only-retry-recv-bundle-for-a-full-transfer.patch @@ -0,0 +1,60 @@ +From 010c5e8c46373dfba92fbe264ac0dc407fcf38db Mon Sep 17 00:00:00 2001 +From: Jens Axboe <axboe@kernel.dk> +Date: Wed, 21 May 2025 18:51:49 -0600 +Subject: io_uring/net: only retry recv bundle for a full transfer + +From: Jens Axboe <axboe@kernel.dk> + +Commit 3a08988123c868dbfdd054541b1090fb891fa49e upstream. + +If a shorter than assumed transfer was seen, a partial buffer will have +been filled. For that case it isn't sane to attempt to fill more into +the bundle before posting a completion, as that will cause a gap in +the received data. + +Check if the iterator has hit zero and only allow to continue a bundle +operation if that is the case. + +Also ensure that for putting finished buffers, only the current transfer +is accounted. Otherwise too many buffers may be put for a short transfer. + +Link: https://github.com/axboe/liburing/issues/1409 +Cc: stable@vger.kernel.org +Fixes: 7c71a0af81ba ("io_uring/net: improve recv bundles") +Signed-off-by: Jens Axboe <axboe@kernel.dk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + io_uring/net.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +--- a/io_uring/net.c ++++ b/io_uring/net.c +@@ -852,18 +852,24 @@ static inline bool io_recv_finish(struct + cflags |= IORING_CQE_F_SOCK_NONEMPTY; + + if (sr->flags & IORING_RECVSEND_BUNDLE) { +- cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), ++ size_t this_ret = *ret - sr->done_io; ++ ++ cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, this_ret), + issue_flags); + if (sr->retry) + cflags = req->cqe.flags | (cflags & CQE_F_MASK); + /* bundle with no more immediate buffers, we're done */ + if (req->flags & REQ_F_BL_EMPTY) + goto finish; +- /* if more is available, retry and append to this one */ +- if (!sr->retry && kmsg->msg.msg_inq > 0 && *ret > 0) { ++ /* ++ * If more is available AND it was a full transfer, retry and ++ * append to this one ++ */ ++ if (!sr->retry && kmsg->msg.msg_inq > 0 && this_ret > 0 && ++ !iov_iter_count(&kmsg->msg.msg_iter)) { + req->cqe.flags = cflags & ~CQE_F_MASK; + sr->len = kmsg->msg.msg_inq; +- sr->done_io += *ret; ++ sr->done_io += this_ret; + sr->retry = true; + return false; + } diff --git a/queue-6.12/io_uring-rsrc-don-t-rely-on-user-vaddr-alignment.patch b/queue-6.12/io_uring-rsrc-don-t-rely-on-user-vaddr-alignment.patch new file mode 100644 index 0000000000..bb1ac22620 --- /dev/null +++ b/queue-6.12/io_uring-rsrc-don-t-rely-on-user-vaddr-alignment.patch @@ -0,0 +1,56 @@ +From a2b1e9553839f0d0524f9a68239ca215e87586bd Mon Sep 17 00:00:00 2001 +From: Pavel Begunkov <asml.silence@gmail.com> +Date: Tue, 24 Jun 2025 14:40:34 +0100 +Subject: io_uring/rsrc: don't rely on user vaddr alignment + +From: Pavel Begunkov <asml.silence@gmail.com> + +Commit 3a3c6d61577dbb23c09df3e21f6f9eda1ecd634b upstream. + +There is no guaranteed alignment for user pointers, however the +calculation of an offset of the first page into a folio after coalescing +uses some weird bit mask logic, get rid of it. + +Cc: stable@vger.kernel.org +Reported-by: David Hildenbrand <david@redhat.com> +Fixes: a8edbb424b139 ("io_uring/rsrc: enable multi-hugepage buffer coalescing") +Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> +Link: https://lore.kernel.org/io-uring/e387b4c78b33f231105a601d84eefd8301f57954.1750771718.git.asml.silence@gmail.com/ +Signed-off-by: Jens Axboe <axboe@kernel.dk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + io_uring/rsrc.c | 5 ++++- + io_uring/rsrc.h | 1 + + 2 files changed, 5 insertions(+), 1 deletion(-) + +--- a/io_uring/rsrc.c ++++ b/io_uring/rsrc.c +@@ -918,6 +918,7 @@ static bool io_try_coalesce_buffer(struc + return false; + + data->folio_shift = folio_shift(folio); ++ data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); + /* + * Check if pages are contiguous inside a folio, and all folios have + * the same page count except for the head and tail. +@@ -998,7 +999,9 @@ static int io_sqe_buffer_register(struct + if (coalesced) + imu->folio_shift = data.folio_shift; + refcount_set(&imu->refs, 1); +- off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); ++ off = (unsigned long)iov->iov_base & ~PAGE_MASK; ++ if (coalesced) ++ off += data.first_folio_page_idx << PAGE_SHIFT; + *pimu = imu; + ret = 0; + +--- a/io_uring/rsrc.h ++++ b/io_uring/rsrc.h +@@ -56,6 +56,7 @@ struct io_imu_folio_data { + /* For non-head/tail folios, has to be fully included */ + unsigned int nr_pages_mid; + unsigned int folio_shift; ++ unsigned long first_folio_page_idx; + }; + + void io_rsrc_node_ref_zero(struct io_rsrc_node *node); diff --git a/queue-6.12/io_uring-rsrc-fix-folio-unpinning.patch b/queue-6.12/io_uring-rsrc-fix-folio-unpinning.patch new file mode 100644 index 0000000000..0c0c825f91 --- /dev/null +++ b/queue-6.12/io_uring-rsrc-fix-folio-unpinning.patch @@ -0,0 +1,77 @@ +From e33b8b1df1133d03c7b3581e666430446e017016 Mon Sep 17 00:00:00 2001 +From: Pavel Begunkov <asml.silence@gmail.com> +Date: Tue, 24 Jun 2025 14:40:33 +0100 +Subject: io_uring/rsrc: fix folio unpinning + +From: Pavel Begunkov <asml.silence@gmail.com> + +Commit 5afb4bf9fc62d828647647ec31745083637132e4 upstream. + +syzbot complains about an unmapping failure: + +[ 108.070381][ T14] kernel BUG at mm/gup.c:71! +[ 108.070502][ T14] Internal error: Oops - BUG: 00000000f2000800 [#1] SMP +[ 108.123672][ T14] Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20250221-8.fc42 02/21/2025 +[ 108.127458][ T14] Workqueue: iou_exit io_ring_exit_work +[ 108.174205][ T14] Call trace: +[ 108.175649][ T14] sanity_check_pinned_pages+0x7cc/0x7d0 (P) +[ 108.178138][ T14] unpin_user_page+0x80/0x10c +[ 108.180189][ T14] io_release_ubuf+0x84/0xf8 +[ 108.182196][ T14] io_free_rsrc_node+0x250/0x57c +[ 108.184345][ T14] io_rsrc_data_free+0x148/0x298 +[ 108.186493][ T14] io_sqe_buffers_unregister+0x84/0xa0 +[ 108.188991][ T14] io_ring_ctx_free+0x48/0x480 +[ 108.191057][ T14] io_ring_exit_work+0x764/0x7d8 +[ 108.193207][ T14] process_one_work+0x7e8/0x155c +[ 108.195431][ T14] worker_thread+0x958/0xed8 +[ 108.197561][ T14] kthread+0x5fc/0x75c +[ 108.199362][ T14] ret_from_fork+0x10/0x20 + +We can pin a tail page of a folio, but then io_uring will try to unpin +the head page of the folio. While it should be fine in terms of keeping +the page actually alive, mm folks say it's wrong and triggers a debug +warning. Use unpin_user_folio() instead of unpin_user_page*. + +Cc: stable@vger.kernel.org +Debugged-by: David Hildenbrand <david@redhat.com> +Reported-by: syzbot+1d335893772467199ab6@syzkaller.appspotmail.com +Closes: https://lkml.kernel.org/r/683f1551.050a0220.55ceb.0017.GAE@google.com +Fixes: a8edbb424b139 ("io_uring/rsrc: enable multi-hugepage buffer coalescing") +Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> +Link: https://lore.kernel.org/io-uring/a28b0f87339ac2acf14a645dad1e95bbcbf18acd.1750771718.git.asml.silence@gmail.com/ +[axboe: adapt to current tree, massage commit message] +Signed-off-by: Jens Axboe <axboe@kernel.dk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + io_uring/rsrc.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +--- a/io_uring/rsrc.c ++++ b/io_uring/rsrc.c +@@ -119,8 +119,11 @@ static void io_buffer_unmap(struct io_ri + if (imu != &dummy_ubuf) { + if (!refcount_dec_and_test(&imu->refs)) + return; +- for (i = 0; i < imu->nr_bvecs; i++) +- unpin_user_page(imu->bvec[i].bv_page); ++ for (i = 0; i < imu->nr_bvecs; i++) { ++ struct folio *folio = page_folio(imu->bvec[i].bv_page); ++ ++ unpin_user_folio(folio, 1); ++ } + if (imu->acct_pages) + io_unaccount_mem(ctx, imu->acct_pages); + kvfree(imu); +@@ -1010,8 +1013,10 @@ static int io_sqe_buffer_register(struct + done: + if (ret) { + kvfree(imu); +- if (pages) +- unpin_user_pages(pages, nr_pages); ++ if (pages) { ++ for (i = 0; i < nr_pages; i++) ++ unpin_user_folio(page_folio(pages[i]), 1); ++ } + } + kvfree(pages); + return ret; diff --git a/queue-6.12/mm-vma-reset-vma-iterator-on-commit_merge-oom-failure.patch b/queue-6.12/mm-vma-reset-vma-iterator-on-commit_merge-oom-failure.patch new file mode 100644 index 0000000000..0fbe96e0af --- /dev/null +++ b/queue-6.12/mm-vma-reset-vma-iterator-on-commit_merge-oom-failure.patch @@ -0,0 +1,109 @@ +From 0cf4b1687a187ba9247c71721d8b064634eda1f7 Mon Sep 17 00:00:00 2001 +From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> +Date: Fri, 6 Jun 2025 13:50:32 +0100 +Subject: mm/vma: reset VMA iterator on commit_merge() OOM failure + +From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> + +commit 0cf4b1687a187ba9247c71721d8b064634eda1f7 upstream. + +While an OOM failure in commit_merge() isn't really feasible due to the +allocation which might fail (a maple tree pre-allocation) being 'too small +to fail', we do need to handle this case correctly regardless. + +In vma_merge_existing_range(), we can theoretically encounter failures +which result in an OOM error in two ways - firstly dup_anon_vma() might +fail with an OOM error, and secondly commit_merge() failing, ultimately, +to pre-allocate a maple tree node. + +The abort logic for dup_anon_vma() resets the VMA iterator to the initial +range, ensuring that any logic looping on this iterator will correctly +proceed to the next VMA. + +However the commit_merge() abort logic does not do the same thing. This +resulted in a syzbot report occurring because mlockall() iterates through +VMAs, is tolerant of errors, but ended up with an incorrect previous VMA +being specified due to incorrect iterator state. + +While making this change, it became apparent we are duplicating logic - +the logic introduced in commit 41e6ddcaa0f1 ("mm/vma: add give_up_on_oom +option on modify/merge, use in uffd release") duplicates the +vmg->give_up_on_oom check in both abort branches. + +Additionally, we observe that we can perform the anon_dup check safely on +dup_anon_vma() failure, as this will not be modified should this call +fail. + +Finally, we need to reset the iterator in both cases, so now we can simply +use the exact same code to abort for both. + +We remove the VM_WARN_ON(err != -ENOMEM) as it would be silly for this to +be otherwise and it allows us to implement the abort check more neatly. + +Link: https://lkml.kernel.org/r/20250606125032.164249-1-lorenzo.stoakes@oracle.com +Fixes: 47b16d0462a4 ("mm: abort vma_modify() on merge out of memory failure") +Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> +Reported-by: syzbot+d16409ea9ecc16ed261a@syzkaller.appspotmail.com +Closes: https://lore.kernel.org/linux-mm/6842cc67.a00a0220.29ac89.003b.GAE@google.com/ +Reviewed-by: Pedro Falcato <pfalcato@suse.de> +Reviewed-by: Vlastimil Babka <vbabka@suse.cz> +Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com> +Cc: Jann Horn <jannh@google.com> +Cc: <stable@vger.kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + mm/vma.c | 27 ++++++++------------------- + 1 file changed, 8 insertions(+), 19 deletions(-) + +--- a/mm/vma.c ++++ b/mm/vma.c +@@ -836,9 +836,6 @@ static struct vm_area_struct *vma_merge_ + err = dup_anon_vma(next, vma, &anon_dup); + } + +- if (err) +- goto abort; +- + /* + * In nearly all cases, we expand vmg->vma. There is one exception - + * merge_right where we partially span the VMA. In this case we shrink +@@ -846,22 +843,11 @@ static struct vm_area_struct *vma_merge_ + */ + expanded = !merge_right || merge_will_delete_vma; + +- if (commit_merge(vmg, adjust, +- merge_will_delete_vma ? vma : NULL, +- merge_will_delete_next ? next : NULL, +- adj_start, expanded)) { +- if (anon_dup) +- unlink_anon_vmas(anon_dup); +- +- /* +- * We've cleaned up any cloned anon_vma's, no VMAs have been +- * modified, no harm no foul if the user requests that we not +- * report this and just give up, leaving the VMAs unmerged. +- */ +- if (!vmg->give_up_on_oom) +- vmg->state = VMA_MERGE_ERROR_NOMEM; +- return NULL; +- } ++ if (err || commit_merge(vmg, adjust, ++ merge_will_delete_vma ? vma : NULL, ++ merge_will_delete_next ? next : NULL, ++ adj_start, expanded)) ++ goto abort; + + res = merge_left ? prev : next; + khugepaged_enter_vma(res, vmg->flags); +@@ -873,6 +859,9 @@ abort: + vma_iter_set(vmg->vmi, start); + vma_iter_load(vmg->vmi); + ++ if (anon_dup) ++ unlink_anon_vmas(anon_dup); ++ + /* + * This means we have failed to clone anon_vma's correctly, but no + * actual changes to VMAs have occurred, so no harm no foul - if the diff --git a/queue-6.12/net-libwx-fix-tx-l4-checksum.patch b/queue-6.12/net-libwx-fix-tx-l4-checksum.patch new file mode 100644 index 0000000000..d534c02e46 --- /dev/null +++ b/queue-6.12/net-libwx-fix-tx-l4-checksum.patch @@ -0,0 +1,43 @@ +From c7d82913d5f9e97860772ee4051eaa66b56a6273 Mon Sep 17 00:00:00 2001 +From: Jiawen Wu <jiawenwu@trustnetic.com> +Date: Mon, 24 Mar 2025 18:32:35 +0800 +Subject: net: libwx: fix Tx L4 checksum + +From: Jiawen Wu <jiawenwu@trustnetic.com> + +commit c7d82913d5f9e97860772ee4051eaa66b56a6273 upstream. + +The hardware only supports L4 checksum offload for TCP/UDP/SCTP protocol. +There was a bug to set Tx checksum flag for the other protocol that results +in Tx ring hang. Fix to compute software checksum for these packets. + +Fixes: 3403960cdf86 ("net: wangxun: libwx add tx offload functions") +Signed-off-by: Jiawen Wu <jiawenwu@trustnetic.com> +Link: https://patch.msgid.link/20250324103235.823096-2-jiawenwu@trustnetic.com +Signed-off-by: Jakub Kicinski <kuba@kernel.org> +Signed-off-by: Wenshan Lan <jetlan9@163.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + drivers/net/ethernet/wangxun/libwx/wx_lib.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c ++++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c +@@ -1336,6 +1336,7 @@ static void wx_tx_csum(struct wx_ring *t + u8 tun_prot = 0; + + if (skb->ip_summed != CHECKSUM_PARTIAL) { ++csum_failed: + if (!(first->tx_flags & WX_TX_FLAGS_HW_VLAN) && + !(first->tx_flags & WX_TX_FLAGS_CC)) + return; +@@ -1429,7 +1430,8 @@ static void wx_tx_csum(struct wx_ring *t + WX_TXD_L4LEN_SHIFT; + break; + default: +- break; ++ skb_checksum_help(skb); ++ goto csum_failed; + } + + /* update TX checksum flag */ diff --git a/queue-6.12/series b/queue-6.12/series index 40855055ee..d4c030ed65 100644 --- a/queue-6.12/series +++ b/queue-6.12/series @@ -187,3 +187,14 @@ drm-amd-display-fix-mpv-playback-corruption-on-weston.patch media-uvcvideo-rollback-non-processed-entities-on-error.patch x86-fpu-refactor-xfeature-bitmask-update-code-for-sigframe-xsave.patch x86-pkeys-simplify-pkru-update-in-signal-frame.patch +net-libwx-fix-tx-l4-checksum.patch +io_uring-fix-potential-page-leak-in-io_sqe_buffer_register.patch +io_uring-rsrc-fix-folio-unpinning.patch +io_uring-rsrc-don-t-rely-on-user-vaddr-alignment.patch +io_uring-net-improve-recv-bundles.patch +io_uring-net-only-retry-recv-bundle-for-a-full-transfer.patch +io_uring-net-only-consider-msg_inq-if-larger-than-1.patch +io_uring-net-always-use-current-transfer-count-for-buffer-put.patch +io_uring-net-mark-iov-as-dynamically-allocated-even-for-single-segments.patch +io_uring-kbuf-flag-partial-buffer-mappings.patch +mm-vma-reset-vma-iterator-on-commit_merge-oom-failure.patch |