aboutsummaryrefslogtreecommitdiffstats
diff options
authorJens Axboe <axboe@kernel.dk>2025-04-23 17:28:57 -0600
committerJens Axboe <axboe@kernel.dk>2025-04-23 19:53:11 -0600
commit18429aa532257e5db0ee0020fc6696690baf9e8a (patch)
tree377a5ff0dafd04abe4ec1185d691d2c2543fa259
parenta79be02bba5c31f967885c7f3bf3a756d77d11d9 (diff)
downloadlinux-block-userfaultfd-signal.tar.gz
mm/userfaultfd: prevent busy looping for tasks with signals pendinguserfaultfd-signal
userfaultfd may use interruptible sleeps to wait on userspace filling a page fault, which works fine if the task can be reliably put to sleeping waiting for that. However, if the task has a normal (ie non-fatal) signal pending, then TASK_INTERRUPTIBLE sleep will simply cause schedule() to be a no-op. For a task that registers a page with userfaultfd and then proceeds to do a write from it, if that task also has a signal pending then it'll essentially busy loop from do_page_fault() -> handle_userfault() until that fault has been filled. Normally it'd be expected that the task would sleep until that happens. Here's a trace from an application doing just that: handle_userfault+0x4b8/0xa00 (P) hugetlb_fault+0xe24/0x1060 handle_mm_fault+0x2bc/0x318 do_page_fault+0x1e8/0x6f0 do_translation_fault+0x9c/0xd0 do_mem_abort+0x44/0xa0 el1_abort+0x3c/0x68 el1h_64_sync_handler+0xd4/0x100 el1h_64_sync+0x6c/0x70 fault_in_readable+0x74/0x108 (P) iomap_file_buffered_write+0x14c/0x438 blkdev_write_iter+0x1a8/0x340 vfs_write+0x20c/0x348 ksys_write+0x64/0x108 __arm64_sys_write+0x1c/0x38 where the task is looping with 100% CPU time in the above mentioned fault path. Since it's impossible to handle signals, or other conditions like TIF_NOTIFY_SIGNAL that also prevents interruptible sleeping, from the fault path, use TASK_UNINTERRUPTIBLE with a short timeout even for vmf modes that would normally ask for INTERRUPTIBLE or KILLABLE sleep. Fatal signals will still be handled by the caller, and the timeout is short enough to hopefully not cause any issues. If this is the first invocation of this fault, eg FAULT_FLAG_TRIED isn't set, then the normal sleep mode is used. Cc: stable@vger.kernel.org Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization") Reported-by: Zhiwei Jiang <qq282012236@gmail.com> Link: https://lore.kernel.org/io-uring/20250422162913.1242057-1-qq282012236@gmail.com/ Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--fs/userfaultfd.c34
1 files changed, 26 insertions, 8 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index d80f943461992f..1016268c7b5113 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -334,15 +334,29 @@ out:
return ret;
}
-static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
+struct userfault_wait {
+ unsigned int task_state;
+ bool timeout;
+};
+
+static struct userfault_wait userfaultfd_get_blocking_state(unsigned int flags)
{
+ /*
+ * If the fault has already been tried AND there's a signal pending
+ * for this task, use TASK_UNINTERRUPTIBLE with a small timeout.
+ * This prevents busy looping where schedule() otherwise does nothing
+ * for TASK_INTERRUPTIBLE when the task has a signal pending.
+ */
+ if ((flags & FAULT_FLAG_TRIED) && signal_pending(current))
+ return (struct userfault_wait) { TASK_UNINTERRUPTIBLE, true };
+
if (flags & FAULT_FLAG_INTERRUPTIBLE)
- return TASK_INTERRUPTIBLE;
+ return (struct userfault_wait) { TASK_INTERRUPTIBLE, false };
if (flags & FAULT_FLAG_KILLABLE)
- return TASK_KILLABLE;
+ return (struct userfault_wait) { TASK_KILLABLE, false };
- return TASK_UNINTERRUPTIBLE;
+ return (struct userfault_wait) { TASK_UNINTERRUPTIBLE, false };
}
/*
@@ -368,7 +382,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
struct userfaultfd_wait_queue uwq;
vm_fault_t ret = VM_FAULT_SIGBUS;
bool must_wait;
- unsigned int blocking_state;
+ struct userfault_wait wait_mode;
/*
* We don't do userfault handling for the final child pid update
@@ -466,7 +480,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
uwq.ctx = ctx;
uwq.waken = false;
- blocking_state = userfaultfd_get_blocking_state(vmf->flags);
+ wait_mode = userfaultfd_get_blocking_state(vmf->flags);
/*
* Take the vma lock now, in order to safely call
@@ -488,7 +502,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
* following the spin_unlock to happen before the list_add in
* __add_wait_queue.
*/
- set_current_state(blocking_state);
+ set_current_state(wait_mode.task_state);
spin_unlock_irq(&ctx->fault_pending_wqh.lock);
if (!is_vm_hugetlb_page(vma))
@@ -501,7 +515,11 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
if (likely(must_wait && !READ_ONCE(ctx->released))) {
wake_up_poll(&ctx->fd_wqh, EPOLLIN);
- schedule();
+ /* See comment in userfaultfd_get_blocking_state() */
+ if (!wait_mode.timeout)
+ schedule();
+ else
+ schedule_timeout(HZ / 10);
}
__set_current_state(TASK_RUNNING);