aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
authorJens Axboe <axboe@kernel.dk>2026-05-15 09:57:58 -0600
committerJens Axboe <axboe@kernel.dk>2026-05-15 09:57:58 -0600
commit80b4ee34a2e577ddbb0b0c0e22afcff76f68b154 (patch)
treea13f103bd4ba973c2d99ec203abefdddc55ed4b1 /fs
parent25df1ff9dfc5245eef57c523a817ad8b01d431c8 (diff)
parentcfa1539b24aff18ecb71c6334e7270f810d145bb (diff)
downloadlinux-next-history-80b4ee34a2e577ddbb0b0c0e22afcff76f68b154.tar.gz
Merge branch 'for-7.2/io_uring-epoll' into for-next
* for-7.2/io_uring-epoll: (23 commits) io_uring/epoll: disallow adding an epoll file to an epoll context io_uring/epoll: switch to using do_epoll_ctl_file() interface eventpoll: rename struct epoll_filefd to epoll_key eventpoll: add file based control interface eventpoll: export is_file_epoll() eventpoll: pass struct epoll_filefd through ep_find() and ep_insert() eventpoll: hoist CTL_ADD scratch state into struct ep_ctl_ctx eventpoll: use bool for predicate helpers eventpoll: rename epi->next and txlist for clarity eventpoll: wrap EP_UNACTIVE_PTR in typed sentinel helpers eventpoll: extract lock dance from do_epoll_ctl() into ep_ctl_lock() eventpoll: extract ep_deliver_event() from ep_send_events() eventpoll: split ep_clear_and_put() into drain helpers eventpoll: split ep_insert() into alloc + register stages eventpoll: relocate KCMP helpers near compat syscalls eventpoll: rename attach_epitem() to ep_attach_file() eventpoll: drop unused depth argument from epoll_mutex_lock() eventpoll: rename ep_refcount_dec_and_test() to ep_put() eventpoll: document ep_clear_and_put() two-pass pattern eventpoll: refresh epi_fget() / ep_remove_file() comments ...
Diffstat (limited to 'fs')
-rw-r--r--fs/eventpoll.c1241
1 files changed, 803 insertions, 438 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a3090b446af10..a569e98d4a996 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -41,45 +41,170 @@
#include <net/busy_poll.h>
/*
- * LOCKING:
- * There are three level of locking required by epoll :
+ * fs/eventpoll.c - Efficient event polling ("epoll") kernel implementation.
*
- * 1) epnested_mutex (mutex)
- * 2) ep->mtx (mutex)
- * 3) ep->lock (spinlock)
*
- * The acquire order is the one listed above, from 1 to 3.
- * We need a spinlock (ep->lock) because we manipulate objects
- * from inside the poll callback, that might be triggered from
- * a wake_up() that in turn might be called from IRQ context.
- * So we can't sleep inside the poll callback and hence we need
- * a spinlock. During the event transfer loop (from kernel to
- * user space) we could end up sleeping due a copy_to_user(), so
- * we need a lock that will allow us to sleep. This lock is a
- * mutex (ep->mtx). It is acquired during the event transfer loop,
- * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
- * The epnested_mutex is acquired when inserting an epoll fd onto another
- * epoll fd. We do this so that we walk the epoll tree and ensure that this
- * insertion does not create a cycle of epoll file descriptors, which
- * could lead to deadlock. We need a global mutex to prevent two
- * simultaneous inserts (A into B and B into A) from racing and
- * constructing a cycle without either insert observing that it is
- * going to.
- * It is necessary to acquire multiple "ep->mtx"es at once in the
- * case when one epoll fd is added to another. In this case, we
- * always acquire the locks in the order of nesting (i.e. after
- * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
- * before e2->mtx). Since we disallow cycles of epoll file
- * descriptors, this ensures that the mutexes are well-ordered. In
- * order to communicate this nesting to lockdep, when walking a tree
- * of epoll file descriptors, we use the current recursion depth as
- * the lockdep subkey.
- * It is possible to drop the "ep->mtx" and to use the global
- * mutex "epnested_mutex" (together with "ep->lock") to have it working,
- * but having "ep->mtx" will make the interface more scalable.
- * Events that require holding "epnested_mutex" are very rare, while for
- * normal operations the epoll private "ep->mtx" will guarantee
- * a better scalability.
+ * Overview
+ * --------
+ *
+ * Each epoll_create(2) returns an anonymous [eventpoll] file whose
+ * ->private_data is a struct eventpoll. Each EPOLL_CTL_ADD installs
+ * a struct epitem linking one (watched file, fd) pair back to that
+ * eventpoll via the watched file's f_op->poll() wait queue(s). When
+ * the watched file signals readiness, ep_poll_callback() fires and
+ * marks the epitem ready. epoll_wait(2) drains the ready list under
+ * ep->mtx, re-queueing items in level-triggered mode.
+ *
+ * epoll instances can watch other epoll instances up to EP_MAX_NESTS
+ * deep; cycles are forbidden and detected at EPOLL_CTL_ADD time.
+ *
+ *
+ * Locking
+ * -------
+ *
+ * Three levels, acquired from outer to inner:
+ *
+ * epnested_mutex (global; rare; taken only for EPOLL_CTL_ADD
+ * loop / path checks)
+ * > ep->mtx (per-eventpoll; sleepable; serializes most ops)
+ * > ep->lock (per-eventpoll; IRQ-safe spinlock)
+ *
+ * file->f_lock (per-file; NOT IRQ-safe; guards f_ep hlist ops;
+ * nested inside ep->mtx, outside ep->lock)
+ *
+ * Rationale:
+ * - ep->lock is a spinlock because ep_poll_callback() is called from
+ * wake_up() which may run in hard-IRQ context. All ep->lock
+ * critical sections use spin_lock_irqsave().
+ * - ep->mtx is a sleepable mutex because the event delivery loop
+ * calls copy_to_user(), and ep_insert() may sleep in
+ * kmem_cache_alloc() and f_op->poll().
+ * - epnested_mutex is global because cycle detection needs a global
+ * view of the epoll topology; a per-object scheme would let two
+ * concurrent inserts (A into B, B into A) construct a cycle
+ * without either observer seeing it.
+ * - Per-ep ep->mtx is preferred for scalability elsewhere. Events
+ * that require epnested_mutex are rare.
+ *
+ * When EPOLL_CTL_ADD nests one eventpoll inside another we acquire
+ * ep->mtx on both: outer first, target second. Since cycles are
+ * forbidden the set of live ep->mtx holds is always a strict chain,
+ * communicated to lockdep via mutex_lock_nested() subclasses derived
+ * from the current recursion depth.
+ *
+ *
+ * Field protection
+ * ----------------
+ *
+ * struct eventpoll:
+ * mtx - self
+ * rbr - ep->mtx
+ * ovflist, rdllist - ep->lock (IRQ-safe)
+ * wq - ep->lock for queue mutation
+ * poll_wait - internal waitqueue spinlock
+ * refs - file->f_lock for adds; ep->mtx for removes;
+ * RCU for readers (hlist_del_rcu + kfree_rcu(ep))
+ * ws - ep->mtx
+ * gen, loop_check_depth - epnested_mutex
+ * file, user - immutable after setup
+ * refcount - atomic (refcount_t)
+ * napi_* - READ_ONCE / WRITE_ONCE
+ *
+ * struct epitem:
+ * rbn / rcu union - rbn: ep->mtx (while epi is linked in ep->rbr).
+ * rcu: written only by kfree_rcu(epi) on the free
+ * path; otherwise untouched by epoll code.
+ * rdllink, next - ep->lock
+ * ffd, ep - immutable after ep_insert()
+ * pwqlist - ep->mtx for writes; POLLFREE clears pwq->whead
+ * via smp_store_release(), see below
+ * fllink - file->f_lock for mutation; hlist_del_rcu +
+ * kfree_rcu(epi) for safe RCU readers
+ * ws - RCU (rcu_assign_pointer /
+ * rcu_dereference_check(mtx))
+ * event - ep->mtx for writes; lockless read in
+ * ep_poll_callback pairs with smp_mb() in
+ * ep_modify()
+ *
+ *
+ * Ready-list state machine
+ * ------------------------
+ *
+ * Readiness is tracked in two lists under ep->lock:
+ *
+ * rdllist - doubly-linked FIFO; the "current" ready list.
+ * ovflist - singly-linked LIFO; used during a scan to catch
+ * events that arrive while rdllist is being iterated
+ * without ep->lock.
+ *
+ * Encoded in ep->ovflist:
+ * EP_UNACTIVE_PTR - no scan active; callback appends to rdllist.
+ * NULL - scan active, no spill yet.
+ * pointer to epi - scan active with spilled items (LIFO).
+ *
+ * Encoded in epi->ovflist_next:
+ * EP_UNACTIVE_PTR - epi is not on ovflist.
+ * otherwise - next epi on ovflist (NULL at tail).
+ *
+ * ep_start_scan() flips "not scanning" to "scanning" and splices
+ * rdllist into a caller-local scan_batch. ep_done_scan() drains ovflist
+ * back to rdllist (list_add head-insert reverses LIFO to FIFO),
+ * flips back to "not scanning", and re-splices any items the caller
+ * left in scan_batch (e.g., level-triggered re-queues).
+ *
+ *
+ * Removal paths
+ * -------------
+ *
+ * Three paths dispose of epitems and/or eventpolls:
+ *
+ * A. ep_remove() - EPOLL_CTL_DEL and ep_insert()
+ * rollback. Caller holds ep->mtx.
+ * B. ep_clear_and_put() - close of the epoll fd itself
+ * (ep_eventpoll_release).
+ * C. eventpoll_release_file() - close of a watched file, invoked
+ * from __fput().
+ *
+ * Coordination:
+ * A and C exclude each other via the watched file's refcount.
+ * A pins the file with epi_fget() before touching file->f_ep or
+ * file->f_lock; if the pin fails, __fput() is in flight and C
+ * will clean this epi up. See the epi_fget() block comment.
+ * A and B both hold ep->mtx serially. B walks the rbtree with
+ * rb_next() captured before ep_remove() erases the current node.
+ * B and C both take ep->mtx; the loser sees fewer entries or an
+ * empty file->f_ep.
+ *
+ * Within every path the internal order is strict:
+ * ep_unregister_pollwait() - drain pwqlist; synchronizes with any
+ * in-flight ep_poll_callback via the
+ * watched wait-queue head's lock.
+ * ep_remove_file() - hlist_del_rcu of epi->fllink and,
+ * if last watcher, clear file->f_ep,
+ * under file->f_lock.
+ * ep_remove_epi() - rb_erase, rdllist unlink (ep->lock),
+ * wakeup_source_unregister,
+ * kfree_rcu(epi).
+ *
+ * kfree_rcu(epi) defers the free past RCU readers in
+ * reverse_path_check_proc(); kfree_rcu(ep) defers past readers in
+ * ep_get_upwards_depth_proc().
+ *
+ *
+ * POLLFREE handshake
+ * ------------------
+ *
+ * When a subsystem tears down a wait-queue head that an epitem is
+ * registered on (binder, signalfd, ...), it wakes the callback with
+ * POLLFREE and must RCU-defer the head's free. The store/load pair:
+ *
+ * ep_poll_callback() POLLFREE branch:
+ * smp_store_release(&pwq->whead, NULL)
+ *
+ * ep_remove_wait_queue():
+ * smp_load_acquire(&pwq->whead)
+ *
+ * See those sites for the full argument.
*/
/* Epoll private bits inside the event mask */
@@ -99,11 +224,6 @@
#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
-struct epoll_filefd {
- struct file *file;
- int fd;
-} __packed;
-
/* Wait structure used by the poll hooks */
struct eppoll_entry {
/* List header used to link this structure to the "struct epitem" */
@@ -136,17 +256,19 @@ struct epitem {
struct rcu_head rcu;
};
- /* List header used to link this structure to the eventpoll ready list */
+ /* Link on the owning eventpoll's ready list (ep->rdllist). */
struct list_head rdllink;
/*
- * Works together "struct eventpoll"->ovflist in keeping the
- * single linked chain of items.
+ * Link on the owning eventpoll's scan-overflow list (ep->ovflist),
+ * EP_UNACTIVE_PTR when not linked. See epi_on_ovflist() /
+ * epi_clear_ovflist() and the "Ready-list state machine" section
+ * in the top-of-file banner.
*/
- struct epitem *next;
+ struct epitem *ovflist_next;
/* The file descriptor information this item refers to */
- struct epoll_filefd ffd;
+ struct epoll_key ffd;
/* List containing poll wait queues */
struct eppoll_entry *pwqlist;
@@ -247,13 +369,77 @@ struct ep_pqueue {
/* Maximum number of epoll watched descriptors, per user */
static long max_user_watches __read_mostly;
-/* Used for cycles detection */
+/*
+ * Cycle and path-length checks at EPOLL_CTL_ADD
+ * ---------------------------------------------
+ *
+ * When EPOLL_CTL_ADD creates a link that either targets an eventpoll
+ * file or extends an existing chain of eventpolls, two checks run:
+ *
+ * 1. no cycle is being formed -- ep_loop_check() walks downward
+ * from the candidate target, and ep_get_upwards_depth_proc()
+ * walks upward from the outer ep, both bounded by EP_MAX_NESTS.
+ * 2. no file accumulates more than path_limits[depth] wakeup paths
+ * of a given length -- reverse_path_check().
+ *
+ * Both need a global view of the epoll topology and must be atomic
+ * with the insertion, so the check is serialized by epnested_mutex
+ * and carries its scratch state on a stack-allocated struct
+ * ep_ctl_ctx scoped to one do_epoll_ctl() call. Non-nested inserts
+ * skip this machinery entirely and take only ep->mtx.
+ *
+ * epnested_mutex Serializes the whole check.
+ * loop_check_gen Global monotonic stamp, bumped at the start of
+ * a check and again at the end. ep->gen caches
+ * the value under which ep was last visited by
+ * ep_loop_check_proc() or
+ * ep_get_upwards_depth_proc(); the post-check
+ * bump ensures those cached stamps can no longer
+ * equal loop_check_gen, so the
+ * "ep->gen == loop_check_gen" trigger in
+ * ep_ctl_lock() only fires while another check
+ * is in flight.
+ *
+ * struct ep_ctl_ctx carries the rest (inserting_into, tfile_check_list,
+ * path_count[]) through the walk; see its declaration below.
+ *
+ * Commits fdcfce93073d ("eventpoll: Fix integer overflow in
+ * ep_loop_check_proc()") and f2e467a48287 ("eventpoll: Fix
+ * semi-unbounded recursion") hardened the walk; any refactor must
+ * preserve both bail-outs.
+ */
static DEFINE_MUTEX(epnested_mutex);
-
static u64 loop_check_gen = 0;
-/* Used to check for epoll file descriptor inclusion loops */
-static struct eventpoll *inserting_into;
+#define PATH_ARR_SIZE 5
+
+/*
+ * Per-do_epoll_ctl() scratch for the loop / path checks. Allocated on
+ * the caller's stack; populated by ep_ctl_lock() and the downward
+ * walk; consumed by reverse_path_check(); released by ep_ctl_unlock().
+ * Only valid while the caller holds epnested_mutex.
+ */
+struct ep_ctl_ctx {
+ /*
+ * Outer eventpoll for one ep_loop_check(); if the downward walk
+ * reaches it the insert would form a cycle.
+ */
+ struct eventpoll *inserting_into;
+
+ /*
+ * Singly-linked list of epitems_head objects collected during
+ * ep_loop_check_proc(), then walked by reverse_path_check().
+ * NULL means empty.
+ */
+ struct epitems_head *tfile_check_list;
+
+ /*
+ * Per-depth wakeup-path tally used by reverse_path_check_proc();
+ * reinitialized to zero at the start of each reverse_path_check()
+ * iteration.
+ */
+ int path_count[PATH_ARR_SIZE];
+};
/* Slab cache used to allocate "struct epitem" */
static struct kmem_cache *epi_cache __ro_after_init;
@@ -262,14 +448,15 @@ static struct kmem_cache *epi_cache __ro_after_init;
static struct kmem_cache *pwq_cache __ro_after_init;
/*
- * List of files with newly added links, where we may need to limit the number
- * of emanating paths. Protected by the epnested_mutex.
+ * Wrapper anchor for file->f_ep when the watched file is not itself an
+ * eventpoll; for the epoll-watches-epoll case, file->f_ep points at
+ * &watched_ep->refs directly. The ->next field threads
+ * ctx->tfile_check_list during one EPOLL_CTL_ADD path check.
*/
struct epitems_head {
struct hlist_head epitems;
struct epitems_head *next;
};
-static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
static struct kmem_cache *ephead_cache __ro_after_init;
@@ -279,14 +466,14 @@ static inline void free_ephead(struct epitems_head *head)
kmem_cache_free(ephead_cache, head);
}
-static void list_file(struct file *file)
+static void list_file(struct file *file, struct ep_ctl_ctx *ctx)
{
struct epitems_head *head;
head = container_of(file->f_ep, struct epitems_head, epitems);
if (!head->next) {
- head->next = tfile_check_list;
- tfile_check_list = head;
+ head->next = ctx->tfile_check_list;
+ ctx->tfile_check_list = head;
}
}
@@ -334,29 +521,20 @@ static void __init epoll_sysctls_init(void)
static const struct file_operations eventpoll_fops;
-static inline int is_file_epoll(struct file *f)
+bool is_file_epoll(struct file *f)
{
return f->f_op == &eventpoll_fops;
}
-/* Setup the structure that is used as key for the RB tree */
-static inline void ep_set_ffd(struct epoll_filefd *ffd,
- struct file *file, int fd)
-{
- ffd->file = file;
- ffd->fd = fd;
-}
-
/* Compare RB tree keys */
-static inline int ep_cmp_ffd(struct epoll_filefd *p1,
- struct epoll_filefd *p2)
+static inline int ep_cmp_ffd(struct epoll_key *p1, struct epoll_key *p2)
{
return (p1->file > p2->file ? +1:
(p1->file < p2->file ? -1 : p1->fd - p2->fd));
}
-/* Tells us if the item is currently linked */
-static inline int ep_is_linked(struct epitem *epi)
+/* True iff @epi is on its owning ep's ready list. */
+static inline bool ep_is_linked(struct epitem *epi)
{
return !list_empty(&epi->rdllink);
}
@@ -372,18 +550,47 @@ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
return container_of(p, struct eppoll_entry, wait)->base;
}
-/**
- * ep_events_available - Checks if ready events might be available.
- *
- * @ep: Pointer to the eventpoll context.
- *
- * Return: a value different than %zero if ready events are available,
- * or %zero otherwise.
+/*
+ * Ready-list / ovflist state (see "Ready-list state machine" in the
+ * top-of-file banner for the full state machine). EP_UNACTIVE_PTR is
+ * the sentinel; these wrappers name each transition and each test so
+ * call sites do not need to know the sentinel's value.
*/
-static inline int ep_events_available(struct eventpoll *ep)
+
+/* True iff @ep is between ep_enter_scan() and ep_exit_scan(). */
+static inline bool ep_is_scanning(struct eventpoll *ep)
+{
+ return READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
+}
+
+/* Called by ep_start_scan(): divert ep_poll_callback() to ovflist. */
+static inline void ep_enter_scan(struct eventpoll *ep)
+{
+ WRITE_ONCE(ep->ovflist, NULL);
+}
+
+/* Called by ep_done_scan(): redirect ep_poll_callback() back to rdllist. */
+static inline void ep_exit_scan(struct eventpoll *ep)
+{
+ WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
+}
+
+/* True iff @epi is currently linked on its ep's ovflist. */
+static inline bool epi_on_ovflist(const struct epitem *epi)
+{
+ return epi->ovflist_next != EP_UNACTIVE_PTR;
+}
+
+/* Mark @epi as not on any ovflist (init and post-drain). */
+static inline void epi_clear_ovflist(struct epitem *epi)
+{
+ epi->ovflist_next = EP_UNACTIVE_PTR;
+}
+
+/* True iff @ep has ready events that epoll_wait() might harvest. */
+static inline bool ep_events_available(struct eventpoll *ep)
{
- return !list_empty_careful(&ep->rdllist) ||
- READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
+ return !list_empty_careful(&ep->rdllist) || ep_is_scanning(ep);
}
#ifdef CONFIG_NET_RX_BUSY_POLL
@@ -659,10 +866,15 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq)
rcu_read_lock();
/*
- * If it is cleared by POLLFREE, it should be rcu-safe.
- * If we read NULL we need a barrier paired with
- * smp_store_release() in ep_poll_callback(), otherwise
- * we rely on whead->lock.
+ * POLLFREE handshake, acquire side; see "POLLFREE handshake"
+ * at the top of this file.
+ *
+ * A NULL load is paired with the smp_store_release(&whead, NULL)
+ * in ep_poll_callback()'s POLLFREE branch: the teardown is
+ * complete and we must not touch whead again. On a non-NULL load
+ * rcu_read_lock() keeps the waitqueue memory alive (POLLFREE
+ * firers RCU-defer the free) and whead->lock inside
+ * remove_wait_queue() serializes us against the store side.
*/
whead = smp_load_acquire(&pwq->whead);
if (whead)
@@ -723,7 +935,7 @@ static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
* ep->mutex needs to be held because we could be hit by
* eventpoll_release_file() and epoll_ctl().
*/
-static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
+static void ep_start_scan(struct eventpoll *ep, struct list_head *scan_batch)
{
/*
* Steal the ready list, and re-init the original one to the
@@ -735,13 +947,13 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
*/
lockdep_assert_irqs_enabled();
spin_lock_irq(&ep->lock);
- list_splice_init(&ep->rdllist, txlist);
- WRITE_ONCE(ep->ovflist, NULL);
+ list_splice_init(&ep->rdllist, scan_batch);
+ ep_enter_scan(ep);
spin_unlock_irq(&ep->lock);
}
static void ep_done_scan(struct eventpoll *ep,
- struct list_head *txlist)
+ struct list_head *scan_batch)
{
struct epitem *epi, *nepi;
@@ -751,34 +963,29 @@ static void ep_done_scan(struct eventpoll *ep,
* other events might have been queued by the poll callback.
* We re-insert them inside the main ready-list here.
*/
- for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
- nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
+ for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; ) {
+ nepi = epi->ovflist_next;
+ epi_clear_ovflist(epi);
/*
- * We need to check if the item is already in the list.
- * During the "sproc" callback execution time, items are
- * queued into ->ovflist but the "txlist" might already
- * contain them, and the list_splice() below takes care of them.
+ * Skip items that the caller already returned via @scan_batch
+ * -- the list_splice() below takes care of those.
*/
if (!ep_is_linked(epi)) {
/*
- * ->ovflist is LIFO, so we have to reverse it in order
- * to keep in FIFO.
+ * ovflist is LIFO; list_add() head-insert here
+ * reverses the iteration order into FIFO.
*/
list_add(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
}
- /*
- * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
- * releasing the lock, events will be queued in the normal way inside
- * ep->rdllist.
- */
- WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
+ /* Back out of scan mode; callbacks target ep->rdllist again. */
+ ep_exit_scan(ep);
/*
- * Quickly re-inject items left on "txlist".
+ * Quickly re-inject items left on "scan_batch".
*/
- list_splice(txlist, &ep->rdllist);
+ list_splice(scan_batch, &ep->rdllist);
__pm_relax(ep->ws);
if (!list_empty(&ep->rdllist)) {
@@ -795,9 +1002,10 @@ static void ep_get(struct eventpoll *ep)
}
/*
- * Returns true if the event poll can be disposed
+ * Drop a reference to @ep; returns true iff it was the last, in which
+ * case the caller is responsible for ep_free().
*/
-static bool ep_refcount_dec_and_test(struct eventpoll *ep)
+static bool ep_put(struct eventpoll *ep)
{
if (!refcount_dec_and_test(&ep->refcount))
return false;
@@ -817,22 +1025,23 @@ static void ep_free(struct eventpoll *ep)
}
/*
- * The ffd.file pointer may be in the process of being torn down due to
- * being closed, but we may not have finished eventpoll_release() yet.
- *
- * Normally, even with the atomic_long_inc_not_zero, the file may have
- * been free'd and then gotten re-allocated to something else (since
- * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).
+ * Pin @epi->ffd.file for operations that require both safe dereference
+ * and exclusion from __fput().
*
- * But for epoll, users hold the ep->mtx mutex, and as such any file in
- * the process of being free'd will block in eventpoll_release_file()
- * and thus the underlying file allocation will not be free'd, and the
- * file re-use cannot happen.
+ * struct file uses SLAB_TYPESAFE_BY_RCU, so a freed slot can be
+ * reassigned at any time. The bare load of epi->ffd.file is safe here
+ * because the caller holds ep->mtx and eventpoll_release_file() blocks
+ * on that mutex while tearing down the epi, so the backing file
+ * allocation cannot be freed and reused under us. An rcu_read_lock()
+ * is therefore unnecessary for the load.
*
- * For the same reason we can avoid a rcu_read_lock() around the
- * operation - 'ffd.file' cannot go away even if the refcount has
- * reached zero (but we must still not call out to ->poll() functions
- * etc).
+ * A successful file_ref_get() additionally blocks __fput() from
+ * starting on this file: once the refcount has reached zero it cannot
+ * come back. ep_remove() relies on that to touch file->f_lock and
+ * file->f_ep without racing eventpoll_release_file() (see commit
+ * a6dc643c6931). A NULL return means __fput() is already in flight;
+ * the caller must bail without touching the file, and
+ * eventpoll_release_file() will clean the epi up from its side.
*/
static struct file *epi_fget(const struct epitem *epi)
{
@@ -858,7 +1067,13 @@ static void ep_remove_file(struct eventpoll *ep, struct epitem *epi,
spin_lock(&file->f_lock);
head = file->f_ep;
if (hlist_is_singular_node(&epi->fllink, head)) {
- /* See eventpoll_release() for details. */
+ /*
+ * Last watcher: publish NULL so the eventpoll_release()
+ * fastpath in include/linux/eventpoll.h can skip the slow
+ * path on a future __fput(). Safe because every f_ep writer
+ * either holds a pin on @file via epi_fget() or is __fput()
+ * itself -- see the comment in eventpoll_release().
+ */
WRITE_ONCE(file->f_ep, NULL);
if (!is_file_epoll(file)) {
struct epitems_head *v;
@@ -919,47 +1134,82 @@ static void ep_remove(struct eventpoll *ep, struct epitem *epi)
ep_remove_file(ep, epi, file);
ep_remove_epi(ep, epi);
- WARN_ON_ONCE(ep_refcount_dec_and_test(ep));
+ WARN_ON_ONCE(ep_put(ep));
}
-static void ep_clear_and_put(struct eventpoll *ep)
+/*
+ * Pass 1 of ep_clear_and_put(): drain every epi's pwqlist.
+ * ep_unregister_pollwait() takes each watched wait-queue head's lock,
+ * which synchronizes with any in-flight ep_poll_callback(); after
+ * this returns no callback can still be about to dereference an epi
+ * on this ep. Must strictly precede ep_drain_tree() -- fusing the
+ * two walks would let a callback queued on epi_i still fire after
+ * epi_{i+k} had already been freed.
+ */
+static void ep_drain_pollwaits(struct eventpoll *ep)
{
- struct rb_node *rbp, *next;
+ struct rb_node *rbp;
struct epitem *epi;
- /* We need to release all tasks waiting for these file */
- if (waitqueue_active(&ep->poll_wait))
- ep_poll_safewake(ep, NULL, 0);
-
- mutex_lock(&ep->mtx);
+ lockdep_assert_held(&ep->mtx);
- /*
- * Walks through the whole tree by unregistering poll callbacks.
- */
for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
ep_unregister_pollwait(ep, epi);
cond_resched();
}
+}
+
+/*
+ * Pass 2 of ep_clear_and_put(): ep_remove() every epi. The per-epi
+ * pwqlist is already empty (ep_drain_pollwaits ran), but the rest of
+ * ep_remove() still runs: epi_fget() pin, f_ep clear under f_lock,
+ * rbtree erase, rdllist unlink, kfree_rcu(epi). rb_next() is captured
+ * before each erase so the iteration is stable.
+ *
+ * A concurrent eventpoll_release_file() (removal path C) on a watched
+ * file serializes with us via ep->mtx; ep_remove() transparently
+ * hands off any epi whose file is in __fput() by bailing when
+ * epi_fget() returns NULL, and path C will clean that epi up.
+ */
+static void ep_drain_tree(struct eventpoll *ep)
+{
+ struct rb_node *rbp, *next;
+ struct epitem *epi;
+
+ lockdep_assert_held(&ep->mtx);
- /*
- * Walks through the whole tree and try to free each "struct epitem".
- * Note that ep_remove() will not remove the epitem in case of a
- * racing eventpoll_release_file(); the latter will do the removal.
- * At this point we are sure no poll callbacks will be lingering around.
- * Since we still own a reference to the eventpoll struct, the loop can't
- * dispose it.
- */
for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) {
next = rb_next(rbp);
epi = rb_entry(rbp, struct epitem, rbn);
ep_remove(ep, epi);
cond_resched();
}
+}
+/*
+ * Removal path B (see "Removal paths" in the top-of-file banner):
+ * close of the epoll fd itself, reached via ep_eventpoll_release().
+ *
+ * Two passes under ep->mtx: first ep_drain_pollwaits() quiesces
+ * in-flight callbacks, then ep_drain_tree() frees the epis. The
+ * ep->refcount is kept > 0 across the walk by the ep file's own
+ * share, which we drop below; ep_free() runs iff we were the last
+ * holder after the tree drained.
+ */
+static void ep_clear_and_put(struct eventpoll *ep)
+{
+ /* Release any threads blocked in poll-on-ep. */
+ if (waitqueue_active(&ep->poll_wait))
+ ep_poll_safewake(ep, NULL, 0);
+
+ mutex_lock(&ep->mtx);
+ ep_drain_pollwaits(ep);
+ ep_drain_tree(ep);
mutex_unlock(&ep->mtx);
- if (ep_refcount_dec_and_test(ep))
+
+ if (ep_put(ep))
ep_free(ep);
}
@@ -999,7 +1249,7 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth
static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth)
{
struct eventpoll *ep = file->private_data;
- LIST_HEAD(txlist);
+ LIST_HEAD(scan_batch);
struct epitem *epi, *tmp;
poll_table pt;
__poll_t res = 0;
@@ -1014,8 +1264,8 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep
* the ready list.
*/
mutex_lock_nested(&ep->mtx, depth);
- ep_start_scan(ep, &txlist);
- list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
+ ep_start_scan(ep, &scan_batch);
+ list_for_each_entry_safe(epi, tmp, &scan_batch, rdllink) {
if (ep_item_poll(epi, &pt, depth + 1)) {
res = EPOLLIN | EPOLLRDNORM;
break;
@@ -1029,7 +1279,7 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep
list_del_init(&epi->rdllink);
}
}
- ep_done_scan(ep, &txlist);
+ ep_done_scan(ep, &scan_batch);
mutex_unlock(&ep->mtx);
return res;
}
@@ -1138,7 +1388,7 @@ again:
mutex_unlock(&ep->mtx);
- if (ep_refcount_dec_and_test(ep))
+ if (ep_put(ep))
ep_free(ep);
goto again;
}
@@ -1159,7 +1409,7 @@ static int ep_alloc(struct eventpoll **pep)
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
ep->rbr = RB_ROOT_CACHED;
- ep->ovflist = EP_UNACTIVE_PTR;
+ ep->ovflist = EP_UNACTIVE_PTR; /* not scanning */
ep->user = get_current_user();
refcount_set(&ep->refcount, 1);
@@ -1173,17 +1423,15 @@ static int ep_alloc(struct eventpoll **pep)
* are protected by the "mtx" mutex, and ep_find() must be called with
* "mtx" held.
*/
-static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
+static struct epitem *ep_find(struct eventpoll *ep, struct epoll_key *tf)
{
int kcmp;
struct rb_node *rbp;
struct epitem *epi, *epir = NULL;
- struct epoll_filefd ffd;
- ep_set_ffd(&ffd, file, fd);
for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
epi = rb_entry(rbp, struct epitem, rbn);
- kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
+ kcmp = ep_cmp_ffd(tf, &epi->ffd);
if (kcmp > 0)
rbp = rbp->rb_right;
else if (kcmp < 0)
@@ -1197,50 +1445,6 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
return epir;
}
-#ifdef CONFIG_KCMP
-static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
-{
- struct rb_node *rbp;
- struct epitem *epi;
-
- for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
- epi = rb_entry(rbp, struct epitem, rbn);
- if (epi->ffd.fd == tfd) {
- if (toff == 0)
- return epi;
- else
- toff--;
- }
- cond_resched();
- }
-
- return NULL;
-}
-
-struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
- unsigned long toff)
-{
- struct file *file_raw;
- struct eventpoll *ep;
- struct epitem *epi;
-
- if (!is_file_epoll(file))
- return ERR_PTR(-EINVAL);
-
- ep = file->private_data;
-
- mutex_lock(&ep->mtx);
- epi = ep_find_tfd(ep, tfd, toff);
- if (epi)
- file_raw = epi->ffd.file;
- else
- file_raw = ERR_PTR(-ENOENT);
- mutex_unlock(&ep->mtx);
-
- return file_raw;
-}
-#endif /* CONFIG_KCMP */
-
/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
@@ -1283,9 +1487,9 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* semantics). All the events that happen during that period of time are
* chained in ep->ovflist and requeued later on.
*/
- if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
- if (epi->next == EP_UNACTIVE_PTR) {
- epi->next = READ_ONCE(ep->ovflist);
+ if (ep_is_scanning(ep)) {
+ if (!epi_on_ovflist(epi)) {
+ epi->ovflist_next = READ_ONCE(ep->ovflist);
WRITE_ONCE(ep->ovflist, epi);
ep_pm_stay_awake_rcu(epi);
}
@@ -1336,17 +1540,24 @@ out_unlock:
if (pollflags & POLLFREE) {
/*
- * If we race with ep_remove_wait_queue() it can miss
- * ->whead = NULL and do another remove_wait_queue() after
- * us, so we can't use __remove_wait_queue().
+ * POLLFREE handshake, release side; see "POLLFREE handshake"
+ * at the top of this file.
+ *
+ * Unlink our wait entry with list_del_init rather than
+ * __remove_wait_queue: a concurrent ep_remove_wait_queue()
+ * that already loaded a non-NULL whead may still call
+ * remove_wait_queue() after us, and list_del_init() tolerates
+ * the second delete.
+ *
+ * smp_store_release(&whead, NULL) publishes the teardown to
+ * ep_remove_wait_queue()'s smp_load_acquire(). Before this
+ * store, a racing ep_clear_and_put() / ep_remove() reaches
+ * ep_remove_wait_queue() which sees whead != NULL and takes
+ * whead->lock -- the same lock held by our caller, so it
+ * serializes behind us. Once whead is zeroed, nothing else
+ * protects ep / epi / wait.
*/
list_del_init(&wait->entry);
- /*
- * ->whead != NULL protects us from the race with
- * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue()
- * takes whead->lock held by the caller. Once we nullify it,
- * nothing protects ep/epi or even wait.
- */
smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
}
@@ -1407,41 +1618,40 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
-#define PATH_ARR_SIZE 5
/*
- * These are the number paths of length 1 to 5, that we are allowing to emanate
- * from a single file of interest. For example, we allow 1000 paths of length
- * 1, to emanate from each file of interest. This essentially represents the
- * potential wakeup paths, which need to be limited in order to avoid massive
- * uncontrolled wakeup storms. The common use case should be a single ep which
- * is connected to n file sources. In this case each file source has 1 path
- * of length 1. Thus, the numbers below should be more than sufficient. These
- * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
- * and delete can't add additional paths. Protected by the epnested_mutex.
+ * Upper bound on wakeup paths emanating from any one watched file,
+ * indexed by path depth (1..PATH_ARR_SIZE). For example, we allow
+ * 1000 paths of length 1 from each watched file. These caps limit
+ * the wakeup amplification that can be built from epoll-watches-
+ * epoll topologies without rejecting reasonable usage.
+ *
+ * Enforced at EPOLL_CTL_ADD; CTL_MOD and CTL_DEL cannot add paths.
+ * The running tallies live in ctx->path_count[] and are protected by
+ * epnested_mutex.
*/
static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
-static int path_count[PATH_ARR_SIZE];
-static int path_count_inc(int nests)
+static int path_count_inc(struct ep_ctl_ctx *ctx, int nests)
{
/* Allow an arbitrary number of depth 1 paths */
if (nests == 0)
return 0;
- if (++path_count[nests] > path_limits[nests])
+ if (++ctx->path_count[nests] > path_limits[nests])
return -1;
return 0;
}
-static void path_count_init(void)
+static void path_count_init(struct ep_ctl_ctx *ctx)
{
int i;
for (i = 0; i < PATH_ARR_SIZE; i++)
- path_count[i] = 0;
+ ctx->path_count[i] = 0;
}
-static int reverse_path_check_proc(struct hlist_head *refs, int depth)
+static int reverse_path_check_proc(struct ep_ctl_ctx *ctx,
+ struct hlist_head *refs, int depth)
{
int error = 0;
struct epitem *epi;
@@ -1453,9 +1663,9 @@ static int reverse_path_check_proc(struct hlist_head *refs, int depth)
hlist_for_each_entry_rcu(epi, refs, fllink) {
struct hlist_head *refs = &epi->ep->refs;
if (hlist_empty(refs))
- error = path_count_inc(depth);
+ error = path_count_inc(ctx, depth);
else
- error = reverse_path_check_proc(refs, depth + 1);
+ error = reverse_path_check_proc(ctx, refs, depth + 1);
if (error != 0)
break;
}
@@ -1463,24 +1673,23 @@ static int reverse_path_check_proc(struct hlist_head *refs, int depth)
}
/**
- * reverse_path_check - The tfile_check_list is list of epitem_head, which have
- * links that are proposed to be newly added. We need to
- * make sure that those added links don't add too many
- * paths such that we will spend all our time waking up
- * eventpoll objects.
+ * reverse_path_check - ctx->tfile_check_list is a list of epitems_head
+ * anchoring files with newly proposed links; make
+ * sure those links don't push any path-length bucket
+ * over its limit in path_limits[].
*
* Return: %zero if the proposed links don't create too many paths,
* %-1 otherwise.
*/
-static int reverse_path_check(void)
+static int reverse_path_check(struct ep_ctl_ctx *ctx)
{
struct epitems_head *p;
- for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
+ for (p = ctx->tfile_check_list; p; p = p->next) {
int error;
- path_count_init();
+ path_count_init(ctx);
rcu_read_lock();
- error = reverse_path_check_proc(&p->epitems, 0);
+ error = reverse_path_check_proc(ctx, &p->epitems, 0);
rcu_read_unlock();
if (error)
return error;
@@ -1526,7 +1735,7 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi)
wakeup_source_unregister(ws);
}
-static int attach_epitem(struct file *file, struct epitem *epi)
+static int ep_attach_file(struct file *file, struct epitem *epi)
{
struct epitems_head *to_free = NULL;
struct hlist_head *head = NULL;
@@ -1561,69 +1770,115 @@ allocate:
}
/*
- * Must be called with "mtx" held.
+ * Charge the user's epoll_watches quota, allocate a fresh epitem for
+ * @tf, and initialize its fields. The returned item is not yet linked
+ * into any data structure; the caller must install it via
+ * ep_register_epitem() (which takes over on success) or kmem_cache_free()
+ * it and decrement epoll_watches on its own.
+ *
+ * Returns ERR_PTR(-ENOSPC) if the quota is exceeded, ERR_PTR(-ENOMEM)
+ * if the slab allocation fails.
*/
-static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
- struct file *tfile, int fd, int full_check)
+static struct epitem *ep_alloc_epitem(struct eventpoll *ep,
+ const struct epoll_event *event,
+ struct epoll_key *tf)
{
- int error, pwake = 0;
- __poll_t revents;
struct epitem *epi;
- struct ep_pqueue epq;
- struct eventpoll *tep = NULL;
-
- if (is_file_epoll(tfile))
- tep = tfile->private_data;
-
- lockdep_assert_irqs_enabled();
if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
max_user_watches) >= 0))
- return -ENOSPC;
+ return ERR_PTR(-ENOSPC);
percpu_counter_inc(&ep->user->epoll_watches);
- if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
+ epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL);
+ if (unlikely(!epi)) {
percpu_counter_dec(&ep->user->epoll_watches);
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}
- /* Item initialization follow here ... */
INIT_LIST_HEAD(&epi->rdllink);
epi->ep = ep;
- ep_set_ffd(&epi->ffd, tfile, fd);
+ epi->ffd = *tf;
epi->event = *event;
- epi->next = EP_UNACTIVE_PTR;
+ epi_clear_ovflist(epi);
+
+ return epi;
+}
+
+/*
+ * Install @epi into its target file's f_ep hlist and into @ep's rbtree,
+ * taking one additional reference on @ep for the lifetime of the item.
+ *
+ * If @tep is non-NULL, the target file is itself an eventpoll; we hold
+ * tep->mtx at subclass 1 across the attach + rbtree insert to serialize
+ * with the target side. RB tree ops are protected by @ep->mtx, which
+ * the caller already holds.
+ *
+ * On failure the epi is freed and the epoll_watches counter decremented,
+ * matching ep_alloc_epitem()'s allocation. After this returns
+ * successfully, ep_insert()'s later error paths use ep_remove() for
+ * unwind; that cannot drop @ep's refcount to zero because the ep file
+ * itself still holds the original reference.
+ */
+static int ep_register_epitem(struct ep_ctl_ctx *ctx, struct eventpoll *ep,
+ struct epitem *epi, struct eventpoll *tep,
+ int full_check)
+{
+ struct file *tfile = epi->ffd.file;
+ int error;
if (tep)
mutex_lock_nested(&tep->mtx, 1);
- /* Add the current item to the list of active epoll hook for this file */
- if (unlikely(attach_epitem(tfile, epi) < 0)) {
+
+ error = ep_attach_file(tfile, epi);
+ if (unlikely(error)) {
if (tep)
mutex_unlock(&tep->mtx);
kmem_cache_free(epi_cache, epi);
percpu_counter_dec(&ep->user->epoll_watches);
- return -ENOMEM;
+ return error;
}
if (full_check && !tep)
- list_file(tfile);
+ list_file(tfile, ctx);
- /*
- * Add the current item to the RB tree. All RB tree operations are
- * protected by "mtx", and ep_insert() is called with "mtx" held.
- */
ep_rbtree_insert(ep, epi);
+
if (tep)
mutex_unlock(&tep->mtx);
- /*
- * ep_remove() calls in the later error paths can't lead to
- * ep_free() as the ep file itself still holds an ep reference.
- */
ep_get(ep);
+ return 0;
+}
+
+/*
+ * Must be called with "mtx" held.
+ */
+static int ep_insert(struct ep_ctl_ctx *ctx, struct eventpoll *ep,
+ const struct epoll_event *event, struct epoll_key *tf,
+ int full_check)
+{
+ int error, pwake = 0;
+ __poll_t revents;
+ struct epitem *epi;
+ struct ep_pqueue epq;
+ struct eventpoll *tep = NULL;
- /* now check if we've created too many backpaths */
- if (unlikely(full_check && reverse_path_check())) {
+ if (is_file_epoll(tf->file))
+ tep = tf->file->private_data;
+
+ lockdep_assert_irqs_enabled();
+
+ epi = ep_alloc_epitem(ep, event, tf);
+ if (IS_ERR(epi))
+ return PTR_ERR(epi);
+
+ error = ep_register_epitem(ctx, ep, epi, tep, full_check);
+ if (error)
+ return error;
+
+ /* Reject the insert if the new link would create too many back-paths. */
+ if (unlikely(full_check && reverse_path_check(ctx))) {
ep_remove(ep, epi);
return -EINVAL;
}
@@ -1649,28 +1904,21 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
*/
revents = ep_item_poll(epi, &epq.pt, 1);
- /*
- * We have to check if something went wrong during the poll wait queue
- * install process. Namely an allocation for a wait queue failed due
- * high memory pressure.
- */
+ /* ep_ptable_queue_proc() signals allocation failure by clearing epq.epi. */
if (unlikely(!epq.epi)) {
ep_remove(ep, epi);
return -ENOMEM;
}
- /* We have to drop the new item inside our item list to keep track of it */
+ /* Drop the new item onto the ready list if it is already ready. */
spin_lock_irq(&ep->lock);
- /* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
- /* If the file is already "ready" we drop it inside the ready list */
if (revents && !ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
- /* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
@@ -1762,11 +2010,87 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
return 0;
}
+/*
+ * Attempt to deliver one event for @epi into @*uevents.
+ *
+ * Returns 1 if an event was delivered (with *uevents advanced to the
+ * next slot), 0 if the re-poll reported no caller-requested events
+ * (@epi drops out of the ready list; a future callback will re-add
+ * it), or -EFAULT if copy_to_user() faulted (in which case @epi is
+ * re-inserted at the head of @scan_batch so ep_done_scan() merges it
+ * back to rdllist for the next attempt).
+ *
+ * PM bookkeeping and level-triggered re-queue are handled here.
+ * Caller holds ep->mtx and the scan is active.
+ */
+static int ep_deliver_event(struct eventpoll *ep, struct epitem *epi,
+ poll_table *pt,
+ struct epoll_event __user **uevents,
+ struct list_head *scan_batch)
+{
+ struct epoll_event __user *next;
+ struct wakeup_source *ws;
+ __poll_t revents;
+
+ /*
+ * Activate ep->ws before deactivating epi->ws to prevent
+ * triggering auto-suspend here (in case we reactivate epi->ws
+ * below). Rearranging to delay the deactivation would let
+ * epi->ws drift out of sync with ep_is_linked().
+ */
+ ws = ep_wakeup_source(epi);
+ if (ws) {
+ if (ws->active)
+ __pm_stay_awake(ep->ws);
+ __pm_relax(ws);
+ }
+
+ list_del_init(&epi->rdllink);
+
+ /*
+ * Re-poll under ep->mtx so userspace cannot change the item
+ * out from under us. If no caller-requested events remain,
+ * @epi stays off the ready list; the poll callback will
+ * re-queue it when events next appear.
+ */
+ revents = ep_item_poll(epi, pt, 1);
+ if (!revents)
+ return 0;
+
+ next = epoll_put_uevent(revents, epi->event.data, *uevents);
+ if (!next) {
+ /*
+ * copy_to_user() faulted: put the item back so
+ * ep_done_scan() splices it onto rdllist for the next
+ * attempt.
+ */
+ list_add(&epi->rdllink, scan_batch);
+ ep_pm_stay_awake(epi);
+ return -EFAULT;
+ }
+ *uevents = next;
+
+ if (epi->event.events & EPOLLONESHOT) {
+ epi->event.events &= EP_PRIVATE_BITS;
+ } else if (!(epi->event.events & EPOLLET)) {
+ /*
+ * Level-triggered: re-queue so the next epoll_wait()
+ * rechecks availability. We are the sole writer to
+ * rdllist here -- epoll_ctl() callers are locked out
+ * by ep->mtx, and the poll callback queues to ovflist
+ * during scans.
+ */
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+ ep_pm_stay_awake(epi);
+ }
+ return 1;
+}
+
static int ep_send_events(struct eventpoll *ep,
struct epoll_event __user *events, int maxevents)
{
struct epitem *epi, *tmp;
- LIST_HEAD(txlist);
+ LIST_HEAD(scan_batch);
poll_table pt;
int res = 0;
@@ -1781,74 +2105,28 @@ static int ep_send_events(struct eventpoll *ep,
init_poll_funcptr(&pt, NULL);
mutex_lock(&ep->mtx);
- ep_start_scan(ep, &txlist);
+ ep_start_scan(ep, &scan_batch);
/*
- * We can loop without lock because we are passed a task private list.
- * Items cannot vanish during the loop we are holding ep->mtx.
+ * We can loop without lock because we are passed a task-private
+ * scan_batch; items cannot vanish while we hold ep->mtx.
*/
- list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
- struct wakeup_source *ws;
- __poll_t revents;
+ list_for_each_entry_safe(epi, tmp, &scan_batch, rdllink) {
+ int delivered;
if (res >= maxevents)
break;
- /*
- * Activate ep->ws before deactivating epi->ws to prevent
- * triggering auto-suspend here (in case we reactive epi->ws
- * below).
- *
- * This could be rearranged to delay the deactivation of epi->ws
- * instead, but then epi->ws would temporarily be out of sync
- * with ep_is_linked().
- */
- ws = ep_wakeup_source(epi);
- if (ws) {
- if (ws->active)
- __pm_stay_awake(ep->ws);
- __pm_relax(ws);
- }
-
- list_del_init(&epi->rdllink);
-
- /*
- * If the event mask intersect the caller-requested one,
- * deliver the event to userspace. Again, we are holding ep->mtx,
- * so no operations coming from userspace can change the item.
- */
- revents = ep_item_poll(epi, &pt, 1);
- if (!revents)
- continue;
-
- events = epoll_put_uevent(revents, epi->event.data, events);
- if (!events) {
- list_add(&epi->rdllink, &txlist);
- ep_pm_stay_awake(epi);
+ delivered = ep_deliver_event(ep, epi, &pt, &events, &scan_batch);
+ if (delivered < 0) {
if (!res)
- res = -EFAULT;
+ res = delivered;
break;
}
- res++;
- if (epi->event.events & EPOLLONESHOT)
- epi->event.events &= EP_PRIVATE_BITS;
- else if (!(epi->event.events & EPOLLET)) {
- /*
- * If this file has been added with Level
- * Trigger mode, we need to insert back inside
- * the ready list, so that the next call to
- * epoll_wait() will check again the events
- * availability. At this point, no one can insert
- * into ep->rdllist besides us. The epoll_ctl()
- * callers are locked out by
- * ep_send_events() holding "mtx" and the
- * poll callback will queue them in ep->ovflist.
- */
- list_add_tail(&epi->rdllink, &ep->rdllist);
- ep_pm_stay_awake(epi);
- }
+ res += delivered;
}
- ep_done_scan(ep, &txlist);
+
+ ep_done_scan(ep, &scan_batch);
mutex_unlock(&ep->mtx);
return res;
@@ -1938,7 +2216,8 @@ static int ep_schedule_timeout(ktime_t *to)
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, struct timespec64 *timeout)
{
- int res, eavail, timed_out = 0;
+ int res, timed_out = 0;
+ bool eavail;
u64 slack = 0;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
@@ -2036,7 +2315,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
* If timed out and still on the wait queue, recheck eavail
* carefully under lock, below.
*/
- eavail = 1;
+ eavail = true;
if (!list_empty_careful(&wait.entry)) {
spin_lock_irq(&ep->lock);
@@ -2066,7 +2345,8 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
* Return: depth of the subtree, or a value bigger than EP_MAX_NESTS if we found
* a loop or went too deep.
*/
-static int ep_loop_check_proc(struct eventpoll *ep, int depth)
+static int ep_loop_check_proc(struct ep_ctl_ctx *ctx,
+ struct eventpoll *ep, int depth)
{
int result = 0;
struct rb_node *rbp;
@@ -2082,22 +2362,23 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
if (unlikely(is_file_epoll(epi->ffd.file))) {
struct eventpoll *ep_tovisit;
ep_tovisit = epi->ffd.file->private_data;
- if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
+ if (ep_tovisit == ctx->inserting_into ||
+ depth > EP_MAX_NESTS)
result = EP_MAX_NESTS+1;
else
- result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1);
+ result = max(result,
+ ep_loop_check_proc(ctx, ep_tovisit,
+ depth + 1) + 1);
if (result > EP_MAX_NESTS)
break;
} else {
/*
- * If we've reached a file that is not associated with
- * an ep, then we need to check if the newly added
- * links are going to add too many wakeup paths. We do
- * this by adding it to the tfile_check_list, if it's
- * not already there, and calling reverse_path_check()
- * during ep_insert().
+ * A non-epoll leaf. Queue it for the companion
+ * reverse_path_check() that runs after this walk so
+ * any new links we propose don't add too many wakeup
+ * paths.
*/
- list_file(epi->ffd.file);
+ list_file(epi->ffd.file, ctx);
}
}
ep->loop_check_depth = result;
@@ -2126,22 +2407,24 @@ static int ep_get_upwards_depth_proc(struct eventpoll *ep, int depth)
* into another epoll file (represented by @ep) does not create
* closed loops or too deep chains.
*
- * @ep: Pointer to the epoll we are inserting into.
- * @to: Pointer to the epoll to be inserted.
+ * @ctx: Per-CTL_ADD scratch context.
+ * @ep: Pointer to the epoll we are inserting into.
+ * @to: Pointer to the epoll to be inserted.
*
* Return: %zero if adding the epoll @to inside the epoll @from
* does not violate the constraints, or %-1 otherwise.
*/
-static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
+static int ep_loop_check(struct ep_ctl_ctx *ctx, struct eventpoll *ep,
+ struct eventpoll *to)
{
int depth, upwards_depth;
- inserting_into = ep;
+ ctx->inserting_into = ep;
/*
* Check how deep down we can get from @to, and whether it is possible
* to loop up to @ep.
*/
- depth = ep_loop_check_proc(to, 0);
+ depth = ep_loop_check_proc(ctx, to, 0);
if (depth > EP_MAX_NESTS)
return -1;
/* Check how far up we can go from @ep. */
@@ -2152,12 +2435,12 @@ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0;
}
-static void clear_tfile_check_list(void)
+static void clear_tfile_check_list(struct ep_ctl_ctx *ctx)
{
rcu_read_lock();
- while (tfile_check_list != EP_UNACTIVE_PTR) {
- struct epitems_head *head = tfile_check_list;
- tfile_check_list = head->next;
+ while (ctx->tfile_check_list) {
+ struct epitems_head *head = ctx->tfile_check_list;
+ ctx->tfile_check_list = head->next;
unlist_file(head);
}
rcu_read_unlock();
@@ -2223,38 +2506,105 @@ static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
}
#endif
-static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
- bool nonblock)
+static inline int epoll_mutex_lock(struct mutex *mutex, bool nonblock)
{
if (!nonblock) {
- mutex_lock_nested(mutex, depth);
+ mutex_lock(mutex);
return 0;
}
- if (mutex_trylock(mutex))
+ return mutex_trylock(mutex) ? 0 : -EAGAIN;
+}
+
+/*
+ * Acquire the locks required for do_epoll_ctl() on @ep for @op.
+ *
+ * Always takes ep->mtx. For EPOLL_CTL_ADD, additionally runs the
+ * loop / path check under epnested_mutex when the topology can
+ * change: @ep is already watched (epfile->f_ep non-NULL), @ep was
+ * recently loop-checked (ep->gen == loop_check_gen), or @tfile is
+ * itself an eventpoll.
+ *
+ * Return value encodes both outcome and lock state:
+ *
+ * 0 success; ep->mtx held.
+ * 1 success; ep->mtx held AND the full check ran under
+ * epnested_mutex (which is also still held). The value
+ * doubles as the @full_check argument to ep_insert().
+ * -errno failure; no locks held.
+ *
+ * The caller releases what was taken with ep_ctl_unlock(ep, ret).
+ *
+ * Holding epnested_mutex on add is what prevents two racing
+ * EPOLL_CTL_ADDs on different eps from building a cycle without
+ * either walker observing it.
+ */
+static int ep_ctl_lock(struct ep_ctl_ctx *ctx, struct eventpoll *ep, int op,
+ struct file *epfile, struct file *tfile, bool nonblock)
+{
+ struct eventpoll *tep;
+ int error;
+
+ error = epoll_mutex_lock(&ep->mtx, nonblock);
+ if (error)
+ return error;
+
+ if (op != EPOLL_CTL_ADD)
+ return 0;
+ if (!READ_ONCE(epfile->f_ep) && ep->gen != loop_check_gen &&
+ !is_file_epoll(tfile))
return 0;
- return -EAGAIN;
+
+ /* Full check needed: drop ep->mtx so we can take epnested_mutex. */
+ mutex_unlock(&ep->mtx);
+ error = epoll_mutex_lock(&epnested_mutex, nonblock);
+ if (error)
+ return error;
+
+ loop_check_gen++;
+
+ if (is_file_epoll(tfile)) {
+ tep = tfile->private_data;
+ if (ep_loop_check(ctx, ep, tep) != 0) {
+ error = -ELOOP;
+ goto err_unlock_nested;
+ }
+ }
+
+ error = epoll_mutex_lock(&ep->mtx, nonblock);
+ if (error)
+ goto err_unlock_nested;
+
+ return 1;
+
+err_unlock_nested:
+ clear_tfile_check_list(ctx);
+ loop_check_gen++;
+ mutex_unlock(&epnested_mutex);
+ return error;
}
-int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
- bool nonblock)
+static void ep_ctl_unlock(struct ep_ctl_ctx *ctx, struct eventpoll *ep,
+ int full_check)
+{
+ mutex_unlock(&ep->mtx);
+ if (full_check) {
+ clear_tfile_check_list(ctx);
+ loop_check_gen++;
+ mutex_unlock(&epnested_mutex);
+ }
+}
+
+int do_epoll_ctl_file(struct file *f, int op, struct epoll_key *tf,
+ struct epoll_event *epds, bool nonblock)
{
int error;
- int full_check = 0;
+ int full_check;
struct eventpoll *ep;
struct epitem *epi;
- struct eventpoll *tep = NULL;
-
- CLASS(fd, f)(epfd);
- if (fd_empty(f))
- return -EBADF;
-
- /* Get the "struct file *" for the target file */
- CLASS(fd, tf)(fd);
- if (fd_empty(tf))
- return -EBADF;
+ struct ep_ctl_ctx ctx = { };
/* The target file descriptor must support poll */
- if (!file_can_poll(fd_file(tf)))
+ if (!file_can_poll(tf->file))
return -EPERM;
/* Check if EPOLLWAKEUP is allowed */
@@ -2262,85 +2612,43 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
ep_take_care_of_epollwakeup(epds);
/*
- * We have to check that the file structure underneath the file descriptor
- * the user passed to us _is_ an eventpoll file. And also we do not permit
+ * The @f file must itself be an eventpoll, and we do not permit
* adding an epoll file descriptor inside itself.
*/
- error = -EINVAL;
- if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f)))
- goto error_tgt_fput;
+ if (f == tf->file || !is_file_epoll(f))
+ return -EINVAL;
/*
* epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
- * Also, we do not currently supported nested exclusive wakeups.
+ * Also, nested exclusive wakeups are not supported.
*/
if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
if (op == EPOLL_CTL_MOD)
- goto error_tgt_fput;
- if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) ||
+ return -EINVAL;
+ if (op == EPOLL_CTL_ADD && (is_file_epoll(tf->file) ||
(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
- goto error_tgt_fput;
+ return -EINVAL;
}
- /*
- * At this point it is safe to assume that the "private_data" contains
- * our own data structure.
- */
- ep = fd_file(f)->private_data;
+ ep = f->private_data;
- /*
- * When we insert an epoll file descriptor inside another epoll file
- * descriptor, there is the chance of creating closed loops, which are
- * better be handled here, than in more critical paths. While we are
- * checking for loops we also determine the list of files reachable
- * and hang them on the tfile_check_list, so we can check that we
- * haven't created too many possible wakeup paths.
- *
- * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
- * the epoll file descriptor is attaching directly to a wakeup source,
- * unless the epoll file descriptor is nested. The purpose of taking the
- * 'epnested_mutex' on add is to prevent complex toplogies such as loops and
- * deep wakeup paths from forming in parallel through multiple
- * EPOLL_CTL_ADD operations.
- */
- error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
- if (error)
- goto error_tgt_fput;
- if (op == EPOLL_CTL_ADD) {
- if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen ||
- is_file_epoll(fd_file(tf))) {
- mutex_unlock(&ep->mtx);
- error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
- if (error)
- goto error_tgt_fput;
- loop_check_gen++;
- full_check = 1;
- if (is_file_epoll(fd_file(tf))) {
- tep = fd_file(tf)->private_data;
- error = -ELOOP;
- if (ep_loop_check(ep, tep) != 0)
- goto error_tgt_fput;
- }
- error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
- if (error)
- goto error_tgt_fput;
- }
- }
+ full_check = ep_ctl_lock(&ctx, ep, op, f, tf->file, nonblock);
+ if (full_check < 0)
+ return full_check;
/*
- * Try to lookup the file inside our RB tree. Since we grabbed "mtx"
- * above, we can be sure to be able to use the item looked up by
- * ep_find() till we release the mutex.
+ * Look the target up in ep's RB tree. We hold ep->mtx, so the
+ * item stays valid until we release.
*/
- epi = ep_find(ep, fd_file(tf), fd);
+ epi = ep_find(ep, tf);
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds->events |= EPOLLERR | EPOLLHUP;
- error = ep_insert(ep, epds, fd_file(tf), fd, full_check);
+ error = ep_insert(&ctx, ep, epds, tf, full_check);
} else
error = -EEXIST;
break;
@@ -2366,17 +2674,30 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
error = -ENOENT;
break;
}
- mutex_unlock(&ep->mtx);
-error_tgt_fput:
- if (full_check) {
- clear_tfile_check_list();
- loop_check_gen++;
- mutex_unlock(&epnested_mutex);
- }
+ ep_ctl_unlock(&ctx, ep, full_check);
return error;
}
+int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
+ bool nonblock)
+{
+ struct epoll_key efd;
+
+ CLASS(fd, f)(epfd);
+ if (fd_empty(f))
+ return -EBADF;
+
+ /* Get the "struct file *" for the target file */
+ CLASS(fd, tf)(fd);
+ if (fd_empty(tf))
+ return -EBADF;
+
+ efd.file = fd_file(tf);
+ efd.fd = fd;
+ return do_epoll_ctl_file(fd_file(f), op, &efd, epds, nonblock);
+}
+
/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
@@ -2527,6 +2848,50 @@ SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
sigmask, sigsetsize);
}
+#ifdef CONFIG_KCMP
+static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
+{
+ struct rb_node *rbp;
+ struct epitem *epi;
+
+ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ epi = rb_entry(rbp, struct epitem, rbn);
+ if (epi->ffd.fd == tfd) {
+ if (toff == 0)
+ return epi;
+ else
+ toff--;
+ }
+ cond_resched();
+ }
+
+ return NULL;
+}
+
+struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
+ unsigned long toff)
+{
+ struct file *file_raw;
+ struct eventpoll *ep;
+ struct epitem *epi;
+
+ if (!is_file_epoll(file))
+ return ERR_PTR(-EINVAL);
+
+ ep = file->private_data;
+
+ mutex_lock(&ep->mtx);
+ epi = ep_find_tfd(ep, tfd, toff);
+ if (epi)
+ file_raw = epi->ffd.file;
+ else
+ file_raw = ERR_PTR(-ENOENT);
+ mutex_unlock(&ep->mtx);
+
+ return file_raw;
+}
+#endif /* CONFIG_KCMP */
+
#ifdef CONFIG_COMPAT
static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *timeout,