next-20260522/vfs-brauner

# Conflicts: # fs/fuse/dev.c
author: Mark Brown <broonie@kernel.org> 2026-05-29 14:59:13 +0100
committer: Mark Brown <broonie@kernel.org> 2026-05-29 14:59:13 +0100
commit: d692b8845cb3cfa4d005a5a360a26f692b09d178 (patch)
tree: 01330ed4686ce3e141e8bf069b7bc11bf770f6f2 /fs
parent: 98e9aea604a182a40f8fb4b77ac426b3fdf00031 (diff)
parent: 4bd540bd9a0d7a2e8403a139e9f7631b06a57e89 (diff)
download: linux-next-history-d692b8845cb3cfa4d005a5a360a26f692b09d178.tar.gz
48 files changed, 1388 insertions, 744 deletions
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index a0caf6ace8601..44a3f69d275f9 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -227,11 +227,6 @@ static inline bool affs_validblock(struct super_block *sb, int block)
 	       block < AFFS_SB(sb)->s_partition_size);
 }
 
-static inline void
-affs_set_blocksize(struct super_block *sb, int size)
-{
-	sb_set_blocksize(sb, size);
-}
 static inline struct buffer_head *
 affs_bread(struct super_block *sb, int block)
 {
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 079f36e1ddec1..b232251aa7bbd 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -358,7 +358,8 @@ static int affs_fill_super(struct super_block *sb, struct fs_context *fc)
 	size = bdev_nr_sectors(sb->s_bdev);
 	pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
 
-	affs_set_blocksize(sb, PAGE_SIZE);
+	if (!sb_set_blocksize(sb, PAGE_SIZE))
+		return -EINVAL;
 	/* Try to find root block. Its location depends on the block size. */
 
 	i = bdev_logical_block_size(sb->s_bdev);
@@ -374,7 +375,8 @@ static int affs_fill_super(struct super_block *sb, struct fs_context *fc)
 		if (ctx->root_block < 0)
 			sbi->s_root_block = (ctx->reserved + size - 1) / 2;
 		pr_debug("setting blocksize to %d\n", blocksize);
-		affs_set_blocksize(sb, blocksize);
+		if (!sb_set_blocksize(sb, blocksize))
+			return -EINVAL;
 		sbi->s_partition_size = size;
 
 		/* The root block location that was calculated above is not
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index c12caae9a9679..ee0cbae521b9f 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -860,7 +860,8 @@ befs_fill_super(struct super_block *sb, struct fs_context *fc)
 	 */
 	sb->s_magic = BEFS_SUPER_MAGIC;
 	/* Set real blocksize of fs */
-	sb_set_blocksize(sb, (ulong) befs_sb->block_size);
+	if (!sb_set_blocksize(sb, (ulong) befs_sb->block_size))
+		goto unacquire_priv_sbp;
 	sb->s_op = &befs_sops;
 	sb->s_export_op = &befs_export_operations;
 	sb->s_time_min = 0;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 19e49c8cf7505..9c3e90390824c 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -346,7 +346,8 @@ static int bfs_fill_super(struct super_block *s, struct fs_context *fc)
 	s->s_time_min = 0;
 	s->s_time_max = U32_MAX;
 
-	sb_set_blocksize(s, BFS_BSIZE);
+	if (!sb_set_blocksize(s, BFS_BSIZE))
+		goto out;
 
 	sbh = sb_bread(s, 0);
 	if (!sbh)
diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c
index e4e51a1d0de28..606319dd69e80 100644
--- a/fs/bpf_fs_kfuncs.c
+++ b/fs/bpf_fs_kfuncs.c
@@ -100,7 +100,7 @@ static bool match_security_bpf_prefix(const char *name__str)
 
 static int bpf_xattr_read_permission(const char *name, struct inode *inode)
 {
-	if (WARN_ON(!inode))
+	if (!inode)
 		return -EINVAL;
 
 	/* Allow reading xattr with user. and security.bpf. prefix */
@@ -170,7 +170,7 @@ __bpf_kfunc_end_defs();
 
 static int bpf_xattr_write_permission(const char *name, struct inode *inode)
 {
-	if (WARN_ON(!inode))
+	if (!inode)
 		return -EINVAL;
 
 	/* Only allow setting and removing security.bpf. xattrs */
@@ -289,6 +289,9 @@ __bpf_kfunc int bpf_set_dentry_xattr(struct dentry *dentry, const char *name__st
 	struct inode *inode = d_inode(dentry);
 	int ret;
 
+	if (!inode)
+		return -EINVAL;
+
 	inode_lock(inode);
 	ret = bpf_set_dentry_xattr_locked(dentry, name__str, value_p, flags);
 	inode_unlock(inode);
@@ -314,6 +317,9 @@ __bpf_kfunc int bpf_remove_dentry_xattr(struct dentry *dentry, const char *name_
 	struct inode *inode = d_inode(dentry);
 	int ret;
 
+	if (!inode)
+		return -EINVAL;
+
 	inode_lock(inode);
 	ret = bpf_remove_dentry_xattr_locked(dentry, name__str);
 	inode_unlock(inode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 61b5594c4206f..799a7409950e9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4570,7 +4570,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root)
 
 	inode = btrfs_find_first_inode(root, min_ino);
 	while (inode) {
-		if (icount_read(&inode->vfs_inode) > 1)
+		if (icount_read_once(&inode->vfs_inode) > 1)
 			d_prune_aliases(&inode->vfs_inode);
 
 		min_ino = btrfs_ino(inode) + 1;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d54d71669176b..0ad42e1cc3058 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -996,6 +996,10 @@ retry:
 			ceph_init_inode_acls(newino, &as_ctx);
 			file->f_mode |= FMODE_CREATED;
 		}
+		if ((flags & __O_REGULAR) && !d_is_reg(dentry)) {
+			err = -EFTYPE;
+			goto out_req;
+		}
 		err = finish_open(file, dentry, ceph_open);
 	}
 out_req:
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index ed17e0023705e..0edb6a2515012 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2267,7 +2267,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg)
 			int count;
 			dput(dentry);
 			d_prune_aliases(inode);
-			count = icount_read(inode);
+			count = icount_read_once(inode);
 			if (count == 1)
 				(*remaining)--;
 			doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n",
diff --git a/fs/coredump.c b/fs/coredump.c
index bb6fdb1f458e9..e68a76ff92a38 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -395,8 +395,7 @@ static bool coredump_parse(struct core_name *cn, struct coredump_params *cprm,
 							  cred->gid));
 				break;
 			case 'd':
-				err = cn_printf(cn, "%d",
-					__get_dumpable(cprm->mm_flags));
+				err = cn_printf(cn, "%d", cprm->dumpable);
 				break;
 			/* signal that caused the coredump */
 			case 's':
@@ -869,11 +868,11 @@ static inline void coredump_sock_shutdown(struct file *file) { }
 static inline bool coredump_socket(struct core_name *cn, struct coredump_params *cprm) { return false; }
 #endif
 
-/* cprm->mm_flags contains a stable snapshot of dumpability flags. */
+/* cprm->dumpable is the snapshot of task dumpability at dump start. */
 static inline bool coredump_force_suid_safe(const struct coredump_params *cprm)
 {
 	/* Require nonrelative corefile path and be extra careful. */
-	return __get_dumpable(cprm->mm_flags) == SUID_DUMP_ROOT;
+	return cprm->dumpable == TASK_DUMPABLE_ROOT;
 }
 
 static bool coredump_file(struct core_name *cn, struct coredump_params *cprm,
@@ -1085,7 +1084,7 @@ static inline bool coredump_skip(const struct coredump_params *cprm,
 		return true;
 	if (!binfmt->core_dump)
 		return true;
-	if (!__get_dumpable(cprm->mm_flags))
+	if (cprm->dumpable == TASK_DUMPABLE_OFF)
 		return true;
 	return false;
 }
@@ -1170,14 +1169,9 @@ void vfs_coredump(const kernel_siginfo_t *siginfo)
 	struct coredump_params cprm = {
 		.siginfo = siginfo,
 		.limit = rlimit(RLIMIT_CORE),
-		/*
-		 * We must use the same mm->flags while dumping core to avoid
-		 * inconsistency of bit flags, since this flag is not protected
-		 * by any locks.
-		 *
-		 * Note that we only care about MMF_DUMP* flags.
-		 */
-		.mm_flags = __mm_flags_get_dumpable(mm),
+		/* Snapshot MMF_DUMP_FILTER_* (unlocked) and dumpable for the dump. */
+		.mm_flags = __mm_flags_get_word(mm),
+		.dumpable = task_exec_state_get_dumpable(current),
 		.vma_meta = NULL,
 		.cpu = raw_smp_processor_id(),
 	};
@@ -1419,7 +1413,7 @@ EXPORT_SYMBOL(dump_align);
 
 void validate_coredump_safety(void)
 {
-	if (suid_dumpable == SUID_DUMP_ROOT &&
+	if (suid_dumpable == TASK_DUMPABLE_ROOT &&
 	    core_pattern[0] != '/' && core_pattern[0] != '|' && core_pattern[0] != '@') {
 
 		coredump_report_failure("Unsafe core_pattern used with fs.suid_dumpable=2: "
@@ -1488,7 +1482,8 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write,
 		return -EINVAL;
 	}
 
-	validate_coredump_safety();
+	if (strncmp(old_core_pattern, core_pattern, CORENAME_MAX_SIZE))
+		validate_coredump_safety();
 	return error;
 }
 
diff --git a/fs/dcache.c b/fs/dcache.c
index 2c61aeea41f45..d6f5053132053 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -426,9 +426,16 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry)
 		this_cpu_inc(nr_dentry_negative);
 }
 
+#define DENTRY_WARN_ONCE(condition, dentry) \
+	WARN_ONCE((condition), "dentry=%p d_flags=0x%x\n", (dentry), (dentry)->d_flags)
+#define D_FLAG_VERIFY(dentry, x) \
+	DENTRY_WARN_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x), (dentry))
+
 static void dentry_free(struct dentry *dentry)
 {
-	WARN_ON(d_really_is_positive(dentry));
+	DENTRY_WARN_ONCE(d_really_is_positive(dentry), dentry);
+	DENTRY_WARN_ONCE(dentry->d_lockref.count >= 0, dentry);
+	D_FLAG_VERIFY(dentry, 0);
 	if (unlikely(dname_external(dentry))) {
 		struct external_name *p = external_name(dentry);
 		if (likely(atomic_dec_and_test(&p->count))) {
@@ -495,7 +502,6 @@ static void dentry_unlink_inode(struct dentry * dentry)
  * These helper functions make sure we always follow the
  * rules. d_lock must be held by the caller.
  */
-#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
 static void d_lru_add(struct dentry *dentry)
 {
 	D_FLAG_VERIFY(dentry, 0);
@@ -1820,10 +1826,10 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 		name = &slash_name;
 		dname = dentry->d_shortname.string;
 	} else if (name->len > DNAME_INLINE_LEN-1) {
-		size_t size = offsetof(struct external_name, name[1]);
-		struct external_name *p = kmalloc(size + name->len,
-						  GFP_KERNEL_ACCOUNT |
-						  __GFP_RECLAIMABLE);
+		struct external_name *p;
+
+		p = kmalloc_flex(*p, name, name->len + 1,
+				 GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE);
 		if (!p) {
 			kmem_cache_free(dentry_cache, dentry); 
 			return NULL;
@@ -2100,6 +2106,10 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode)
 	__d_instantiate(entry, inode);
 	spin_unlock(&entry->d_lock);
 	WARN_ON(!(inode_state_read(inode) & I_NEW));
+	/*
+	 * Paired with igrab_from_hash()
+	 */
+	smp_wmb();
 	inode_state_clear(inode, I_NEW | I_CREATING);
 	inode_wake_up_bit(inode, __I_NEW);
 	spin_unlock(&inode->i_lock);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a3090b446af10..a569e98d4a996 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -41,45 +41,170 @@
 #include <net/busy_poll.h>
 
 /*
- * LOCKING:
- * There are three level of locking required by epoll :
+ * fs/eventpoll.c - Efficient event polling ("epoll") kernel implementation.
  *
- * 1) epnested_mutex (mutex)
- * 2) ep->mtx (mutex)
- * 3) ep->lock (spinlock)
  *
- * The acquire order is the one listed above, from 1 to 3.
- * We need a spinlock (ep->lock) because we manipulate objects
- * from inside the poll callback, that might be triggered from
- * a wake_up() that in turn might be called from IRQ context.
- * So we can't sleep inside the poll callback and hence we need
- * a spinlock. During the event transfer loop (from kernel to
- * user space) we could end up sleeping due a copy_to_user(), so
- * we need a lock that will allow us to sleep. This lock is a
- * mutex (ep->mtx). It is acquired during the event transfer loop,
- * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
- * The epnested_mutex is acquired when inserting an epoll fd onto another
- * epoll fd. We do this so that we walk the epoll tree and ensure that this
- * insertion does not create a cycle of epoll file descriptors, which
- * could lead to deadlock. We need a global mutex to prevent two
- * simultaneous inserts (A into B and B into A) from racing and
- * constructing a cycle without either insert observing that it is
- * going to.
- * It is necessary to acquire multiple "ep->mtx"es at once in the
- * case when one epoll fd is added to another. In this case, we
- * always acquire the locks in the order of nesting (i.e. after
- * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
- * before e2->mtx). Since we disallow cycles of epoll file
- * descriptors, this ensures that the mutexes are well-ordered. In
- * order to communicate this nesting to lockdep, when walking a tree
- * of epoll file descriptors, we use the current recursion depth as
- * the lockdep subkey.
- * It is possible to drop the "ep->mtx" and to use the global
- * mutex "epnested_mutex" (together with "ep->lock") to have it working,
- * but having "ep->mtx" will make the interface more scalable.
- * Events that require holding "epnested_mutex" are very rare, while for
- * normal operations the epoll private "ep->mtx" will guarantee
- * a better scalability.
+ * Overview
+ * --------
+ *
+ * Each epoll_create(2) returns an anonymous [eventpoll] file whose
+ * ->private_data is a struct eventpoll. Each EPOLL_CTL_ADD installs
+ * a struct epitem linking one (watched file, fd) pair back to that
+ * eventpoll via the watched file's f_op->poll() wait queue(s). When
+ * the watched file signals readiness, ep_poll_callback() fires and
+ * marks the epitem ready. epoll_wait(2) drains the ready list under
+ * ep->mtx, re-queueing items in level-triggered mode.
+ *
+ * epoll instances can watch other epoll instances up to EP_MAX_NESTS
+ * deep; cycles are forbidden and detected at EPOLL_CTL_ADD time.
+ *
+ *
+ * Locking
+ * -------
+ *
+ * Three levels, acquired from outer to inner:
+ *
+ *   epnested_mutex   (global; rare; taken only for EPOLL_CTL_ADD
+ *                     loop / path checks)
+ *     > ep->mtx     (per-eventpoll; sleepable; serializes most ops)
+ *       > ep->lock  (per-eventpoll; IRQ-safe spinlock)
+ *
+ *   file->f_lock    (per-file; NOT IRQ-safe; guards f_ep hlist ops;
+ *                    nested inside ep->mtx, outside ep->lock)
+ *
+ * Rationale:
+ *   - ep->lock is a spinlock because ep_poll_callback() is called from
+ *     wake_up() which may run in hard-IRQ context. All ep->lock
+ *     critical sections use spin_lock_irqsave().
+ *   - ep->mtx is a sleepable mutex because the event delivery loop
+ *     calls copy_to_user(), and ep_insert() may sleep in
+ *     kmem_cache_alloc() and f_op->poll().
+ *   - epnested_mutex is global because cycle detection needs a global
+ *     view of the epoll topology; a per-object scheme would let two
+ *     concurrent inserts (A into B, B into A) construct a cycle
+ *     without either observer seeing it.
+ *   - Per-ep ep->mtx is preferred for scalability elsewhere. Events
+ *     that require epnested_mutex are rare.
+ *
+ * When EPOLL_CTL_ADD nests one eventpoll inside another we acquire
+ * ep->mtx on both: outer first, target second. Since cycles are
+ * forbidden the set of live ep->mtx holds is always a strict chain,
+ * communicated to lockdep via mutex_lock_nested() subclasses derived
+ * from the current recursion depth.
+ *
+ *
+ * Field protection
+ * ----------------
+ *
+ * struct eventpoll:
+ *   mtx              - self
+ *   rbr              - ep->mtx
+ *   ovflist, rdllist - ep->lock (IRQ-safe)
+ *   wq               - ep->lock for queue mutation
+ *   poll_wait        - internal waitqueue spinlock
+ *   refs             - file->f_lock for adds; ep->mtx for removes;
+ *                      RCU for readers (hlist_del_rcu + kfree_rcu(ep))
+ *   ws               - ep->mtx
+ *   gen, loop_check_depth - epnested_mutex
+ *   file, user       - immutable after setup
+ *   refcount         - atomic (refcount_t)
+ *   napi_*           - READ_ONCE / WRITE_ONCE
+ *
+ * struct epitem:
+ *   rbn / rcu union  - rbn: ep->mtx (while epi is linked in ep->rbr).
+ *                      rcu: written only by kfree_rcu(epi) on the free
+ *                      path; otherwise untouched by epoll code.
+ *   rdllink, next    - ep->lock
+ *   ffd, ep          - immutable after ep_insert()
+ *   pwqlist          - ep->mtx for writes; POLLFREE clears pwq->whead
+ *                      via smp_store_release(), see below
+ *   fllink           - file->f_lock for mutation; hlist_del_rcu +
+ *                      kfree_rcu(epi) for safe RCU readers
+ *   ws               - RCU (rcu_assign_pointer /
+ *                      rcu_dereference_check(mtx))
+ *   event            - ep->mtx for writes; lockless read in
+ *                      ep_poll_callback pairs with smp_mb() in
+ *                      ep_modify()
+ *
+ *
+ * Ready-list state machine
+ * ------------------------
+ *
+ * Readiness is tracked in two lists under ep->lock:
+ *
+ *   rdllist   - doubly-linked FIFO; the "current" ready list.
+ *   ovflist   - singly-linked LIFO; used during a scan to catch
+ *               events that arrive while rdllist is being iterated
+ *               without ep->lock.
+ *
+ * Encoded in ep->ovflist:
+ *   EP_UNACTIVE_PTR - no scan active; callback appends to rdllist.
+ *   NULL            - scan active, no spill yet.
+ *   pointer to epi  - scan active with spilled items (LIFO).
+ *
+ * Encoded in epi->ovflist_next:
+ *   EP_UNACTIVE_PTR - epi is not on ovflist.
+ *   otherwise       - next epi on ovflist (NULL at tail).
+ *
+ * ep_start_scan() flips "not scanning" to "scanning" and splices
+ * rdllist into a caller-local scan_batch. ep_done_scan() drains ovflist
+ * back to rdllist (list_add head-insert reverses LIFO to FIFO),
+ * flips back to "not scanning", and re-splices any items the caller
+ * left in scan_batch (e.g., level-triggered re-queues).
+ *
+ *
+ * Removal paths
+ * -------------
+ *
+ * Three paths dispose of epitems and/or eventpolls:
+ *
+ *   A. ep_remove()              - EPOLL_CTL_DEL and ep_insert()
+ *                                 rollback. Caller holds ep->mtx.
+ *   B. ep_clear_and_put()       - close of the epoll fd itself
+ *                                 (ep_eventpoll_release).
+ *   C. eventpoll_release_file() - close of a watched file, invoked
+ *                                 from __fput().
+ *
+ * Coordination:
+ *   A and C exclude each other via the watched file's refcount.
+ *   A pins the file with epi_fget() before touching file->f_ep or
+ *   file->f_lock; if the pin fails, __fput() is in flight and C
+ *   will clean this epi up. See the epi_fget() block comment.
+ *   A and B both hold ep->mtx serially. B walks the rbtree with
+ *   rb_next() captured before ep_remove() erases the current node.
+ *   B and C both take ep->mtx; the loser sees fewer entries or an
+ *   empty file->f_ep.
+ *
+ * Within every path the internal order is strict:
+ *   ep_unregister_pollwait()  - drain pwqlist; synchronizes with any
+ *                                in-flight ep_poll_callback via the
+ *                                watched wait-queue head's lock.
+ *   ep_remove_file()          - hlist_del_rcu of epi->fllink and,
+ *                                if last watcher, clear file->f_ep,
+ *                                under file->f_lock.
+ *   ep_remove_epi()           - rb_erase, rdllist unlink (ep->lock),
+ *                                wakeup_source_unregister,
+ *                                kfree_rcu(epi).
+ *
+ * kfree_rcu(epi) defers the free past RCU readers in
+ * reverse_path_check_proc(); kfree_rcu(ep) defers past readers in
+ * ep_get_upwards_depth_proc().
+ *
+ *
+ * POLLFREE handshake
+ * ------------------
+ *
+ * When a subsystem tears down a wait-queue head that an epitem is
+ * registered on (binder, signalfd, ...), it wakes the callback with
+ * POLLFREE and must RCU-defer the head's free. The store/load pair:
+ *
+ *   ep_poll_callback() POLLFREE branch:
+ *     smp_store_release(&pwq->whead, NULL)
+ *
+ *   ep_remove_wait_queue():
+ *     smp_load_acquire(&pwq->whead)
+ *
+ * See those sites for the full argument.
  */
 
 /* Epoll private bits inside the event mask */
@@ -99,11 +224,6 @@
 
 #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
 
-struct epoll_filefd {
-	struct file *file;
-	int fd;
-} __packed;
-
 /* Wait structure used by the poll hooks */
 struct eppoll_entry {
 	/* List header used to link this structure to the "struct epitem" */
@@ -136,17 +256,19 @@ struct epitem {
 		struct rcu_head rcu;
 	};
 
-	/* List header used to link this structure to the eventpoll ready list */
+	/* Link on the owning eventpoll's ready list (ep->rdllist). */
 	struct list_head rdllink;
 
 	/*
-	 * Works together "struct eventpoll"->ovflist in keeping the
-	 * single linked chain of items.
+	 * Link on the owning eventpoll's scan-overflow list (ep->ovflist),
+	 * EP_UNACTIVE_PTR when not linked. See epi_on_ovflist() /
+	 * epi_clear_ovflist() and the "Ready-list state machine" section
+	 * in the top-of-file banner.
 	 */
-	struct epitem *next;
+	struct epitem *ovflist_next;
 
 	/* The file descriptor information this item refers to */
-	struct epoll_filefd ffd;
+	struct epoll_key ffd;
 
 	/* List containing poll wait queues */
 	struct eppoll_entry *pwqlist;
@@ -247,13 +369,77 @@ struct ep_pqueue {
 /* Maximum number of epoll watched descriptors, per user */
 static long max_user_watches __read_mostly;
 
-/* Used for cycles detection */
+/*
+ * Cycle and path-length checks at EPOLL_CTL_ADD
+ * ---------------------------------------------
+ *
+ * When EPOLL_CTL_ADD creates a link that either targets an eventpoll
+ * file or extends an existing chain of eventpolls, two checks run:
+ *
+ *   1. no cycle is being formed -- ep_loop_check() walks downward
+ *      from the candidate target, and ep_get_upwards_depth_proc()
+ *      walks upward from the outer ep, both bounded by EP_MAX_NESTS.
+ *   2. no file accumulates more than path_limits[depth] wakeup paths
+ *      of a given length -- reverse_path_check().
+ *
+ * Both need a global view of the epoll topology and must be atomic
+ * with the insertion, so the check is serialized by epnested_mutex
+ * and carries its scratch state on a stack-allocated struct
+ * ep_ctl_ctx scoped to one do_epoll_ctl() call. Non-nested inserts
+ * skip this machinery entirely and take only ep->mtx.
+ *
+ *   epnested_mutex     Serializes the whole check.
+ *   loop_check_gen     Global monotonic stamp, bumped at the start of
+ *                      a check and again at the end. ep->gen caches
+ *                      the value under which ep was last visited by
+ *                      ep_loop_check_proc() or
+ *                      ep_get_upwards_depth_proc(); the post-check
+ *                      bump ensures those cached stamps can no longer
+ *                      equal loop_check_gen, so the
+ *                      "ep->gen == loop_check_gen" trigger in
+ *                      ep_ctl_lock() only fires while another check
+ *                      is in flight.
+ *
+ * struct ep_ctl_ctx carries the rest (inserting_into, tfile_check_list,
+ * path_count[]) through the walk; see its declaration below.
+ *
+ * Commits fdcfce93073d ("eventpoll: Fix integer overflow in
+ * ep_loop_check_proc()") and f2e467a48287 ("eventpoll: Fix
+ * semi-unbounded recursion") hardened the walk; any refactor must
+ * preserve both bail-outs.
+ */
 static DEFINE_MUTEX(epnested_mutex);
-
 static u64 loop_check_gen = 0;
 
-/* Used to check for epoll file descriptor inclusion loops */
-static struct eventpoll *inserting_into;
+#define PATH_ARR_SIZE 5
+
+/*
+ * Per-do_epoll_ctl() scratch for the loop / path checks. Allocated on
+ * the caller's stack; populated by ep_ctl_lock() and the downward
+ * walk; consumed by reverse_path_check(); released by ep_ctl_unlock().
+ * Only valid while the caller holds epnested_mutex.
+ */
+struct ep_ctl_ctx {
+	/*
+	 * Outer eventpoll for one ep_loop_check(); if the downward walk
+	 * reaches it the insert would form a cycle.
+	 */
+	struct eventpoll *inserting_into;
+
+	/*
+	 * Singly-linked list of epitems_head objects collected during
+	 * ep_loop_check_proc(), then walked by reverse_path_check().
+	 * NULL means empty.
+	 */
+	struct epitems_head *tfile_check_list;
+
+	/*
+	 * Per-depth wakeup-path tally used by reverse_path_check_proc();
+	 * reinitialized to zero at the start of each reverse_path_check()
+	 * iteration.
+	 */
+	int path_count[PATH_ARR_SIZE];
+};
 
 /* Slab cache used to allocate "struct epitem" */
 static struct kmem_cache *epi_cache __ro_after_init;
@@ -262,14 +448,15 @@ static struct kmem_cache *epi_cache __ro_after_init;
 static struct kmem_cache *pwq_cache __ro_after_init;
 
 /*
- * List of files with newly added links, where we may need to limit the number
- * of emanating paths. Protected by the epnested_mutex.
+ * Wrapper anchor for file->f_ep when the watched file is not itself an
+ * eventpoll; for the epoll-watches-epoll case, file->f_ep points at
+ * &watched_ep->refs directly. The ->next field threads
+ * ctx->tfile_check_list during one EPOLL_CTL_ADD path check.
  */
 struct epitems_head {
 	struct hlist_head epitems;
 	struct epitems_head *next;
 };
-static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
 
 static struct kmem_cache *ephead_cache __ro_after_init;
 
@@ -279,14 +466,14 @@ static inline void free_ephead(struct epitems_head *head)
 		kmem_cache_free(ephead_cache, head);
 }
 
-static void list_file(struct file *file)
+static void list_file(struct file *file, struct ep_ctl_ctx *ctx)
 {
 	struct epitems_head *head;
 
 	head = container_of(file->f_ep, struct epitems_head, epitems);
 	if (!head->next) {
-		head->next = tfile_check_list;
-		tfile_check_list = head;
+		head->next = ctx->tfile_check_list;
+		ctx->tfile_check_list = head;
 	}
 }
 
@@ -334,29 +521,20 @@ static void __init epoll_sysctls_init(void)
 
 static const struct file_operations eventpoll_fops;
 
-static inline int is_file_epoll(struct file *f)
+bool is_file_epoll(struct file *f)
 {
 	return f->f_op == &eventpoll_fops;
 }
 
-/* Setup the structure that is used as key for the RB tree */
-static inline void ep_set_ffd(struct epoll_filefd *ffd,
-			      struct file *file, int fd)
-{
-	ffd->file = file;
-	ffd->fd = fd;
-}
-
 /* Compare RB tree keys */
-static inline int ep_cmp_ffd(struct epoll_filefd *p1,
-			     struct epoll_filefd *p2)
+static inline int ep_cmp_ffd(struct epoll_key *p1, struct epoll_key *p2)
 {
 	return (p1->file > p2->file ? +1:
 	        (p1->file < p2->file ? -1 : p1->fd - p2->fd));
 }
 
-/* Tells us if the item is currently linked */
-static inline int ep_is_linked(struct epitem *epi)
+/* True iff @epi is on its owning ep's ready list. */
+static inline bool ep_is_linked(struct epitem *epi)
 {
 	return !list_empty(&epi->rdllink);
 }
@@ -372,18 +550,47 @@ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
 	return container_of(p, struct eppoll_entry, wait)->base;
 }
 
-/**
- * ep_events_available - Checks if ready events might be available.
- *
- * @ep: Pointer to the eventpoll context.
- *
- * Return: a value different than %zero if ready events are available,
- *          or %zero otherwise.
+/*
+ * Ready-list / ovflist state (see "Ready-list state machine" in the
+ * top-of-file banner for the full state machine). EP_UNACTIVE_PTR is
+ * the sentinel; these wrappers name each transition and each test so
+ * call sites do not need to know the sentinel's value.
  */
-static inline int ep_events_available(struct eventpoll *ep)
+
+/* True iff @ep is between ep_enter_scan() and ep_exit_scan(). */
+static inline bool ep_is_scanning(struct eventpoll *ep)
+{
+	return READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
+}
+
+/* Called by ep_start_scan(): divert ep_poll_callback() to ovflist. */
+static inline void ep_enter_scan(struct eventpoll *ep)
+{
+	WRITE_ONCE(ep->ovflist, NULL);
+}
+
+/* Called by ep_done_scan(): redirect ep_poll_callback() back to rdllist. */
+static inline void ep_exit_scan(struct eventpoll *ep)
+{
+	WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
+}
+
+/* True iff @epi is currently linked on its ep's ovflist. */
+static inline bool epi_on_ovflist(const struct epitem *epi)
+{
+	return epi->ovflist_next != EP_UNACTIVE_PTR;
+}
+
+/* Mark @epi as not on any ovflist (init and post-drain). */
+static inline void epi_clear_ovflist(struct epitem *epi)
+{
+	epi->ovflist_next = EP_UNACTIVE_PTR;
+}
+
+/* True iff @ep has ready events that epoll_wait() might harvest. */
+static inline bool ep_events_available(struct eventpoll *ep)
 {
-	return !list_empty_careful(&ep->rdllist) ||
-		READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
+	return !list_empty_careful(&ep->rdllist) || ep_is_scanning(ep);
 }
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
@@ -659,10 +866,15 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq)
 
 	rcu_read_lock();
 	/*
-	 * If it is cleared by POLLFREE, it should be rcu-safe.
-	 * If we read NULL we need a barrier paired with
-	 * smp_store_release() in ep_poll_callback(), otherwise
-	 * we rely on whead->lock.
+	 * POLLFREE handshake, acquire side; see "POLLFREE handshake"
+	 * at the top of this file.
+	 *
+	 * A NULL load is paired with the smp_store_release(&whead, NULL)
+	 * in ep_poll_callback()'s POLLFREE branch: the teardown is
+	 * complete and we must not touch whead again. On a non-NULL load
+	 * rcu_read_lock() keeps the waitqueue memory alive (POLLFREE
+	 * firers RCU-defer the free) and whead->lock inside
+	 * remove_wait_queue() serializes us against the store side.
 	 */
 	whead = smp_load_acquire(&pwq->whead);
 	if (whead)
@@ -723,7 +935,7 @@ static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
  * ep->mutex needs to be held because we could be hit by
  * eventpoll_release_file() and epoll_ctl().
  */
-static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
+static void ep_start_scan(struct eventpoll *ep, struct list_head *scan_batch)
 {
 	/*
 	 * Steal the ready list, and re-init the original one to the
@@ -735,13 +947,13 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
 	 */
 	lockdep_assert_irqs_enabled();
 	spin_lock_irq(&ep->lock);
-	list_splice_init(&ep->rdllist, txlist);
-	WRITE_ONCE(ep->ovflist, NULL);
+	list_splice_init(&ep->rdllist, scan_batch);
+	ep_enter_scan(ep);
 	spin_unlock_irq(&ep->lock);
 }
 
 static void ep_done_scan(struct eventpoll *ep,
-			 struct list_head *txlist)
+			 struct list_head *scan_batch)
 {
 	struct epitem *epi, *nepi;
 
@@ -751,34 +963,29 @@ static void ep_done_scan(struct eventpoll *ep,
 	 * other events might have been queued by the poll callback.
 	 * We re-insert them inside the main ready-list here.
 	 */
-	for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
-	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
+	for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; ) {
+		nepi = epi->ovflist_next;
+		epi_clear_ovflist(epi);
 		/*
-		 * We need to check if the item is already in the list.
-		 * During the "sproc" callback execution time, items are
-		 * queued into ->ovflist but the "txlist" might already
-		 * contain them, and the list_splice() below takes care of them.
+		 * Skip items that the caller already returned via @scan_batch
+		 * -- the list_splice() below takes care of those.
 		 */
 		if (!ep_is_linked(epi)) {
 			/*
-			 * ->ovflist is LIFO, so we have to reverse it in order
-			 * to keep in FIFO.
+			 * ovflist is LIFO; list_add() head-insert here
+			 * reverses the iteration order into FIFO.
 			 */
 			list_add(&epi->rdllink, &ep->rdllist);
 			ep_pm_stay_awake(epi);
 		}
 	}
-	/*
-	 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
-	 * releasing the lock, events will be queued in the normal way inside
-	 * ep->rdllist.
-	 */
-	WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
+	/* Back out of scan mode; callbacks target ep->rdllist again. */
+	ep_exit_scan(ep);
 
 	/*
-	 * Quickly re-inject items left on "txlist".
+	 * Quickly re-inject items left on "scan_batch".
 	 */
-	list_splice(txlist, &ep->rdllist);
+	list_splice(scan_batch, &ep->rdllist);
 	__pm_relax(ep->ws);
 
 	if (!list_empty(&ep->rdllist)) {
@@ -795,9 +1002,10 @@ static void ep_get(struct eventpoll *ep)
 }
 
 /*
- * Returns true if the event poll can be disposed
+ * Drop a reference to @ep; returns true iff it was the last, in which
+ * case the caller is responsible for ep_free().
  */
-static bool ep_refcount_dec_and_test(struct eventpoll *ep)
+static bool ep_put(struct eventpoll *ep)
 {
 	if (!refcount_dec_and_test(&ep->refcount))
 		return false;
@@ -817,22 +1025,23 @@ static void ep_free(struct eventpoll *ep)
 }
 
 /*
- * The ffd.file pointer may be in the process of being torn down due to
- * being closed, but we may not have finished eventpoll_release() yet.
- *
- * Normally, even with the atomic_long_inc_not_zero, the file may have
- * been free'd and then gotten re-allocated to something else (since
- * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).
+ * Pin @epi->ffd.file for operations that require both safe dereference
+ * and exclusion from __fput().
  *
- * But for epoll, users hold the ep->mtx mutex, and as such any file in
- * the process of being free'd will block in eventpoll_release_file()
- * and thus the underlying file allocation will not be free'd, and the
- * file re-use cannot happen.
+ * struct file uses SLAB_TYPESAFE_BY_RCU, so a freed slot can be
+ * reassigned at any time. The bare load of epi->ffd.file is safe here
+ * because the caller holds ep->mtx and eventpoll_release_file() blocks
+ * on that mutex while tearing down the epi, so the backing file
+ * allocation cannot be freed and reused under us. An rcu_read_lock()
+ * is therefore unnecessary for the load.
  *
- * For the same reason we can avoid a rcu_read_lock() around the
- * operation - 'ffd.file' cannot go away even if the refcount has
- * reached zero (but we must still not call out to ->poll() functions
- * etc).
+ * A successful file_ref_get() additionally blocks __fput() from
+ * starting on this file: once the refcount has reached zero it cannot
+ * come back. ep_remove() relies on that to touch file->f_lock and
+ * file->f_ep without racing eventpoll_release_file() (see commit
+ * a6dc643c6931). A NULL return means __fput() is already in flight;
+ * the caller must bail without touching the file, and
+ * eventpoll_release_file() will clean the epi up from its side.
  */
 static struct file *epi_fget(const struct epitem *epi)
 {
@@ -858,7 +1067,13 @@ static void ep_remove_file(struct eventpoll *ep, struct epitem *epi,
 	spin_lock(&file->f_lock);
 	head = file->f_ep;
 	if (hlist_is_singular_node(&epi->fllink, head)) {
-		/* See eventpoll_release() for details. */
+		/*
+		 * Last watcher: publish NULL so the eventpoll_release()
+		 * fastpath in include/linux/eventpoll.h can skip the slow
+		 * path on a future __fput(). Safe because every f_ep writer
+		 * either holds a pin on @file via epi_fget() or is __fput()
+		 * itself -- see the comment in eventpoll_release().
+		 */
 		WRITE_ONCE(file->f_ep, NULL);
 		if (!is_file_epoll(file)) {
 			struct epitems_head *v;
@@ -919,47 +1134,82 @@ static void ep_remove(struct eventpoll *ep, struct epitem *epi)
 
 	ep_remove_file(ep, epi, file);
 	ep_remove_epi(ep, epi);
-	WARN_ON_ONCE(ep_refcount_dec_and_test(ep));
+	WARN_ON_ONCE(ep_put(ep));
 }
 
-static void ep_clear_and_put(struct eventpoll *ep)
+/*
+ * Pass 1 of ep_clear_and_put(): drain every epi's pwqlist.
+ * ep_unregister_pollwait() takes each watched wait-queue head's lock,
+ * which synchronizes with any in-flight ep_poll_callback(); after
+ * this returns no callback can still be about to dereference an epi
+ * on this ep. Must strictly precede ep_drain_tree() -- fusing the
+ * two walks would let a callback queued on epi_i still fire after
+ * epi_{i+k} had already been freed.
+ */
+static void ep_drain_pollwaits(struct eventpoll *ep)
 {
-	struct rb_node *rbp, *next;
+	struct rb_node *rbp;
 	struct epitem *epi;
 
-	/* We need to release all tasks waiting for these file */
-	if (waitqueue_active(&ep->poll_wait))
-		ep_poll_safewake(ep, NULL, 0);
-
-	mutex_lock(&ep->mtx);
+	lockdep_assert_held(&ep->mtx);
 
-	/*
-	 * Walks through the whole tree by unregistering poll callbacks.
-	 */
 	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
 		epi = rb_entry(rbp, struct epitem, rbn);
 
 		ep_unregister_pollwait(ep, epi);
 		cond_resched();
 	}
+}
+
+/*
+ * Pass 2 of ep_clear_and_put(): ep_remove() every epi. The per-epi
+ * pwqlist is already empty (ep_drain_pollwaits ran), but the rest of
+ * ep_remove() still runs: epi_fget() pin, f_ep clear under f_lock,
+ * rbtree erase, rdllist unlink, kfree_rcu(epi). rb_next() is captured
+ * before each erase so the iteration is stable.
+ *
+ * A concurrent eventpoll_release_file() (removal path C) on a watched
+ * file serializes with us via ep->mtx; ep_remove() transparently
+ * hands off any epi whose file is in __fput() by bailing when
+ * epi_fget() returns NULL, and path C will clean that epi up.
+ */
+static void ep_drain_tree(struct eventpoll *ep)
+{
+	struct rb_node *rbp, *next;
+	struct epitem *epi;
+
+	lockdep_assert_held(&ep->mtx);
 
-	/*
-	 * Walks through the whole tree and try to free each "struct epitem".
-	 * Note that ep_remove() will not remove the epitem in case of a
-	 * racing eventpoll_release_file(); the latter will do the removal.
-	 * At this point we are sure no poll callbacks will be lingering around.
-	 * Since we still own a reference to the eventpoll struct, the loop can't
-	 * dispose it.
-	 */
 	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) {
 		next = rb_next(rbp);
 		epi = rb_entry(rbp, struct epitem, rbn);
 		ep_remove(ep, epi);
 		cond_resched();
 	}
+}
 
+/*
+ * Removal path B (see "Removal paths" in the top-of-file banner):
+ * close of the epoll fd itself, reached via ep_eventpoll_release().
+ *
+ * Two passes under ep->mtx: first ep_drain_pollwaits() quiesces
+ * in-flight callbacks, then ep_drain_tree() frees the epis. The
+ * ep->refcount is kept > 0 across the walk by the ep file's own
+ * share, which we drop below; ep_free() runs iff we were the last
+ * holder after the tree drained.
+ */
+static void ep_clear_and_put(struct eventpoll *ep)
+{
+	/* Release any threads blocked in poll-on-ep. */
+	if (waitqueue_active(&ep->poll_wait))
+		ep_poll_safewake(ep, NULL, 0);
+
+	mutex_lock(&ep->mtx);
+	ep_drain_pollwaits(ep);
+	ep_drain_tree(ep);
 	mutex_unlock(&ep->mtx);
-	if (ep_refcount_dec_and_test(ep))
+
+	if (ep_put(ep))
 		ep_free(ep);
 }
 
@@ -999,7 +1249,7 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth
 static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth)
 {
 	struct eventpoll *ep = file->private_data;
-	LIST_HEAD(txlist);
+	LIST_HEAD(scan_batch);
 	struct epitem *epi, *tmp;
 	poll_table pt;
 	__poll_t res = 0;
@@ -1014,8 +1264,8 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep
 	 * the ready list.
 	 */
 	mutex_lock_nested(&ep->mtx, depth);
-	ep_start_scan(ep, &txlist);
-	list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
+	ep_start_scan(ep, &scan_batch);
+	list_for_each_entry_safe(epi, tmp, &scan_batch, rdllink) {
 		if (ep_item_poll(epi, &pt, depth + 1)) {
 			res = EPOLLIN | EPOLLRDNORM;
 			break;
@@ -1029,7 +1279,7 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep
 			list_del_init(&epi->rdllink);
 		}
 	}
-	ep_done_scan(ep, &txlist);
+	ep_done_scan(ep, &scan_batch);
 	mutex_unlock(&ep->mtx);
 	return res;
 }
@@ -1138,7 +1388,7 @@ again:
 
 		mutex_unlock(&ep->mtx);
 
-		if (ep_refcount_dec_and_test(ep))
+		if (ep_put(ep))
 			ep_free(ep);
 		goto again;
 	}
@@ -1159,7 +1409,7 @@ static int ep_alloc(struct eventpoll **pep)
 	init_waitqueue_head(&ep->poll_wait);
 	INIT_LIST_HEAD(&ep->rdllist);
 	ep->rbr = RB_ROOT_CACHED;
-	ep->ovflist = EP_UNACTIVE_PTR;
+	ep->ovflist = EP_UNACTIVE_PTR;	/* not scanning */
 	ep->user = get_current_user();
 	refcount_set(&ep->refcount, 1);
 
@@ -1173,17 +1423,15 @@ static int ep_alloc(struct eventpoll **pep)
  * are protected by the "mtx" mutex, and ep_find() must be called with
  * "mtx" held.
  */
-static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
+static struct epitem *ep_find(struct eventpoll *ep, struct epoll_key *tf)
 {
 	int kcmp;
 	struct rb_node *rbp;
 	struct epitem *epi, *epir = NULL;
-	struct epoll_filefd ffd;
 
-	ep_set_ffd(&ffd, file, fd);
 	for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
 		epi = rb_entry(rbp, struct epitem, rbn);
-		kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
+		kcmp = ep_cmp_ffd(tf, &epi->ffd);
 		if (kcmp > 0)
 			rbp = rbp->rb_right;
 		else if (kcmp < 0)
@@ -1197,50 +1445,6 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
 	return epir;
 }
 
-#ifdef CONFIG_KCMP
-static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
-{
-	struct rb_node *rbp;
-	struct epitem *epi;
-
-	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
-		epi = rb_entry(rbp, struct epitem, rbn);
-		if (epi->ffd.fd == tfd) {
-			if (toff == 0)
-				return epi;
-			else
-				toff--;
-		}
-		cond_resched();
-	}
-
-	return NULL;
-}
-
-struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
-				     unsigned long toff)
-{
-	struct file *file_raw;
-	struct eventpoll *ep;
-	struct epitem *epi;
-
-	if (!is_file_epoll(file))
-		return ERR_PTR(-EINVAL);
-
-	ep = file->private_data;
-
-	mutex_lock(&ep->mtx);
-	epi = ep_find_tfd(ep, tfd, toff);
-	if (epi)
-		file_raw = epi->ffd.file;
-	else
-		file_raw = ERR_PTR(-ENOENT);
-	mutex_unlock(&ep->mtx);
-
-	return file_raw;
-}
-#endif /* CONFIG_KCMP */
-
 /*
  * This is the callback that is passed to the wait queue wakeup
  * mechanism. It is called by the stored file descriptors when they
@@ -1283,9 +1487,9 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
 	 * semantics). All the events that happen during that period of time are
 	 * chained in ep->ovflist and requeued later on.
 	 */
-	if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
-		if (epi->next == EP_UNACTIVE_PTR) {
-			epi->next = READ_ONCE(ep->ovflist);
+	if (ep_is_scanning(ep)) {
+		if (!epi_on_ovflist(epi)) {
+			epi->ovflist_next = READ_ONCE(ep->ovflist);
 			WRITE_ONCE(ep->ovflist, epi);
 			ep_pm_stay_awake_rcu(epi);
 		}
@@ -1336,17 +1540,24 @@ out_unlock:
 
 	if (pollflags & POLLFREE) {
 		/*
-		 * If we race with ep_remove_wait_queue() it can miss
-		 * ->whead = NULL and do another remove_wait_queue() after
-		 * us, so we can't use __remove_wait_queue().
+		 * POLLFREE handshake, release side; see "POLLFREE handshake"
+		 * at the top of this file.
+		 *
+		 * Unlink our wait entry with list_del_init rather than
+		 * __remove_wait_queue: a concurrent ep_remove_wait_queue()
+		 * that already loaded a non-NULL whead may still call
+		 * remove_wait_queue() after us, and list_del_init() tolerates
+		 * the second delete.
+		 *
+		 * smp_store_release(&whead, NULL) publishes the teardown to
+		 * ep_remove_wait_queue()'s smp_load_acquire(). Before this
+		 * store, a racing ep_clear_and_put() / ep_remove() reaches
+		 * ep_remove_wait_queue() which sees whead != NULL and takes
+		 * whead->lock -- the same lock held by our caller, so it
+		 * serializes behind us. Once whead is zeroed, nothing else
+		 * protects ep / epi / wait.
 		 */
 		list_del_init(&wait->entry);
-		/*
-		 * ->whead != NULL protects us from the race with
-		 * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue()
-		 * takes whead->lock held by the caller. Once we nullify it,
-		 * nothing protects ep/epi or even wait.
-		 */
 		smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
 	}
 
@@ -1407,41 +1618,40 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
 
 
 
-#define PATH_ARR_SIZE 5
 /*
- * These are the number paths of length 1 to 5, that we are allowing to emanate
- * from a single file of interest. For example, we allow 1000 paths of length
- * 1, to emanate from each file of interest. This essentially represents the
- * potential wakeup paths, which need to be limited in order to avoid massive
- * uncontrolled wakeup storms. The common use case should be a single ep which
- * is connected to n file sources. In this case each file source has 1 path
- * of length 1. Thus, the numbers below should be more than sufficient. These
- * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
- * and delete can't add additional paths. Protected by the epnested_mutex.
+ * Upper bound on wakeup paths emanating from any one watched file,
+ * indexed by path depth (1..PATH_ARR_SIZE). For example, we allow
+ * 1000 paths of length 1 from each watched file. These caps limit
+ * the wakeup amplification that can be built from epoll-watches-
+ * epoll topologies without rejecting reasonable usage.
+ *
+ * Enforced at EPOLL_CTL_ADD; CTL_MOD and CTL_DEL cannot add paths.
+ * The running tallies live in ctx->path_count[] and are protected by
+ * epnested_mutex.
  */
 static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
-static int path_count[PATH_ARR_SIZE];
 
-static int path_count_inc(int nests)
+static int path_count_inc(struct ep_ctl_ctx *ctx, int nests)
 {
 	/* Allow an arbitrary number of depth 1 paths */
 	if (nests == 0)
 		return 0;
 
-	if (++path_count[nests] > path_limits[nests])
+	if (++ctx->path_count[nests] > path_limits[nests])
 		return -1;
 	return 0;
 }
 
-static void path_count_init(void)
+static void path_count_init(struct ep_ctl_ctx *ctx)
 {
 	int i;
 
 	for (i = 0; i < PATH_ARR_SIZE; i++)
-		path_count[i] = 0;
+		ctx->path_count[i] = 0;
 }
 
-static int reverse_path_check_proc(struct hlist_head *refs, int depth)
+static int reverse_path_check_proc(struct ep_ctl_ctx *ctx,
+				   struct hlist_head *refs, int depth)
 {
 	int error = 0;
 	struct epitem *epi;
@@ -1453,9 +1663,9 @@ static int reverse_path_check_proc(struct hlist_head *refs, int depth)
 	hlist_for_each_entry_rcu(epi, refs, fllink) {
 		struct hlist_head *refs = &epi->ep->refs;
 		if (hlist_empty(refs))
-			error = path_count_inc(depth);
+			error = path_count_inc(ctx, depth);
 		else
-			error = reverse_path_check_proc(refs, depth + 1);
+			error = reverse_path_check_proc(ctx, refs, depth + 1);
 		if (error != 0)
 			break;
 	}
@@ -1463,24 +1673,23 @@ static int reverse_path_check_proc(struct hlist_head *refs, int depth)
 }
 
 /**
- * reverse_path_check - The tfile_check_list is list of epitem_head, which have
- *                      links that are proposed to be newly added. We need to
- *                      make sure that those added links don't add too many
- *                      paths such that we will spend all our time waking up
- *                      eventpoll objects.
+ * reverse_path_check - ctx->tfile_check_list is a list of epitems_head
+ *                      anchoring files with newly proposed links; make
+ *                      sure those links don't push any path-length bucket
+ *                      over its limit in path_limits[].
  *
  * Return: %zero if the proposed links don't create too many paths,
  *	    %-1 otherwise.
  */
-static int reverse_path_check(void)
+static int reverse_path_check(struct ep_ctl_ctx *ctx)
 {
 	struct epitems_head *p;
 
-	for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
+	for (p = ctx->tfile_check_list; p; p = p->next) {
 		int error;
-		path_count_init();
+		path_count_init(ctx);
 		rcu_read_lock();
-		error = reverse_path_check_proc(&p->epitems, 0);
+		error = reverse_path_check_proc(ctx, &p->epitems, 0);
 		rcu_read_unlock();
 		if (error)
 			return error;
@@ -1526,7 +1735,7 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi)
 	wakeup_source_unregister(ws);
 }
 
-static int attach_epitem(struct file *file, struct epitem *epi)
+static int ep_attach_file(struct file *file, struct epitem *epi)
 {
 	struct epitems_head *to_free = NULL;
 	struct hlist_head *head = NULL;
@@ -1561,69 +1770,115 @@ allocate:
 }
 
 /*
- * Must be called with "mtx" held.
+ * Charge the user's epoll_watches quota, allocate a fresh epitem for
+ * @tf, and initialize its fields. The returned item is not yet linked
+ * into any data structure; the caller must install it via
+ * ep_register_epitem() (which takes over on success) or kmem_cache_free()
+ * it and decrement epoll_watches on its own.
+ *
+ * Returns ERR_PTR(-ENOSPC) if the quota is exceeded, ERR_PTR(-ENOMEM)
+ * if the slab allocation fails.
  */
-static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
-		     struct file *tfile, int fd, int full_check)
+static struct epitem *ep_alloc_epitem(struct eventpoll *ep,
+				      const struct epoll_event *event,
+				      struct epoll_key *tf)
 {
-	int error, pwake = 0;
-	__poll_t revents;
 	struct epitem *epi;
-	struct ep_pqueue epq;
-	struct eventpoll *tep = NULL;
-
-	if (is_file_epoll(tfile))
-		tep = tfile->private_data;
-
-	lockdep_assert_irqs_enabled();
 
 	if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
 					    max_user_watches) >= 0))
-		return -ENOSPC;
+		return ERR_PTR(-ENOSPC);
 	percpu_counter_inc(&ep->user->epoll_watches);
 
-	if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
+	epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL);
+	if (unlikely(!epi)) {
 		percpu_counter_dec(&ep->user->epoll_watches);
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 	}
 
-	/* Item initialization follow here ... */
 	INIT_LIST_HEAD(&epi->rdllink);
 	epi->ep = ep;
-	ep_set_ffd(&epi->ffd, tfile, fd);
+	epi->ffd = *tf;
 	epi->event = *event;
-	epi->next = EP_UNACTIVE_PTR;
+	epi_clear_ovflist(epi);
+
+	return epi;
+}
+
+/*
+ * Install @epi into its target file's f_ep hlist and into @ep's rbtree,
+ * taking one additional reference on @ep for the lifetime of the item.
+ *
+ * If @tep is non-NULL, the target file is itself an eventpoll; we hold
+ * tep->mtx at subclass 1 across the attach + rbtree insert to serialize
+ * with the target side. RB tree ops are protected by @ep->mtx, which
+ * the caller already holds.
+ *
+ * On failure the epi is freed and the epoll_watches counter decremented,
+ * matching ep_alloc_epitem()'s allocation. After this returns
+ * successfully, ep_insert()'s later error paths use ep_remove() for
+ * unwind; that cannot drop @ep's refcount to zero because the ep file
+ * itself still holds the original reference.
+ */
+static int ep_register_epitem(struct ep_ctl_ctx *ctx, struct eventpoll *ep,
+			      struct epitem *epi, struct eventpoll *tep,
+			      int full_check)
+{
+	struct file *tfile = epi->ffd.file;
+	int error;
 
 	if (tep)
 		mutex_lock_nested(&tep->mtx, 1);
-	/* Add the current item to the list of active epoll hook for this file */
-	if (unlikely(attach_epitem(tfile, epi) < 0)) {
+
+	error = ep_attach_file(tfile, epi);
+	if (unlikely(error)) {
 		if (tep)
 			mutex_unlock(&tep->mtx);
 		kmem_cache_free(epi_cache, epi);
 		percpu_counter_dec(&ep->user->epoll_watches);
-		return -ENOMEM;
+		return error;
 	}
 
 	if (full_check && !tep)
-		list_file(tfile);
+		list_file(tfile, ctx);
 
-	/*
-	 * Add the current item to the RB tree. All RB tree operations are
-	 * protected by "mtx", and ep_insert() is called with "mtx" held.
-	 */
 	ep_rbtree_insert(ep, epi);
+
 	if (tep)
 		mutex_unlock(&tep->mtx);
 
-	/*
-	 * ep_remove() calls in the later error paths can't lead to
-	 * ep_free() as the ep file itself still holds an ep reference.
-	 */
 	ep_get(ep);
+	return 0;
+}
+
+/*
+ * Must be called with "mtx" held.
+ */
+static int ep_insert(struct ep_ctl_ctx *ctx, struct eventpoll *ep,
+		     const struct epoll_event *event, struct epoll_key *tf,
+		     int full_check)
+{
+	int error, pwake = 0;
+	__poll_t revents;
+	struct epitem *epi;
+	struct ep_pqueue epq;
+	struct eventpoll *tep = NULL;
 
-	/* now check if we've created too many backpaths */
-	if (unlikely(full_check && reverse_path_check())) {
+	if (is_file_epoll(tf->file))
+		tep = tf->file->private_data;
+
+	lockdep_assert_irqs_enabled();
+
+	epi = ep_alloc_epitem(ep, event, tf);
+	if (IS_ERR(epi))
+		return PTR_ERR(epi);
+
+	error = ep_register_epitem(ctx, ep, epi, tep, full_check);
+	if (error)
+		return error;
+
+	/* Reject the insert if the new link would create too many back-paths. */
+	if (unlikely(full_check && reverse_path_check(ctx))) {
 		ep_remove(ep, epi);
 		return -EINVAL;
 	}
@@ -1649,28 +1904,21 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 	 */
 	revents = ep_item_poll(epi, &epq.pt, 1);
 
-	/*
-	 * We have to check if something went wrong during the poll wait queue
-	 * install process. Namely an allocation for a wait queue failed due
-	 * high memory pressure.
-	 */
+	/* ep_ptable_queue_proc() signals allocation failure by clearing epq.epi. */
 	if (unlikely(!epq.epi)) {
 		ep_remove(ep, epi);
 		return -ENOMEM;
 	}
 
-	/* We have to drop the new item inside our item list to keep track of it */
+	/* Drop the new item onto the ready list if it is already ready. */
 	spin_lock_irq(&ep->lock);
 
-	/* record NAPI ID of new item if present */
 	ep_set_busy_poll_napi_id(epi);
 
-	/* If the file is already "ready" we drop it inside the ready list */
 	if (revents && !ep_is_linked(epi)) {
 		list_add_tail(&epi->rdllink, &ep->rdllist);
 		ep_pm_stay_awake(epi);
 
-		/* Notify waiting tasks that events are available */
 		if (waitqueue_active(&ep->wq))
 			wake_up(&ep->wq);
 		if (waitqueue_active(&ep->poll_wait))
@@ -1762,11 +2010,87 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
 	return 0;
 }
 
+/*
+ * Attempt to deliver one event for @epi into @*uevents.
+ *
+ * Returns 1 if an event was delivered (with *uevents advanced to the
+ * next slot), 0 if the re-poll reported no caller-requested events
+ * (@epi drops out of the ready list; a future callback will re-add
+ * it), or -EFAULT if copy_to_user() faulted (in which case @epi is
+ * re-inserted at the head of @scan_batch so ep_done_scan() merges it
+ * back to rdllist for the next attempt).
+ *
+ * PM bookkeeping and level-triggered re-queue are handled here.
+ * Caller holds ep->mtx and the scan is active.
+ */
+static int ep_deliver_event(struct eventpoll *ep, struct epitem *epi,
+			    poll_table *pt,
+			    struct epoll_event __user **uevents,
+			    struct list_head *scan_batch)
+{
+	struct epoll_event __user *next;
+	struct wakeup_source *ws;
+	__poll_t revents;
+
+	/*
+	 * Activate ep->ws before deactivating epi->ws to prevent
+	 * triggering auto-suspend here (in case we reactivate epi->ws
+	 * below).  Rearranging to delay the deactivation would let
+	 * epi->ws drift out of sync with ep_is_linked().
+	 */
+	ws = ep_wakeup_source(epi);
+	if (ws) {
+		if (ws->active)
+			__pm_stay_awake(ep->ws);
+		__pm_relax(ws);
+	}
+
+	list_del_init(&epi->rdllink);
+
+	/*
+	 * Re-poll under ep->mtx so userspace cannot change the item
+	 * out from under us. If no caller-requested events remain,
+	 * @epi stays off the ready list; the poll callback will
+	 * re-queue it when events next appear.
+	 */
+	revents = ep_item_poll(epi, pt, 1);
+	if (!revents)
+		return 0;
+
+	next = epoll_put_uevent(revents, epi->event.data, *uevents);
+	if (!next) {
+		/*
+		 * copy_to_user() faulted: put the item back so
+		 * ep_done_scan() splices it onto rdllist for the next
+		 * attempt.
+		 */
+		list_add(&epi->rdllink, scan_batch);
+		ep_pm_stay_awake(epi);
+		return -EFAULT;
+	}
+	*uevents = next;
+
+	if (epi->event.events & EPOLLONESHOT) {
+		epi->event.events &= EP_PRIVATE_BITS;
+	} else if (!(epi->event.events & EPOLLET)) {
+		/*
+		 * Level-triggered: re-queue so the next epoll_wait()
+		 * rechecks availability. We are the sole writer to
+		 * rdllist here -- epoll_ctl() callers are locked out
+		 * by ep->mtx, and the poll callback queues to ovflist
+		 * during scans.
+		 */
+		list_add_tail(&epi->rdllink, &ep->rdllist);
+		ep_pm_stay_awake(epi);
+	}
+	return 1;
+}
+
 static int ep_send_events(struct eventpoll *ep,
 			  struct epoll_event __user *events, int maxevents)
 {
 	struct epitem *epi, *tmp;
-	LIST_HEAD(txlist);
+	LIST_HEAD(scan_batch);
 	poll_table pt;
 	int res = 0;
 
@@ -1781,74 +2105,28 @@ static int ep_send_events(struct eventpoll *ep,
 	init_poll_funcptr(&pt, NULL);
 
 	mutex_lock(&ep->mtx);
-	ep_start_scan(ep, &txlist);
+	ep_start_scan(ep, &scan_batch);
 
 	/*
-	 * We can loop without lock because we are passed a task private list.
-	 * Items cannot vanish during the loop we are holding ep->mtx.
+	 * We can loop without lock because we are passed a task-private
+	 * scan_batch; items cannot vanish while we hold ep->mtx.
 	 */
-	list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
-		struct wakeup_source *ws;
-		__poll_t revents;
+	list_for_each_entry_safe(epi, tmp, &scan_batch, rdllink) {
+		int delivered;
 
 		if (res >= maxevents)
 			break;
 
-		/*
-		 * Activate ep->ws before deactivating epi->ws to prevent
-		 * triggering auto-suspend here (in case we reactive epi->ws
-		 * below).
-		 *
-		 * This could be rearranged to delay the deactivation of epi->ws
-		 * instead, but then epi->ws would temporarily be out of sync
-		 * with ep_is_linked().
-		 */
-		ws = ep_wakeup_source(epi);
-		if (ws) {
-			if (ws->active)
-				__pm_stay_awake(ep->ws);
-			__pm_relax(ws);
-		}
-
-		list_del_init(&epi->rdllink);
-
-		/*
-		 * If the event mask intersect the caller-requested one,
-		 * deliver the event to userspace. Again, we are holding ep->mtx,
-		 * so no operations coming from userspace can change the item.
-		 */
-		revents = ep_item_poll(epi, &pt, 1);
-		if (!revents)
-			continue;
-
-		events = epoll_put_uevent(revents, epi->event.data, events);
-		if (!events) {
-			list_add(&epi->rdllink, &txlist);
-			ep_pm_stay_awake(epi);
+		delivered = ep_deliver_event(ep, epi, &pt, &events, &scan_batch);
+		if (delivered < 0) {
 			if (!res)
-				res = -EFAULT;
+				res = delivered;
 			break;
 		}
-		res++;
-		if (epi->event.events & EPOLLONESHOT)
-			epi->event.events &= EP_PRIVATE_BITS;
-		else if (!(epi->event.events & EPOLLET)) {
-			/*
-			 * If this file has been added with Level
-			 * Trigger mode, we need to insert back inside
-			 * the ready list, so that the next call to
-			 * epoll_wait() will check again the events
-			 * availability. At this point, no one can insert
-			 * into ep->rdllist besides us. The epoll_ctl()
-			 * callers are locked out by
-			 * ep_send_events() holding "mtx" and the
-			 * poll callback will queue them in ep->ovflist.
-			 */
-			list_add_tail(&epi->rdllink, &ep->rdllist);
-			ep_pm_stay_awake(epi);
-		}
+		res += delivered;
 	}
-	ep_done_scan(ep, &txlist);
+
+	ep_done_scan(ep, &scan_batch);
 	mutex_unlock(&ep->mtx);
 
 	return res;
@@ -1938,7 +2216,8 @@ static int ep_schedule_timeout(ktime_t *to)
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		   int maxevents, struct timespec64 *timeout)
 {
-	int res, eavail, timed_out = 0;
+	int res, timed_out = 0;
+	bool eavail;
 	u64 slack = 0;
 	wait_queue_entry_t wait;
 	ktime_t expires, *to = NULL;
@@ -2036,7 +2315,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		 * If timed out and still on the wait queue, recheck eavail
 		 * carefully under lock, below.
 		 */
-		eavail = 1;
+		eavail = true;
 
 		if (!list_empty_careful(&wait.entry)) {
 			spin_lock_irq(&ep->lock);
@@ -2066,7 +2345,8 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
  * Return: depth of the subtree, or a value bigger than EP_MAX_NESTS if we found
  * a loop or went too deep.
  */
-static int ep_loop_check_proc(struct eventpoll *ep, int depth)
+static int ep_loop_check_proc(struct ep_ctl_ctx *ctx,
+			      struct eventpoll *ep, int depth)
 {
 	int result = 0;
 	struct rb_node *rbp;
@@ -2082,22 +2362,23 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
 		if (unlikely(is_file_epoll(epi->ffd.file))) {
 			struct eventpoll *ep_tovisit;
 			ep_tovisit = epi->ffd.file->private_data;
-			if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
+			if (ep_tovisit == ctx->inserting_into ||
+			    depth > EP_MAX_NESTS)
 				result = EP_MAX_NESTS+1;
 			else
-				result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1);
+				result = max(result,
+					     ep_loop_check_proc(ctx, ep_tovisit,
+								depth + 1) + 1);
 			if (result > EP_MAX_NESTS)
 				break;
 		} else {
 			/*
-			 * If we've reached a file that is not associated with
-			 * an ep, then we need to check if the newly added
-			 * links are going to add too many wakeup paths. We do
-			 * this by adding it to the tfile_check_list, if it's
-			 * not already there, and calling reverse_path_check()
-			 * during ep_insert().
+			 * A non-epoll leaf. Queue it for the companion
+			 * reverse_path_check() that runs after this walk so
+			 * any new links we propose don't add too many wakeup
+			 * paths.
 			 */
-			list_file(epi->ffd.file);
+			list_file(epi->ffd.file, ctx);
 		}
 	}
 	ep->loop_check_depth = result;
@@ -2126,22 +2407,24 @@ static int ep_get_upwards_depth_proc(struct eventpoll *ep, int depth)
  *                 into another epoll file (represented by @ep) does not create
  *                 closed loops or too deep chains.
  *
- * @ep: Pointer to the epoll we are inserting into.
- * @to: Pointer to the epoll to be inserted.
+ * @ctx: Per-CTL_ADD scratch context.
+ * @ep:  Pointer to the epoll we are inserting into.
+ * @to:  Pointer to the epoll to be inserted.
  *
  * Return: %zero if adding the epoll @to inside the epoll @from
  * does not violate the constraints, or %-1 otherwise.
  */
-static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
+static int ep_loop_check(struct ep_ctl_ctx *ctx, struct eventpoll *ep,
+			 struct eventpoll *to)
 {
 	int depth, upwards_depth;
 
-	inserting_into = ep;
+	ctx->inserting_into = ep;
 	/*
 	 * Check how deep down we can get from @to, and whether it is possible
 	 * to loop up to @ep.
 	 */
-	depth = ep_loop_check_proc(to, 0);
+	depth = ep_loop_check_proc(ctx, to, 0);
 	if (depth > EP_MAX_NESTS)
 		return -1;
 	/* Check how far up we can go from @ep. */
@@ -2152,12 +2435,12 @@ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
 	return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0;
 }
 
-static void clear_tfile_check_list(void)
+static void clear_tfile_check_list(struct ep_ctl_ctx *ctx)
 {
 	rcu_read_lock();
-	while (tfile_check_list != EP_UNACTIVE_PTR) {
-		struct epitems_head *head = tfile_check_list;
-		tfile_check_list = head->next;
+	while (ctx->tfile_check_list) {
+		struct epitems_head *head = ctx->tfile_check_list;
+		ctx->tfile_check_list = head->next;
 		unlist_file(head);
 	}
 	rcu_read_unlock();
@@ -2223,38 +2506,105 @@ static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
 }
 #endif
 
-static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
-				   bool nonblock)
+static inline int epoll_mutex_lock(struct mutex *mutex, bool nonblock)
 {
 	if (!nonblock) {
-		mutex_lock_nested(mutex, depth);
+		mutex_lock(mutex);
 		return 0;
 	}
-	if (mutex_trylock(mutex))
+	return mutex_trylock(mutex) ? 0 : -EAGAIN;
+}
+
+/*
+ * Acquire the locks required for do_epoll_ctl() on @ep for @op.
+ *
+ * Always takes ep->mtx. For EPOLL_CTL_ADD, additionally runs the
+ * loop / path check under epnested_mutex when the topology can
+ * change: @ep is already watched (epfile->f_ep non-NULL), @ep was
+ * recently loop-checked (ep->gen == loop_check_gen), or @tfile is
+ * itself an eventpoll.
+ *
+ * Return value encodes both outcome and lock state:
+ *
+ *   0        success; ep->mtx held.
+ *   1        success; ep->mtx held AND the full check ran under
+ *            epnested_mutex (which is also still held). The value
+ *            doubles as the @full_check argument to ep_insert().
+ *   -errno   failure; no locks held.
+ *
+ * The caller releases what was taken with ep_ctl_unlock(ep, ret).
+ *
+ * Holding epnested_mutex on add is what prevents two racing
+ * EPOLL_CTL_ADDs on different eps from building a cycle without
+ * either walker observing it.
+ */
+static int ep_ctl_lock(struct ep_ctl_ctx *ctx, struct eventpoll *ep, int op,
+		       struct file *epfile, struct file *tfile, bool nonblock)
+{
+	struct eventpoll *tep;
+	int error;
+
+	error = epoll_mutex_lock(&ep->mtx, nonblock);
+	if (error)
+		return error;
+
+	if (op != EPOLL_CTL_ADD)
+		return 0;
+	if (!READ_ONCE(epfile->f_ep) && ep->gen != loop_check_gen &&
+	    !is_file_epoll(tfile))
 		return 0;
-	return -EAGAIN;
+
+	/* Full check needed: drop ep->mtx so we can take epnested_mutex. */
+	mutex_unlock(&ep->mtx);
+	error = epoll_mutex_lock(&epnested_mutex, nonblock);
+	if (error)
+		return error;
+
+	loop_check_gen++;
+
+	if (is_file_epoll(tfile)) {
+		tep = tfile->private_data;
+		if (ep_loop_check(ctx, ep, tep) != 0) {
+			error = -ELOOP;
+			goto err_unlock_nested;
+		}
+	}
+
+	error = epoll_mutex_lock(&ep->mtx, nonblock);
+	if (error)
+		goto err_unlock_nested;
+
+	return 1;
+
+err_unlock_nested:
+	clear_tfile_check_list(ctx);
+	loop_check_gen++;
+	mutex_unlock(&epnested_mutex);
+	return error;
 }
 
-int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
-		 bool nonblock)
+static void ep_ctl_unlock(struct ep_ctl_ctx *ctx, struct eventpoll *ep,
+			  int full_check)
+{
+	mutex_unlock(&ep->mtx);
+	if (full_check) {
+		clear_tfile_check_list(ctx);
+		loop_check_gen++;
+		mutex_unlock(&epnested_mutex);
+	}
+}
+
+int do_epoll_ctl_file(struct file *f, int op, struct epoll_key *tf,
+		      struct epoll_event *epds, bool nonblock)
 {
 	int error;
-	int full_check = 0;
+	int full_check;
 	struct eventpoll *ep;
 	struct epitem *epi;
-	struct eventpoll *tep = NULL;
-
-	CLASS(fd, f)(epfd);
-	if (fd_empty(f))
-		return -EBADF;
-
-	/* Get the "struct file *" for the target file */
-	CLASS(fd, tf)(fd);
-	if (fd_empty(tf))
-		return -EBADF;
+	struct ep_ctl_ctx ctx = { };
 
 	/* The target file descriptor must support poll */
-	if (!file_can_poll(fd_file(tf)))
+	if (!file_can_poll(tf->file))
 		return -EPERM;
 
 	/* Check if EPOLLWAKEUP is allowed */
@@ -2262,85 +2612,43 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 		ep_take_care_of_epollwakeup(epds);
 
 	/*
-	 * We have to check that the file structure underneath the file descriptor
-	 * the user passed to us _is_ an eventpoll file. And also we do not permit
+	 * The @f file must itself be an eventpoll, and we do not permit
 	 * adding an epoll file descriptor inside itself.
 	 */
-	error = -EINVAL;
-	if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f)))
-		goto error_tgt_fput;
+	if (f == tf->file || !is_file_epoll(f))
+		return -EINVAL;
 
 	/*
 	 * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
 	 * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
-	 * Also, we do not currently supported nested exclusive wakeups.
+	 * Also, nested exclusive wakeups are not supported.
 	 */
 	if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
 		if (op == EPOLL_CTL_MOD)
-			goto error_tgt_fput;
-		if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) ||
+			return -EINVAL;
+		if (op == EPOLL_CTL_ADD && (is_file_epoll(tf->file) ||
 				(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
-			goto error_tgt_fput;
+			return -EINVAL;
 	}
 
-	/*
-	 * At this point it is safe to assume that the "private_data" contains
-	 * our own data structure.
-	 */
-	ep = fd_file(f)->private_data;
+	ep = f->private_data;
 
-	/*
-	 * When we insert an epoll file descriptor inside another epoll file
-	 * descriptor, there is the chance of creating closed loops, which are
-	 * better be handled here, than in more critical paths. While we are
-	 * checking for loops we also determine the list of files reachable
-	 * and hang them on the tfile_check_list, so we can check that we
-	 * haven't created too many possible wakeup paths.
-	 *
-	 * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
-	 * the epoll file descriptor is attaching directly to a wakeup source,
-	 * unless the epoll file descriptor is nested. The purpose of taking the
-	 * 'epnested_mutex' on add is to prevent complex toplogies such as loops and
-	 * deep wakeup paths from forming in parallel through multiple
-	 * EPOLL_CTL_ADD operations.
-	 */
-	error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
-	if (error)
-		goto error_tgt_fput;
-	if (op == EPOLL_CTL_ADD) {
-		if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen ||
-		    is_file_epoll(fd_file(tf))) {
-			mutex_unlock(&ep->mtx);
-			error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
-			if (error)
-				goto error_tgt_fput;
-			loop_check_gen++;
-			full_check = 1;
-			if (is_file_epoll(fd_file(tf))) {
-				tep = fd_file(tf)->private_data;
-				error = -ELOOP;
-				if (ep_loop_check(ep, tep) != 0)
-					goto error_tgt_fput;
-			}
-			error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
-			if (error)
-				goto error_tgt_fput;
-		}
-	}
+	full_check = ep_ctl_lock(&ctx, ep, op, f, tf->file, nonblock);
+	if (full_check < 0)
+		return full_check;
 
 	/*
-	 * Try to lookup the file inside our RB tree. Since we grabbed "mtx"
-	 * above, we can be sure to be able to use the item looked up by
-	 * ep_find() till we release the mutex.
+	 * Look the target up in ep's RB tree. We hold ep->mtx, so the
+	 * item stays valid until we release.
 	 */
-	epi = ep_find(ep, fd_file(tf), fd);
+	epi = ep_find(ep, tf);
 
 	error = -EINVAL;
 	switch (op) {
 	case EPOLL_CTL_ADD:
 		if (!epi) {
 			epds->events |= EPOLLERR | EPOLLHUP;
-			error = ep_insert(ep, epds, fd_file(tf), fd, full_check);
+			error = ep_insert(&ctx, ep, epds, tf, full_check);
 		} else
 			error = -EEXIST;
 		break;
@@ -2366,17 +2674,30 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 			error = -ENOENT;
 		break;
 	}
-	mutex_unlock(&ep->mtx);
 
-error_tgt_fput:
-	if (full_check) {
-		clear_tfile_check_list();
-		loop_check_gen++;
-		mutex_unlock(&epnested_mutex);
-	}
+	ep_ctl_unlock(&ctx, ep, full_check);
 	return error;
 }
 
+int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
+		 bool nonblock)
+{
+	struct epoll_key efd;
+
+	CLASS(fd, f)(epfd);
+	if (fd_empty(f))
+		return -EBADF;
+
+	/* Get the "struct file *" for the target file */
+	CLASS(fd, tf)(fd);
+	if (fd_empty(tf))
+		return -EBADF;
+
+	efd.file = fd_file(tf);
+	efd.fd = fd;
+	return do_epoll_ctl_file(fd_file(f), op, &efd, epds, nonblock);
+}
+
 /*
  * The following function implements the controller interface for
  * the eventpoll file that enables the insertion/removal/change of
@@ -2527,6 +2848,50 @@ SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
 			      sigmask, sigsetsize);
 }
 
+#ifdef CONFIG_KCMP
+static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
+{
+	struct rb_node *rbp;
+	struct epitem *epi;
+
+	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+		epi = rb_entry(rbp, struct epitem, rbn);
+		if (epi->ffd.fd == tfd) {
+			if (toff == 0)
+				return epi;
+			else
+				toff--;
+		}
+		cond_resched();
+	}
+
+	return NULL;
+}
+
+struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
+				     unsigned long toff)
+{
+	struct file *file_raw;
+	struct eventpoll *ep;
+	struct epitem *epi;
+
+	if (!is_file_epoll(file))
+		return ERR_PTR(-EINVAL);
+
+	ep = file->private_data;
+
+	mutex_lock(&ep->mtx);
+	epi = ep_find_tfd(ep, tfd, toff);
+	if (epi)
+		file_raw = epi->ffd.file;
+	else
+		file_raw = ERR_PTR(-ENOENT);
+	mutex_unlock(&ep->mtx);
+
+	return file_raw;
+}
+#endif /* CONFIG_KCMP */
+
 #ifdef CONFIG_COMPAT
 static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
 				 int maxevents, struct timespec64 *timeout,
diff --git a/fs/exec.c b/fs/exec.c
index ba12b4c466f6d..894added369dd 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/exec_state.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/sched/task.h>
@@ -263,6 +264,9 @@ static int bprm_mm_init(struct linux_binprm *bprm)
 	if (!mm)
 		goto err;
 
+	/* Staged for would_dump() narrowing; consumed by begin_new_exec(). */
+	bprm->user_ns = get_user_ns(current_user_ns());
+
 	/* Save current stack limit for all calculations made during exec. */
 	task_lock(current->group_leader);
 	bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
@@ -834,12 +838,17 @@ EXPORT_SYMBOL(read_code);
  * On success, this function returns with exec_update_lock
  * held for writing.
  */
-static int exec_mmap(struct mm_struct *mm)
+static int exec_mmap(struct mm_struct *mm, struct user_namespace *user_ns)
 {
+	struct task_exec_state *exec_state __free(put_task_exec_state) = NULL;
 	struct task_struct *tsk;
 	struct mm_struct *old_mm, *active_mm;
 	int ret;
 
+	exec_state = alloc_task_exec_state(user_ns);
+	if (!exec_state)
+		return -ENOMEM;
+
 	/* Notify parent that we're no longer interested in the old VM */
 	tsk = current;
 	old_mm = current->mm;
@@ -870,6 +879,7 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk->active_mm = mm;
 	tsk->mm = mm;
 	mm_init_cid(mm, tsk);
+	exec_state = task_exec_state_replace(tsk, exec_state);
 	/*
 	 * This prevents preemption while active_mm is being loaded and
 	 * it and mm are being updated, which could cause problems for
@@ -1145,7 +1155,7 @@ int begin_new_exec(struct linux_binprm * bprm)
 	 * Release all of the old mmap stuff
 	 */
 	acct_arg_size(bprm, 0);
-	retval = exec_mmap(bprm->mm);
+	retval = exec_mmap(bprm->mm, bprm->user_ns);
 	if (retval)
 		goto out;
 
@@ -1210,9 +1220,9 @@ int begin_new_exec(struct linux_binprm * bprm)
 	if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
 	    !(uid_eq(current_euid(), current_uid()) &&
 	      gid_eq(current_egid(), current_gid())))
-		set_dumpable(current->mm, suid_dumpable);
+		task_exec_state_set_dumpable(suid_dumpable);
 	else
-		set_dumpable(current->mm, SUID_DUMP_USER);
+		task_exec_state_set_dumpable(TASK_DUMPABLE_OWNER);
 
 	perf_event_exec();
 
@@ -1261,7 +1271,7 @@ int begin_new_exec(struct linux_binprm * bprm)
 	 * wait until new credentials are committed
 	 * by commit_creds() above
 	 */
-	if (get_dumpable(me->mm) != SUID_DUMP_USER)
+	if (task_exec_state_get_dumpable(me) != TASK_DUMPABLE_OWNER)
 		perf_event_exit_task(me);
 	/*
 	 * cred_guard_mutex must be held at least to this point to prevent
@@ -1298,14 +1308,14 @@ void would_dump(struct linux_binprm *bprm, struct file *file)
 		struct user_namespace *old, *user_ns;
 		bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
 
-		/* Ensure mm->user_ns contains the executable */
-		user_ns = old = bprm->mm->user_ns;
+		/* Ensure bprm->user_ns contains the executable. */
+		user_ns = old = bprm->user_ns;
 		while ((user_ns != &init_user_ns) &&
 		       !privileged_wrt_inode_uidgid(user_ns, idmap, inode))
 			user_ns = user_ns->parent;
 
 		if (old != user_ns) {
-			bprm->mm->user_ns = get_user_ns(user_ns);
+			bprm->user_ns = get_user_ns(user_ns);
 			put_user_ns(old);
 		}
 	}
@@ -1375,6 +1385,8 @@ static void free_bprm(struct linux_binprm *bprm)
 		acct_arg_size(bprm, 0);
 		mmput(bprm->mm);
 	}
+	if (bprm->user_ns)
+		put_user_ns(bprm->user_ns);
 	free_arg_pages(bprm);
 	if (bprm->cred) {
 		/* in case exec fails before de_thread() succeeds */
@@ -1905,17 +1917,6 @@ void set_binfmt(struct linux_binfmt *new)
 }
 EXPORT_SYMBOL(set_binfmt);
 
-/*
- * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
- */
-void set_dumpable(struct mm_struct *mm, int value)
-{
-	if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
-		return;
-
-	__mm_flags_set_mask_dumpable(mm, value);
-}
-
 static inline struct user_arg_ptr native_arg(const char __user *const __user *p)
 {
 	return (struct user_arg_ptr){.ptr.native = p};
@@ -1975,9 +1976,11 @@ COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
 static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos)
 {
-	int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	int error, old = READ_ONCE(suid_dumpable);
+
+	error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
-	if (!error && write)
+	if (!error && write && (old != READ_ONCE(suid_dumpable)))
 		validate_coredump_safety();
 	return error;
 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3fd8f00998521..8c80d50875167 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -252,10 +252,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 		       "nonexistent device\n", __func__, __LINE__);
 		return;
 	}
-	if (icount_read(inode) > 1) {
+	if (icount_read_once(inode) > 1) {
 		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%llu: count=%d",
 			 __func__, __LINE__, inode->i_ino,
-			 icount_read(inode));
+			 icount_read_once(inode));
 		return;
 	}
 	if (inode->i_nlink) {
diff --git a/fs/fcntl.c b/fs/fcntl.c
index beab8080badf6..b3ea135b74d8b 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -1169,10 +1169,10 @@ static int __init fcntl_init(void)
 	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
 	 * is defined as O_NONBLOCK on some platforms and not on others.
 	 */
-	BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
+	BUILD_BUG_ON(22 - 1 /* for O_RDONLY being 0 */ !=
 		HWEIGHT32(
 			(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
-			__FMODE_EXEC));
+			__FMODE_EXEC | __O_REGULAR));
 
 	fasync_cache = kmem_cache_create("fasync_cache",
 					 sizeof(struct fasync_struct), 0,
diff --git a/fs/file.c b/fs/file.c
index 2c81c0b162d05..e5c75b22e0c7c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -544,24 +544,23 @@ struct files_struct init_files = {
 static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
 {
 	unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
-	unsigned int maxbit = maxfd / BITS_PER_LONG;
-	unsigned int bitbit = start / BITS_PER_LONG;
+	unsigned int max_fds_words = maxfd / BITS_PER_LONG;
+	unsigned int fds_word_idx = start / BITS_PER_LONG;
 	unsigned int bit;
 
 	/*
 	 * Try to avoid looking at the second level bitmap
 	 */
-	bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG,
+	bit = find_next_zero_bit(&fdt->open_fds[fds_word_idx], BITS_PER_LONG,
 				 start & (BITS_PER_LONG - 1));
 	if (bit < BITS_PER_LONG)
-		return bit + bitbit * BITS_PER_LONG;
+		return bit + (fds_word_idx * BITS_PER_LONG);
 
-	bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
-	if (bitbit >= maxfd)
+	bit = BITS_PER_LONG *
+		find_next_zero_bit(fdt->full_fds_bits, max_fds_words, fds_word_idx + 1);
+	if (bit >= maxfd)
 		return maxfd;
-	if (bitbit > start)
-		start = bitbit;
-	return find_next_zero_bit(fdt->open_fds, maxfd, start);
+	return find_next_zero_bit(fdt->open_fds, maxfd, bit);
 }
 
 /*
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 0c7d2b7ac26c8..771fc31a69b8f 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -17,22 +17,49 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/fs_parser.h>
+#include <linux/rculist.h>
 
 /*
- * Handling of filesystem drivers list.
- * Rules:
- *	Inclusion to/removals from/scanning of list are protected by spinlock.
- *	During the unload module must call unregister_filesystem().
- *	We can access the fields of list element if:
- *		1) spinlock is held or
- *		2) we hold the reference to the module.
- *	The latter can be guaranteed by call of try_module_get(); if it
- *	returned 0 we must skip the element, otherwise we got the reference.
- *	Once the reference is obtained we can drop the spinlock.
+ * Read-mostly filesystem drivers list.
+ *
+ * Readers walk under rcu_read_lock(); writers take file_systems_lock
+ * and publish via _rcu hlist primitives.  unregister_filesystem()
+ * synchronize_rcu()s after unlock so the embedded file_system_type
+ * can't go away under a reader.  To keep using a filesystem after
+ * the RCU section ends, take a module reference via try_module_get().
  */
+static HLIST_HEAD(file_systems);
+static DEFINE_SPINLOCK(file_systems_lock);
+
+#ifdef CONFIG_PROC_FS
+/*
+ * Cache a stringified version of the filesystem list.
+ *
+ * The fs list gets queried a lot by userspace because of libselinux, including
+ * rather surprising programs (would you guess *sed* is on the list?). In order
+ * to reduce the overhead we cache the resulting string, which normally hangs
+ * around below 512 bytes in size.
+ *
+ * As the list almost never changes, its creation is not particularly optimized
+ * to keep things simple.
+ *
+ * We sort it out on read in order to not introduce a failure point for fs
+ * registration (in principle we may be unable to alloc memory for the list).
+ */
+struct file_systems_string {
+	struct rcu_head rcu;
+	unsigned long gen;
+	size_t len;
+	char string[];
+};
 
-static struct file_system_type *file_systems;
-static DEFINE_RWLOCK(file_systems_lock);
+static unsigned long file_systems_gen;
+static struct file_systems_string __rcu *file_systems_string;
+
+static void invalidate_filesystems_string(void);
+#else
+static inline void invalidate_filesystems_string(void) { }
+#endif
 
 /* WARNING: This can be used only if we _already_ own a reference */
 struct file_system_type *get_filesystem(struct file_system_type *fs)
@@ -46,14 +73,15 @@ void put_filesystem(struct file_system_type *fs)
 	module_put(fs->owner);
 }
 
-static struct file_system_type **find_filesystem(const char *name, unsigned len)
+static struct file_system_type *find_filesystem(const char *name, unsigned len)
 {
-	struct file_system_type **p;
-	for (p = &file_systems; *p; p = &(*p)->next)
-		if (strncmp((*p)->name, name, len) == 0 &&
-		    !(*p)->name[len])
-			break;
-	return p;
+	struct file_system_type *fs;
+
+	hlist_for_each_entry_rcu(fs, &file_systems, list,
+				 lockdep_is_held(&file_systems_lock))
+		if (strncmp(fs->name, name, len) == 0 && !fs->name[len])
+			return fs;
+	return NULL;
 }
 
 /**
@@ -64,33 +92,27 @@ static struct file_system_type **find_filesystem(const char *name, unsigned len)
  *	is aware of for mount and other syscalls. Returns 0 on success,
  *	or a negative errno code on an error.
  *
- *	The &struct file_system_type that is passed is linked into the kernel 
+ *	The &struct file_system_type that is passed is linked into the kernel
  *	structures and must not be freed until the file system has been
  *	unregistered.
  */
- 
-int register_filesystem(struct file_system_type * fs)
+int register_filesystem(struct file_system_type *fs)
 {
-	int res = 0;
-	struct file_system_type ** p;
-
 	if (fs->parameters &&
 	    !fs_validate_description(fs->name, fs->parameters))
 		return -EINVAL;
 
 	BUG_ON(strchr(fs->name, '.'));
-	if (fs->next)
+	if (!hlist_unhashed_lockless(&fs->list))
 		return -EBUSY;
-	write_lock(&file_systems_lock);
-	p = find_filesystem(fs->name, strlen(fs->name));
-	if (*p)
-		res = -EBUSY;
-	else
-		*p = fs;
-	write_unlock(&file_systems_lock);
-	return res;
-}
 
+	guard(spinlock)(&file_systems_lock);
+	if (find_filesystem(fs->name, strlen(fs->name)))
+		return -EBUSY;
+	hlist_add_tail_rcu(&fs->list, &file_systems);
+	invalidate_filesystems_string();
+	return 0;
+}
 EXPORT_SYMBOL(register_filesystem);
 
 /**
@@ -100,94 +122,79 @@ EXPORT_SYMBOL(register_filesystem);
  *	Remove a file system that was previously successfully registered
  *	with the kernel. An error is returned if the file system is not found.
  *	Zero is returned on a success.
- *	
+ *
  *	Once this function has returned the &struct file_system_type structure
  *	may be freed or reused.
  */
- 
-int unregister_filesystem(struct file_system_type * fs)
+int unregister_filesystem(struct file_system_type *fs)
 {
-	struct file_system_type ** tmp;
-
-	write_lock(&file_systems_lock);
-	tmp = &file_systems;
-	while (*tmp) {
-		if (fs == *tmp) {
-			*tmp = fs->next;
-			fs->next = NULL;
-			write_unlock(&file_systems_lock);
-			synchronize_rcu();
-			return 0;
-		}
-		tmp = &(*tmp)->next;
+	scoped_guard(spinlock, &file_systems_lock) {
+		if (hlist_unhashed(&fs->list))
+			return -EINVAL;
+		hlist_del_init_rcu(&fs->list);
+		invalidate_filesystems_string();
 	}
-	write_unlock(&file_systems_lock);
-
-	return -EINVAL;
+	synchronize_rcu();
+	return 0;
 }
-
 EXPORT_SYMBOL(unregister_filesystem);
 
 #ifdef CONFIG_SYSFS_SYSCALL
-static int fs_index(const char __user * __name)
+static int fs_index(const char __user *__name)
 {
-	struct file_system_type * tmp;
+	struct file_system_type *p;
 	char *name __free(kfree) = strndup_user(__name, PATH_MAX);
-	int err, index;
+	int index = 0;
 
 	if (IS_ERR(name))
 		return PTR_ERR(name);
 
-	err = -EINVAL;
-	read_lock(&file_systems_lock);
-	for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
-		if (strcmp(tmp->name, name) == 0) {
-			err = index;
-			break;
-		}
+	guard(rcu)();
+	hlist_for_each_entry_rcu(p, &file_systems, list) {
+		if (strcmp(p->name, name) == 0)
+			return index;
+		index++;
 	}
-	read_unlock(&file_systems_lock);
-	return err;
+	return -EINVAL;
 }
 
-static int fs_name(unsigned int index, char __user * buf)
+static int fs_name(unsigned int index, char __user *buf)
 {
-	struct file_system_type * tmp;
-	int len, res = -EINVAL;
+	struct file_system_type *p, *found = NULL;
+	int len, res;
 
-	read_lock(&file_systems_lock);
-	for (tmp = file_systems; tmp; tmp = tmp->next, index--) {
-		if (index == 0) {
-			if (try_module_get(tmp->owner))
-				res = 0;
+	scoped_guard(rcu) {
+		hlist_for_each_entry_rcu(p, &file_systems, list) {
+			if (index--)
+				continue;
+			if (try_module_get(p->owner))
+				found = p;
 			break;
 		}
 	}
-	read_unlock(&file_systems_lock);
-	if (res)
-		return res;
+	if (!found)
+		return -EINVAL;
 
 	/* OK, we got the reference, so we can safely block */
-	len = strlen(tmp->name) + 1;
-	res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0;
-	put_filesystem(tmp);
+	len = strlen(found->name) + 1;
+	res = copy_to_user(buf, found->name, len) ? -EFAULT : 0;
+	put_filesystem(found);
 	return res;
 }
 
 static int fs_maxindex(void)
 {
-	struct file_system_type * tmp;
-	int index;
+	struct file_system_type *p;
+	int index = 0;
 
-	read_lock(&file_systems_lock);
-	for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
-		;
-	read_unlock(&file_systems_lock);
+	guard(rcu)();
+	hlist_for_each_entry_rcu(p, &file_systems, list)
+		index++;
 	return index;
 }
 
 /*
- * Whee.. Weird sysv syscall. 
+ * Whee.. Weird sysv syscall.
  */
 SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
 {
@@ -216,8 +223,8 @@ int __init list_bdev_fs_names(char *buf, size_t size)
 	size_t len;
 	int count = 0;
 
-	read_lock(&file_systems_lock);
-	for (p = file_systems; p; p = p->next) {
+	guard(rcu)();
+	hlist_for_each_entry_rcu(p, &file_systems, list) {
 		if (!(p->fs_flags & FS_REQUIRES_DEV))
 			continue;
 		len = strlen(p->name) + 1;
@@ -230,30 +237,145 @@ int __init list_bdev_fs_names(char *buf, size_t size)
 		size -= len;
 		count++;
 	}
-	read_unlock(&file_systems_lock);
 	return count;
 }
 
 #ifdef CONFIG_PROC_FS
-static int filesystems_proc_show(struct seq_file *m, void *v)
+static void invalidate_filesystems_string(void)
 {
-	struct file_system_type * tmp;
+	struct file_systems_string *old;
+
+	lockdep_assert_held_write(&file_systems_lock);
+	file_systems_gen++;
+	old = rcu_replace_pointer(file_systems_string, NULL,
+			   lockdep_is_held(&file_systems_lock));
+	if (old)
+		kfree_rcu(old, rcu);
+}
 
-	read_lock(&file_systems_lock);
-	tmp = file_systems;
-	while (tmp) {
+static __cold noinline int regen_filesystems_string(void)
+{
+	struct file_system_type *p;
+	struct file_systems_string *old, *new;
+	size_t newlen, usedlen;
+	unsigned long gen;
+
+retry:
+	newlen = 0;
+
+	/* pre-calc space for each fs */
+	spin_lock(&file_systems_lock);
+	gen = file_systems_gen;
+	hlist_for_each_entry_rcu(p, &file_systems, list) {
+		if (!(p->fs_flags & FS_REQUIRES_DEV))
+			newlen += strlen("nodev");
+		newlen += strlen("\t") + strlen(p->name) +  strlen("\n");
+	}
+	spin_unlock(&file_systems_lock);
+
+	new = kmalloc(offsetof(struct file_systems_string, string) + newlen + 1,
+		      GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	new->gen = gen;
+	new->len = newlen;
+	new->string[newlen] = '\0';
+
+	spin_lock(&file_systems_lock);
+	old = file_systems_string;
+
+	/*
+	 * Did someone beat us to it?
+	 */
+	if (old && old->gen == file_systems_gen) {
+		kfree(new);
+		return 0;
+	}
+
+	/*
+	 * Did the list change in the meantime?
+	 */
+	if (gen != file_systems_gen) {
+		kfree(new);
+		goto retry;
+	}
+
+	/*
+	 * Populate the string.
+	 *
+	 * We know we have just enough space because we calculated the right
+	 * size the previous time we had the lock and confirmed the list has
+	 * not changed after reacquiring it.
+	 */
+	usedlen = 0;
+	hlist_for_each_entry_rcu(p, &file_systems, list) {
+		usedlen += sprintf(&new->string[usedlen], "%s\t%s\n",
+				   (p->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+				   p->name);
+	}
+
+	if (WARN_ON_ONCE(new->len != strlen(new->string))) {
+		/*
+		 * Should never happen of course, keep this in case someone changes string
+		 * generation above and messes it up.
+		 */
+		spin_unlock(&file_systems_lock);
+		if (old)
+			kfree_rcu(old, rcu);
+		return -EINVAL;
+	}
+
+	/*
+	 * Paired with consume fence in READ_ONCE() in filesystems_proc_show()
+	 */
+	smp_store_release(&file_systems_string, new);
+	spin_unlock(&file_systems_lock);
+	if (old)
+		kfree_rcu(old, rcu);
+	return 0;
+}
+
+static __cold noinline int filesystems_proc_show_fallback(struct seq_file *m, void *v)
+{
+	struct file_system_type *p;
+
+	guard(rcu)();
+	hlist_for_each_entry_rcu(p, &file_systems, list) {
 		seq_printf(m, "%s\t%s\n",
-			(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
-			tmp->name);
-		tmp = tmp->next;
+			   (p->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+			   p->name);
 	}
-	read_unlock(&file_systems_lock);
 	return 0;
 }
 
+static int filesystems_proc_show(struct seq_file *m, void *v)
+{
+	struct file_systems_string *fss;
+
+	for (;;) {
+		scoped_guard(rcu) {
+			fss = rcu_dereference(file_systems_string);
+			if (likely(fss)) {
+				seq_write(m, fss->string, fss->len);
+				return 0;
+			}
+		}
+
+		int err = regen_filesystems_string();
+		if (unlikely(err))
+			return filesystems_proc_show_fallback(m, v);
+	}
+}
+
 static int __init proc_filesystems_init(void)
 {
-	proc_create_single("filesystems", 0, NULL, filesystems_proc_show);
+	struct proc_dir_entry *pde;
+
+	pde = proc_create_single("filesystems", 0, NULL, filesystems_proc_show);
+	if (!pde)
+		return -ENOMEM;
+	proc_make_permanent(pde);
 	return 0;
 }
 module_init(proc_filesystems_init);
@@ -263,11 +385,10 @@ static struct file_system_type *__get_fs_type(const char *name, int len)
 {
 	struct file_system_type *fs;
 
-	read_lock(&file_systems_lock);
-	fs = *(find_filesystem(name, len));
+	guard(rcu)();
+	fs = find_filesystem(name, len);
 	if (fs && !try_module_get(fs->owner))
 		fs = NULL;
-	read_unlock(&file_systems_lock);
 	return fs;
 }
 
@@ -291,5 +412,4 @@ struct file_system_type *get_fs_type(const char *name)
 	}
 	return fs;
 }
-
 EXPORT_SYMBOL(get_fs_type);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e9bf4879c07f7..e9895dea0da49 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -738,6 +738,13 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
 	error = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
+		if (file && (file->f_flags & __O_REGULAR) &&
+		    !S_ISREG(inode->i_mode)) {
+			iput(inode);
+			inode = NULL;
+			error = -EFTYPE;
+			goto fail_gunlock;
+		}
 		if (S_ISDIR(inode->i_mode)) {
 			iput(inode);
 			inode = NULL;
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 0e932cc8be1b2..1b4fcf760aadc 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -184,7 +184,7 @@ void hpfs_write_inode(struct inode *i)
 	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
 	struct inode *parent;
 	if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return;
-	if (hpfs_inode->i_rddir_off && !icount_read(i)) {
+	if (hpfs_inode->i_rddir_off && !icount_read_once(i)) {
 		if (*hpfs_inode->i_rddir_off)
 			pr_err("write_inode: some position still there\n");
 		kfree(hpfs_inode->i_rddir_off);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index c16d5d4caeade..8fbdbf0806276 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -523,7 +523,8 @@ static int hpfs_fill_super(struct super_block *s, struct fs_context *fc)
 	hpfs_lock(s);
 
 	/*sbi->sb_mounting = 1;*/
-	sb_set_blocksize(s, 512);
+	if (!sb_set_blocksize(s, 512))
+		goto bail0;
 	sbi->sb_fs_size = -1;
 	if (!(bootblock = hpfs_map_sector(s, 0, &bh0, 0))) goto bail1;
 	if (!(superblock = hpfs_map_sector(s, 16, &bh1, 1))) goto bail2;
diff --git a/fs/inode.c b/fs/inode.c
index 62c579a0cf7df..acf206beb2e03 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -53,11 +53,7 @@
  *   inode->i_lock
  *
  * inode_hash_lock
- *   inode->i_sb->s_inode_list_lock
  *   inode->i_lock
- *
- * iunique_lock
- *   inode_hash_lock
  */
 
 static unsigned int i_hash_mask __ro_after_init;
@@ -518,15 +514,6 @@ static void init_once(void *foo)
 	inode_init_once(inode);
 }
 
-/*
- * get additional reference to inode; caller must already hold one.
- */
-void ihold(struct inode *inode)
-{
-	WARN_ON(atomic_inc_return(&inode->i_count) < 2);
-}
-EXPORT_SYMBOL(ihold);
-
 struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
 					    struct inode *inode, u32 bit)
 {
@@ -902,7 +889,7 @@ void evict_inodes(struct super_block *sb)
 again:
 	spin_lock(&sb->s_inode_list_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (icount_read(inode))
+		if (icount_read_once(inode))
 			continue;
 
 		spin_lock(&inode->i_lock);
@@ -1032,6 +1019,7 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
 }
 
 static void __wait_on_freeing_inode(struct inode *inode, bool hash_locked, bool rcu_locked);
+static bool igrab_from_hash(struct inode *inode);
 
 /*
  * Called with the inode lock held.
@@ -1056,6 +1044,11 @@ repeat:
 			continue;
 		if (!test(inode, data))
 			continue;
+		if (igrab_from_hash(inode)) {
+			rcu_read_unlock();
+			*isnew = false;
+			return inode;
+		}
 		spin_lock(&inode->i_lock);
 		if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode, hash_locked, true);
@@ -1098,6 +1091,11 @@ repeat:
 			continue;
 		if (inode->i_sb != sb)
 			continue;
+		if (igrab_from_hash(inode)) {
+			rcu_read_unlock();
+			*isnew = false;
+			return inode;
+		}
 		spin_lock(&inode->i_lock);
 		if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode, hash_locked, true);
@@ -1215,6 +1213,10 @@ void unlock_new_inode(struct inode *inode)
 	lockdep_annotate_inode_mutex_key(inode);
 	spin_lock(&inode->i_lock);
 	WARN_ON(!(inode_state_read(inode) & I_NEW));
+	/*
+	 * Paired with igrab_from_hash()
+	 */
+	smp_wmb();
 	inode_state_clear(inode, I_NEW | I_CREATING);
 	inode_wake_up_bit(inode, __I_NEW);
 	spin_unlock(&inode->i_lock);
@@ -1226,6 +1228,10 @@ void discard_new_inode(struct inode *inode)
 	lockdep_annotate_inode_mutex_key(inode);
 	spin_lock(&inode->i_lock);
 	WARN_ON(!(inode_state_read(inode) & I_NEW));
+	/*
+	 * Paired with igrab_from_hash()
+	 */
+	smp_wmb();
 	inode_state_clear(inode, I_NEW);
 	inode_wake_up_bit(inode, __I_NEW);
 	spin_unlock(&inode->i_lock);
@@ -1572,8 +1578,27 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
 }
 EXPORT_SYMBOL(iunique);
 
+/**
+ * ihold - get a reference on the inode, provided you already have one
+ * @inode:	inode to operate on
+ */
+void ihold(struct inode *inode)
+{
+	VFS_BUG_ON_INODE(icount_read_once(inode) < 1, inode);
+	WARN_ON(atomic_inc_return(&inode->i_count) < 2);
+}
+EXPORT_SYMBOL(ihold);
+
 struct inode *igrab(struct inode *inode)
 {
+	/*
+	 * Read commentary above igrab_from_hash() for an explanation why this works.
+	 */
+	if (atomic_add_unless(&inode->i_count, 1, 0)) {
+		VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE), inode);
+		return inode;
+	}
+
 	spin_lock(&inode->i_lock);
 	if (!(inode_state_read(inode) & (I_FREEING | I_WILL_FREE))) {
 		__iget(inode);
@@ -1591,6 +1616,43 @@ struct inode *igrab(struct inode *inode)
 }
 EXPORT_SYMBOL(igrab);
 
+/*
+ * igrab_from_hash - special inode refcount acquire primitive for the inode hash
+ *
+ * It provides lockless refcount acquire in the common case of no problematic
+ * flags being set and the count being > 0.
+ *
+ * There are 4 state flags to worry about and the routine makes sure to not bump the
+ * ref if any of them is present.
+ *
+ * I_NEW and I_CREATING can only legally get set *before* the inode becomes visible
+ * during lookup. Thus if the flags are not spotted, they are guaranteed to not be
+ * a factor. However, we need an acquire fence before returning the inode just
+ * in case we raced against clearing the state to make sure our consumer picks up
+ * any other changes made prior. atomic_add_unless provides a full fence, which
+ * takes care of it.
+ *
+ * I_FREEING and I_WILL_FREE can only legally get set if ->i_count == 0 and it is
+ * illegal to bump the ref if either is present. Consequently if atomic_add_unless
+ * managed to replace a non-0 value with a bigger one, we have a guarantee neither
+ * of these flags is set. Note this means explicitly checking of these flags below
+ * is not necessary, it is only done because it does not cost anything on top of the
+ * load which already needs to be done to handle the other flags.
+ */
+static bool igrab_from_hash(struct inode *inode)
+{
+	if (inode_state_read_once(inode) & (I_NEW | I_CREATING | I_FREEING | I_WILL_FREE))
+		return false;
+	/*
+	 * Paired with routines clearing I_NEW
+	 */
+	if (atomic_add_unless(&inode->i_count, 1, 0)) {
+		VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE), inode);
+		return true;
+	}
+	return false;
+}
+
 /**
  * ilookup5_nowait - search for an inode in the inode cache
  * @sb:		super block of file system to search
@@ -1920,7 +1982,7 @@ static void iput_final(struct inode *inode)
 	int drop;
 
 	WARN_ON(inode_state_read(inode) & I_NEW);
-	VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode);
+	VFS_BUG_ON_INODE(icount_read(inode) != 0, inode);
 
 	if (op->drop_inode)
 		drop = op->drop_inode(inode);
@@ -1939,7 +2001,7 @@ static void iput_final(struct inode *inode)
 	 * Re-check ->i_count in case the ->drop_inode() hooks played games.
 	 * Note we only execute this if the verdict was to drop the inode.
 	 */
-	VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode);
+	VFS_BUG_ON_INODE(icount_read(inode) != 0, inode);
 
 	if (drop) {
 		inode_state_set(inode, I_FREEING);
@@ -1983,7 +2045,7 @@ retry:
 	 * equal to one, then two CPUs racing to further drop it can both
 	 * conclude it's fine.
 	 */
-	VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 1, inode);
+	VFS_BUG_ON_INODE(icount_read_once(inode) < 1, inode);
 
 	if (atomic_add_unless(&inode->i_count, -1, 1))
 		return;
@@ -2017,7 +2079,7 @@ EXPORT_SYMBOL(iput);
 void iput_not_last(struct inode *inode)
 {
 	VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_CLEAR), inode);
-	VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 2, inode);
+	VFS_BUG_ON_INODE(icount_read_once(inode) < 2, inode);
 
 	WARN_ON(atomic_sub_return(1, &inode->i_count) == 0);
 }
@@ -3046,7 +3108,7 @@ void dump_inode(struct inode *inode, const char *reason)
 	}
 
 	state = inode_state_read_once(inode);
-	count = atomic_read(&inode->i_count);
+	count = icount_read_once(inode);
 
 	if (!sb ||
 	    get_kernel_nofault(s_type, &sb->s_type) || !s_type ||
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index efee53717f1cd..337836a0a1704 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -818,7 +818,8 @@ root_found:
 	 * entries.  By forcing the blocksize in this way, we ensure
 	 * that we will never be required to do this.
 	 */
-	sb_set_blocksize(s, orig_zonesize);
+	if (!sb_set_blocksize(s, orig_zonesize))
+		goto out_freesbi;
 
 	sbi->s_nls_iocharset = NULL;
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 61575f7397aea..8180d83d33fed 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -491,7 +491,8 @@ static int jfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	/*
 	 * Initialize blocksize to 4K.
 	 */
-	sb_set_blocksize(sb, PSIZE);
+	if (!sb_set_blocksize(sb, PSIZE))
+		goto out_unload;
 
 	/*
 	 * Set method vectors.
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 9c6bac2489077..03a69b13950db 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -292,7 +292,8 @@ static int minix_fill_super(struct super_block *s, struct fs_context *fc)
 		sbi->s_namelen = 60;
 		sbi->s_version = MINIX_V3;
 		sbi->s_mount_state = MINIX_VALID_FS;
-		sb_set_blocksize(s, m3s->s_blocksize);
+		if (!sb_set_blocksize(s, m3s->s_blocksize))
+			goto out;
 		s->s_max_links = MINIX2_LINK_MAX;
 	} else
 		goto out_no_fs;
diff --git a/fs/mount.h b/fs/mount.h
index e0816c11a1989..5df134d56d475 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -25,6 +25,7 @@ struct mnt_namespace {
 	__u32			n_fsnotify_mask;
 	struct fsnotify_mark_connector __rcu *n_fsnotify_marks;
 #endif
+	struct hlist_head	mnt_visible_mounts; /* SB_I_USERNS_VISIBLE mounts */
 	unsigned int		nr_mounts; /* # of mounts in the namespace */
 	unsigned int		pending_mounts;
 	refcount_t		passive; /* number references not pinning @mounts */
@@ -90,6 +91,7 @@ struct mount {
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	struct hlist_head mnt_pins;
 	struct hlist_head mnt_stuck_children;
+	struct hlist_node mnt_ns_visible; /* link in ns->mnt_visible_mounts */
 	struct mount *overmount;	/* mounted on ->mnt_root */
 } __randomize_layout;
 
@@ -207,6 +209,8 @@ static inline void move_from_ns(struct mount *mnt)
 		ns->mnt_first_node = rb_next(&mnt->mnt_node);
 	rb_erase(&mnt->mnt_node, &ns->mounts);
 	RB_CLEAR_NODE(&mnt->mnt_node);
+	if (!hlist_unhashed(&mnt->mnt_ns_visible))
+		hlist_del_init(&mnt->mnt_ns_visible);
 }
 
 bool has_locked_children(struct mount *mnt, struct dentry *dentry);
diff --git a/fs/namei.c b/fs/namei.c
index 3a3a2e5e77a0f..a59ea10b481d6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2955,15 +2955,16 @@ void end_dirop(struct dentry *de)
 EXPORT_SYMBOL(end_dirop);
 
 /* does lookup, returns the object with parent locked */
-static struct dentry *__start_removing_path(int dfd, struct filename *name,
-					   struct path *path)
+struct dentry *start_removing_path(const char *name, struct path *path)
 {
+	CLASS(filename_kernel, filename)(name);
 	struct path parent_path __free(path_put) = {};
 	struct dentry *d;
 	struct qstr last;
 	int type, error;
 
-	error = filename_parentat(dfd, name, 0, &parent_path, &last, &type);
+	error = filename_parentat(AT_FDCWD, filename, 0, &parent_path, &last,
+			&type);
 	if (error)
 		return ERR_PTR(error);
 	if (unlikely(type != LAST_NORM))
@@ -3023,21 +3024,6 @@ struct dentry *kern_path_parent(const char *name, struct path *path)
 	return d;
 }
 
-struct dentry *start_removing_path(const char *name, struct path *path)
-{
-	CLASS(filename_kernel, filename)(name);
-	return __start_removing_path(AT_FDCWD, filename, path);
-}
-
-struct dentry *start_removing_user_path_at(int dfd,
-					   const char __user *name,
-					   struct path *path)
-{
-	CLASS(filename, filename)(name);
-	return __start_removing_path(dfd, filename, path);
-}
-EXPORT_SYMBOL(start_removing_user_path_at);
-
 int kern_path(const char *name, unsigned int flags, struct path *path)
 {
 	CLASS(filename_kernel, filename)(name);
@@ -3617,7 +3603,6 @@ int path_pts(struct path *path)
 	 */
 	struct dentry *parent = dget_parent(path->dentry);
 	struct dentry *child;
-	struct qstr this = QSTR_INIT("pts", 3);
 
 	if (unlikely(!path_connected(path->mnt, parent))) {
 		dput(parent);
@@ -3625,7 +3610,7 @@ int path_pts(struct path *path)
 	}
 	dput(path->dentry);
 	path->dentry = parent;
-	child = d_hash_and_lookup(parent, &this);
+	child = d_hash_and_lookup(parent, &QSTR("pts"));
 	if (IS_ERR_OR_NULL(child))
 		return -ENOENT;
 
@@ -4679,6 +4664,10 @@ static int do_open(struct nameidata *nd,
 		if (unlikely(error))
 			return error;
 	}
+
+	if ((open_flag & __O_REGULAR) && !d_is_reg(nd->path.dentry))
+		return -EFTYPE;
+
 	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
 		return -ENOTDIR;
 
diff --git a/fs/namespace.c b/fs/namespace.c
index fe919abd2f011..9a66a806a9b8a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -321,6 +321,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_HLIST_NODE(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
 		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
+		INIT_HLIST_NODE(&mnt->mnt_ns_visible);
 		RB_CLEAR_NODE(&mnt->mnt_node);
 		mnt->mnt.mnt_idmap = &nop_mnt_idmap;
 	}
@@ -1098,6 +1099,10 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
 	rb_link_node(&mnt->mnt_node, parent, link);
 	rb_insert_color(&mnt->mnt_node, &ns->mounts);
 
+	if ((mnt->mnt.mnt_sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED) &&
+	    mnt->mnt.mnt_root == mnt->mnt.mnt_sb->s_root)
+		hlist_add_head(&mnt->mnt_ns_visible, &ns->mnt_visible_mounts);
+
 	mnt_notify_add(mnt);
 }
 
@@ -6340,20 +6345,26 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
 				int *new_mnt_flags)
 {
 	int new_flags = *new_mnt_flags;
-	struct mount *mnt, *n;
+	struct mount *mnt;
+
+	/* Don't acquire namespace semaphore without a good reason. */
+	if (hlist_empty(&ns->mnt_visible_mounts))
+		return false;
 
 	guard(namespace_shared)();
-	rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
+	hlist_for_each_entry(mnt, &ns->mnt_visible_mounts, mnt_ns_visible) {
+		const struct super_block *sb_visible = mnt->mnt.mnt_sb;
 		struct mount *child;
 		int mnt_flags;
 
-		if (mnt->mnt.mnt_sb->s_type != sb->s_type)
+		if (sb_visible->s_type != sb->s_type)
 			continue;
 
-		/* This mount is not fully visible if it's root directory
-		 * is not the root directory of the filesystem.
+		/*
+		 * Restricted variants are not compatible with anything, even
+		 * other restricted variants.
 		 */
-		if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
+		if (sb_visible->s_iflags & SB_I_RESTRICTED_VARIANT)
 			continue;
 
 		/* A local view of the mount flags */
@@ -6405,16 +6416,23 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags
 		return false;
 
 	/* Can this filesystem be too revealing? */
-	s_iflags = sb->s_iflags;
-	if (!(s_iflags & SB_I_USERNS_VISIBLE))
+	if (!(sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED))
 		return false;
 
+	s_iflags = sb->s_iflags;
 	if ((s_iflags & required_iflags) != required_iflags) {
 		WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
 			  required_iflags);
 		return true;
 	}
 
+	/*
+	 * Restricted variants don't need an already visible mount because they
+	 * don't expose the full filesystem view.
+	 */
+	if (s_iflags & SB_I_RESTRICTED_VARIANT)
+		return false;
+
 	return !mnt_already_visible(ns, sb, new_mnt_flags);
 }
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e9ce1883288c5..1b9c368fb1338 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2194,6 +2194,10 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
 			break;
 		case -EISDIR:
 		case -ENOTDIR:
+			if (open_flags & __O_REGULAR) {
+				err = -EFTYPE;
+				break;
+			}
 			goto no_open;
 		case -ELOOP:
 			if (!(open_flags & O_NOFOLLOW))
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 170d32c217ae4..6227df9ae6f1d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -609,7 +609,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		inode->i_sb->s_id,
 		(unsigned long long)NFS_FILEID(inode),
 		nfs_display_fhandle_hash(fh),
-		icount_read(inode));
+		icount_read_once(inode));
 
 out:
 	return inode;
@@ -2270,7 +2270,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	dfprintk(VFS, "NFS: %s(%s/%llu fh_crc=0x%08x ct=%d info=0x%llx)\n",
 			__func__, inode->i_sb->s_id, inode->i_ino,
 			nfs_display_fhandle_hash(NFS_FH(inode)),
-			icount_read(inode), fattr->valid);
+			icount_read_once(inode), fattr->valid);
 
 	if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) {
 		/* Only a mounted-on-fileid? Just exit */
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 004f599375597..3305fe406cb22 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -1174,7 +1174,10 @@ read_boot:
 	rec->total = cpu_to_le32(sbi->record_size);
 	((struct ATTRIB *)Add2Ptr(rec, ao))->type = ATTR_END;
 
-	sb_set_blocksize(sb, min_t(u32, sbi->cluster_size, PAGE_SIZE));
+	if (!sb_set_blocksize(sb, min_t(u32, sbi->cluster_size, PAGE_SIZE))) {
+		err = -EINVAL;
+		goto out;
+	}
 
 	sbi->block_mask = sb->s_blocksize - 1;
 	sbi->blocks_per_cluster = sbi->cluster_size >> sb->s_blocksize_bits;
@@ -1225,7 +1228,8 @@ out:
 			/*
 			 * Try alternative boot (last sector)
 			 */
-			sb_set_blocksize(sb, block_size);
+			if (!sb_set_blocksize(sb, block_size))
+				return -EINVAL;
 			hint = "Alternative boot";
 			dev_size = dev_size0; /* restore original size. */
 			goto read_boot;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index b875f01c97564..4870e680c4e5a 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1224,7 +1224,6 @@ static struct file_system_type ocfs2_fs_type = {
 	.name           = "ocfs2",
 	.kill_sb        = kill_block_super,
 	.fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
-	.next           = NULL,
 	.init_fs_context = ocfs2_init_fs_context,
 	.parameters	= ocfs2_param_spec,
 };
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 834cae1e62233..1d915ef72119f 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -478,7 +478,8 @@ static int omfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_time_min = 0;
 	sb->s_time_max = U64_MAX / MSEC_PER_SEC;
 
-	sb_set_blocksize(sb, 0x200);
+	if (!sb_set_blocksize(sb, 0x200))
+		goto end;
 
 	bh = sb_bread(sb, 0);
 	if (!bh)
@@ -530,7 +531,8 @@ static int omfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	 * Use sys_blocksize as the fs block since it is smaller than a
 	 * page while the fs blocksize can be larger.
 	 */
-	sb_set_blocksize(sb, sbi->s_sys_blocksize);
+	if (!sb_set_blocksize(sb, sbi->s_sys_blocksize))
+		goto out_brelse_bh;
 
 	/*
 	 * ...and the difference goes into a shift.  sys_blocksize is always
diff --git a/fs/open.c b/fs/open.c
index 681d405bc61eb..5458668a68e11 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -960,7 +960,7 @@ static int do_dentry_open(struct file *f,
 	if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
 		f->f_mode |= FMODE_CAN_ODIRECT;
 
-	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | __O_REGULAR);
 	f->f_iocb_flags = iocb_flags(f);
 
 	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
@@ -1158,7 +1158,7 @@ struct file *kernel_file_open(const struct path *path, int flags,
 EXPORT_SYMBOL_GPL(kernel_file_open);
 
 #define WILL_CREATE(flags)	(flags & (O_CREAT | __O_TMPFILE))
-#define O_PATH_FLAGS		(O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
+#define O_PATH_FLAGS		(O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC | O_EMPTYPATH)
 
 inline struct open_how build_open_how(int flags, umode_t mode)
 {
@@ -1184,7 +1184,15 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
 	int acc_mode = ACC_MODE(flags);
 
 	BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
-			 "struct open_flags doesn't yet handle flags > 32 bits");
+			 "VALID_OPEN_FLAGS must fit in 32 bits");
+	/* The whole point: OPENAT2_REGULAR must be unrepresentable in int. */
+	BUILD_BUG_ON_MSG(!upper_32_bits(OPENAT2_REGULAR),
+			 "OPENAT2_REGULAR must live in the upper 32 bits of open_how::flags");
+	/* Prevent a future bit collision between UAPI and internal carrier. */
+	BUILD_BUG_ON_MSG(OPENAT2_REGULAR & VALID_OPEN_FLAGS,
+			 "OPENAT2_REGULAR must not alias any open()/openat() flag");
+	BUILD_BUG_ON_MSG(__O_REGULAR & VALID_OPENAT2_FLAGS,
+			 "__O_REGULAR must not alias any user-visible flag");
 
 	/*
 	 * Strip flags that aren't relevant in determining struct open_flags.
@@ -1196,7 +1204,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
 	 * values before calling build_open_flags(), but openat2(2) checks all
 	 * of its arguments.
 	 */
-	if (flags & ~VALID_OPEN_FLAGS)
+	if (flags & ~VALID_OPENAT2_FLAGS)
 		return -EINVAL;
 	if (how->resolve & ~VALID_RESOLVE_FLAGS)
 		return -EINVAL;
@@ -1236,6 +1244,14 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
 		if (!(acc_mode & MAY_WRITE))
 			return -EINVAL;
 	}
+	/*
+	 * Asking to open a directory and a regular file at the same time is
+	 * contradictory.
+	 */
+	if ((flags & (O_DIRECTORY | OPENAT2_REGULAR)) ==
+	    (O_DIRECTORY | OPENAT2_REGULAR))
+		return -EINVAL;
+
 	if (flags & O_PATH) {
 		/* O_PATH only permits certain other flags to be set. */
 		if (flags & ~O_PATH_FLAGS)
@@ -1252,6 +1268,19 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
 	if (flags & __O_SYNC)
 		flags |= O_DSYNC;
 
+	/*
+	 * Translate the upper-32-bit UAPI bit OPENAT2_REGULAR into the
+	 * kernel-internal lower-32-bit __O_REGULAR carrier so the bit
+	 * survives the assignment to op->open_flag (an int) below and the
+	 * subsequent flow through f->f_flags (unsigned int) and the
+	 * i_op->atomic_open() callback (unsigned). do_dentry_open() strips
+	 * __O_REGULAR before the file becomes visible to userspace.
+	 */
+	if (flags & OPENAT2_REGULAR) {
+		flags &= ~OPENAT2_REGULAR;
+		flags |= __O_REGULAR;
+	}
+
 	op->open_flag = flags;
 
 	/* O_TRUNC implies we need access checks for write permissions */
@@ -1279,6 +1308,8 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
 		lookup_flags |= LOOKUP_DIRECTORY;
 	if (!(flags & O_NOFOLLOW))
 		lookup_flags |= LOOKUP_FOLLOW;
+	if (flags & O_EMPTYPATH)
+		lookup_flags |= LOOKUP_EMPTY;
 
 	if (how->resolve & RESOLVE_NO_XDEV)
 		lookup_flags |= LOOKUP_NO_XDEV;
@@ -1360,7 +1391,7 @@ static int do_sys_openat2(int dfd, const char __user *filename,
 	if (unlikely(err))
 		return err;
 
-	CLASS(filename, name)(filename);
+	CLASS(filename_flags, name)(filename, op.lookup_flags);
 	return FD_ADD(how->flags, do_file_open(dfd, name, &op));
 }
 
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 1cce4f34a0512..b2ff950a096e9 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -338,14 +338,14 @@ static inline bool pid_in_current_pidns(const struct pid *pid)
 	return false;
 }
 
-static __u32 pidfs_coredump_mask(unsigned long mm_flags)
+static __u32 pidfs_coredump_mask(enum task_dumpable dumpable)
 {
-	switch (__get_dumpable(mm_flags)) {
-	case SUID_DUMP_USER:
+	switch (dumpable) {
+	case TASK_DUMPABLE_OWNER:
 		return PIDFD_COREDUMP_USER;
-	case SUID_DUMP_ROOT:
+	case TASK_DUMPABLE_ROOT:
 		return PIDFD_COREDUMP_ROOT;
-	case SUID_DUMP_DISABLE:
+	case TASK_DUMPABLE_OFF:
 		return PIDFD_COREDUMP_SKIP;
 	default:
 		WARN_ON_ONCE(true);
@@ -433,14 +433,9 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 		return -ESRCH;
 
 	if ((mask & PIDFD_INFO_COREDUMP) && !kinfo.coredump_mask) {
-		guard(task_lock)(task);
-		if (task->mm) {
-			unsigned long flags = __mm_flags_get_dumpable(task->mm);
-
-			kinfo.coredump_mask = pidfs_coredump_mask(flags);
-			kinfo.mask |= PIDFD_INFO_COREDUMP;
-			/* No coredump actually took place, so no coredump signal. */
-		}
+		kinfo.coredump_mask = pidfs_coredump_mask(task_exec_state_get_dumpable(task));
+		kinfo.mask |= PIDFD_INFO_COREDUMP;
+		/* No coredump actually took place, so no coredump signal. */
 	}
 
 	/* Unconditionally return identifiers and credentials, the rest only on request */
@@ -779,7 +774,7 @@ void pidfs_coredump(const struct coredump_params *cprm)
 	VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
 
 	/* Note how we were coredumped and that we coredumped. */
-	attr->coredump_mask = pidfs_coredump_mask(cprm->mm_flags) |
+	attr->coredump_mask = pidfs_coredump_mask(cprm->dumpable) |
 			      PIDFD_COREDUMPED;
 	/* If coredumping is set to skip we should never end up here. */
 	VFS_WARN_ON_ONCE(attr->coredump_mask & PIDFD_COREDUMP_SKIP);
diff --git a/fs/pipe.c b/fs/pipe.c
index 9841648c9cf3e..e37c79935ecb1 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -664,7 +664,8 @@ pipe_poll(struct file *filp, poll_table *wait)
 	union pipe_index idx;
 
 	/* Epoll has some historical nasty semantics, this enables them */
-	WRITE_ONCE(pipe->poll_usage, true);
+	if (unlikely(!READ_ONCE(pipe->poll_usage)))
+		WRITE_ONCE(pipe->poll_usage, true);
 
 	/*
 	 * Reading pipe state only -- no need for acquiring the semaphore.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d9acfa89c894b..65f56136ec3f3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -91,6 +91,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/exec_state.h>
 #include <linux/sched/stat.h>
 #include <linux/posix-timers.h>
 #include <linux/time_namespace.h>
@@ -1893,7 +1894,6 @@ void task_dump_owner(struct task_struct *task, umode_t mode,
 	cred = __task_cred(task);
 	uid = cred->euid;
 	gid = cred->egid;
-	rcu_read_unlock();
 
 	/*
 	 * Before the /proc/pid/status file was created the only way to read
@@ -1903,29 +1903,22 @@ void task_dump_owner(struct task_struct *task, umode_t mode,
 	 * made this apply to all per process world readable and executable
 	 * directories.
 	 */
-	if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
-		struct mm_struct *mm;
-		task_lock(task);
-		mm = task->mm;
-		/* Make non-dumpable tasks owned by some root */
-		if (mm) {
-			if (get_dumpable(mm) != SUID_DUMP_USER) {
-				struct user_namespace *user_ns = mm->user_ns;
+	if (mode != (S_IFDIR | S_IRUGO | S_IXUGO)) {
+		struct task_exec_state *exec_state;
 
-				uid = make_kuid(user_ns, 0);
-				if (!uid_valid(uid))
-					uid = GLOBAL_ROOT_UID;
+		exec_state = task_exec_state_rcu(task);
+		if (READ_ONCE(exec_state->dumpable) != TASK_DUMPABLE_OWNER) {
+			uid = make_kuid(exec_state->user_ns, 0);
+			if (!uid_valid(uid))
+				uid = GLOBAL_ROOT_UID;
 
-				gid = make_kgid(user_ns, 0);
-				if (!gid_valid(gid))
-					gid = GLOBAL_ROOT_GID;
-			}
-		} else {
-			uid = GLOBAL_ROOT_UID;
-			gid = GLOBAL_ROOT_GID;
+			gid = make_kgid(exec_state->user_ns, 0);
+			if (!gid_valid(gid))
+				gid = GLOBAL_ROOT_GID;
 		}
-		task_unlock(task);
 	}
+	rcu_read_unlock();
+
 	*ruid = uid;
 	*rgid = gid;
 }
@@ -2965,7 +2958,7 @@ static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
 	ret = 0;
 	mm = get_task_mm(task);
 	if (mm) {
-		unsigned long flags = __mm_flags_get_dumpable(mm);
+		unsigned long flags = __mm_flags_get_word(mm);
 
 		len = snprintf(buffer, sizeof(buffer), "%08lx\n",
 			       ((flags & MMF_DUMP_FILTER_MASK) >>
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8bb81e58c9d8c..c6ae076e1fa03 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -841,3 +841,13 @@ ssize_t proc_simple_write(struct file *f, const char __user *ubuf, size_t size,
 	kfree(buf);
 	return ret == 0 ? size : ret;
 }
+
+/*
+ * Not exported to modules:
+ * modules' /proc files aren't permanent because modules aren't permanent.
+ */
+void impl_proc_make_permanent(struct proc_dir_entry *pde)
+{
+	if (pde)
+		pde_make_permanent(pde);
+}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 64dc44832808a..1edbabbdbc5d7 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -79,8 +79,11 @@ static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
 	return pde->flags & PROC_ENTRY_PERMANENT;
 }
 
+/* This is for builtin code, not even for modules which are compiled in. */
 static inline void pde_make_permanent(struct proc_dir_entry *pde)
 {
+	/* Ensure magic flag does something. */
+	static_assert(PROC_ENTRY_PERMANENT != 0);
 	pde->flags |= PROC_ENTRY_PERMANENT;
 }
 
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 184cddeb8215c..00cc385bce212 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -23,6 +23,7 @@
 #include <linux/uidgid.h>
 #include <net/net_namespace.h>
 #include <linux/seq_file.h>
+#include <linux/security.h>
 
 #include "internal.h"
 
@@ -270,6 +271,7 @@ static struct net *get_proc_task_net(struct inode *dir)
 	struct task_struct *task;
 	struct nsproxy *ns;
 	struct net *net = NULL;
+	struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb);
 
 	rcu_read_lock();
 	task = pid_task(proc_pid(dir), PIDTYPE_PID);
@@ -282,6 +284,12 @@ static struct net *get_proc_task_net(struct inode *dir)
 	}
 	rcu_read_unlock();
 
+	if (net && (fs_info->pidonly == PROC_PIDONLY_ON) &&
+	    security_capable(fs_info->mounter_cred, net->user_ns, CAP_NET_ADMIN, CAP_OPT_NONE) < 0) {
+		put_net(net);
+		net = NULL;
+	}
+
 	return net;
 }
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 0f91005594710..99adddfeb4a44 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -223,12 +223,17 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	return 0;
 }
 
-static void proc_apply_options(struct proc_fs_info *fs_info,
+static int proc_apply_options(struct proc_fs_info *fs_info,
 			       struct fs_context *fc,
 			       struct user_namespace *user_ns)
 {
 	struct proc_fs_context *ctx = fc->fs_private;
 
+	if ((ctx->mask & (1 << Opt_subset)) &&
+	    fc->purpose == FS_CONTEXT_FOR_RECONFIGURE &&
+	    ctx->pidonly != fs_info->pidonly)
+		return invalf(fc, "proc: subset=pid cannot be changed\n");
+
 	if (ctx->mask & (1 << Opt_gid))
 		fs_info->pid_gid = make_kgid(user_ns, ctx->gid);
 	if (ctx->mask & (1 << Opt_hidepid))
@@ -240,6 +245,7 @@ static void proc_apply_options(struct proc_fs_info *fs_info,
 		put_pid_ns(fs_info->pid_ns);
 		fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
 	}
+	return 0;
 }
 
 static int proc_fill_super(struct super_block *s, struct fs_context *fc)
@@ -254,10 +260,13 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
 		return -ENOMEM;
 
 	fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
-	proc_apply_options(fs_info, fc, current_user_ns());
+	fs_info->mounter_cred = get_cred(fc->cred);
+	ret = proc_apply_options(fs_info, fc, current_user_ns());
+	if (ret)
+		return ret;
 
 	/* User space would break if executables or devices appear on proc */
-	s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
+	s->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
 	s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
@@ -266,6 +275,9 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
 	s->s_time_gran = 1;
 	s->s_fs_info = fs_info;
 
+	if (fs_info->pidonly == PROC_PIDONLY_ON)
+		s->s_iflags |= SB_I_RESTRICTED_VARIANT;
+
 	/*
 	 * procfs isn't actually a stacking filesystem; however, there is
 	 * too much magic going on inside it to permit stacking things on
@@ -303,8 +315,7 @@ static int proc_reconfigure(struct fs_context *fc)
 
 	sync_filesystem(sb);
 
-	proc_apply_options(fs_info, fc, current_user_ns());
-	return 0;
+	return proc_apply_options(fs_info, fc, current_user_ns());
 }
 
 static int proc_get_tree(struct fs_context *fc)
@@ -350,6 +361,7 @@ static void proc_kill_sb(struct super_block *sb)
 	kill_anon_super(sb);
 	if (fs_info) {
 		put_pid_ns(fs_info->pid_ns);
+		put_cred(fs_info->mounter_cred);
 		kfree_rcu(fs_info, rcu);
 	}
 }
@@ -359,7 +371,7 @@ static struct file_system_type proc_fs_type = {
 	.init_fs_context	= proc_init_fs_context,
 	.parameters		= proc_fs_parameters,
 	.kill_sb		= proc_kill_sb,
-	.fs_flags		= FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM,
+	.fs_flags		= FS_USERNS_MOUNT | FS_USERNS_MOUNT_RESTRICTED | FS_DISALLOW_NOTIFY_PERM,
 };
 
 void __init proc_root_init(void)
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 4deb0eeadbdef..42fcd500fad21 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -202,7 +202,8 @@ static int qnx4_fill_super(struct super_block *s, struct fs_context *fc)
 		return -ENOMEM;
 	s->s_fs_info = qs;
 
-	sb_set_blocksize(s, QNX4_BLOCK_SIZE);
+	if (!sb_set_blocksize(s, QNX4_BLOCK_SIZE))
+		return -EINVAL;
 
 	s->s_op = &qnx4_sops;
 	s->s_magic = QNX4_SUPER_MAGIC;
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index e4295a5b55b34..88a4a1787ff04 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -241,6 +241,12 @@ static int __cifs_do_create(struct inode *dir, struct dentry *direntry,
 				goto cifs_create_get_file_info;
 			}
 
+			if ((oflags & __O_REGULAR) && !S_ISREG(newinode->i_mode)) {
+				CIFSSMBClose(xid, tcon, fid->netfid);
+				iput(newinode);
+				return -EFTYPE;
+			}
+
 			if (S_ISDIR(newinode->i_mode)) {
 				CIFSSMBClose(xid, tcon, fid->netfid);
 				iput(newinode);
@@ -458,9 +464,15 @@ cifs_create_set_dentry:
 		goto out_err;
 	}
 
-	if (newinode && S_ISDIR(newinode->i_mode)) {
-		rc = -EISDIR;
-		goto out_err;
+	if (newinode) {
+		if ((oflags & __O_REGULAR) && !S_ISREG(newinode->i_mode)) {
+			rc = -EFTYPE;
+			goto out_err;
+		}
+		if (S_ISDIR(newinode->i_mode)) {
+			rc = -EISDIR;
+			goto out_err;
+		}
 	}
 
 	*inode = newinode;
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 826d36ed13ec9..56b0f109e41b4 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -2873,7 +2873,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
 	}
 
 	cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n",
-		 full_path, inode, icount_read(inode),
+		 full_path, inode, icount_read_once(inode),
 		 dentry, cifs_get_time(dentry), jiffies);
 
 again:
diff --git a/fs/super.c b/fs/super.c
index 378e81efe643b..5d46a0d5b6169 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -882,7 +882,6 @@ void drop_super_exclusive(struct super_block *sb)
 	super_unlock_excl(sb);
 	put_super(sb);
 }
-EXPORT_SYMBOL(drop_super_exclusive);
 
 enum super_iter_flags_t {
 	SUPER_ITER_EXCL		= (1U << 0),
diff --git a/fs/sync.c b/fs/sync.c
index 942a60cfedfbf..4a84dd837b863 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -266,8 +266,7 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
 
 	i_mode = file_inode(file)->i_mode;
 	ret = -ESPIPE;
-	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
-			!S_ISLNK(i_mode))
+	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode))
 		goto out;
 
 	mapping = file->f_mapping;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index b199e8ff79b1f..88c10823fcafa 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -23,20 +23,6 @@
 static struct kernfs_root *sysfs_root;
 struct kernfs_node *sysfs_root_kn;
 
-static int sysfs_get_tree(struct fs_context *fc)
-{
-	struct kernfs_fs_context *kfc = fc->fs_private;
-	int ret;
-
-	ret = kernfs_get_tree(fc);
-	if (ret)
-		return ret;
-
-	if (kfc->new_sb_created)
-		fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
-	return 0;
-}
-
 static void sysfs_fs_context_free(struct fs_context *fc)
 {
 	struct kernfs_fs_context *kfc = fc->fs_private;
@@ -49,7 +35,7 @@ static void sysfs_fs_context_free(struct fs_context *fc)
 
 static const struct fs_context_operations sysfs_fs_context_ops = {
 	.free		= sysfs_fs_context_free,
-	.get_tree	= sysfs_get_tree,
+	.get_tree	= kernfs_get_tree,
 };
 
 static int sysfs_init_fs_context(struct fs_context *fc)
@@ -93,7 +79,7 @@ static struct file_system_type sysfs_fs_type = {
 	.name			= "sysfs",
 	.init_fs_context	= sysfs_init_fs_context,
 	.kill_sb		= sysfs_kill_sb,
-	.fs_flags		= FS_USERNS_MOUNT,
+	.fs_flags		= FS_USERNS_MOUNT | FS_USERNS_MOUNT_RESTRICTED,
 };
 
 int __init sysfs_init(void)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 9a77d8b64ffa7..38972786817e7 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -358,7 +358,7 @@ static void ubifs_evict_inode(struct inode *inode)
 		goto out;
 
 	dbg_gen("inode %llu, mode %#x", inode->i_ino, (int)inode->i_mode);
-	ubifs_assert(c, !icount_read(inode));
+	ubifs_assert(c, !icount_read_once(inode));
 
 	truncate_inode_pages_final(&inode->i_data);
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9978ac1422fc4..ddf2707c88945 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1040,7 +1040,7 @@ xfs_itruncate_extents_flags(
 	int			error = 0;
 
 	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
-	if (icount_read(VFS_I(ip)))
+	if (icount_read_once(VFS_I(ip)))
 		xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
 	if (whichfork == XFS_DATA_FORK)
 		ASSERT(new_size <= XFS_ISIZE(ip));
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 1c098cfc5c00d..f87c738d84b24 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1158,7 +1158,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class,
 	TP_fast_assign(
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
 		__entry->ino = ip->i_ino;
-		__entry->count = icount_read(VFS_I(ip));
+		__entry->count = icount_read_once(VFS_I(ip));
 		__entry->pincount = atomic_read(&ip->i_pincount);
 		__entry->iflags = ip->i_flags;
 		__entry->caller_ip = caller_ip;
author	Mark Brown <broonie@kernel.org>	2026-05-29 14:59:13 +0100
committer	Mark Brown <broonie@kernel.org>	2026-05-29 14:59:13 +0100
commit	d692b8845cb3cfa4d005a5a360a26f692b09d178 (patch)
tree	01330ed4686ce3e141e8bf069b7bc11bf770f6f2 /fs
parent	98e9aea604a182a40f8fb4b77ac426b3fdf00031 (diff)
parent	4bd540bd9a0d7a2e8403a139e9f7631b06a57e89 (diff)
download	linux-next-history-d692b8845cb3cfa4d005a5a360a26f692b09d178.tar.gz