diff options
| author | Mark Brown <broonie@kernel.org> | 2026-05-29 14:59:13 +0100 |
|---|---|---|
| committer | Mark Brown <broonie@kernel.org> | 2026-05-29 14:59:13 +0100 |
| commit | d692b8845cb3cfa4d005a5a360a26f692b09d178 (patch) | |
| tree | 01330ed4686ce3e141e8bf069b7bc11bf770f6f2 /fs | |
| parent | 98e9aea604a182a40f8fb4b77ac426b3fdf00031 (diff) | |
| parent | 4bd540bd9a0d7a2e8403a139e9f7631b06a57e89 (diff) | |
| download | linux-next-history-d692b8845cb3cfa4d005a5a360a26f692b09d178.tar.gz | |
next-20260522/vfs-brauner
# Conflicts:
# fs/fuse/dev.c
Diffstat (limited to 'fs')
48 files changed, 1388 insertions, 744 deletions
diff --git a/fs/affs/affs.h b/fs/affs/affs.h index a0caf6ace8601..44a3f69d275f9 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -227,11 +227,6 @@ static inline bool affs_validblock(struct super_block *sb, int block) block < AFFS_SB(sb)->s_partition_size); } -static inline void -affs_set_blocksize(struct super_block *sb, int size) -{ - sb_set_blocksize(sb, size); -} static inline struct buffer_head * affs_bread(struct super_block *sb, int block) { diff --git a/fs/affs/super.c b/fs/affs/super.c index 079f36e1ddec1..b232251aa7bbd 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -358,7 +358,8 @@ static int affs_fill_super(struct super_block *sb, struct fs_context *fc) size = bdev_nr_sectors(sb->s_bdev); pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size); - affs_set_blocksize(sb, PAGE_SIZE); + if (!sb_set_blocksize(sb, PAGE_SIZE)) + return -EINVAL; /* Try to find root block. Its location depends on the block size. */ i = bdev_logical_block_size(sb->s_bdev); @@ -374,7 +375,8 @@ static int affs_fill_super(struct super_block *sb, struct fs_context *fc) if (ctx->root_block < 0) sbi->s_root_block = (ctx->reserved + size - 1) / 2; pr_debug("setting blocksize to %d\n", blocksize); - affs_set_blocksize(sb, blocksize); + if (!sb_set_blocksize(sb, blocksize)) + return -EINVAL; sbi->s_partition_size = size; /* The root block location that was calculated above is not diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index c12caae9a9679..ee0cbae521b9f 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -860,7 +860,8 @@ befs_fill_super(struct super_block *sb, struct fs_context *fc) */ sb->s_magic = BEFS_SUPER_MAGIC; /* Set real blocksize of fs */ - sb_set_blocksize(sb, (ulong) befs_sb->block_size); + if (!sb_set_blocksize(sb, (ulong) befs_sb->block_size)) + goto unacquire_priv_sbp; sb->s_op = &befs_sops; sb->s_export_op = &befs_export_operations; sb->s_time_min = 0; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 19e49c8cf7505..9c3e90390824c 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -346,7 +346,8 @@ static int bfs_fill_super(struct super_block *s, struct fs_context *fc) s->s_time_min = 0; s->s_time_max = U32_MAX; - sb_set_blocksize(s, BFS_BSIZE); + if (!sb_set_blocksize(s, BFS_BSIZE)) + goto out; sbh = sb_bread(s, 0); if (!sbh) diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c index e4e51a1d0de28..606319dd69e80 100644 --- a/fs/bpf_fs_kfuncs.c +++ b/fs/bpf_fs_kfuncs.c @@ -100,7 +100,7 @@ static bool match_security_bpf_prefix(const char *name__str) static int bpf_xattr_read_permission(const char *name, struct inode *inode) { - if (WARN_ON(!inode)) + if (!inode) return -EINVAL; /* Allow reading xattr with user. and security.bpf. prefix */ @@ -170,7 +170,7 @@ __bpf_kfunc_end_defs(); static int bpf_xattr_write_permission(const char *name, struct inode *inode) { - if (WARN_ON(!inode)) + if (!inode) return -EINVAL; /* Only allow setting and removing security.bpf. xattrs */ @@ -289,6 +289,9 @@ __bpf_kfunc int bpf_set_dentry_xattr(struct dentry *dentry, const char *name__st struct inode *inode = d_inode(dentry); int ret; + if (!inode) + return -EINVAL; + inode_lock(inode); ret = bpf_set_dentry_xattr_locked(dentry, name__str, value_p, flags); inode_unlock(inode); @@ -314,6 +317,9 @@ __bpf_kfunc int bpf_remove_dentry_xattr(struct dentry *dentry, const char *name_ struct inode *inode = d_inode(dentry); int ret; + if (!inode) + return -EINVAL; + inode_lock(inode); ret = bpf_remove_dentry_xattr_locked(dentry, name__str); inode_unlock(inode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 61b5594c4206f..799a7409950e9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4570,7 +4570,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root) inode = btrfs_find_first_inode(root, min_ino); while (inode) { - if (icount_read(&inode->vfs_inode) > 1) + if (icount_read_once(&inode->vfs_inode) > 1) d_prune_aliases(&inode->vfs_inode); min_ino = btrfs_ino(inode) + 1; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d54d71669176b..0ad42e1cc3058 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -996,6 +996,10 @@ retry: ceph_init_inode_acls(newino, &as_ctx); file->f_mode |= FMODE_CREATED; } + if ((flags & __O_REGULAR) && !d_is_reg(dentry)) { + err = -EFTYPE; + goto out_req; + } err = finish_open(file, dentry, ceph_open); } out_req: diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index ed17e0023705e..0edb6a2515012 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2267,7 +2267,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg) int count; dput(dentry); d_prune_aliases(inode); - count = icount_read(inode); + count = icount_read_once(inode); if (count == 1) (*remaining)--; doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n", diff --git a/fs/coredump.c b/fs/coredump.c index bb6fdb1f458e9..e68a76ff92a38 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -395,8 +395,7 @@ static bool coredump_parse(struct core_name *cn, struct coredump_params *cprm, cred->gid)); break; case 'd': - err = cn_printf(cn, "%d", - __get_dumpable(cprm->mm_flags)); + err = cn_printf(cn, "%d", cprm->dumpable); break; /* signal that caused the coredump */ case 's': @@ -869,11 +868,11 @@ static inline void coredump_sock_shutdown(struct file *file) { } static inline bool coredump_socket(struct core_name *cn, struct coredump_params *cprm) { return false; } #endif -/* cprm->mm_flags contains a stable snapshot of dumpability flags. */ +/* cprm->dumpable is the snapshot of task dumpability at dump start. */ static inline bool coredump_force_suid_safe(const struct coredump_params *cprm) { /* Require nonrelative corefile path and be extra careful. */ - return __get_dumpable(cprm->mm_flags) == SUID_DUMP_ROOT; + return cprm->dumpable == TASK_DUMPABLE_ROOT; } static bool coredump_file(struct core_name *cn, struct coredump_params *cprm, @@ -1085,7 +1084,7 @@ static inline bool coredump_skip(const struct coredump_params *cprm, return true; if (!binfmt->core_dump) return true; - if (!__get_dumpable(cprm->mm_flags)) + if (cprm->dumpable == TASK_DUMPABLE_OFF) return true; return false; } @@ -1170,14 +1169,9 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) struct coredump_params cprm = { .siginfo = siginfo, .limit = rlimit(RLIMIT_CORE), - /* - * We must use the same mm->flags while dumping core to avoid - * inconsistency of bit flags, since this flag is not protected - * by any locks. - * - * Note that we only care about MMF_DUMP* flags. - */ - .mm_flags = __mm_flags_get_dumpable(mm), + /* Snapshot MMF_DUMP_FILTER_* (unlocked) and dumpable for the dump. */ + .mm_flags = __mm_flags_get_word(mm), + .dumpable = task_exec_state_get_dumpable(current), .vma_meta = NULL, .cpu = raw_smp_processor_id(), }; @@ -1419,7 +1413,7 @@ EXPORT_SYMBOL(dump_align); void validate_coredump_safety(void) { - if (suid_dumpable == SUID_DUMP_ROOT && + if (suid_dumpable == TASK_DUMPABLE_ROOT && core_pattern[0] != '/' && core_pattern[0] != '|' && core_pattern[0] != '@') { coredump_report_failure("Unsafe core_pattern used with fs.suid_dumpable=2: " @@ -1488,7 +1482,8 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write, return -EINVAL; } - validate_coredump_safety(); + if (strncmp(old_core_pattern, core_pattern, CORENAME_MAX_SIZE)) + validate_coredump_safety(); return error; } diff --git a/fs/dcache.c b/fs/dcache.c index 2c61aeea41f45..d6f5053132053 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -426,9 +426,16 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry) this_cpu_inc(nr_dentry_negative); } +#define DENTRY_WARN_ONCE(condition, dentry) \ + WARN_ONCE((condition), "dentry=%p d_flags=0x%x\n", (dentry), (dentry)->d_flags) +#define D_FLAG_VERIFY(dentry, x) \ + DENTRY_WARN_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x), (dentry)) + static void dentry_free(struct dentry *dentry) { - WARN_ON(d_really_is_positive(dentry)); + DENTRY_WARN_ONCE(d_really_is_positive(dentry), dentry); + DENTRY_WARN_ONCE(dentry->d_lockref.count >= 0, dentry); + D_FLAG_VERIFY(dentry, 0); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); if (likely(atomic_dec_and_test(&p->count))) { @@ -495,7 +502,6 @@ static void dentry_unlink_inode(struct dentry * dentry) * These helper functions make sure we always follow the * rules. d_lock must be held by the caller. */ -#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x)) static void d_lru_add(struct dentry *dentry) { D_FLAG_VERIFY(dentry, 0); @@ -1820,10 +1826,10 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) name = &slash_name; dname = dentry->d_shortname.string; } else if (name->len > DNAME_INLINE_LEN-1) { - size_t size = offsetof(struct external_name, name[1]); - struct external_name *p = kmalloc(size + name->len, - GFP_KERNEL_ACCOUNT | - __GFP_RECLAIMABLE); + struct external_name *p; + + p = kmalloc_flex(*p, name, name->len + 1, + GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; @@ -2100,6 +2106,10 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode) __d_instantiate(entry, inode); spin_unlock(&entry->d_lock); WARN_ON(!(inode_state_read(inode) & I_NEW)); + /* + * Paired with igrab_from_hash() + */ + smp_wmb(); inode_state_clear(inode, I_NEW | I_CREATING); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index a3090b446af10..a569e98d4a996 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -41,45 +41,170 @@ #include <net/busy_poll.h> /* - * LOCKING: - * There are three level of locking required by epoll : + * fs/eventpoll.c - Efficient event polling ("epoll") kernel implementation. * - * 1) epnested_mutex (mutex) - * 2) ep->mtx (mutex) - * 3) ep->lock (spinlock) * - * The acquire order is the one listed above, from 1 to 3. - * We need a spinlock (ep->lock) because we manipulate objects - * from inside the poll callback, that might be triggered from - * a wake_up() that in turn might be called from IRQ context. - * So we can't sleep inside the poll callback and hence we need - * a spinlock. During the event transfer loop (from kernel to - * user space) we could end up sleeping due a copy_to_user(), so - * we need a lock that will allow us to sleep. This lock is a - * mutex (ep->mtx). It is acquired during the event transfer loop, - * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). - * The epnested_mutex is acquired when inserting an epoll fd onto another - * epoll fd. We do this so that we walk the epoll tree and ensure that this - * insertion does not create a cycle of epoll file descriptors, which - * could lead to deadlock. We need a global mutex to prevent two - * simultaneous inserts (A into B and B into A) from racing and - * constructing a cycle without either insert observing that it is - * going to. - * It is necessary to acquire multiple "ep->mtx"es at once in the - * case when one epoll fd is added to another. In this case, we - * always acquire the locks in the order of nesting (i.e. after - * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired - * before e2->mtx). Since we disallow cycles of epoll file - * descriptors, this ensures that the mutexes are well-ordered. In - * order to communicate this nesting to lockdep, when walking a tree - * of epoll file descriptors, we use the current recursion depth as - * the lockdep subkey. - * It is possible to drop the "ep->mtx" and to use the global - * mutex "epnested_mutex" (together with "ep->lock") to have it working, - * but having "ep->mtx" will make the interface more scalable. - * Events that require holding "epnested_mutex" are very rare, while for - * normal operations the epoll private "ep->mtx" will guarantee - * a better scalability. + * Overview + * -------- + * + * Each epoll_create(2) returns an anonymous [eventpoll] file whose + * ->private_data is a struct eventpoll. Each EPOLL_CTL_ADD installs + * a struct epitem linking one (watched file, fd) pair back to that + * eventpoll via the watched file's f_op->poll() wait queue(s). When + * the watched file signals readiness, ep_poll_callback() fires and + * marks the epitem ready. epoll_wait(2) drains the ready list under + * ep->mtx, re-queueing items in level-triggered mode. + * + * epoll instances can watch other epoll instances up to EP_MAX_NESTS + * deep; cycles are forbidden and detected at EPOLL_CTL_ADD time. + * + * + * Locking + * ------- + * + * Three levels, acquired from outer to inner: + * + * epnested_mutex (global; rare; taken only for EPOLL_CTL_ADD + * loop / path checks) + * > ep->mtx (per-eventpoll; sleepable; serializes most ops) + * > ep->lock (per-eventpoll; IRQ-safe spinlock) + * + * file->f_lock (per-file; NOT IRQ-safe; guards f_ep hlist ops; + * nested inside ep->mtx, outside ep->lock) + * + * Rationale: + * - ep->lock is a spinlock because ep_poll_callback() is called from + * wake_up() which may run in hard-IRQ context. All ep->lock + * critical sections use spin_lock_irqsave(). + * - ep->mtx is a sleepable mutex because the event delivery loop + * calls copy_to_user(), and ep_insert() may sleep in + * kmem_cache_alloc() and f_op->poll(). + * - epnested_mutex is global because cycle detection needs a global + * view of the epoll topology; a per-object scheme would let two + * concurrent inserts (A into B, B into A) construct a cycle + * without either observer seeing it. + * - Per-ep ep->mtx is preferred for scalability elsewhere. Events + * that require epnested_mutex are rare. + * + * When EPOLL_CTL_ADD nests one eventpoll inside another we acquire + * ep->mtx on both: outer first, target second. Since cycles are + * forbidden the set of live ep->mtx holds is always a strict chain, + * communicated to lockdep via mutex_lock_nested() subclasses derived + * from the current recursion depth. + * + * + * Field protection + * ---------------- + * + * struct eventpoll: + * mtx - self + * rbr - ep->mtx + * ovflist, rdllist - ep->lock (IRQ-safe) + * wq - ep->lock for queue mutation + * poll_wait - internal waitqueue spinlock + * refs - file->f_lock for adds; ep->mtx for removes; + * RCU for readers (hlist_del_rcu + kfree_rcu(ep)) + * ws - ep->mtx + * gen, loop_check_depth - epnested_mutex + * file, user - immutable after setup + * refcount - atomic (refcount_t) + * napi_* - READ_ONCE / WRITE_ONCE + * + * struct epitem: + * rbn / rcu union - rbn: ep->mtx (while epi is linked in ep->rbr). + * rcu: written only by kfree_rcu(epi) on the free + * path; otherwise untouched by epoll code. + * rdllink, next - ep->lock + * ffd, ep - immutable after ep_insert() + * pwqlist - ep->mtx for writes; POLLFREE clears pwq->whead + * via smp_store_release(), see below + * fllink - file->f_lock for mutation; hlist_del_rcu + + * kfree_rcu(epi) for safe RCU readers + * ws - RCU (rcu_assign_pointer / + * rcu_dereference_check(mtx)) + * event - ep->mtx for writes; lockless read in + * ep_poll_callback pairs with smp_mb() in + * ep_modify() + * + * + * Ready-list state machine + * ------------------------ + * + * Readiness is tracked in two lists under ep->lock: + * + * rdllist - doubly-linked FIFO; the "current" ready list. + * ovflist - singly-linked LIFO; used during a scan to catch + * events that arrive while rdllist is being iterated + * without ep->lock. + * + * Encoded in ep->ovflist: + * EP_UNACTIVE_PTR - no scan active; callback appends to rdllist. + * NULL - scan active, no spill yet. + * pointer to epi - scan active with spilled items (LIFO). + * + * Encoded in epi->ovflist_next: + * EP_UNACTIVE_PTR - epi is not on ovflist. + * otherwise - next epi on ovflist (NULL at tail). + * + * ep_start_scan() flips "not scanning" to "scanning" and splices + * rdllist into a caller-local scan_batch. ep_done_scan() drains ovflist + * back to rdllist (list_add head-insert reverses LIFO to FIFO), + * flips back to "not scanning", and re-splices any items the caller + * left in scan_batch (e.g., level-triggered re-queues). + * + * + * Removal paths + * ------------- + * + * Three paths dispose of epitems and/or eventpolls: + * + * A. ep_remove() - EPOLL_CTL_DEL and ep_insert() + * rollback. Caller holds ep->mtx. + * B. ep_clear_and_put() - close of the epoll fd itself + * (ep_eventpoll_release). + * C. eventpoll_release_file() - close of a watched file, invoked + * from __fput(). + * + * Coordination: + * A and C exclude each other via the watched file's refcount. + * A pins the file with epi_fget() before touching file->f_ep or + * file->f_lock; if the pin fails, __fput() is in flight and C + * will clean this epi up. See the epi_fget() block comment. + * A and B both hold ep->mtx serially. B walks the rbtree with + * rb_next() captured before ep_remove() erases the current node. + * B and C both take ep->mtx; the loser sees fewer entries or an + * empty file->f_ep. + * + * Within every path the internal order is strict: + * ep_unregister_pollwait() - drain pwqlist; synchronizes with any + * in-flight ep_poll_callback via the + * watched wait-queue head's lock. + * ep_remove_file() - hlist_del_rcu of epi->fllink and, + * if last watcher, clear file->f_ep, + * under file->f_lock. + * ep_remove_epi() - rb_erase, rdllist unlink (ep->lock), + * wakeup_source_unregister, + * kfree_rcu(epi). + * + * kfree_rcu(epi) defers the free past RCU readers in + * reverse_path_check_proc(); kfree_rcu(ep) defers past readers in + * ep_get_upwards_depth_proc(). + * + * + * POLLFREE handshake + * ------------------ + * + * When a subsystem tears down a wait-queue head that an epitem is + * registered on (binder, signalfd, ...), it wakes the callback with + * POLLFREE and must RCU-defer the head's free. The store/load pair: + * + * ep_poll_callback() POLLFREE branch: + * smp_store_release(&pwq->whead, NULL) + * + * ep_remove_wait_queue(): + * smp_load_acquire(&pwq->whead) + * + * See those sites for the full argument. */ /* Epoll private bits inside the event mask */ @@ -99,11 +224,6 @@ #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) -struct epoll_filefd { - struct file *file; - int fd; -} __packed; - /* Wait structure used by the poll hooks */ struct eppoll_entry { /* List header used to link this structure to the "struct epitem" */ @@ -136,17 +256,19 @@ struct epitem { struct rcu_head rcu; }; - /* List header used to link this structure to the eventpoll ready list */ + /* Link on the owning eventpoll's ready list (ep->rdllist). */ struct list_head rdllink; /* - * Works together "struct eventpoll"->ovflist in keeping the - * single linked chain of items. + * Link on the owning eventpoll's scan-overflow list (ep->ovflist), + * EP_UNACTIVE_PTR when not linked. See epi_on_ovflist() / + * epi_clear_ovflist() and the "Ready-list state machine" section + * in the top-of-file banner. */ - struct epitem *next; + struct epitem *ovflist_next; /* The file descriptor information this item refers to */ - struct epoll_filefd ffd; + struct epoll_key ffd; /* List containing poll wait queues */ struct eppoll_entry *pwqlist; @@ -247,13 +369,77 @@ struct ep_pqueue { /* Maximum number of epoll watched descriptors, per user */ static long max_user_watches __read_mostly; -/* Used for cycles detection */ +/* + * Cycle and path-length checks at EPOLL_CTL_ADD + * --------------------------------------------- + * + * When EPOLL_CTL_ADD creates a link that either targets an eventpoll + * file or extends an existing chain of eventpolls, two checks run: + * + * 1. no cycle is being formed -- ep_loop_check() walks downward + * from the candidate target, and ep_get_upwards_depth_proc() + * walks upward from the outer ep, both bounded by EP_MAX_NESTS. + * 2. no file accumulates more than path_limits[depth] wakeup paths + * of a given length -- reverse_path_check(). + * + * Both need a global view of the epoll topology and must be atomic + * with the insertion, so the check is serialized by epnested_mutex + * and carries its scratch state on a stack-allocated struct + * ep_ctl_ctx scoped to one do_epoll_ctl() call. Non-nested inserts + * skip this machinery entirely and take only ep->mtx. + * + * epnested_mutex Serializes the whole check. + * loop_check_gen Global monotonic stamp, bumped at the start of + * a check and again at the end. ep->gen caches + * the value under which ep was last visited by + * ep_loop_check_proc() or + * ep_get_upwards_depth_proc(); the post-check + * bump ensures those cached stamps can no longer + * equal loop_check_gen, so the + * "ep->gen == loop_check_gen" trigger in + * ep_ctl_lock() only fires while another check + * is in flight. + * + * struct ep_ctl_ctx carries the rest (inserting_into, tfile_check_list, + * path_count[]) through the walk; see its declaration below. + * + * Commits fdcfce93073d ("eventpoll: Fix integer overflow in + * ep_loop_check_proc()") and f2e467a48287 ("eventpoll: Fix + * semi-unbounded recursion") hardened the walk; any refactor must + * preserve both bail-outs. + */ static DEFINE_MUTEX(epnested_mutex); - static u64 loop_check_gen = 0; -/* Used to check for epoll file descriptor inclusion loops */ -static struct eventpoll *inserting_into; +#define PATH_ARR_SIZE 5 + +/* + * Per-do_epoll_ctl() scratch for the loop / path checks. Allocated on + * the caller's stack; populated by ep_ctl_lock() and the downward + * walk; consumed by reverse_path_check(); released by ep_ctl_unlock(). + * Only valid while the caller holds epnested_mutex. + */ +struct ep_ctl_ctx { + /* + * Outer eventpoll for one ep_loop_check(); if the downward walk + * reaches it the insert would form a cycle. + */ + struct eventpoll *inserting_into; + + /* + * Singly-linked list of epitems_head objects collected during + * ep_loop_check_proc(), then walked by reverse_path_check(). + * NULL means empty. + */ + struct epitems_head *tfile_check_list; + + /* + * Per-depth wakeup-path tally used by reverse_path_check_proc(); + * reinitialized to zero at the start of each reverse_path_check() + * iteration. + */ + int path_count[PATH_ARR_SIZE]; +}; /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __ro_after_init; @@ -262,14 +448,15 @@ static struct kmem_cache *epi_cache __ro_after_init; static struct kmem_cache *pwq_cache __ro_after_init; /* - * List of files with newly added links, where we may need to limit the number - * of emanating paths. Protected by the epnested_mutex. + * Wrapper anchor for file->f_ep when the watched file is not itself an + * eventpoll; for the epoll-watches-epoll case, file->f_ep points at + * &watched_ep->refs directly. The ->next field threads + * ctx->tfile_check_list during one EPOLL_CTL_ADD path check. */ struct epitems_head { struct hlist_head epitems; struct epitems_head *next; }; -static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR; static struct kmem_cache *ephead_cache __ro_after_init; @@ -279,14 +466,14 @@ static inline void free_ephead(struct epitems_head *head) kmem_cache_free(ephead_cache, head); } -static void list_file(struct file *file) +static void list_file(struct file *file, struct ep_ctl_ctx *ctx) { struct epitems_head *head; head = container_of(file->f_ep, struct epitems_head, epitems); if (!head->next) { - head->next = tfile_check_list; - tfile_check_list = head; + head->next = ctx->tfile_check_list; + ctx->tfile_check_list = head; } } @@ -334,29 +521,20 @@ static void __init epoll_sysctls_init(void) static const struct file_operations eventpoll_fops; -static inline int is_file_epoll(struct file *f) +bool is_file_epoll(struct file *f) { return f->f_op == &eventpoll_fops; } -/* Setup the structure that is used as key for the RB tree */ -static inline void ep_set_ffd(struct epoll_filefd *ffd, - struct file *file, int fd) -{ - ffd->file = file; - ffd->fd = fd; -} - /* Compare RB tree keys */ -static inline int ep_cmp_ffd(struct epoll_filefd *p1, - struct epoll_filefd *p2) +static inline int ep_cmp_ffd(struct epoll_key *p1, struct epoll_key *p2) { return (p1->file > p2->file ? +1: (p1->file < p2->file ? -1 : p1->fd - p2->fd)); } -/* Tells us if the item is currently linked */ -static inline int ep_is_linked(struct epitem *epi) +/* True iff @epi is on its owning ep's ready list. */ +static inline bool ep_is_linked(struct epitem *epi) { return !list_empty(&epi->rdllink); } @@ -372,18 +550,47 @@ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) return container_of(p, struct eppoll_entry, wait)->base; } -/** - * ep_events_available - Checks if ready events might be available. - * - * @ep: Pointer to the eventpoll context. - * - * Return: a value different than %zero if ready events are available, - * or %zero otherwise. +/* + * Ready-list / ovflist state (see "Ready-list state machine" in the + * top-of-file banner for the full state machine). EP_UNACTIVE_PTR is + * the sentinel; these wrappers name each transition and each test so + * call sites do not need to know the sentinel's value. */ -static inline int ep_events_available(struct eventpoll *ep) + +/* True iff @ep is between ep_enter_scan() and ep_exit_scan(). */ +static inline bool ep_is_scanning(struct eventpoll *ep) +{ + return READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; +} + +/* Called by ep_start_scan(): divert ep_poll_callback() to ovflist. */ +static inline void ep_enter_scan(struct eventpoll *ep) +{ + WRITE_ONCE(ep->ovflist, NULL); +} + +/* Called by ep_done_scan(): redirect ep_poll_callback() back to rdllist. */ +static inline void ep_exit_scan(struct eventpoll *ep) +{ + WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR); +} + +/* True iff @epi is currently linked on its ep's ovflist. */ +static inline bool epi_on_ovflist(const struct epitem *epi) +{ + return epi->ovflist_next != EP_UNACTIVE_PTR; +} + +/* Mark @epi as not on any ovflist (init and post-drain). */ +static inline void epi_clear_ovflist(struct epitem *epi) +{ + epi->ovflist_next = EP_UNACTIVE_PTR; +} + +/* True iff @ep has ready events that epoll_wait() might harvest. */ +static inline bool ep_events_available(struct eventpoll *ep) { - return !list_empty_careful(&ep->rdllist) || - READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; + return !list_empty_careful(&ep->rdllist) || ep_is_scanning(ep); } #ifdef CONFIG_NET_RX_BUSY_POLL @@ -659,10 +866,15 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq) rcu_read_lock(); /* - * If it is cleared by POLLFREE, it should be rcu-safe. - * If we read NULL we need a barrier paired with - * smp_store_release() in ep_poll_callback(), otherwise - * we rely on whead->lock. + * POLLFREE handshake, acquire side; see "POLLFREE handshake" + * at the top of this file. + * + * A NULL load is paired with the smp_store_release(&whead, NULL) + * in ep_poll_callback()'s POLLFREE branch: the teardown is + * complete and we must not touch whead again. On a non-NULL load + * rcu_read_lock() keeps the waitqueue memory alive (POLLFREE + * firers RCU-defer the free) and whead->lock inside + * remove_wait_queue() serializes us against the store side. */ whead = smp_load_acquire(&pwq->whead); if (whead) @@ -723,7 +935,7 @@ static inline void ep_pm_stay_awake_rcu(struct epitem *epi) * ep->mutex needs to be held because we could be hit by * eventpoll_release_file() and epoll_ctl(). */ -static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist) +static void ep_start_scan(struct eventpoll *ep, struct list_head *scan_batch) { /* * Steal the ready list, and re-init the original one to the @@ -735,13 +947,13 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist) */ lockdep_assert_irqs_enabled(); spin_lock_irq(&ep->lock); - list_splice_init(&ep->rdllist, txlist); - WRITE_ONCE(ep->ovflist, NULL); + list_splice_init(&ep->rdllist, scan_batch); + ep_enter_scan(ep); spin_unlock_irq(&ep->lock); } static void ep_done_scan(struct eventpoll *ep, - struct list_head *txlist) + struct list_head *scan_batch) { struct epitem *epi, *nepi; @@ -751,34 +963,29 @@ static void ep_done_scan(struct eventpoll *ep, * other events might have been queued by the poll callback. * We re-insert them inside the main ready-list here. */ - for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; - nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { + for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; ) { + nepi = epi->ovflist_next; + epi_clear_ovflist(epi); /* - * We need to check if the item is already in the list. - * During the "sproc" callback execution time, items are - * queued into ->ovflist but the "txlist" might already - * contain them, and the list_splice() below takes care of them. + * Skip items that the caller already returned via @scan_batch + * -- the list_splice() below takes care of those. */ if (!ep_is_linked(epi)) { /* - * ->ovflist is LIFO, so we have to reverse it in order - * to keep in FIFO. + * ovflist is LIFO; list_add() head-insert here + * reverses the iteration order into FIFO. */ list_add(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } - /* - * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after - * releasing the lock, events will be queued in the normal way inside - * ep->rdllist. - */ - WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR); + /* Back out of scan mode; callbacks target ep->rdllist again. */ + ep_exit_scan(ep); /* - * Quickly re-inject items left on "txlist". + * Quickly re-inject items left on "scan_batch". */ - list_splice(txlist, &ep->rdllist); + list_splice(scan_batch, &ep->rdllist); __pm_relax(ep->ws); if (!list_empty(&ep->rdllist)) { @@ -795,9 +1002,10 @@ static void ep_get(struct eventpoll *ep) } /* - * Returns true if the event poll can be disposed + * Drop a reference to @ep; returns true iff it was the last, in which + * case the caller is responsible for ep_free(). */ -static bool ep_refcount_dec_and_test(struct eventpoll *ep) +static bool ep_put(struct eventpoll *ep) { if (!refcount_dec_and_test(&ep->refcount)) return false; @@ -817,22 +1025,23 @@ static void ep_free(struct eventpoll *ep) } /* - * The ffd.file pointer may be in the process of being torn down due to - * being closed, but we may not have finished eventpoll_release() yet. - * - * Normally, even with the atomic_long_inc_not_zero, the file may have - * been free'd and then gotten re-allocated to something else (since - * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU). + * Pin @epi->ffd.file for operations that require both safe dereference + * and exclusion from __fput(). * - * But for epoll, users hold the ep->mtx mutex, and as such any file in - * the process of being free'd will block in eventpoll_release_file() - * and thus the underlying file allocation will not be free'd, and the - * file re-use cannot happen. + * struct file uses SLAB_TYPESAFE_BY_RCU, so a freed slot can be + * reassigned at any time. The bare load of epi->ffd.file is safe here + * because the caller holds ep->mtx and eventpoll_release_file() blocks + * on that mutex while tearing down the epi, so the backing file + * allocation cannot be freed and reused under us. An rcu_read_lock() + * is therefore unnecessary for the load. * - * For the same reason we can avoid a rcu_read_lock() around the - * operation - 'ffd.file' cannot go away even if the refcount has - * reached zero (but we must still not call out to ->poll() functions - * etc). + * A successful file_ref_get() additionally blocks __fput() from + * starting on this file: once the refcount has reached zero it cannot + * come back. ep_remove() relies on that to touch file->f_lock and + * file->f_ep without racing eventpoll_release_file() (see commit + * a6dc643c6931). A NULL return means __fput() is already in flight; + * the caller must bail without touching the file, and + * eventpoll_release_file() will clean the epi up from its side. */ static struct file *epi_fget(const struct epitem *epi) { @@ -858,7 +1067,13 @@ static void ep_remove_file(struct eventpoll *ep, struct epitem *epi, spin_lock(&file->f_lock); head = file->f_ep; if (hlist_is_singular_node(&epi->fllink, head)) { - /* See eventpoll_release() for details. */ + /* + * Last watcher: publish NULL so the eventpoll_release() + * fastpath in include/linux/eventpoll.h can skip the slow + * path on a future __fput(). Safe because every f_ep writer + * either holds a pin on @file via epi_fget() or is __fput() + * itself -- see the comment in eventpoll_release(). + */ WRITE_ONCE(file->f_ep, NULL); if (!is_file_epoll(file)) { struct epitems_head *v; @@ -919,47 +1134,82 @@ static void ep_remove(struct eventpoll *ep, struct epitem *epi) ep_remove_file(ep, epi, file); ep_remove_epi(ep, epi); - WARN_ON_ONCE(ep_refcount_dec_and_test(ep)); + WARN_ON_ONCE(ep_put(ep)); } -static void ep_clear_and_put(struct eventpoll *ep) +/* + * Pass 1 of ep_clear_and_put(): drain every epi's pwqlist. + * ep_unregister_pollwait() takes each watched wait-queue head's lock, + * which synchronizes with any in-flight ep_poll_callback(); after + * this returns no callback can still be about to dereference an epi + * on this ep. Must strictly precede ep_drain_tree() -- fusing the + * two walks would let a callback queued on epi_i still fire after + * epi_{i+k} had already been freed. + */ +static void ep_drain_pollwaits(struct eventpoll *ep) { - struct rb_node *rbp, *next; + struct rb_node *rbp; struct epitem *epi; - /* We need to release all tasks waiting for these file */ - if (waitqueue_active(&ep->poll_wait)) - ep_poll_safewake(ep, NULL, 0); - - mutex_lock(&ep->mtx); + lockdep_assert_held(&ep->mtx); - /* - * Walks through the whole tree by unregistering poll callbacks. - */ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); ep_unregister_pollwait(ep, epi); cond_resched(); } +} + +/* + * Pass 2 of ep_clear_and_put(): ep_remove() every epi. The per-epi + * pwqlist is already empty (ep_drain_pollwaits ran), but the rest of + * ep_remove() still runs: epi_fget() pin, f_ep clear under f_lock, + * rbtree erase, rdllist unlink, kfree_rcu(epi). rb_next() is captured + * before each erase so the iteration is stable. + * + * A concurrent eventpoll_release_file() (removal path C) on a watched + * file serializes with us via ep->mtx; ep_remove() transparently + * hands off any epi whose file is in __fput() by bailing when + * epi_fget() returns NULL, and path C will clean that epi up. + */ +static void ep_drain_tree(struct eventpoll *ep) +{ + struct rb_node *rbp, *next; + struct epitem *epi; + + lockdep_assert_held(&ep->mtx); - /* - * Walks through the whole tree and try to free each "struct epitem". - * Note that ep_remove() will not remove the epitem in case of a - * racing eventpoll_release_file(); the latter will do the removal. - * At this point we are sure no poll callbacks will be lingering around. - * Since we still own a reference to the eventpoll struct, the loop can't - * dispose it. - */ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) { next = rb_next(rbp); epi = rb_entry(rbp, struct epitem, rbn); ep_remove(ep, epi); cond_resched(); } +} +/* + * Removal path B (see "Removal paths" in the top-of-file banner): + * close of the epoll fd itself, reached via ep_eventpoll_release(). + * + * Two passes under ep->mtx: first ep_drain_pollwaits() quiesces + * in-flight callbacks, then ep_drain_tree() frees the epis. The + * ep->refcount is kept > 0 across the walk by the ep file's own + * share, which we drop below; ep_free() runs iff we were the last + * holder after the tree drained. + */ +static void ep_clear_and_put(struct eventpoll *ep) +{ + /* Release any threads blocked in poll-on-ep. */ + if (waitqueue_active(&ep->poll_wait)) + ep_poll_safewake(ep, NULL, 0); + + mutex_lock(&ep->mtx); + ep_drain_pollwaits(ep); + ep_drain_tree(ep); mutex_unlock(&ep->mtx); - if (ep_refcount_dec_and_test(ep)) + + if (ep_put(ep)) ep_free(ep); } @@ -999,7 +1249,7 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth) { struct eventpoll *ep = file->private_data; - LIST_HEAD(txlist); + LIST_HEAD(scan_batch); struct epitem *epi, *tmp; poll_table pt; __poll_t res = 0; @@ -1014,8 +1264,8 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep * the ready list. */ mutex_lock_nested(&ep->mtx, depth); - ep_start_scan(ep, &txlist); - list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { + ep_start_scan(ep, &scan_batch); + list_for_each_entry_safe(epi, tmp, &scan_batch, rdllink) { if (ep_item_poll(epi, &pt, depth + 1)) { res = EPOLLIN | EPOLLRDNORM; break; @@ -1029,7 +1279,7 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep list_del_init(&epi->rdllink); } } - ep_done_scan(ep, &txlist); + ep_done_scan(ep, &scan_batch); mutex_unlock(&ep->mtx); return res; } @@ -1138,7 +1388,7 @@ again: mutex_unlock(&ep->mtx); - if (ep_refcount_dec_and_test(ep)) + if (ep_put(ep)) ep_free(ep); goto again; } @@ -1159,7 +1409,7 @@ static int ep_alloc(struct eventpoll **pep) init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT_CACHED; - ep->ovflist = EP_UNACTIVE_PTR; + ep->ovflist = EP_UNACTIVE_PTR; /* not scanning */ ep->user = get_current_user(); refcount_set(&ep->refcount, 1); @@ -1173,17 +1423,15 @@ static int ep_alloc(struct eventpoll **pep) * are protected by the "mtx" mutex, and ep_find() must be called with * "mtx" held. */ -static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) +static struct epitem *ep_find(struct eventpoll *ep, struct epoll_key *tf) { int kcmp; struct rb_node *rbp; struct epitem *epi, *epir = NULL; - struct epoll_filefd ffd; - ep_set_ffd(&ffd, file, fd); for (rbp = ep->rbr.rb_root.rb_node; rbp; ) { epi = rb_entry(rbp, struct epitem, rbn); - kcmp = ep_cmp_ffd(&ffd, &epi->ffd); + kcmp = ep_cmp_ffd(tf, &epi->ffd); if (kcmp > 0) rbp = rbp->rb_right; else if (kcmp < 0) @@ -1197,50 +1445,6 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) return epir; } -#ifdef CONFIG_KCMP -static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) -{ - struct rb_node *rbp; - struct epitem *epi; - - for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { - epi = rb_entry(rbp, struct epitem, rbn); - if (epi->ffd.fd == tfd) { - if (toff == 0) - return epi; - else - toff--; - } - cond_resched(); - } - - return NULL; -} - -struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, - unsigned long toff) -{ - struct file *file_raw; - struct eventpoll *ep; - struct epitem *epi; - - if (!is_file_epoll(file)) - return ERR_PTR(-EINVAL); - - ep = file->private_data; - - mutex_lock(&ep->mtx); - epi = ep_find_tfd(ep, tfd, toff); - if (epi) - file_raw = epi->ffd.file; - else - file_raw = ERR_PTR(-ENOENT); - mutex_unlock(&ep->mtx); - - return file_raw; -} -#endif /* CONFIG_KCMP */ - /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they @@ -1283,9 +1487,9 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ - if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { - if (epi->next == EP_UNACTIVE_PTR) { - epi->next = READ_ONCE(ep->ovflist); + if (ep_is_scanning(ep)) { + if (!epi_on_ovflist(epi)) { + epi->ovflist_next = READ_ONCE(ep->ovflist); WRITE_ONCE(ep->ovflist, epi); ep_pm_stay_awake_rcu(epi); } @@ -1336,17 +1540,24 @@ out_unlock: if (pollflags & POLLFREE) { /* - * If we race with ep_remove_wait_queue() it can miss - * ->whead = NULL and do another remove_wait_queue() after - * us, so we can't use __remove_wait_queue(). + * POLLFREE handshake, release side; see "POLLFREE handshake" + * at the top of this file. + * + * Unlink our wait entry with list_del_init rather than + * __remove_wait_queue: a concurrent ep_remove_wait_queue() + * that already loaded a non-NULL whead may still call + * remove_wait_queue() after us, and list_del_init() tolerates + * the second delete. + * + * smp_store_release(&whead, NULL) publishes the teardown to + * ep_remove_wait_queue()'s smp_load_acquire(). Before this + * store, a racing ep_clear_and_put() / ep_remove() reaches + * ep_remove_wait_queue() which sees whead != NULL and takes + * whead->lock -- the same lock held by our caller, so it + * serializes behind us. Once whead is zeroed, nothing else + * protects ep / epi / wait. */ list_del_init(&wait->entry); - /* - * ->whead != NULL protects us from the race with - * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue() - * takes whead->lock held by the caller. Once we nullify it, - * nothing protects ep/epi or even wait. - */ smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); } @@ -1407,41 +1618,40 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) -#define PATH_ARR_SIZE 5 /* - * These are the number paths of length 1 to 5, that we are allowing to emanate - * from a single file of interest. For example, we allow 1000 paths of length - * 1, to emanate from each file of interest. This essentially represents the - * potential wakeup paths, which need to be limited in order to avoid massive - * uncontrolled wakeup storms. The common use case should be a single ep which - * is connected to n file sources. In this case each file source has 1 path - * of length 1. Thus, the numbers below should be more than sufficient. These - * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify - * and delete can't add additional paths. Protected by the epnested_mutex. + * Upper bound on wakeup paths emanating from any one watched file, + * indexed by path depth (1..PATH_ARR_SIZE). For example, we allow + * 1000 paths of length 1 from each watched file. These caps limit + * the wakeup amplification that can be built from epoll-watches- + * epoll topologies without rejecting reasonable usage. + * + * Enforced at EPOLL_CTL_ADD; CTL_MOD and CTL_DEL cannot add paths. + * The running tallies live in ctx->path_count[] and are protected by + * epnested_mutex. */ static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; -static int path_count[PATH_ARR_SIZE]; -static int path_count_inc(int nests) +static int path_count_inc(struct ep_ctl_ctx *ctx, int nests) { /* Allow an arbitrary number of depth 1 paths */ if (nests == 0) return 0; - if (++path_count[nests] > path_limits[nests]) + if (++ctx->path_count[nests] > path_limits[nests]) return -1; return 0; } -static void path_count_init(void) +static void path_count_init(struct ep_ctl_ctx *ctx) { int i; for (i = 0; i < PATH_ARR_SIZE; i++) - path_count[i] = 0; + ctx->path_count[i] = 0; } -static int reverse_path_check_proc(struct hlist_head *refs, int depth) +static int reverse_path_check_proc(struct ep_ctl_ctx *ctx, + struct hlist_head *refs, int depth) { int error = 0; struct epitem *epi; @@ -1453,9 +1663,9 @@ static int reverse_path_check_proc(struct hlist_head *refs, int depth) hlist_for_each_entry_rcu(epi, refs, fllink) { struct hlist_head *refs = &epi->ep->refs; if (hlist_empty(refs)) - error = path_count_inc(depth); + error = path_count_inc(ctx, depth); else - error = reverse_path_check_proc(refs, depth + 1); + error = reverse_path_check_proc(ctx, refs, depth + 1); if (error != 0) break; } @@ -1463,24 +1673,23 @@ static int reverse_path_check_proc(struct hlist_head *refs, int depth) } /** - * reverse_path_check - The tfile_check_list is list of epitem_head, which have - * links that are proposed to be newly added. We need to - * make sure that those added links don't add too many - * paths such that we will spend all our time waking up - * eventpoll objects. + * reverse_path_check - ctx->tfile_check_list is a list of epitems_head + * anchoring files with newly proposed links; make + * sure those links don't push any path-length bucket + * over its limit in path_limits[]. * * Return: %zero if the proposed links don't create too many paths, * %-1 otherwise. */ -static int reverse_path_check(void) +static int reverse_path_check(struct ep_ctl_ctx *ctx) { struct epitems_head *p; - for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) { + for (p = ctx->tfile_check_list; p; p = p->next) { int error; - path_count_init(); + path_count_init(ctx); rcu_read_lock(); - error = reverse_path_check_proc(&p->epitems, 0); + error = reverse_path_check_proc(ctx, &p->epitems, 0); rcu_read_unlock(); if (error) return error; @@ -1526,7 +1735,7 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi) wakeup_source_unregister(ws); } -static int attach_epitem(struct file *file, struct epitem *epi) +static int ep_attach_file(struct file *file, struct epitem *epi) { struct epitems_head *to_free = NULL; struct hlist_head *head = NULL; @@ -1561,69 +1770,115 @@ allocate: } /* - * Must be called with "mtx" held. + * Charge the user's epoll_watches quota, allocate a fresh epitem for + * @tf, and initialize its fields. The returned item is not yet linked + * into any data structure; the caller must install it via + * ep_register_epitem() (which takes over on success) or kmem_cache_free() + * it and decrement epoll_watches on its own. + * + * Returns ERR_PTR(-ENOSPC) if the quota is exceeded, ERR_PTR(-ENOMEM) + * if the slab allocation fails. */ -static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, - struct file *tfile, int fd, int full_check) +static struct epitem *ep_alloc_epitem(struct eventpoll *ep, + const struct epoll_event *event, + struct epoll_key *tf) { - int error, pwake = 0; - __poll_t revents; struct epitem *epi; - struct ep_pqueue epq; - struct eventpoll *tep = NULL; - - if (is_file_epoll(tfile)) - tep = tfile->private_data; - - lockdep_assert_irqs_enabled(); if (unlikely(percpu_counter_compare(&ep->user->epoll_watches, max_user_watches) >= 0)) - return -ENOSPC; + return ERR_PTR(-ENOSPC); percpu_counter_inc(&ep->user->epoll_watches); - if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) { + epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL); + if (unlikely(!epi)) { percpu_counter_dec(&ep->user->epoll_watches); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } - /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); epi->ep = ep; - ep_set_ffd(&epi->ffd, tfile, fd); + epi->ffd = *tf; epi->event = *event; - epi->next = EP_UNACTIVE_PTR; + epi_clear_ovflist(epi); + + return epi; +} + +/* + * Install @epi into its target file's f_ep hlist and into @ep's rbtree, + * taking one additional reference on @ep for the lifetime of the item. + * + * If @tep is non-NULL, the target file is itself an eventpoll; we hold + * tep->mtx at subclass 1 across the attach + rbtree insert to serialize + * with the target side. RB tree ops are protected by @ep->mtx, which + * the caller already holds. + * + * On failure the epi is freed and the epoll_watches counter decremented, + * matching ep_alloc_epitem()'s allocation. After this returns + * successfully, ep_insert()'s later error paths use ep_remove() for + * unwind; that cannot drop @ep's refcount to zero because the ep file + * itself still holds the original reference. + */ +static int ep_register_epitem(struct ep_ctl_ctx *ctx, struct eventpoll *ep, + struct epitem *epi, struct eventpoll *tep, + int full_check) +{ + struct file *tfile = epi->ffd.file; + int error; if (tep) mutex_lock_nested(&tep->mtx, 1); - /* Add the current item to the list of active epoll hook for this file */ - if (unlikely(attach_epitem(tfile, epi) < 0)) { + + error = ep_attach_file(tfile, epi); + if (unlikely(error)) { if (tep) mutex_unlock(&tep->mtx); kmem_cache_free(epi_cache, epi); percpu_counter_dec(&ep->user->epoll_watches); - return -ENOMEM; + return error; } if (full_check && !tep) - list_file(tfile); + list_file(tfile, ctx); - /* - * Add the current item to the RB tree. All RB tree operations are - * protected by "mtx", and ep_insert() is called with "mtx" held. - */ ep_rbtree_insert(ep, epi); + if (tep) mutex_unlock(&tep->mtx); - /* - * ep_remove() calls in the later error paths can't lead to - * ep_free() as the ep file itself still holds an ep reference. - */ ep_get(ep); + return 0; +} + +/* + * Must be called with "mtx" held. + */ +static int ep_insert(struct ep_ctl_ctx *ctx, struct eventpoll *ep, + const struct epoll_event *event, struct epoll_key *tf, + int full_check) +{ + int error, pwake = 0; + __poll_t revents; + struct epitem *epi; + struct ep_pqueue epq; + struct eventpoll *tep = NULL; - /* now check if we've created too many backpaths */ - if (unlikely(full_check && reverse_path_check())) { + if (is_file_epoll(tf->file)) + tep = tf->file->private_data; + + lockdep_assert_irqs_enabled(); + + epi = ep_alloc_epitem(ep, event, tf); + if (IS_ERR(epi)) + return PTR_ERR(epi); + + error = ep_register_epitem(ctx, ep, epi, tep, full_check); + if (error) + return error; + + /* Reject the insert if the new link would create too many back-paths. */ + if (unlikely(full_check && reverse_path_check(ctx))) { ep_remove(ep, epi); return -EINVAL; } @@ -1649,28 +1904,21 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, */ revents = ep_item_poll(epi, &epq.pt, 1); - /* - * We have to check if something went wrong during the poll wait queue - * install process. Namely an allocation for a wait queue failed due - * high memory pressure. - */ + /* ep_ptable_queue_proc() signals allocation failure by clearing epq.epi. */ if (unlikely(!epq.epi)) { ep_remove(ep, epi); return -ENOMEM; } - /* We have to drop the new item inside our item list to keep track of it */ + /* Drop the new item onto the ready list if it is already ready. */ spin_lock_irq(&ep->lock); - /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); - /* If the file is already "ready" we drop it inside the ready list */ if (revents && !ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); - /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) @@ -1762,11 +2010,87 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, return 0; } +/* + * Attempt to deliver one event for @epi into @*uevents. + * + * Returns 1 if an event was delivered (with *uevents advanced to the + * next slot), 0 if the re-poll reported no caller-requested events + * (@epi drops out of the ready list; a future callback will re-add + * it), or -EFAULT if copy_to_user() faulted (in which case @epi is + * re-inserted at the head of @scan_batch so ep_done_scan() merges it + * back to rdllist for the next attempt). + * + * PM bookkeeping and level-triggered re-queue are handled here. + * Caller holds ep->mtx and the scan is active. + */ +static int ep_deliver_event(struct eventpoll *ep, struct epitem *epi, + poll_table *pt, + struct epoll_event __user **uevents, + struct list_head *scan_batch) +{ + struct epoll_event __user *next; + struct wakeup_source *ws; + __poll_t revents; + + /* + * Activate ep->ws before deactivating epi->ws to prevent + * triggering auto-suspend here (in case we reactivate epi->ws + * below). Rearranging to delay the deactivation would let + * epi->ws drift out of sync with ep_is_linked(). + */ + ws = ep_wakeup_source(epi); + if (ws) { + if (ws->active) + __pm_stay_awake(ep->ws); + __pm_relax(ws); + } + + list_del_init(&epi->rdllink); + + /* + * Re-poll under ep->mtx so userspace cannot change the item + * out from under us. If no caller-requested events remain, + * @epi stays off the ready list; the poll callback will + * re-queue it when events next appear. + */ + revents = ep_item_poll(epi, pt, 1); + if (!revents) + return 0; + + next = epoll_put_uevent(revents, epi->event.data, *uevents); + if (!next) { + /* + * copy_to_user() faulted: put the item back so + * ep_done_scan() splices it onto rdllist for the next + * attempt. + */ + list_add(&epi->rdllink, scan_batch); + ep_pm_stay_awake(epi); + return -EFAULT; + } + *uevents = next; + + if (epi->event.events & EPOLLONESHOT) { + epi->event.events &= EP_PRIVATE_BITS; + } else if (!(epi->event.events & EPOLLET)) { + /* + * Level-triggered: re-queue so the next epoll_wait() + * rechecks availability. We are the sole writer to + * rdllist here -- epoll_ctl() callers are locked out + * by ep->mtx, and the poll callback queues to ovflist + * during scans. + */ + list_add_tail(&epi->rdllink, &ep->rdllist); + ep_pm_stay_awake(epi); + } + return 1; +} + static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents) { struct epitem *epi, *tmp; - LIST_HEAD(txlist); + LIST_HEAD(scan_batch); poll_table pt; int res = 0; @@ -1781,74 +2105,28 @@ static int ep_send_events(struct eventpoll *ep, init_poll_funcptr(&pt, NULL); mutex_lock(&ep->mtx); - ep_start_scan(ep, &txlist); + ep_start_scan(ep, &scan_batch); /* - * We can loop without lock because we are passed a task private list. - * Items cannot vanish during the loop we are holding ep->mtx. + * We can loop without lock because we are passed a task-private + * scan_batch; items cannot vanish while we hold ep->mtx. */ - list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { - struct wakeup_source *ws; - __poll_t revents; + list_for_each_entry_safe(epi, tmp, &scan_batch, rdllink) { + int delivered; if (res >= maxevents) break; - /* - * Activate ep->ws before deactivating epi->ws to prevent - * triggering auto-suspend here (in case we reactive epi->ws - * below). - * - * This could be rearranged to delay the deactivation of epi->ws - * instead, but then epi->ws would temporarily be out of sync - * with ep_is_linked(). - */ - ws = ep_wakeup_source(epi); - if (ws) { - if (ws->active) - __pm_stay_awake(ep->ws); - __pm_relax(ws); - } - - list_del_init(&epi->rdllink); - - /* - * If the event mask intersect the caller-requested one, - * deliver the event to userspace. Again, we are holding ep->mtx, - * so no operations coming from userspace can change the item. - */ - revents = ep_item_poll(epi, &pt, 1); - if (!revents) - continue; - - events = epoll_put_uevent(revents, epi->event.data, events); - if (!events) { - list_add(&epi->rdllink, &txlist); - ep_pm_stay_awake(epi); + delivered = ep_deliver_event(ep, epi, &pt, &events, &scan_batch); + if (delivered < 0) { if (!res) - res = -EFAULT; + res = delivered; break; } - res++; - if (epi->event.events & EPOLLONESHOT) - epi->event.events &= EP_PRIVATE_BITS; - else if (!(epi->event.events & EPOLLET)) { - /* - * If this file has been added with Level - * Trigger mode, we need to insert back inside - * the ready list, so that the next call to - * epoll_wait() will check again the events - * availability. At this point, no one can insert - * into ep->rdllist besides us. The epoll_ctl() - * callers are locked out by - * ep_send_events() holding "mtx" and the - * poll callback will queue them in ep->ovflist. - */ - list_add_tail(&epi->rdllink, &ep->rdllist); - ep_pm_stay_awake(epi); - } + res += delivered; } - ep_done_scan(ep, &txlist); + + ep_done_scan(ep, &scan_batch); mutex_unlock(&ep->mtx); return res; @@ -1938,7 +2216,8 @@ static int ep_schedule_timeout(ktime_t *to) static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, struct timespec64 *timeout) { - int res, eavail, timed_out = 0; + int res, timed_out = 0; + bool eavail; u64 slack = 0; wait_queue_entry_t wait; ktime_t expires, *to = NULL; @@ -2036,7 +2315,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, * If timed out and still on the wait queue, recheck eavail * carefully under lock, below. */ - eavail = 1; + eavail = true; if (!list_empty_careful(&wait.entry)) { spin_lock_irq(&ep->lock); @@ -2066,7 +2345,8 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, * Return: depth of the subtree, or a value bigger than EP_MAX_NESTS if we found * a loop or went too deep. */ -static int ep_loop_check_proc(struct eventpoll *ep, int depth) +static int ep_loop_check_proc(struct ep_ctl_ctx *ctx, + struct eventpoll *ep, int depth) { int result = 0; struct rb_node *rbp; @@ -2082,22 +2362,23 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) if (unlikely(is_file_epoll(epi->ffd.file))) { struct eventpoll *ep_tovisit; ep_tovisit = epi->ffd.file->private_data; - if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) + if (ep_tovisit == ctx->inserting_into || + depth > EP_MAX_NESTS) result = EP_MAX_NESTS+1; else - result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1); + result = max(result, + ep_loop_check_proc(ctx, ep_tovisit, + depth + 1) + 1); if (result > EP_MAX_NESTS) break; } else { /* - * If we've reached a file that is not associated with - * an ep, then we need to check if the newly added - * links are going to add too many wakeup paths. We do - * this by adding it to the tfile_check_list, if it's - * not already there, and calling reverse_path_check() - * during ep_insert(). + * A non-epoll leaf. Queue it for the companion + * reverse_path_check() that runs after this walk so + * any new links we propose don't add too many wakeup + * paths. */ - list_file(epi->ffd.file); + list_file(epi->ffd.file, ctx); } } ep->loop_check_depth = result; @@ -2126,22 +2407,24 @@ static int ep_get_upwards_depth_proc(struct eventpoll *ep, int depth) * into another epoll file (represented by @ep) does not create * closed loops or too deep chains. * - * @ep: Pointer to the epoll we are inserting into. - * @to: Pointer to the epoll to be inserted. + * @ctx: Per-CTL_ADD scratch context. + * @ep: Pointer to the epoll we are inserting into. + * @to: Pointer to the epoll to be inserted. * * Return: %zero if adding the epoll @to inside the epoll @from * does not violate the constraints, or %-1 otherwise. */ -static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) +static int ep_loop_check(struct ep_ctl_ctx *ctx, struct eventpoll *ep, + struct eventpoll *to) { int depth, upwards_depth; - inserting_into = ep; + ctx->inserting_into = ep; /* * Check how deep down we can get from @to, and whether it is possible * to loop up to @ep. */ - depth = ep_loop_check_proc(to, 0); + depth = ep_loop_check_proc(ctx, to, 0); if (depth > EP_MAX_NESTS) return -1; /* Check how far up we can go from @ep. */ @@ -2152,12 +2435,12 @@ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0; } -static void clear_tfile_check_list(void) +static void clear_tfile_check_list(struct ep_ctl_ctx *ctx) { rcu_read_lock(); - while (tfile_check_list != EP_UNACTIVE_PTR) { - struct epitems_head *head = tfile_check_list; - tfile_check_list = head->next; + while (ctx->tfile_check_list) { + struct epitems_head *head = ctx->tfile_check_list; + ctx->tfile_check_list = head->next; unlist_file(head); } rcu_read_unlock(); @@ -2223,38 +2506,105 @@ static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) } #endif -static inline int epoll_mutex_lock(struct mutex *mutex, int depth, - bool nonblock) +static inline int epoll_mutex_lock(struct mutex *mutex, bool nonblock) { if (!nonblock) { - mutex_lock_nested(mutex, depth); + mutex_lock(mutex); return 0; } - if (mutex_trylock(mutex)) + return mutex_trylock(mutex) ? 0 : -EAGAIN; +} + +/* + * Acquire the locks required for do_epoll_ctl() on @ep for @op. + * + * Always takes ep->mtx. For EPOLL_CTL_ADD, additionally runs the + * loop / path check under epnested_mutex when the topology can + * change: @ep is already watched (epfile->f_ep non-NULL), @ep was + * recently loop-checked (ep->gen == loop_check_gen), or @tfile is + * itself an eventpoll. + * + * Return value encodes both outcome and lock state: + * + * 0 success; ep->mtx held. + * 1 success; ep->mtx held AND the full check ran under + * epnested_mutex (which is also still held). The value + * doubles as the @full_check argument to ep_insert(). + * -errno failure; no locks held. + * + * The caller releases what was taken with ep_ctl_unlock(ep, ret). + * + * Holding epnested_mutex on add is what prevents two racing + * EPOLL_CTL_ADDs on different eps from building a cycle without + * either walker observing it. + */ +static int ep_ctl_lock(struct ep_ctl_ctx *ctx, struct eventpoll *ep, int op, + struct file *epfile, struct file *tfile, bool nonblock) +{ + struct eventpoll *tep; + int error; + + error = epoll_mutex_lock(&ep->mtx, nonblock); + if (error) + return error; + + if (op != EPOLL_CTL_ADD) + return 0; + if (!READ_ONCE(epfile->f_ep) && ep->gen != loop_check_gen && + !is_file_epoll(tfile)) return 0; - return -EAGAIN; + + /* Full check needed: drop ep->mtx so we can take epnested_mutex. */ + mutex_unlock(&ep->mtx); + error = epoll_mutex_lock(&epnested_mutex, nonblock); + if (error) + return error; + + loop_check_gen++; + + if (is_file_epoll(tfile)) { + tep = tfile->private_data; + if (ep_loop_check(ctx, ep, tep) != 0) { + error = -ELOOP; + goto err_unlock_nested; + } + } + + error = epoll_mutex_lock(&ep->mtx, nonblock); + if (error) + goto err_unlock_nested; + + return 1; + +err_unlock_nested: + clear_tfile_check_list(ctx); + loop_check_gen++; + mutex_unlock(&epnested_mutex); + return error; } -int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, - bool nonblock) +static void ep_ctl_unlock(struct ep_ctl_ctx *ctx, struct eventpoll *ep, + int full_check) +{ + mutex_unlock(&ep->mtx); + if (full_check) { + clear_tfile_check_list(ctx); + loop_check_gen++; + mutex_unlock(&epnested_mutex); + } +} + +int do_epoll_ctl_file(struct file *f, int op, struct epoll_key *tf, + struct epoll_event *epds, bool nonblock) { int error; - int full_check = 0; + int full_check; struct eventpoll *ep; struct epitem *epi; - struct eventpoll *tep = NULL; - - CLASS(fd, f)(epfd); - if (fd_empty(f)) - return -EBADF; - - /* Get the "struct file *" for the target file */ - CLASS(fd, tf)(fd); - if (fd_empty(tf)) - return -EBADF; + struct ep_ctl_ctx ctx = { }; /* The target file descriptor must support poll */ - if (!file_can_poll(fd_file(tf))) + if (!file_can_poll(tf->file)) return -EPERM; /* Check if EPOLLWAKEUP is allowed */ @@ -2262,85 +2612,43 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, ep_take_care_of_epollwakeup(epds); /* - * We have to check that the file structure underneath the file descriptor - * the user passed to us _is_ an eventpoll file. And also we do not permit + * The @f file must itself be an eventpoll, and we do not permit * adding an epoll file descriptor inside itself. */ - error = -EINVAL; - if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f))) - goto error_tgt_fput; + if (f == tf->file || !is_file_epoll(f)) + return -EINVAL; /* * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only, * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. - * Also, we do not currently supported nested exclusive wakeups. + * Also, nested exclusive wakeups are not supported. */ if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { if (op == EPOLL_CTL_MOD) - goto error_tgt_fput; - if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) || + return -EINVAL; + if (op == EPOLL_CTL_ADD && (is_file_epoll(tf->file) || (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) - goto error_tgt_fput; + return -EINVAL; } - /* - * At this point it is safe to assume that the "private_data" contains - * our own data structure. - */ - ep = fd_file(f)->private_data; + ep = f->private_data; - /* - * When we insert an epoll file descriptor inside another epoll file - * descriptor, there is the chance of creating closed loops, which are - * better be handled here, than in more critical paths. While we are - * checking for loops we also determine the list of files reachable - * and hang them on the tfile_check_list, so we can check that we - * haven't created too many possible wakeup paths. - * - * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when - * the epoll file descriptor is attaching directly to a wakeup source, - * unless the epoll file descriptor is nested. The purpose of taking the - * 'epnested_mutex' on add is to prevent complex toplogies such as loops and - * deep wakeup paths from forming in parallel through multiple - * EPOLL_CTL_ADD operations. - */ - error = epoll_mutex_lock(&ep->mtx, 0, nonblock); - if (error) - goto error_tgt_fput; - if (op == EPOLL_CTL_ADD) { - if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen || - is_file_epoll(fd_file(tf))) { - mutex_unlock(&ep->mtx); - error = epoll_mutex_lock(&epnested_mutex, 0, nonblock); - if (error) - goto error_tgt_fput; - loop_check_gen++; - full_check = 1; - if (is_file_epoll(fd_file(tf))) { - tep = fd_file(tf)->private_data; - error = -ELOOP; - if (ep_loop_check(ep, tep) != 0) - goto error_tgt_fput; - } - error = epoll_mutex_lock(&ep->mtx, 0, nonblock); - if (error) - goto error_tgt_fput; - } - } + full_check = ep_ctl_lock(&ctx, ep, op, f, tf->file, nonblock); + if (full_check < 0) + return full_check; /* - * Try to lookup the file inside our RB tree. Since we grabbed "mtx" - * above, we can be sure to be able to use the item looked up by - * ep_find() till we release the mutex. + * Look the target up in ep's RB tree. We hold ep->mtx, so the + * item stays valid until we release. */ - epi = ep_find(ep, fd_file(tf), fd); + epi = ep_find(ep, tf); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds->events |= EPOLLERR | EPOLLHUP; - error = ep_insert(ep, epds, fd_file(tf), fd, full_check); + error = ep_insert(&ctx, ep, epds, tf, full_check); } else error = -EEXIST; break; @@ -2366,17 +2674,30 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, error = -ENOENT; break; } - mutex_unlock(&ep->mtx); -error_tgt_fput: - if (full_check) { - clear_tfile_check_list(); - loop_check_gen++; - mutex_unlock(&epnested_mutex); - } + ep_ctl_unlock(&ctx, ep, full_check); return error; } +int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, + bool nonblock) +{ + struct epoll_key efd; + + CLASS(fd, f)(epfd); + if (fd_empty(f)) + return -EBADF; + + /* Get the "struct file *" for the target file */ + CLASS(fd, tf)(fd); + if (fd_empty(tf)) + return -EBADF; + + efd.file = fd_file(tf); + efd.fd = fd; + return do_epoll_ctl_file(fd_file(f), op, &efd, epds, nonblock); +} + /* * The following function implements the controller interface for * the eventpoll file that enables the insertion/removal/change of @@ -2527,6 +2848,50 @@ SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events, sigmask, sigsetsize); } +#ifdef CONFIG_KCMP +static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) +{ + struct rb_node *rbp; + struct epitem *epi; + + for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); + if (epi->ffd.fd == tfd) { + if (toff == 0) + return epi; + else + toff--; + } + cond_resched(); + } + + return NULL; +} + +struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, + unsigned long toff) +{ + struct file *file_raw; + struct eventpoll *ep; + struct epitem *epi; + + if (!is_file_epoll(file)) + return ERR_PTR(-EINVAL); + + ep = file->private_data; + + mutex_lock(&ep->mtx); + epi = ep_find_tfd(ep, tfd, toff); + if (epi) + file_raw = epi->ffd.file; + else + file_raw = ERR_PTR(-ENOENT); + mutex_unlock(&ep->mtx); + + return file_raw; +} +#endif /* CONFIG_KCMP */ + #ifdef CONFIG_COMPAT static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *timeout, diff --git a/fs/exec.c b/fs/exec.c index ba12b4c466f6d..894added369dd 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -35,6 +35,7 @@ #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> +#include <linux/sched/exec_state.h> #include <linux/sched/signal.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> @@ -263,6 +264,9 @@ static int bprm_mm_init(struct linux_binprm *bprm) if (!mm) goto err; + /* Staged for would_dump() narrowing; consumed by begin_new_exec(). */ + bprm->user_ns = get_user_ns(current_user_ns()); + /* Save current stack limit for all calculations made during exec. */ task_lock(current->group_leader); bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; @@ -834,12 +838,17 @@ EXPORT_SYMBOL(read_code); * On success, this function returns with exec_update_lock * held for writing. */ -static int exec_mmap(struct mm_struct *mm) +static int exec_mmap(struct mm_struct *mm, struct user_namespace *user_ns) { + struct task_exec_state *exec_state __free(put_task_exec_state) = NULL; struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; int ret; + exec_state = alloc_task_exec_state(user_ns); + if (!exec_state) + return -ENOMEM; + /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; @@ -870,6 +879,7 @@ static int exec_mmap(struct mm_struct *mm) tsk->active_mm = mm; tsk->mm = mm; mm_init_cid(mm, tsk); + exec_state = task_exec_state_replace(tsk, exec_state); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for @@ -1145,7 +1155,7 @@ int begin_new_exec(struct linux_binprm * bprm) * Release all of the old mmap stuff */ acct_arg_size(bprm, 0); - retval = exec_mmap(bprm->mm); + retval = exec_mmap(bprm->mm, bprm->user_ns); if (retval) goto out; @@ -1210,9 +1220,9 @@ int begin_new_exec(struct linux_binprm * bprm) if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || !(uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))) - set_dumpable(current->mm, suid_dumpable); + task_exec_state_set_dumpable(suid_dumpable); else - set_dumpable(current->mm, SUID_DUMP_USER); + task_exec_state_set_dumpable(TASK_DUMPABLE_OWNER); perf_event_exec(); @@ -1261,7 +1271,7 @@ int begin_new_exec(struct linux_binprm * bprm) * wait until new credentials are committed * by commit_creds() above */ - if (get_dumpable(me->mm) != SUID_DUMP_USER) + if (task_exec_state_get_dumpable(me) != TASK_DUMPABLE_OWNER) perf_event_exit_task(me); /* * cred_guard_mutex must be held at least to this point to prevent @@ -1298,14 +1308,14 @@ void would_dump(struct linux_binprm *bprm, struct file *file) struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; - /* Ensure mm->user_ns contains the executable */ - user_ns = old = bprm->mm->user_ns; + /* Ensure bprm->user_ns contains the executable. */ + user_ns = old = bprm->user_ns; while ((user_ns != &init_user_ns) && !privileged_wrt_inode_uidgid(user_ns, idmap, inode)) user_ns = user_ns->parent; if (old != user_ns) { - bprm->mm->user_ns = get_user_ns(user_ns); + bprm->user_ns = get_user_ns(user_ns); put_user_ns(old); } } @@ -1375,6 +1385,8 @@ static void free_bprm(struct linux_binprm *bprm) acct_arg_size(bprm, 0); mmput(bprm->mm); } + if (bprm->user_ns) + put_user_ns(bprm->user_ns); free_arg_pages(bprm); if (bprm->cred) { /* in case exec fails before de_thread() succeeds */ @@ -1905,17 +1917,6 @@ void set_binfmt(struct linux_binfmt *new) } EXPORT_SYMBOL(set_binfmt); -/* - * set_dumpable stores three-value SUID_DUMP_* into mm->flags. - */ -void set_dumpable(struct mm_struct *mm, int value) -{ - if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) - return; - - __mm_flags_set_mask_dumpable(mm, value); -} - static inline struct user_arg_ptr native_arg(const char __user *const __user *p) { return (struct user_arg_ptr){.ptr.native = p}; @@ -1975,9 +1976,11 @@ COMPAT_SYSCALL_DEFINE5(execveat, int, fd, static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + int error, old = READ_ONCE(suid_dumpable); + + error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!error && write) + if (!error && write && (old != READ_ONCE(suid_dumpable))) validate_coredump_safety(); return error; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 3fd8f00998521..8c80d50875167 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -252,10 +252,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) "nonexistent device\n", __func__, __LINE__); return; } - if (icount_read(inode) > 1) { + if (icount_read_once(inode) > 1) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%llu: count=%d", __func__, __LINE__, inode->i_ino, - icount_read(inode)); + icount_read_once(inode)); return; } if (inode->i_nlink) { diff --git a/fs/fcntl.c b/fs/fcntl.c index beab8080badf6..b3ea135b74d8b 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -1169,10 +1169,10 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != + BUILD_BUG_ON(22 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) | - __FMODE_EXEC)); + __FMODE_EXEC | __O_REGULAR)); fasync_cache = kmem_cache_create("fasync_cache", sizeof(struct fasync_struct), 0, diff --git a/fs/file.c b/fs/file.c index 2c81c0b162d05..e5c75b22e0c7c 100644 --- a/fs/file.c +++ b/fs/file.c @@ -544,24 +544,23 @@ struct files_struct init_files = { static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) { unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ - unsigned int maxbit = maxfd / BITS_PER_LONG; - unsigned int bitbit = start / BITS_PER_LONG; + unsigned int max_fds_words = maxfd / BITS_PER_LONG; + unsigned int fds_word_idx = start / BITS_PER_LONG; unsigned int bit; /* * Try to avoid looking at the second level bitmap */ - bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG, + bit = find_next_zero_bit(&fdt->open_fds[fds_word_idx], BITS_PER_LONG, start & (BITS_PER_LONG - 1)); if (bit < BITS_PER_LONG) - return bit + bitbit * BITS_PER_LONG; + return bit + (fds_word_idx * BITS_PER_LONG); - bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; - if (bitbit >= maxfd) + bit = BITS_PER_LONG * + find_next_zero_bit(fdt->full_fds_bits, max_fds_words, fds_word_idx + 1); + if (bit >= maxfd) return maxfd; - if (bitbit > start) - start = bitbit; - return find_next_zero_bit(fdt->open_fds, maxfd, start); + return find_next_zero_bit(fdt->open_fds, maxfd, bit); } /* diff --git a/fs/filesystems.c b/fs/filesystems.c index 0c7d2b7ac26c8..771fc31a69b8f 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -17,22 +17,49 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs_parser.h> +#include <linux/rculist.h> /* - * Handling of filesystem drivers list. - * Rules: - * Inclusion to/removals from/scanning of list are protected by spinlock. - * During the unload module must call unregister_filesystem(). - * We can access the fields of list element if: - * 1) spinlock is held or - * 2) we hold the reference to the module. - * The latter can be guaranteed by call of try_module_get(); if it - * returned 0 we must skip the element, otherwise we got the reference. - * Once the reference is obtained we can drop the spinlock. + * Read-mostly filesystem drivers list. + * + * Readers walk under rcu_read_lock(); writers take file_systems_lock + * and publish via _rcu hlist primitives. unregister_filesystem() + * synchronize_rcu()s after unlock so the embedded file_system_type + * can't go away under a reader. To keep using a filesystem after + * the RCU section ends, take a module reference via try_module_get(). */ +static HLIST_HEAD(file_systems); +static DEFINE_SPINLOCK(file_systems_lock); + +#ifdef CONFIG_PROC_FS +/* + * Cache a stringified version of the filesystem list. + * + * The fs list gets queried a lot by userspace because of libselinux, including + * rather surprising programs (would you guess *sed* is on the list?). In order + * to reduce the overhead we cache the resulting string, which normally hangs + * around below 512 bytes in size. + * + * As the list almost never changes, its creation is not particularly optimized + * to keep things simple. + * + * We sort it out on read in order to not introduce a failure point for fs + * registration (in principle we may be unable to alloc memory for the list). + */ +struct file_systems_string { + struct rcu_head rcu; + unsigned long gen; + size_t len; + char string[]; +}; -static struct file_system_type *file_systems; -static DEFINE_RWLOCK(file_systems_lock); +static unsigned long file_systems_gen; +static struct file_systems_string __rcu *file_systems_string; + +static void invalidate_filesystems_string(void); +#else +static inline void invalidate_filesystems_string(void) { } +#endif /* WARNING: This can be used only if we _already_ own a reference */ struct file_system_type *get_filesystem(struct file_system_type *fs) @@ -46,14 +73,15 @@ void put_filesystem(struct file_system_type *fs) module_put(fs->owner); } -static struct file_system_type **find_filesystem(const char *name, unsigned len) +static struct file_system_type *find_filesystem(const char *name, unsigned len) { - struct file_system_type **p; - for (p = &file_systems; *p; p = &(*p)->next) - if (strncmp((*p)->name, name, len) == 0 && - !(*p)->name[len]) - break; - return p; + struct file_system_type *fs; + + hlist_for_each_entry_rcu(fs, &file_systems, list, + lockdep_is_held(&file_systems_lock)) + if (strncmp(fs->name, name, len) == 0 && !fs->name[len]) + return fs; + return NULL; } /** @@ -64,33 +92,27 @@ static struct file_system_type **find_filesystem(const char *name, unsigned len) * is aware of for mount and other syscalls. Returns 0 on success, * or a negative errno code on an error. * - * The &struct file_system_type that is passed is linked into the kernel + * The &struct file_system_type that is passed is linked into the kernel * structures and must not be freed until the file system has been * unregistered. */ - -int register_filesystem(struct file_system_type * fs) +int register_filesystem(struct file_system_type *fs) { - int res = 0; - struct file_system_type ** p; - if (fs->parameters && !fs_validate_description(fs->name, fs->parameters)) return -EINVAL; BUG_ON(strchr(fs->name, '.')); - if (fs->next) + if (!hlist_unhashed_lockless(&fs->list)) return -EBUSY; - write_lock(&file_systems_lock); - p = find_filesystem(fs->name, strlen(fs->name)); - if (*p) - res = -EBUSY; - else - *p = fs; - write_unlock(&file_systems_lock); - return res; -} + guard(spinlock)(&file_systems_lock); + if (find_filesystem(fs->name, strlen(fs->name))) + return -EBUSY; + hlist_add_tail_rcu(&fs->list, &file_systems); + invalidate_filesystems_string(); + return 0; +} EXPORT_SYMBOL(register_filesystem); /** @@ -100,94 +122,79 @@ EXPORT_SYMBOL(register_filesystem); * Remove a file system that was previously successfully registered * with the kernel. An error is returned if the file system is not found. * Zero is returned on a success. - * + * * Once this function has returned the &struct file_system_type structure * may be freed or reused. */ - -int unregister_filesystem(struct file_system_type * fs) +int unregister_filesystem(struct file_system_type *fs) { - struct file_system_type ** tmp; - - write_lock(&file_systems_lock); - tmp = &file_systems; - while (*tmp) { - if (fs == *tmp) { - *tmp = fs->next; - fs->next = NULL; - write_unlock(&file_systems_lock); - synchronize_rcu(); - return 0; - } - tmp = &(*tmp)->next; + scoped_guard(spinlock, &file_systems_lock) { + if (hlist_unhashed(&fs->list)) + return -EINVAL; + hlist_del_init_rcu(&fs->list); + invalidate_filesystems_string(); } - write_unlock(&file_systems_lock); - - return -EINVAL; + synchronize_rcu(); + return 0; } - EXPORT_SYMBOL(unregister_filesystem); #ifdef CONFIG_SYSFS_SYSCALL -static int fs_index(const char __user * __name) +static int fs_index(const char __user *__name) { - struct file_system_type * tmp; + struct file_system_type *p; char *name __free(kfree) = strndup_user(__name, PATH_MAX); - int err, index; + int index = 0; if (IS_ERR(name)) return PTR_ERR(name); - err = -EINVAL; - read_lock(&file_systems_lock); - for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { - if (strcmp(tmp->name, name) == 0) { - err = index; - break; - } + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) { + if (strcmp(p->name, name) == 0) + return index; + index++; } - read_unlock(&file_systems_lock); - return err; + return -EINVAL; } -static int fs_name(unsigned int index, char __user * buf) +static int fs_name(unsigned int index, char __user *buf) { - struct file_system_type * tmp; - int len, res = -EINVAL; + struct file_system_type *p, *found = NULL; + int len, res; - read_lock(&file_systems_lock); - for (tmp = file_systems; tmp; tmp = tmp->next, index--) { - if (index == 0) { - if (try_module_get(tmp->owner)) - res = 0; + scoped_guard(rcu) { + hlist_for_each_entry_rcu(p, &file_systems, list) { + if (index--) + continue; + if (try_module_get(p->owner)) + found = p; break; } } - read_unlock(&file_systems_lock); - if (res) - return res; + if (!found) + return -EINVAL; /* OK, we got the reference, so we can safely block */ - len = strlen(tmp->name) + 1; - res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0; - put_filesystem(tmp); + len = strlen(found->name) + 1; + res = copy_to_user(buf, found->name, len) ? -EFAULT : 0; + put_filesystem(found); return res; } static int fs_maxindex(void) { - struct file_system_type * tmp; - int index; + struct file_system_type *p; + int index = 0; - read_lock(&file_systems_lock); - for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) - ; - read_unlock(&file_systems_lock); + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) + index++; return index; } /* - * Whee.. Weird sysv syscall. + * Whee.. Weird sysv syscall. */ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) { @@ -216,8 +223,8 @@ int __init list_bdev_fs_names(char *buf, size_t size) size_t len; int count = 0; - read_lock(&file_systems_lock); - for (p = file_systems; p; p = p->next) { + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) { if (!(p->fs_flags & FS_REQUIRES_DEV)) continue; len = strlen(p->name) + 1; @@ -230,30 +237,145 @@ int __init list_bdev_fs_names(char *buf, size_t size) size -= len; count++; } - read_unlock(&file_systems_lock); return count; } #ifdef CONFIG_PROC_FS -static int filesystems_proc_show(struct seq_file *m, void *v) +static void invalidate_filesystems_string(void) { - struct file_system_type * tmp; + struct file_systems_string *old; + + lockdep_assert_held_write(&file_systems_lock); + file_systems_gen++; + old = rcu_replace_pointer(file_systems_string, NULL, + lockdep_is_held(&file_systems_lock)); + if (old) + kfree_rcu(old, rcu); +} - read_lock(&file_systems_lock); - tmp = file_systems; - while (tmp) { +static __cold noinline int regen_filesystems_string(void) +{ + struct file_system_type *p; + struct file_systems_string *old, *new; + size_t newlen, usedlen; + unsigned long gen; + +retry: + newlen = 0; + + /* pre-calc space for each fs */ + spin_lock(&file_systems_lock); + gen = file_systems_gen; + hlist_for_each_entry_rcu(p, &file_systems, list) { + if (!(p->fs_flags & FS_REQUIRES_DEV)) + newlen += strlen("nodev"); + newlen += strlen("\t") + strlen(p->name) + strlen("\n"); + } + spin_unlock(&file_systems_lock); + + new = kmalloc(offsetof(struct file_systems_string, string) + newlen + 1, + GFP_KERNEL); + if (!new) + return -ENOMEM; + + new->gen = gen; + new->len = newlen; + new->string[newlen] = '\0'; + + spin_lock(&file_systems_lock); + old = file_systems_string; + + /* + * Did someone beat us to it? + */ + if (old && old->gen == file_systems_gen) { + kfree(new); + return 0; + } + + /* + * Did the list change in the meantime? + */ + if (gen != file_systems_gen) { + kfree(new); + goto retry; + } + + /* + * Populate the string. + * + * We know we have just enough space because we calculated the right + * size the previous time we had the lock and confirmed the list has + * not changed after reacquiring it. + */ + usedlen = 0; + hlist_for_each_entry_rcu(p, &file_systems, list) { + usedlen += sprintf(&new->string[usedlen], "%s\t%s\n", + (p->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + p->name); + } + + if (WARN_ON_ONCE(new->len != strlen(new->string))) { + /* + * Should never happen of course, keep this in case someone changes string + * generation above and messes it up. + */ + spin_unlock(&file_systems_lock); + if (old) + kfree_rcu(old, rcu); + return -EINVAL; + } + + /* + * Paired with consume fence in READ_ONCE() in filesystems_proc_show() + */ + smp_store_release(&file_systems_string, new); + spin_unlock(&file_systems_lock); + if (old) + kfree_rcu(old, rcu); + return 0; +} + +static __cold noinline int filesystems_proc_show_fallback(struct seq_file *m, void *v) +{ + struct file_system_type *p; + + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) { seq_printf(m, "%s\t%s\n", - (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", - tmp->name); - tmp = tmp->next; + (p->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + p->name); } - read_unlock(&file_systems_lock); return 0; } +static int filesystems_proc_show(struct seq_file *m, void *v) +{ + struct file_systems_string *fss; + + for (;;) { + scoped_guard(rcu) { + fss = rcu_dereference(file_systems_string); + if (likely(fss)) { + seq_write(m, fss->string, fss->len); + return 0; + } + } + + int err = regen_filesystems_string(); + if (unlikely(err)) + return filesystems_proc_show_fallback(m, v); + } +} + static int __init proc_filesystems_init(void) { - proc_create_single("filesystems", 0, NULL, filesystems_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("filesystems", 0, NULL, filesystems_proc_show); + if (!pde) + return -ENOMEM; + proc_make_permanent(pde); return 0; } module_init(proc_filesystems_init); @@ -263,11 +385,10 @@ static struct file_system_type *__get_fs_type(const char *name, int len) { struct file_system_type *fs; - read_lock(&file_systems_lock); - fs = *(find_filesystem(name, len)); + guard(rcu)(); + fs = find_filesystem(name, len); if (fs && !try_module_get(fs->owner)) fs = NULL; - read_unlock(&file_systems_lock); return fs; } @@ -291,5 +412,4 @@ struct file_system_type *get_fs_type(const char *name) } return fs; } - EXPORT_SYMBOL(get_fs_type); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e9bf4879c07f7..e9895dea0da49 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -738,6 +738,13 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl); error = PTR_ERR(inode); if (!IS_ERR(inode)) { + if (file && (file->f_flags & __O_REGULAR) && + !S_ISREG(inode->i_mode)) { + iput(inode); + inode = NULL; + error = -EFTYPE; + goto fail_gunlock; + } if (S_ISDIR(inode->i_mode)) { iput(inode); inode = NULL; diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 0e932cc8be1b2..1b4fcf760aadc 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -184,7 +184,7 @@ void hpfs_write_inode(struct inode *i) struct hpfs_inode_info *hpfs_inode = hpfs_i(i); struct inode *parent; if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return; - if (hpfs_inode->i_rddir_off && !icount_read(i)) { + if (hpfs_inode->i_rddir_off && !icount_read_once(i)) { if (*hpfs_inode->i_rddir_off) pr_err("write_inode: some position still there\n"); kfree(hpfs_inode->i_rddir_off); diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index c16d5d4caeade..8fbdbf0806276 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -523,7 +523,8 @@ static int hpfs_fill_super(struct super_block *s, struct fs_context *fc) hpfs_lock(s); /*sbi->sb_mounting = 1;*/ - sb_set_blocksize(s, 512); + if (!sb_set_blocksize(s, 512)) + goto bail0; sbi->sb_fs_size = -1; if (!(bootblock = hpfs_map_sector(s, 0, &bh0, 0))) goto bail1; if (!(superblock = hpfs_map_sector(s, 16, &bh1, 1))) goto bail2; diff --git a/fs/inode.c b/fs/inode.c index 62c579a0cf7df..acf206beb2e03 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -53,11 +53,7 @@ * inode->i_lock * * inode_hash_lock - * inode->i_sb->s_inode_list_lock * inode->i_lock - * - * iunique_lock - * inode_hash_lock */ static unsigned int i_hash_mask __ro_after_init; @@ -518,15 +514,6 @@ static void init_once(void *foo) inode_init_once(inode); } -/* - * get additional reference to inode; caller must already hold one. - */ -void ihold(struct inode *inode) -{ - WARN_ON(atomic_inc_return(&inode->i_count) < 2); -} -EXPORT_SYMBOL(ihold); - struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, struct inode *inode, u32 bit) { @@ -902,7 +889,7 @@ void evict_inodes(struct super_block *sb) again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (icount_read(inode)) + if (icount_read_once(inode)) continue; spin_lock(&inode->i_lock); @@ -1032,6 +1019,7 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) } static void __wait_on_freeing_inode(struct inode *inode, bool hash_locked, bool rcu_locked); +static bool igrab_from_hash(struct inode *inode); /* * Called with the inode lock held. @@ -1056,6 +1044,11 @@ repeat: continue; if (!test(inode, data)) continue; + if (igrab_from_hash(inode)) { + rcu_read_unlock(); + *isnew = false; + return inode; + } spin_lock(&inode->i_lock); if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) { __wait_on_freeing_inode(inode, hash_locked, true); @@ -1098,6 +1091,11 @@ repeat: continue; if (inode->i_sb != sb) continue; + if (igrab_from_hash(inode)) { + rcu_read_unlock(); + *isnew = false; + return inode; + } spin_lock(&inode->i_lock); if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) { __wait_on_freeing_inode(inode, hash_locked, true); @@ -1215,6 +1213,10 @@ void unlock_new_inode(struct inode *inode) lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode_state_read(inode) & I_NEW)); + /* + * Paired with igrab_from_hash() + */ + smp_wmb(); inode_state_clear(inode, I_NEW | I_CREATING); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); @@ -1226,6 +1228,10 @@ void discard_new_inode(struct inode *inode) lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode_state_read(inode) & I_NEW)); + /* + * Paired with igrab_from_hash() + */ + smp_wmb(); inode_state_clear(inode, I_NEW); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); @@ -1572,8 +1578,27 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved) } EXPORT_SYMBOL(iunique); +/** + * ihold - get a reference on the inode, provided you already have one + * @inode: inode to operate on + */ +void ihold(struct inode *inode) +{ + VFS_BUG_ON_INODE(icount_read_once(inode) < 1, inode); + WARN_ON(atomic_inc_return(&inode->i_count) < 2); +} +EXPORT_SYMBOL(ihold); + struct inode *igrab(struct inode *inode) { + /* + * Read commentary above igrab_from_hash() for an explanation why this works. + */ + if (atomic_add_unless(&inode->i_count, 1, 0)) { + VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE), inode); + return inode; + } + spin_lock(&inode->i_lock); if (!(inode_state_read(inode) & (I_FREEING | I_WILL_FREE))) { __iget(inode); @@ -1591,6 +1616,43 @@ struct inode *igrab(struct inode *inode) } EXPORT_SYMBOL(igrab); +/* + * igrab_from_hash - special inode refcount acquire primitive for the inode hash + * + * It provides lockless refcount acquire in the common case of no problematic + * flags being set and the count being > 0. + * + * There are 4 state flags to worry about and the routine makes sure to not bump the + * ref if any of them is present. + * + * I_NEW and I_CREATING can only legally get set *before* the inode becomes visible + * during lookup. Thus if the flags are not spotted, they are guaranteed to not be + * a factor. However, we need an acquire fence before returning the inode just + * in case we raced against clearing the state to make sure our consumer picks up + * any other changes made prior. atomic_add_unless provides a full fence, which + * takes care of it. + * + * I_FREEING and I_WILL_FREE can only legally get set if ->i_count == 0 and it is + * illegal to bump the ref if either is present. Consequently if atomic_add_unless + * managed to replace a non-0 value with a bigger one, we have a guarantee neither + * of these flags is set. Note this means explicitly checking of these flags below + * is not necessary, it is only done because it does not cost anything on top of the + * load which already needs to be done to handle the other flags. + */ +static bool igrab_from_hash(struct inode *inode) +{ + if (inode_state_read_once(inode) & (I_NEW | I_CREATING | I_FREEING | I_WILL_FREE)) + return false; + /* + * Paired with routines clearing I_NEW + */ + if (atomic_add_unless(&inode->i_count, 1, 0)) { + VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE), inode); + return true; + } + return false; +} + /** * ilookup5_nowait - search for an inode in the inode cache * @sb: super block of file system to search @@ -1920,7 +1982,7 @@ static void iput_final(struct inode *inode) int drop; WARN_ON(inode_state_read(inode) & I_NEW); - VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode); + VFS_BUG_ON_INODE(icount_read(inode) != 0, inode); if (op->drop_inode) drop = op->drop_inode(inode); @@ -1939,7 +2001,7 @@ static void iput_final(struct inode *inode) * Re-check ->i_count in case the ->drop_inode() hooks played games. * Note we only execute this if the verdict was to drop the inode. */ - VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode); + VFS_BUG_ON_INODE(icount_read(inode) != 0, inode); if (drop) { inode_state_set(inode, I_FREEING); @@ -1983,7 +2045,7 @@ retry: * equal to one, then two CPUs racing to further drop it can both * conclude it's fine. */ - VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 1, inode); + VFS_BUG_ON_INODE(icount_read_once(inode) < 1, inode); if (atomic_add_unless(&inode->i_count, -1, 1)) return; @@ -2017,7 +2079,7 @@ EXPORT_SYMBOL(iput); void iput_not_last(struct inode *inode) { VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_CLEAR), inode); - VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 2, inode); + VFS_BUG_ON_INODE(icount_read_once(inode) < 2, inode); WARN_ON(atomic_sub_return(1, &inode->i_count) == 0); } @@ -3046,7 +3108,7 @@ void dump_inode(struct inode *inode, const char *reason) } state = inode_state_read_once(inode); - count = atomic_read(&inode->i_count); + count = icount_read_once(inode); if (!sb || get_kernel_nofault(s_type, &sb->s_type) || !s_type || diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index efee53717f1cd..337836a0a1704 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -818,7 +818,8 @@ root_found: * entries. By forcing the blocksize in this way, we ensure * that we will never be required to do this. */ - sb_set_blocksize(s, orig_zonesize); + if (!sb_set_blocksize(s, orig_zonesize)) + goto out_freesbi; sbi->s_nls_iocharset = NULL; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 61575f7397aea..8180d83d33fed 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -491,7 +491,8 @@ static int jfs_fill_super(struct super_block *sb, struct fs_context *fc) /* * Initialize blocksize to 4K. */ - sb_set_blocksize(sb, PSIZE); + if (!sb_set_blocksize(sb, PSIZE)) + goto out_unload; /* * Set method vectors. diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 9c6bac2489077..03a69b13950db 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -292,7 +292,8 @@ static int minix_fill_super(struct super_block *s, struct fs_context *fc) sbi->s_namelen = 60; sbi->s_version = MINIX_V3; sbi->s_mount_state = MINIX_VALID_FS; - sb_set_blocksize(s, m3s->s_blocksize); + if (!sb_set_blocksize(s, m3s->s_blocksize)) + goto out; s->s_max_links = MINIX2_LINK_MAX; } else goto out_no_fs; diff --git a/fs/mount.h b/fs/mount.h index e0816c11a1989..5df134d56d475 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -25,6 +25,7 @@ struct mnt_namespace { __u32 n_fsnotify_mask; struct fsnotify_mark_connector __rcu *n_fsnotify_marks; #endif + struct hlist_head mnt_visible_mounts; /* SB_I_USERNS_VISIBLE mounts */ unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; refcount_t passive; /* number references not pinning @mounts */ @@ -90,6 +91,7 @@ struct mount { int mnt_expiry_mark; /* true if marked for expiry */ struct hlist_head mnt_pins; struct hlist_head mnt_stuck_children; + struct hlist_node mnt_ns_visible; /* link in ns->mnt_visible_mounts */ struct mount *overmount; /* mounted on ->mnt_root */ } __randomize_layout; @@ -207,6 +209,8 @@ static inline void move_from_ns(struct mount *mnt) ns->mnt_first_node = rb_next(&mnt->mnt_node); rb_erase(&mnt->mnt_node, &ns->mounts); RB_CLEAR_NODE(&mnt->mnt_node); + if (!hlist_unhashed(&mnt->mnt_ns_visible)) + hlist_del_init(&mnt->mnt_ns_visible); } bool has_locked_children(struct mount *mnt, struct dentry *dentry); diff --git a/fs/namei.c b/fs/namei.c index 3a3a2e5e77a0f..a59ea10b481d6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2955,15 +2955,16 @@ void end_dirop(struct dentry *de) EXPORT_SYMBOL(end_dirop); /* does lookup, returns the object with parent locked */ -static struct dentry *__start_removing_path(int dfd, struct filename *name, - struct path *path) +struct dentry *start_removing_path(const char *name, struct path *path) { + CLASS(filename_kernel, filename)(name); struct path parent_path __free(path_put) = {}; struct dentry *d; struct qstr last; int type, error; - error = filename_parentat(dfd, name, 0, &parent_path, &last, &type); + error = filename_parentat(AT_FDCWD, filename, 0, &parent_path, &last, + &type); if (error) return ERR_PTR(error); if (unlikely(type != LAST_NORM)) @@ -3023,21 +3024,6 @@ struct dentry *kern_path_parent(const char *name, struct path *path) return d; } -struct dentry *start_removing_path(const char *name, struct path *path) -{ - CLASS(filename_kernel, filename)(name); - return __start_removing_path(AT_FDCWD, filename, path); -} - -struct dentry *start_removing_user_path_at(int dfd, - const char __user *name, - struct path *path) -{ - CLASS(filename, filename)(name); - return __start_removing_path(dfd, filename, path); -} -EXPORT_SYMBOL(start_removing_user_path_at); - int kern_path(const char *name, unsigned int flags, struct path *path) { CLASS(filename_kernel, filename)(name); @@ -3617,7 +3603,6 @@ int path_pts(struct path *path) */ struct dentry *parent = dget_parent(path->dentry); struct dentry *child; - struct qstr this = QSTR_INIT("pts", 3); if (unlikely(!path_connected(path->mnt, parent))) { dput(parent); @@ -3625,7 +3610,7 @@ int path_pts(struct path *path) } dput(path->dentry); path->dentry = parent; - child = d_hash_and_lookup(parent, &this); + child = d_hash_and_lookup(parent, &QSTR("pts")); if (IS_ERR_OR_NULL(child)) return -ENOENT; @@ -4679,6 +4664,10 @@ static int do_open(struct nameidata *nd, if (unlikely(error)) return error; } + + if ((open_flag & __O_REGULAR) && !d_is_reg(nd->path.dentry)) + return -EFTYPE; + if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) return -ENOTDIR; diff --git a/fs/namespace.c b/fs/namespace.c index fe919abd2f011..9a66a806a9b8a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -321,6 +321,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_HLIST_NODE(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); + INIT_HLIST_NODE(&mnt->mnt_ns_visible); RB_CLEAR_NODE(&mnt->mnt_node); mnt->mnt.mnt_idmap = &nop_mnt_idmap; } @@ -1098,6 +1099,10 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) rb_link_node(&mnt->mnt_node, parent, link); rb_insert_color(&mnt->mnt_node, &ns->mounts); + if ((mnt->mnt.mnt_sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED) && + mnt->mnt.mnt_root == mnt->mnt.mnt_sb->s_root) + hlist_add_head(&mnt->mnt_ns_visible, &ns->mnt_visible_mounts); + mnt_notify_add(mnt); } @@ -6340,20 +6345,26 @@ static bool mnt_already_visible(struct mnt_namespace *ns, int *new_mnt_flags) { int new_flags = *new_mnt_flags; - struct mount *mnt, *n; + struct mount *mnt; + + /* Don't acquire namespace semaphore without a good reason. */ + if (hlist_empty(&ns->mnt_visible_mounts)) + return false; guard(namespace_shared)(); - rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { + hlist_for_each_entry(mnt, &ns->mnt_visible_mounts, mnt_ns_visible) { + const struct super_block *sb_visible = mnt->mnt.mnt_sb; struct mount *child; int mnt_flags; - if (mnt->mnt.mnt_sb->s_type != sb->s_type) + if (sb_visible->s_type != sb->s_type) continue; - /* This mount is not fully visible if it's root directory - * is not the root directory of the filesystem. + /* + * Restricted variants are not compatible with anything, even + * other restricted variants. */ - if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) + if (sb_visible->s_iflags & SB_I_RESTRICTED_VARIANT) continue; /* A local view of the mount flags */ @@ -6405,16 +6416,23 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags return false; /* Can this filesystem be too revealing? */ - s_iflags = sb->s_iflags; - if (!(s_iflags & SB_I_USERNS_VISIBLE)) + if (!(sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED)) return false; + s_iflags = sb->s_iflags; if ((s_iflags & required_iflags) != required_iflags) { WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n", required_iflags); return true; } + /* + * Restricted variants don't need an already visible mount because they + * don't expose the full filesystem view. + */ + if (s_iflags & SB_I_RESTRICTED_VARIANT) + return false; + return !mnt_already_visible(ns, sb, new_mnt_flags); } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e9ce1883288c5..1b9c368fb1338 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2194,6 +2194,10 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, break; case -EISDIR: case -ENOTDIR: + if (open_flags & __O_REGULAR) { + err = -EFTYPE; + break; + } goto no_open; case -ELOOP: if (!(open_flags & O_NOFOLLOW)) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 170d32c217ae4..6227df9ae6f1d 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -609,7 +609,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), nfs_display_fhandle_hash(fh), - icount_read(inode)); + icount_read_once(inode)); out: return inode; @@ -2270,7 +2270,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dfprintk(VFS, "NFS: %s(%s/%llu fh_crc=0x%08x ct=%d info=0x%llx)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), - icount_read(inode), fattr->valid); + icount_read_once(inode), fattr->valid); if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { /* Only a mounted-on-fileid? Just exit */ diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 004f599375597..3305fe406cb22 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -1174,7 +1174,10 @@ read_boot: rec->total = cpu_to_le32(sbi->record_size); ((struct ATTRIB *)Add2Ptr(rec, ao))->type = ATTR_END; - sb_set_blocksize(sb, min_t(u32, sbi->cluster_size, PAGE_SIZE)); + if (!sb_set_blocksize(sb, min_t(u32, sbi->cluster_size, PAGE_SIZE))) { + err = -EINVAL; + goto out; + } sbi->block_mask = sb->s_blocksize - 1; sbi->blocks_per_cluster = sbi->cluster_size >> sb->s_blocksize_bits; @@ -1225,7 +1228,8 @@ out: /* * Try alternative boot (last sector) */ - sb_set_blocksize(sb, block_size); + if (!sb_set_blocksize(sb, block_size)) + return -EINVAL; hint = "Alternative boot"; dev_size = dev_size0; /* restore original size. */ goto read_boot; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index b875f01c97564..4870e680c4e5a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1224,7 +1224,6 @@ static struct file_system_type ocfs2_fs_type = { .name = "ocfs2", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, - .next = NULL, .init_fs_context = ocfs2_init_fs_context, .parameters = ocfs2_param_spec, }; diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 834cae1e62233..1d915ef72119f 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -478,7 +478,8 @@ static int omfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_time_min = 0; sb->s_time_max = U64_MAX / MSEC_PER_SEC; - sb_set_blocksize(sb, 0x200); + if (!sb_set_blocksize(sb, 0x200)) + goto end; bh = sb_bread(sb, 0); if (!bh) @@ -530,7 +531,8 @@ static int omfs_fill_super(struct super_block *sb, struct fs_context *fc) * Use sys_blocksize as the fs block since it is smaller than a * page while the fs blocksize can be larger. */ - sb_set_blocksize(sb, sbi->s_sys_blocksize); + if (!sb_set_blocksize(sb, sbi->s_sys_blocksize)) + goto out_brelse_bh; /* * ...and the difference goes into a shift. sys_blocksize is always diff --git a/fs/open.c b/fs/open.c index 681d405bc61eb..5458668a68e11 100644 --- a/fs/open.c +++ b/fs/open.c @@ -960,7 +960,7 @@ static int do_dentry_open(struct file *f, if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO) f->f_mode |= FMODE_CAN_ODIRECT; - f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | __O_REGULAR); f->f_iocb_flags = iocb_flags(f); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); @@ -1158,7 +1158,7 @@ struct file *kernel_file_open(const struct path *path, int flags, EXPORT_SYMBOL_GPL(kernel_file_open); #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) -#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) +#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC | O_EMPTYPATH) inline struct open_how build_open_how(int flags, umode_t mode) { @@ -1184,7 +1184,15 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) int acc_mode = ACC_MODE(flags); BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS), - "struct open_flags doesn't yet handle flags > 32 bits"); + "VALID_OPEN_FLAGS must fit in 32 bits"); + /* The whole point: OPENAT2_REGULAR must be unrepresentable in int. */ + BUILD_BUG_ON_MSG(!upper_32_bits(OPENAT2_REGULAR), + "OPENAT2_REGULAR must live in the upper 32 bits of open_how::flags"); + /* Prevent a future bit collision between UAPI and internal carrier. */ + BUILD_BUG_ON_MSG(OPENAT2_REGULAR & VALID_OPEN_FLAGS, + "OPENAT2_REGULAR must not alias any open()/openat() flag"); + BUILD_BUG_ON_MSG(__O_REGULAR & VALID_OPENAT2_FLAGS, + "__O_REGULAR must not alias any user-visible flag"); /* * Strip flags that aren't relevant in determining struct open_flags. @@ -1196,7 +1204,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) * values before calling build_open_flags(), but openat2(2) checks all * of its arguments. */ - if (flags & ~VALID_OPEN_FLAGS) + if (flags & ~VALID_OPENAT2_FLAGS) return -EINVAL; if (how->resolve & ~VALID_RESOLVE_FLAGS) return -EINVAL; @@ -1236,6 +1244,14 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) if (!(acc_mode & MAY_WRITE)) return -EINVAL; } + /* + * Asking to open a directory and a regular file at the same time is + * contradictory. + */ + if ((flags & (O_DIRECTORY | OPENAT2_REGULAR)) == + (O_DIRECTORY | OPENAT2_REGULAR)) + return -EINVAL; + if (flags & O_PATH) { /* O_PATH only permits certain other flags to be set. */ if (flags & ~O_PATH_FLAGS) @@ -1252,6 +1268,19 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) if (flags & __O_SYNC) flags |= O_DSYNC; + /* + * Translate the upper-32-bit UAPI bit OPENAT2_REGULAR into the + * kernel-internal lower-32-bit __O_REGULAR carrier so the bit + * survives the assignment to op->open_flag (an int) below and the + * subsequent flow through f->f_flags (unsigned int) and the + * i_op->atomic_open() callback (unsigned). do_dentry_open() strips + * __O_REGULAR before the file becomes visible to userspace. + */ + if (flags & OPENAT2_REGULAR) { + flags &= ~OPENAT2_REGULAR; + flags |= __O_REGULAR; + } + op->open_flag = flags; /* O_TRUNC implies we need access checks for write permissions */ @@ -1279,6 +1308,8 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) lookup_flags |= LOOKUP_DIRECTORY; if (!(flags & O_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; + if (flags & O_EMPTYPATH) + lookup_flags |= LOOKUP_EMPTY; if (how->resolve & RESOLVE_NO_XDEV) lookup_flags |= LOOKUP_NO_XDEV; @@ -1360,7 +1391,7 @@ static int do_sys_openat2(int dfd, const char __user *filename, if (unlikely(err)) return err; - CLASS(filename, name)(filename); + CLASS(filename_flags, name)(filename, op.lookup_flags); return FD_ADD(how->flags, do_file_open(dfd, name, &op)); } diff --git a/fs/pidfs.c b/fs/pidfs.c index 1cce4f34a0512..b2ff950a096e9 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -338,14 +338,14 @@ static inline bool pid_in_current_pidns(const struct pid *pid) return false; } -static __u32 pidfs_coredump_mask(unsigned long mm_flags) +static __u32 pidfs_coredump_mask(enum task_dumpable dumpable) { - switch (__get_dumpable(mm_flags)) { - case SUID_DUMP_USER: + switch (dumpable) { + case TASK_DUMPABLE_OWNER: return PIDFD_COREDUMP_USER; - case SUID_DUMP_ROOT: + case TASK_DUMPABLE_ROOT: return PIDFD_COREDUMP_ROOT; - case SUID_DUMP_DISABLE: + case TASK_DUMPABLE_OFF: return PIDFD_COREDUMP_SKIP; default: WARN_ON_ONCE(true); @@ -433,14 +433,9 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) return -ESRCH; if ((mask & PIDFD_INFO_COREDUMP) && !kinfo.coredump_mask) { - guard(task_lock)(task); - if (task->mm) { - unsigned long flags = __mm_flags_get_dumpable(task->mm); - - kinfo.coredump_mask = pidfs_coredump_mask(flags); - kinfo.mask |= PIDFD_INFO_COREDUMP; - /* No coredump actually took place, so no coredump signal. */ - } + kinfo.coredump_mask = pidfs_coredump_mask(task_exec_state_get_dumpable(task)); + kinfo.mask |= PIDFD_INFO_COREDUMP; + /* No coredump actually took place, so no coredump signal. */ } /* Unconditionally return identifiers and credentials, the rest only on request */ @@ -779,7 +774,7 @@ void pidfs_coredump(const struct coredump_params *cprm) VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD); /* Note how we were coredumped and that we coredumped. */ - attr->coredump_mask = pidfs_coredump_mask(cprm->mm_flags) | + attr->coredump_mask = pidfs_coredump_mask(cprm->dumpable) | PIDFD_COREDUMPED; /* If coredumping is set to skip we should never end up here. */ VFS_WARN_ON_ONCE(attr->coredump_mask & PIDFD_COREDUMP_SKIP); diff --git a/fs/pipe.c b/fs/pipe.c index 9841648c9cf3e..e37c79935ecb1 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -664,7 +664,8 @@ pipe_poll(struct file *filp, poll_table *wait) union pipe_index idx; /* Epoll has some historical nasty semantics, this enables them */ - WRITE_ONCE(pipe->poll_usage, true); + if (unlikely(!READ_ONCE(pipe->poll_usage))) + WRITE_ONCE(pipe->poll_usage, true); /* * Reading pipe state only -- no need for acquiring the semaphore. diff --git a/fs/proc/base.c b/fs/proc/base.c index d9acfa89c894b..65f56136ec3f3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -91,6 +91,7 @@ #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/debug.h> +#include <linux/sched/exec_state.h> #include <linux/sched/stat.h> #include <linux/posix-timers.h> #include <linux/time_namespace.h> @@ -1893,7 +1894,6 @@ void task_dump_owner(struct task_struct *task, umode_t mode, cred = __task_cred(task); uid = cred->euid; gid = cred->egid; - rcu_read_unlock(); /* * Before the /proc/pid/status file was created the only way to read @@ -1903,29 +1903,22 @@ void task_dump_owner(struct task_struct *task, umode_t mode, * made this apply to all per process world readable and executable * directories. */ - if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) { - struct mm_struct *mm; - task_lock(task); - mm = task->mm; - /* Make non-dumpable tasks owned by some root */ - if (mm) { - if (get_dumpable(mm) != SUID_DUMP_USER) { - struct user_namespace *user_ns = mm->user_ns; + if (mode != (S_IFDIR | S_IRUGO | S_IXUGO)) { + struct task_exec_state *exec_state; - uid = make_kuid(user_ns, 0); - if (!uid_valid(uid)) - uid = GLOBAL_ROOT_UID; + exec_state = task_exec_state_rcu(task); + if (READ_ONCE(exec_state->dumpable) != TASK_DUMPABLE_OWNER) { + uid = make_kuid(exec_state->user_ns, 0); + if (!uid_valid(uid)) + uid = GLOBAL_ROOT_UID; - gid = make_kgid(user_ns, 0); - if (!gid_valid(gid)) - gid = GLOBAL_ROOT_GID; - } - } else { - uid = GLOBAL_ROOT_UID; - gid = GLOBAL_ROOT_GID; + gid = make_kgid(exec_state->user_ns, 0); + if (!gid_valid(gid)) + gid = GLOBAL_ROOT_GID; } - task_unlock(task); } + rcu_read_unlock(); + *ruid = uid; *rgid = gid; } @@ -2965,7 +2958,7 @@ static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, ret = 0; mm = get_task_mm(task); if (mm) { - unsigned long flags = __mm_flags_get_dumpable(mm); + unsigned long flags = __mm_flags_get_word(mm); len = snprintf(buffer, sizeof(buffer), "%08lx\n", ((flags & MMF_DUMP_FILTER_MASK) >> diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 8bb81e58c9d8c..c6ae076e1fa03 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -841,3 +841,13 @@ ssize_t proc_simple_write(struct file *f, const char __user *ubuf, size_t size, kfree(buf); return ret == 0 ? size : ret; } + +/* + * Not exported to modules: + * modules' /proc files aren't permanent because modules aren't permanent. + */ +void impl_proc_make_permanent(struct proc_dir_entry *pde) +{ + if (pde) + pde_make_permanent(pde); +} diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 64dc44832808a..1edbabbdbc5d7 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -79,8 +79,11 @@ static inline bool pde_is_permanent(const struct proc_dir_entry *pde) return pde->flags & PROC_ENTRY_PERMANENT; } +/* This is for builtin code, not even for modules which are compiled in. */ static inline void pde_make_permanent(struct proc_dir_entry *pde) { + /* Ensure magic flag does something. */ + static_assert(PROC_ENTRY_PERMANENT != 0); pde->flags |= PROC_ENTRY_PERMANENT; } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 184cddeb8215c..00cc385bce212 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -23,6 +23,7 @@ #include <linux/uidgid.h> #include <net/net_namespace.h> #include <linux/seq_file.h> +#include <linux/security.h> #include "internal.h" @@ -270,6 +271,7 @@ static struct net *get_proc_task_net(struct inode *dir) struct task_struct *task; struct nsproxy *ns; struct net *net = NULL; + struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb); rcu_read_lock(); task = pid_task(proc_pid(dir), PIDTYPE_PID); @@ -282,6 +284,12 @@ static struct net *get_proc_task_net(struct inode *dir) } rcu_read_unlock(); + if (net && (fs_info->pidonly == PROC_PIDONLY_ON) && + security_capable(fs_info->mounter_cred, net->user_ns, CAP_NET_ADMIN, CAP_OPT_NONE) < 0) { + put_net(net); + net = NULL; + } + return net; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 0f91005594710..99adddfeb4a44 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -223,12 +223,17 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } -static void proc_apply_options(struct proc_fs_info *fs_info, +static int proc_apply_options(struct proc_fs_info *fs_info, struct fs_context *fc, struct user_namespace *user_ns) { struct proc_fs_context *ctx = fc->fs_private; + if ((ctx->mask & (1 << Opt_subset)) && + fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && + ctx->pidonly != fs_info->pidonly) + return invalf(fc, "proc: subset=pid cannot be changed\n"); + if (ctx->mask & (1 << Opt_gid)) fs_info->pid_gid = make_kgid(user_ns, ctx->gid); if (ctx->mask & (1 << Opt_hidepid)) @@ -240,6 +245,7 @@ static void proc_apply_options(struct proc_fs_info *fs_info, put_pid_ns(fs_info->pid_ns); fs_info->pid_ns = get_pid_ns(ctx->pid_ns); } + return 0; } static int proc_fill_super(struct super_block *s, struct fs_context *fc) @@ -254,10 +260,13 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) return -ENOMEM; fs_info->pid_ns = get_pid_ns(ctx->pid_ns); - proc_apply_options(fs_info, fc, current_user_ns()); + fs_info->mounter_cred = get_cred(fc->cred); + ret = proc_apply_options(fs_info, fc, current_user_ns()); + if (ret) + return ret; /* User space would break if executables or devices appear on proc */ - s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; + s->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; @@ -266,6 +275,9 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) s->s_time_gran = 1; s->s_fs_info = fs_info; + if (fs_info->pidonly == PROC_PIDONLY_ON) + s->s_iflags |= SB_I_RESTRICTED_VARIANT; + /* * procfs isn't actually a stacking filesystem; however, there is * too much magic going on inside it to permit stacking things on @@ -303,8 +315,7 @@ static int proc_reconfigure(struct fs_context *fc) sync_filesystem(sb); - proc_apply_options(fs_info, fc, current_user_ns()); - return 0; + return proc_apply_options(fs_info, fc, current_user_ns()); } static int proc_get_tree(struct fs_context *fc) @@ -350,6 +361,7 @@ static void proc_kill_sb(struct super_block *sb) kill_anon_super(sb); if (fs_info) { put_pid_ns(fs_info->pid_ns); + put_cred(fs_info->mounter_cred); kfree_rcu(fs_info, rcu); } } @@ -359,7 +371,7 @@ static struct file_system_type proc_fs_type = { .init_fs_context = proc_init_fs_context, .parameters = proc_fs_parameters, .kill_sb = proc_kill_sb, - .fs_flags = FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM, + .fs_flags = FS_USERNS_MOUNT | FS_USERNS_MOUNT_RESTRICTED | FS_DISALLOW_NOTIFY_PERM, }; void __init proc_root_init(void) diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 4deb0eeadbdef..42fcd500fad21 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -202,7 +202,8 @@ static int qnx4_fill_super(struct super_block *s, struct fs_context *fc) return -ENOMEM; s->s_fs_info = qs; - sb_set_blocksize(s, QNX4_BLOCK_SIZE); + if (!sb_set_blocksize(s, QNX4_BLOCK_SIZE)) + return -EINVAL; s->s_op = &qnx4_sops; s->s_magic = QNX4_SUPER_MAGIC; diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c index e4295a5b55b34..88a4a1787ff04 100644 --- a/fs/smb/client/dir.c +++ b/fs/smb/client/dir.c @@ -241,6 +241,12 @@ static int __cifs_do_create(struct inode *dir, struct dentry *direntry, goto cifs_create_get_file_info; } + if ((oflags & __O_REGULAR) && !S_ISREG(newinode->i_mode)) { + CIFSSMBClose(xid, tcon, fid->netfid); + iput(newinode); + return -EFTYPE; + } + if (S_ISDIR(newinode->i_mode)) { CIFSSMBClose(xid, tcon, fid->netfid); iput(newinode); @@ -458,9 +464,15 @@ cifs_create_set_dentry: goto out_err; } - if (newinode && S_ISDIR(newinode->i_mode)) { - rc = -EISDIR; - goto out_err; + if (newinode) { + if ((oflags & __O_REGULAR) && !S_ISREG(newinode->i_mode)) { + rc = -EFTYPE; + goto out_err; + } + if (S_ISDIR(newinode->i_mode)) { + rc = -EISDIR; + goto out_err; + } } *inode = newinode; diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 826d36ed13ec9..56b0f109e41b4 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2873,7 +2873,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) } cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n", - full_path, inode, icount_read(inode), + full_path, inode, icount_read_once(inode), dentry, cifs_get_time(dentry), jiffies); again: diff --git a/fs/super.c b/fs/super.c index 378e81efe643b..5d46a0d5b6169 100644 --- a/fs/super.c +++ b/fs/super.c @@ -882,7 +882,6 @@ void drop_super_exclusive(struct super_block *sb) super_unlock_excl(sb); put_super(sb); } -EXPORT_SYMBOL(drop_super_exclusive); enum super_iter_flags_t { SUPER_ITER_EXCL = (1U << 0), diff --git a/fs/sync.c b/fs/sync.c index 942a60cfedfbf..4a84dd837b863 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -266,8 +266,7 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, i_mode = file_inode(file)->i_mode; ret = -ESPIPE; - if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) && - !S_ISLNK(i_mode)) + if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode)) goto out; mapping = file->f_mapping; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index b199e8ff79b1f..88c10823fcafa 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -23,20 +23,6 @@ static struct kernfs_root *sysfs_root; struct kernfs_node *sysfs_root_kn; -static int sysfs_get_tree(struct fs_context *fc) -{ - struct kernfs_fs_context *kfc = fc->fs_private; - int ret; - - ret = kernfs_get_tree(fc); - if (ret) - return ret; - - if (kfc->new_sb_created) - fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; - return 0; -} - static void sysfs_fs_context_free(struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; @@ -49,7 +35,7 @@ static void sysfs_fs_context_free(struct fs_context *fc) static const struct fs_context_operations sysfs_fs_context_ops = { .free = sysfs_fs_context_free, - .get_tree = sysfs_get_tree, + .get_tree = kernfs_get_tree, }; static int sysfs_init_fs_context(struct fs_context *fc) @@ -93,7 +79,7 @@ static struct file_system_type sysfs_fs_type = { .name = "sysfs", .init_fs_context = sysfs_init_fs_context, .kill_sb = sysfs_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_MOUNT | FS_USERNS_MOUNT_RESTRICTED, }; int __init sysfs_init(void) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 9a77d8b64ffa7..38972786817e7 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -358,7 +358,7 @@ static void ubifs_evict_inode(struct inode *inode) goto out; dbg_gen("inode %llu, mode %#x", inode->i_ino, (int)inode->i_mode); - ubifs_assert(c, !icount_read(inode)); + ubifs_assert(c, !icount_read_once(inode)); truncate_inode_pages_final(&inode->i_data); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9978ac1422fc4..ddf2707c88945 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1040,7 +1040,7 @@ xfs_itruncate_extents_flags( int error = 0; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - if (icount_read(VFS_I(ip))) + if (icount_read_once(VFS_I(ip))) xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); if (whichfork == XFS_DATA_FORK) ASSERT(new_size <= XFS_ISIZE(ip)); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 1c098cfc5c00d..f87c738d84b24 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1158,7 +1158,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class, TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->count = icount_read(VFS_I(ip)); + __entry->count = icount_read_once(VFS_I(ip)); __entry->pincount = atomic_read(&ip->i_pincount); __entry->iflags = ip->i_flags; __entry->caller_ip = caller_ip; |
