diff options
Diffstat (limited to 'queue-6.12/btrfs-skip-inodes-without-loaded-extent-maps-when-sh.patch')
-rw-r--r-- | queue-6.12/btrfs-skip-inodes-without-loaded-extent-maps-when-sh.patch | 161 |
1 files changed, 161 insertions, 0 deletions
diff --git a/queue-6.12/btrfs-skip-inodes-without-loaded-extent-maps-when-sh.patch b/queue-6.12/btrfs-skip-inodes-without-loaded-extent-maps-when-sh.patch new file mode 100644 index 0000000000..91401e4adb --- /dev/null +++ b/queue-6.12/btrfs-skip-inodes-without-loaded-extent-maps-when-sh.patch @@ -0,0 +1,161 @@ +From 6f7096b21055313b16be91b99f0c4fe1a05d9449 Mon Sep 17 00:00:00 2001 +From: Sasha Levin <sashal@kernel.org> +Date: Sat, 15 Feb 2025 11:04:15 +0000 +Subject: btrfs: skip inodes without loaded extent maps when shrinking extent + maps + +From: Filipe Manana <fdmanana@suse.com> + +[ Upstream commit c6c9c4d56483d941f567eb921434c25fc6086dfa ] + +If there are inodes that don't have any loaded extent maps, we end up +grabbing a reference on them and later adding a delayed iput, which wakes +up the cleaner and makes it do unnecessary work. This is common when for +example the inodes were open only to run stat(2) or all their extent maps +were already released through the folio release callback +(btrfs_release_folio()) or released by a previous run of the shrinker, or +directories which never have extent maps. + +Reported-by: Ivan Shapovalov <intelfx@intelfx.name> +Tested-by: Ivan Shapovalov <intelfx@intelfx.name> +Link: https://lore.kernel.org/linux-btrfs/0414d690ac5680d0d77dfc930606cdc36e42e12f.camel@intelfx.name/ +CC: stable@vger.kernel.org # 6.13+ +Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> +Reviewed-by: Qu Wenruo <wqu@suse.com> +Signed-off-by: Filipe Manana <fdmanana@suse.com> +Signed-off-by: David Sterba <dsterba@suse.com> +Signed-off-by: Sasha Levin <sashal@kernel.org> +--- + fs/btrfs/extent_map.c | 78 +++++++++++++++++++++++++++++++------------ + 1 file changed, 57 insertions(+), 21 deletions(-) + +diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c +index 1d93e1202c339..d67abf5f97a77 100644 +--- a/fs/btrfs/extent_map.c ++++ b/fs/btrfs/extent_map.c +@@ -1133,6 +1133,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c + long nr_dropped = 0; + struct rb_node *node; + ++ lockdep_assert_held_write(&tree->lock); ++ + /* + * Take the mmap lock so that we serialize with the inode logging phase + * of fsync because we may need to set the full sync flag on the inode, +@@ -1144,28 +1146,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c + * to find new extents, which may not be there yet because ordered + * extents haven't completed yet. + * +- * We also do a try lock because otherwise we could deadlock. This is +- * because the shrinker for this filesystem may be invoked while we are +- * in a path that is holding the mmap lock in write mode. For example in +- * a reflink operation while COWing an extent buffer, when allocating +- * pages for a new extent buffer and under memory pressure, the shrinker +- * may be invoked, and therefore we would deadlock by attempting to read +- * lock the mmap lock while we are holding already a write lock on it. ++ * We also do a try lock because we don't want to block for too long and ++ * we are holding the extent map tree's lock in write mode. + */ + if (!down_read_trylock(&inode->i_mmap_lock)) + return 0; + +- /* +- * We want to be fast so if the lock is busy we don't want to spend time +- * waiting for it - either some task is about to do IO for the inode or +- * we may have another task shrinking extent maps, here in this code, so +- * skip this inode. +- */ +- if (!write_trylock(&tree->lock)) { +- up_read(&inode->i_mmap_lock); +- return 0; +- } +- + node = rb_first(&tree->root); + while (node) { + struct rb_node *next = rb_next(node); +@@ -1205,21 +1191,71 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c + break; + node = next; + } +- write_unlock(&tree->lock); + up_read(&inode->i_mmap_lock); + + return nr_dropped; + } + ++static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root, ++ u64 min_ino) ++{ ++ struct btrfs_inode *inode; ++ unsigned long from = min_ino; ++ ++ xa_lock(&root->inodes); ++ while (true) { ++ struct extent_map_tree *tree; ++ ++ inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); ++ if (!inode) ++ break; ++ ++ tree = &inode->extent_tree; ++ ++ /* ++ * We want to be fast so if the lock is busy we don't want to ++ * spend time waiting for it (some task is about to do IO for ++ * the inode). ++ */ ++ if (!write_trylock(&tree->lock)) ++ goto next; ++ ++ /* ++ * Skip inode if it doesn't have loaded extent maps, so we avoid ++ * getting a reference and doing an iput later. This includes ++ * cases like files that were opened for things like stat(2), or ++ * files with all extent maps previously released through the ++ * release folio callback (btrfs_release_folio()) or released in ++ * a previous run, or directories which never have extent maps. ++ */ ++ if (RB_EMPTY_ROOT(&tree->root)) { ++ write_unlock(&tree->lock); ++ goto next; ++ } ++ ++ if (igrab(&inode->vfs_inode)) ++ break; ++ ++ write_unlock(&tree->lock); ++next: ++ from = btrfs_ino(inode) + 1; ++ cond_resched_lock(&root->inodes.xa_lock); ++ } ++ xa_unlock(&root->inodes); ++ ++ return inode; ++} ++ + static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) + { + struct btrfs_inode *inode; + long nr_dropped = 0; + u64 min_ino = ctx->last_ino + 1; + +- inode = btrfs_find_first_inode(root, min_ino); ++ inode = find_first_inode_to_shrink(root, min_ino); + while (inode) { + nr_dropped += btrfs_scan_inode(inode, ctx); ++ write_unlock(&inode->extent_tree.lock); + + min_ino = btrfs_ino(inode) + 1; + ctx->last_ino = btrfs_ino(inode); +@@ -1230,7 +1266,7 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx + + cond_resched(); + +- inode = btrfs_find_first_inode(root, min_ino); ++ inode = find_first_inode_to_shrink(root, min_ino); + } + + if (inode) { +-- +2.39.5 + |