1 files changed, 161 insertions, 0 deletions
diff --git a/queue-6.12/btrfs-skip-inodes-without-loaded-extent-maps-when-sh.patch b/queue-6.12/btrfs-skip-inodes-without-loaded-extent-maps-when-sh.patch
new file mode 100644
index 0000000000..91401e4adb
--- /dev/null
+++ b/queue-6.12/btrfs-skip-inodes-without-loaded-extent-maps-when-sh.patch
@@ -0,0 +1,161 @@
+From 6f7096b21055313b16be91b99f0c4fe1a05d9449 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 15 Feb 2025 11:04:15 +0000
+Subject: btrfs: skip inodes without loaded extent maps when shrinking extent
+ maps
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit c6c9c4d56483d941f567eb921434c25fc6086dfa ]
+
+If there are inodes that don't have any loaded extent maps, we end up
+grabbing a reference on them and later adding a delayed iput, which wakes
+up the cleaner and makes it do unnecessary work. This is common when for
+example the inodes were open only to run stat(2) or all their extent maps
+were already released through the folio release callback
+(btrfs_release_folio()) or released by a previous run of the shrinker, or
+directories which never have extent maps.
+
+Reported-by: Ivan Shapovalov <intelfx@intelfx.name>
+Tested-by: Ivan Shapovalov <intelfx@intelfx.name>
+Link: https://lore.kernel.org/linux-btrfs/0414d690ac5680d0d77dfc930606cdc36e42e12f.camel@intelfx.name/
+CC: stable@vger.kernel.org # 6.13+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/extent_map.c | 78 +++++++++++++++++++++++++++++++------------
+ 1 file changed, 57 insertions(+), 21 deletions(-)
+
+diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
+index 1d93e1202c339..d67abf5f97a77 100644
+--- a/fs/btrfs/extent_map.c
++++ b/fs/btrfs/extent_map.c
+@@ -1133,6 +1133,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
+ 	long nr_dropped = 0;
+ 	struct rb_node *node;
+ 
++	lockdep_assert_held_write(&tree->lock);
++
+ 	/*
+ 	 * Take the mmap lock so that we serialize with the inode logging phase
+ 	 * of fsync because we may need to set the full sync flag on the inode,
+@@ -1144,28 +1146,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
+ 	 * to find new extents, which may not be there yet because ordered
+ 	 * extents haven't completed yet.
+ 	 *
+-	 * We also do a try lock because otherwise we could deadlock. This is
+-	 * because the shrinker for this filesystem may be invoked while we are
+-	 * in a path that is holding the mmap lock in write mode. For example in
+-	 * a reflink operation while COWing an extent buffer, when allocating
+-	 * pages for a new extent buffer and under memory pressure, the shrinker
+-	 * may be invoked, and therefore we would deadlock by attempting to read
+-	 * lock the mmap lock while we are holding already a write lock on it.
++	 * We also do a try lock because we don't want to block for too long and
++	 * we are holding the extent map tree's lock in write mode.
+ 	 */
+ 	if (!down_read_trylock(&inode->i_mmap_lock))
+ 		return 0;
+ 
+-	/*
+-	 * We want to be fast so if the lock is busy we don't want to spend time
+-	 * waiting for it - either some task is about to do IO for the inode or
+-	 * we may have another task shrinking extent maps, here in this code, so
+-	 * skip this inode.
+-	 */
+-	if (!write_trylock(&tree->lock)) {
+-		up_read(&inode->i_mmap_lock);
+-		return 0;
+-	}
+-
+ 	node = rb_first(&tree->root);
+ 	while (node) {
+ 		struct rb_node *next = rb_next(node);
+@@ -1205,21 +1191,71 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
+ 			break;
+ 		node = next;
+ 	}
+-	write_unlock(&tree->lock);
+ 	up_read(&inode->i_mmap_lock);
+ 
+ 	return nr_dropped;
+ }
+ 
++static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root,
++						      u64 min_ino)
++{
++	struct btrfs_inode *inode;
++	unsigned long from = min_ino;
++
++	xa_lock(&root->inodes);
++	while (true) {
++		struct extent_map_tree *tree;
++
++		inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
++		if (!inode)
++			break;
++
++		tree = &inode->extent_tree;
++
++		/*
++		 * We want to be fast so if the lock is busy we don't want to
++		 * spend time waiting for it (some task is about to do IO for
++		 * the inode).
++		 */
++		if (!write_trylock(&tree->lock))
++			goto next;
++
++		/*
++		 * Skip inode if it doesn't have loaded extent maps, so we avoid
++		 * getting a reference and doing an iput later. This includes
++		 * cases like files that were opened for things like stat(2), or
++		 * files with all extent maps previously released through the
++		 * release folio callback (btrfs_release_folio()) or released in
++		 * a previous run, or directories which never have extent maps.
++		 */
++		if (RB_EMPTY_ROOT(&tree->root)) {
++			write_unlock(&tree->lock);
++			goto next;
++		}
++
++		if (igrab(&inode->vfs_inode))
++			break;
++
++		write_unlock(&tree->lock);
++next:
++		from = btrfs_ino(inode) + 1;
++		cond_resched_lock(&root->inodes.xa_lock);
++	}
++	xa_unlock(&root->inodes);
++
++	return inode;
++}
++
+ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
+ {
+ 	struct btrfs_inode *inode;
+ 	long nr_dropped = 0;
+ 	u64 min_ino = ctx->last_ino + 1;
+ 
+-	inode = btrfs_find_first_inode(root, min_ino);
++	inode = find_first_inode_to_shrink(root, min_ino);
+ 	while (inode) {
+ 		nr_dropped += btrfs_scan_inode(inode, ctx);
++		write_unlock(&inode->extent_tree.lock);
+ 
+ 		min_ino = btrfs_ino(inode) + 1;
+ 		ctx->last_ino = btrfs_ino(inode);
+@@ -1230,7 +1266,7 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
+ 
+ 		cond_resched();
+ 
+-		inode = btrfs_find_first_inode(root, min_ino);
++		inode = find_first_inode_to_shrink(root, min_ino);
+ 	}
+ 
+ 	if (inode) {
+-- 
+2.39.5
+