aboutsummaryrefslogtreecommitdiffstats
path: root/queue-6.12/btrfs-skip-inodes-without-loaded-extent-maps-when-sh.patch
diff options
Diffstat (limited to 'queue-6.12/btrfs-skip-inodes-without-loaded-extent-maps-when-sh.patch')
-rw-r--r--queue-6.12/btrfs-skip-inodes-without-loaded-extent-maps-when-sh.patch161
1 files changed, 161 insertions, 0 deletions
diff --git a/queue-6.12/btrfs-skip-inodes-without-loaded-extent-maps-when-sh.patch b/queue-6.12/btrfs-skip-inodes-without-loaded-extent-maps-when-sh.patch
new file mode 100644
index 0000000000..91401e4adb
--- /dev/null
+++ b/queue-6.12/btrfs-skip-inodes-without-loaded-extent-maps-when-sh.patch
@@ -0,0 +1,161 @@
+From 6f7096b21055313b16be91b99f0c4fe1a05d9449 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 15 Feb 2025 11:04:15 +0000
+Subject: btrfs: skip inodes without loaded extent maps when shrinking extent
+ maps
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit c6c9c4d56483d941f567eb921434c25fc6086dfa ]
+
+If there are inodes that don't have any loaded extent maps, we end up
+grabbing a reference on them and later adding a delayed iput, which wakes
+up the cleaner and makes it do unnecessary work. This is common when for
+example the inodes were open only to run stat(2) or all their extent maps
+were already released through the folio release callback
+(btrfs_release_folio()) or released by a previous run of the shrinker, or
+directories which never have extent maps.
+
+Reported-by: Ivan Shapovalov <intelfx@intelfx.name>
+Tested-by: Ivan Shapovalov <intelfx@intelfx.name>
+Link: https://lore.kernel.org/linux-btrfs/0414d690ac5680d0d77dfc930606cdc36e42e12f.camel@intelfx.name/
+CC: stable@vger.kernel.org # 6.13+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/extent_map.c | 78 +++++++++++++++++++++++++++++++------------
+ 1 file changed, 57 insertions(+), 21 deletions(-)
+
+diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
+index 1d93e1202c339..d67abf5f97a77 100644
+--- a/fs/btrfs/extent_map.c
++++ b/fs/btrfs/extent_map.c
+@@ -1133,6 +1133,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
+ long nr_dropped = 0;
+ struct rb_node *node;
+
++ lockdep_assert_held_write(&tree->lock);
++
+ /*
+ * Take the mmap lock so that we serialize with the inode logging phase
+ * of fsync because we may need to set the full sync flag on the inode,
+@@ -1144,28 +1146,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
+ * to find new extents, which may not be there yet because ordered
+ * extents haven't completed yet.
+ *
+- * We also do a try lock because otherwise we could deadlock. This is
+- * because the shrinker for this filesystem may be invoked while we are
+- * in a path that is holding the mmap lock in write mode. For example in
+- * a reflink operation while COWing an extent buffer, when allocating
+- * pages for a new extent buffer and under memory pressure, the shrinker
+- * may be invoked, and therefore we would deadlock by attempting to read
+- * lock the mmap lock while we are holding already a write lock on it.
++ * We also do a try lock because we don't want to block for too long and
++ * we are holding the extent map tree's lock in write mode.
+ */
+ if (!down_read_trylock(&inode->i_mmap_lock))
+ return 0;
+
+- /*
+- * We want to be fast so if the lock is busy we don't want to spend time
+- * waiting for it - either some task is about to do IO for the inode or
+- * we may have another task shrinking extent maps, here in this code, so
+- * skip this inode.
+- */
+- if (!write_trylock(&tree->lock)) {
+- up_read(&inode->i_mmap_lock);
+- return 0;
+- }
+-
+ node = rb_first(&tree->root);
+ while (node) {
+ struct rb_node *next = rb_next(node);
+@@ -1205,21 +1191,71 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
+ break;
+ node = next;
+ }
+- write_unlock(&tree->lock);
+ up_read(&inode->i_mmap_lock);
+
+ return nr_dropped;
+ }
+
++static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root,
++ u64 min_ino)
++{
++ struct btrfs_inode *inode;
++ unsigned long from = min_ino;
++
++ xa_lock(&root->inodes);
++ while (true) {
++ struct extent_map_tree *tree;
++
++ inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
++ if (!inode)
++ break;
++
++ tree = &inode->extent_tree;
++
++ /*
++ * We want to be fast so if the lock is busy we don't want to
++ * spend time waiting for it (some task is about to do IO for
++ * the inode).
++ */
++ if (!write_trylock(&tree->lock))
++ goto next;
++
++ /*
++ * Skip inode if it doesn't have loaded extent maps, so we avoid
++ * getting a reference and doing an iput later. This includes
++ * cases like files that were opened for things like stat(2), or
++ * files with all extent maps previously released through the
++ * release folio callback (btrfs_release_folio()) or released in
++ * a previous run, or directories which never have extent maps.
++ */
++ if (RB_EMPTY_ROOT(&tree->root)) {
++ write_unlock(&tree->lock);
++ goto next;
++ }
++
++ if (igrab(&inode->vfs_inode))
++ break;
++
++ write_unlock(&tree->lock);
++next:
++ from = btrfs_ino(inode) + 1;
++ cond_resched_lock(&root->inodes.xa_lock);
++ }
++ xa_unlock(&root->inodes);
++
++ return inode;
++}
++
+ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
+ {
+ struct btrfs_inode *inode;
+ long nr_dropped = 0;
+ u64 min_ino = ctx->last_ino + 1;
+
+- inode = btrfs_find_first_inode(root, min_ino);
++ inode = find_first_inode_to_shrink(root, min_ino);
+ while (inode) {
+ nr_dropped += btrfs_scan_inode(inode, ctx);
++ write_unlock(&inode->extent_tree.lock);
+
+ min_ino = btrfs_ino(inode) + 1;
+ ctx->last_ino = btrfs_ino(inode);
+@@ -1230,7 +1266,7 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
+
+ cond_resched();
+
+- inode = btrfs_find_first_inode(root, min_ino);
++ inode = find_first_inode_to_shrink(root, min_ino);
+ }
+
+ if (inode) {
+--
+2.39.5
+