1 files changed, 243 insertions, 0 deletions
diff --git a/queue-6.12/btrfs-make-the-extent-map-shrinker-run-asynchronousl.patch b/queue-6.12/btrfs-make-the-extent-map-shrinker-run-asynchronousl.patch
new file mode 100644
index 0000000000..cff2e2ea54
--- /dev/null
+++ b/queue-6.12/btrfs-make-the-extent-map-shrinker-run-asynchronousl.patch
@@ -0,0 +1,243 @@
+From ff155cff3944f92ed4eebed0e2d6f093415a2126 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 29 Aug 2024 15:23:32 +0100
+Subject: btrfs: make the extent map shrinker run asynchronously as a work
+ queue job
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 1020443840569535f6025a855958f07ea3eebf71 ]
+
+Currently the extent map shrinker is run synchronously for kswapd tasks
+that end up calling the fs shrinker (fs/super.c:super_cache_scan()).
+This has some disadvantages and for some heavy workloads with memory
+pressure it can cause some delays and stalls that make a machine
+unresponsive for some periods. This happens because:
+
+1) We can have several kswapd tasks on machines with multiple NUMA zones,
+   and running the extent map shrinker concurrently can cause high
+   contention on some spin locks, namely the spin locks that protect
+   the radix tree that tracks roots, the per root xarray that tracks
+   open inodes and the list of delayed iputs. This not only delays the
+   shrinker but also causes high CPU consumption and makes the task
+   running the shrinker monopolize a core, resulting in the symptoms
+   of an unresponsive system. This was noted in previous commits such as
+   commit ae1e766f623f ("btrfs: only run the extent map shrinker from
+   kswapd tasks");
+
+2) The extent map shrinker's iteration over inodes can often be slow, even
+   after changing the data structure that tracks open inodes for a root
+   from a red black tree (up to kernel 6.10) to an xarray (kernel 6.10+).
+   The transition to the xarray while it made things a bit faster, it's
+   still somewhat slow - for example in a test scenario with 10000 inodes
+   that have no extent maps loaded, the extent map shrinker took between
+   5ms to 8ms, using a release, non-debug kernel. Iterating over the
+   extent maps of an inode can also be slow if have an inode with many
+   thousands of extent maps, since we use a red black tree to track and
+   search extent maps. So having the extent map shrinker run synchronously
+   adds extra delay for other things a kswapd task does.
+
+So make the extent map shrinker run asynchronously as a job for the
+system unbounded workqueue, just like what we do for data and metadata
+space reclaim jobs.
+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/disk-io.c    |  2 ++
+ fs/btrfs/extent_map.c | 51 ++++++++++++++++++++++++++++++++++++-------
+ fs/btrfs/extent_map.h |  3 ++-
+ fs/btrfs/fs.h         |  2 ++
+ fs/btrfs/super.c      | 13 +++--------
+ 5 files changed, 52 insertions(+), 19 deletions(-)
+
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index 96282bf28b19c..e655fa3bfd9be 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -2785,6 +2785,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
+ 	btrfs_init_scrub(fs_info);
+ 	btrfs_init_balance(fs_info);
+ 	btrfs_init_async_reclaim_work(fs_info);
++	btrfs_init_extent_map_shrinker_work(fs_info);
+ 
+ 	rwlock_init(&fs_info->block_group_cache_lock);
+ 	fs_info->block_group_cache_tree = RB_ROOT_CACHED;
+@@ -4334,6 +4335,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
+ 	cancel_work_sync(&fs_info->async_reclaim_work);
+ 	cancel_work_sync(&fs_info->async_data_reclaim_work);
+ 	cancel_work_sync(&fs_info->preempt_reclaim_work);
++	cancel_work_sync(&fs_info->extent_map_shrinker_work);
+ 
+ 	/* Cancel or finish ongoing discard work */
+ 	btrfs_discard_cleanup(fs_info);
+diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
+index d67abf5f97a77..61477cb69a6fd 100644
+--- a/fs/btrfs/extent_map.c
++++ b/fs/btrfs/extent_map.c
+@@ -1128,7 +1128,8 @@ struct btrfs_em_shrink_ctx {
+ 
+ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
+ {
+-	const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
++	struct btrfs_fs_info *fs_info = inode->root->fs_info;
++	const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info);
+ 	struct extent_map_tree *tree = &inode->extent_tree;
+ 	long nr_dropped = 0;
+ 	struct rb_node *node;
+@@ -1187,7 +1188,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
+ 		 * lock. This is to avoid slowing other tasks trying to take the
+ 		 * lock.
+ 		 */
+-		if (need_resched() || rwlock_needbreak(&tree->lock))
++		if (need_resched() || rwlock_needbreak(&tree->lock) ||
++		    btrfs_fs_closing(fs_info))
+ 			break;
+ 		node = next;
+ 	}
+@@ -1261,7 +1263,8 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
+ 		ctx->last_ino = btrfs_ino(inode);
+ 		btrfs_add_delayed_iput(inode);
+ 
+-		if (ctx->scanned >= ctx->nr_to_scan)
++		if (ctx->scanned >= ctx->nr_to_scan ||
++		    btrfs_fs_closing(inode->root->fs_info))
+ 			break;
+ 
+ 		cond_resched();
+@@ -1290,16 +1293,19 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
+ 	return nr_dropped;
+ }
+ 
+-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
++static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
+ {
++	struct btrfs_fs_info *fs_info;
+ 	struct btrfs_em_shrink_ctx ctx;
+ 	u64 start_root_id;
+ 	u64 next_root_id;
+ 	bool cycled = false;
+ 	long nr_dropped = 0;
+ 
++	fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work);
++
+ 	ctx.scanned = 0;
+-	ctx.nr_to_scan = nr_to_scan;
++	ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
+ 
+ 	/*
+ 	 * In case we have multiple tasks running this shrinker, make the next
+@@ -1317,12 +1323,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+ 	if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
+ 		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
+ 
+-		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
++		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan,
+ 							   nr, ctx.last_root,
+ 							   ctx.last_ino);
+ 	}
+ 
+-	while (ctx.scanned < ctx.nr_to_scan) {
++	while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
+ 		struct btrfs_root *root;
+ 		unsigned long count;
+ 
+@@ -1380,5 +1386,34 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+ 							  ctx.last_ino);
+ 	}
+ 
+-	return nr_dropped;
++	atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
++}
++
++void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
++{
++	/*
++	 * Do nothing if the shrinker is already running. In case of high memory
++	 * pressure we can have a lot of tasks calling us and all passing the
++	 * same nr_to_scan value, but in reality we may need only to free
++	 * nr_to_scan extent maps (or less). In case we need to free more than
++	 * that, we will be called again by the fs shrinker, so no worries about
++	 * not doing enough work to reclaim memory from extent maps.
++	 * We can also be repeatedly called with the same nr_to_scan value
++	 * simply because the shrinker runs asynchronously and multiple calls
++	 * to this function are made before the shrinker does enough progress.
++	 *
++	 * That's why we set the atomic counter to nr_to_scan only if its
++	 * current value is zero, instead of incrementing the counter by
++	 * nr_to_scan.
++	 */
++	if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
++		return;
++
++	queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work);
++}
++
++void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
++{
++	atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
++	INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker);
+ }
+diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
+index 5154a8f1d26c9..cd123b266b641 100644
+--- a/fs/btrfs/extent_map.h
++++ b/fs/btrfs/extent_map.h
+@@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
+ int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
+ 				   struct extent_map *new_em,
+ 				   bool modified);
+-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
++void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
++void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info);
+ 
+ #endif
+diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
+index bb822e425d7fa..374843aca60d8 100644
+--- a/fs/btrfs/fs.h
++++ b/fs/btrfs/fs.h
+@@ -639,6 +639,8 @@ struct btrfs_fs_info {
+ 	spinlock_t extent_map_shrinker_lock;
+ 	u64 extent_map_shrinker_last_root;
+ 	u64 extent_map_shrinker_last_ino;
++	atomic64_t extent_map_shrinker_nr_to_scan;
++	struct work_struct extent_map_shrinker_work;
+ 
+ 	/* Protected by 'trans_lock'. */
+ 	struct list_head dirty_cowonly_roots;
+diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
+index bcb8def4ade20..6119a06b05693 100644
+--- a/fs/btrfs/super.c
++++ b/fs/btrfs/super.c
+@@ -28,7 +28,6 @@
+ #include <linux/btrfs.h>
+ #include <linux/security.h>
+ #include <linux/fs_parser.h>
+-#include <linux/swap.h>
+ #include "messages.h"
+ #include "delayed-inode.h"
+ #include "ctree.h"
+@@ -2399,16 +2398,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
+ 	const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ 
+-	/*
+-	 * We may be called from any task trying to allocate memory and we don't
+-	 * want to slow it down with scanning and dropping extent maps. It would
+-	 * also cause heavy lock contention if many tasks concurrently enter
+-	 * here. Therefore only allow kswapd tasks to scan and drop extent maps.
+-	 */
+-	if (!current_is_kswapd())
+-		return 0;
++	btrfs_free_extent_maps(fs_info, nr_to_scan);
+ 
+-	return btrfs_free_extent_maps(fs_info, nr_to_scan);
++	/* The extent map shrinker runs asynchronously, so always return 0. */
++	return 0;
+ }
+ 
+ static const struct super_operations btrfs_super_ops = {
+-- 
+2.39.5
+