diff options
Diffstat (limited to 'queue-6.12/btrfs-make-the-extent-map-shrinker-run-asynchronousl.patch')
-rw-r--r-- | queue-6.12/btrfs-make-the-extent-map-shrinker-run-asynchronousl.patch | 243 |
1 files changed, 243 insertions, 0 deletions
diff --git a/queue-6.12/btrfs-make-the-extent-map-shrinker-run-asynchronousl.patch b/queue-6.12/btrfs-make-the-extent-map-shrinker-run-asynchronousl.patch new file mode 100644 index 0000000000..cff2e2ea54 --- /dev/null +++ b/queue-6.12/btrfs-make-the-extent-map-shrinker-run-asynchronousl.patch @@ -0,0 +1,243 @@ +From ff155cff3944f92ed4eebed0e2d6f093415a2126 Mon Sep 17 00:00:00 2001 +From: Sasha Levin <sashal@kernel.org> +Date: Thu, 29 Aug 2024 15:23:32 +0100 +Subject: btrfs: make the extent map shrinker run asynchronously as a work + queue job + +From: Filipe Manana <fdmanana@suse.com> + +[ Upstream commit 1020443840569535f6025a855958f07ea3eebf71 ] + +Currently the extent map shrinker is run synchronously for kswapd tasks +that end up calling the fs shrinker (fs/super.c:super_cache_scan()). +This has some disadvantages and for some heavy workloads with memory +pressure it can cause some delays and stalls that make a machine +unresponsive for some periods. This happens because: + +1) We can have several kswapd tasks on machines with multiple NUMA zones, + and running the extent map shrinker concurrently can cause high + contention on some spin locks, namely the spin locks that protect + the radix tree that tracks roots, the per root xarray that tracks + open inodes and the list of delayed iputs. This not only delays the + shrinker but also causes high CPU consumption and makes the task + running the shrinker monopolize a core, resulting in the symptoms + of an unresponsive system. This was noted in previous commits such as + commit ae1e766f623f ("btrfs: only run the extent map shrinker from + kswapd tasks"); + +2) The extent map shrinker's iteration over inodes can often be slow, even + after changing the data structure that tracks open inodes for a root + from a red black tree (up to kernel 6.10) to an xarray (kernel 6.10+). + The transition to the xarray while it made things a bit faster, it's + still somewhat slow - for example in a test scenario with 10000 inodes + that have no extent maps loaded, the extent map shrinker took between + 5ms to 8ms, using a release, non-debug kernel. Iterating over the + extent maps of an inode can also be slow if have an inode with many + thousands of extent maps, since we use a red black tree to track and + search extent maps. So having the extent map shrinker run synchronously + adds extra delay for other things a kswapd task does. + +So make the extent map shrinker run asynchronously as a job for the +system unbounded workqueue, just like what we do for data and metadata +space reclaim jobs. + +Reviewed-by: Josef Bacik <josef@toxicpanda.com> +Signed-off-by: Filipe Manana <fdmanana@suse.com> +Signed-off-by: David Sterba <dsterba@suse.com> +Signed-off-by: Sasha Levin <sashal@kernel.org> +--- + fs/btrfs/disk-io.c | 2 ++ + fs/btrfs/extent_map.c | 51 ++++++++++++++++++++++++++++++++++++------- + fs/btrfs/extent_map.h | 3 ++- + fs/btrfs/fs.h | 2 ++ + fs/btrfs/super.c | 13 +++-------- + 5 files changed, 52 insertions(+), 19 deletions(-) + +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index 96282bf28b19c..e655fa3bfd9be 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -2785,6 +2785,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) + btrfs_init_scrub(fs_info); + btrfs_init_balance(fs_info); + btrfs_init_async_reclaim_work(fs_info); ++ btrfs_init_extent_map_shrinker_work(fs_info); + + rwlock_init(&fs_info->block_group_cache_lock); + fs_info->block_group_cache_tree = RB_ROOT_CACHED; +@@ -4334,6 +4335,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) + cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); + cancel_work_sync(&fs_info->preempt_reclaim_work); ++ cancel_work_sync(&fs_info->extent_map_shrinker_work); + + /* Cancel or finish ongoing discard work */ + btrfs_discard_cleanup(fs_info); +diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c +index d67abf5f97a77..61477cb69a6fd 100644 +--- a/fs/btrfs/extent_map.c ++++ b/fs/btrfs/extent_map.c +@@ -1128,7 +1128,8 @@ struct btrfs_em_shrink_ctx { + + static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx) + { +- const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info); ++ struct btrfs_fs_info *fs_info = inode->root->fs_info; ++ const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info); + struct extent_map_tree *tree = &inode->extent_tree; + long nr_dropped = 0; + struct rb_node *node; +@@ -1187,7 +1188,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c + * lock. This is to avoid slowing other tasks trying to take the + * lock. + */ +- if (need_resched() || rwlock_needbreak(&tree->lock)) ++ if (need_resched() || rwlock_needbreak(&tree->lock) || ++ btrfs_fs_closing(fs_info)) + break; + node = next; + } +@@ -1261,7 +1263,8 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx + ctx->last_ino = btrfs_ino(inode); + btrfs_add_delayed_iput(inode); + +- if (ctx->scanned >= ctx->nr_to_scan) ++ if (ctx->scanned >= ctx->nr_to_scan || ++ btrfs_fs_closing(inode->root->fs_info)) + break; + + cond_resched(); +@@ -1290,16 +1293,19 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx + return nr_dropped; + } + +-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) ++static void btrfs_extent_map_shrinker_worker(struct work_struct *work) + { ++ struct btrfs_fs_info *fs_info; + struct btrfs_em_shrink_ctx ctx; + u64 start_root_id; + u64 next_root_id; + bool cycled = false; + long nr_dropped = 0; + ++ fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work); ++ + ctx.scanned = 0; +- ctx.nr_to_scan = nr_to_scan; ++ ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan); + + /* + * In case we have multiple tasks running this shrinker, make the next +@@ -1317,12 +1323,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) + if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { + s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); + +- trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, ++ trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan, + nr, ctx.last_root, + ctx.last_ino); + } + +- while (ctx.scanned < ctx.nr_to_scan) { ++ while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) { + struct btrfs_root *root; + unsigned long count; + +@@ -1380,5 +1386,34 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) + ctx.last_ino); + } + +- return nr_dropped; ++ atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0); ++} ++ ++void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) ++{ ++ /* ++ * Do nothing if the shrinker is already running. In case of high memory ++ * pressure we can have a lot of tasks calling us and all passing the ++ * same nr_to_scan value, but in reality we may need only to free ++ * nr_to_scan extent maps (or less). In case we need to free more than ++ * that, we will be called again by the fs shrinker, so no worries about ++ * not doing enough work to reclaim memory from extent maps. ++ * We can also be repeatedly called with the same nr_to_scan value ++ * simply because the shrinker runs asynchronously and multiple calls ++ * to this function are made before the shrinker does enough progress. ++ * ++ * That's why we set the atomic counter to nr_to_scan only if its ++ * current value is zero, instead of incrementing the counter by ++ * nr_to_scan. ++ */ ++ if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0) ++ return; ++ ++ queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work); ++} ++ ++void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info) ++{ ++ atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0); ++ INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker); + } +diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h +index 5154a8f1d26c9..cd123b266b641 100644 +--- a/fs/btrfs/extent_map.h ++++ b/fs/btrfs/extent_map.h +@@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, + int btrfs_replace_extent_map_range(struct btrfs_inode *inode, + struct extent_map *new_em, + bool modified); +-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan); ++void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan); ++void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info); + + #endif +diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h +index bb822e425d7fa..374843aca60d8 100644 +--- a/fs/btrfs/fs.h ++++ b/fs/btrfs/fs.h +@@ -639,6 +639,8 @@ struct btrfs_fs_info { + spinlock_t extent_map_shrinker_lock; + u64 extent_map_shrinker_last_root; + u64 extent_map_shrinker_last_ino; ++ atomic64_t extent_map_shrinker_nr_to_scan; ++ struct work_struct extent_map_shrinker_work; + + /* Protected by 'trans_lock'. */ + struct list_head dirty_cowonly_roots; +diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c +index bcb8def4ade20..6119a06b05693 100644 +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -28,7 +28,6 @@ + #include <linux/btrfs.h> + #include <linux/security.h> + #include <linux/fs_parser.h> +-#include <linux/swap.h> + #include "messages.h" + #include "delayed-inode.h" + #include "ctree.h" +@@ -2399,16 +2398,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont + const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + +- /* +- * We may be called from any task trying to allocate memory and we don't +- * want to slow it down with scanning and dropping extent maps. It would +- * also cause heavy lock contention if many tasks concurrently enter +- * here. Therefore only allow kswapd tasks to scan and drop extent maps. +- */ +- if (!current_is_kswapd()) +- return 0; ++ btrfs_free_extent_maps(fs_info, nr_to_scan); + +- return btrfs_free_extent_maps(fs_info, nr_to_scan); ++ /* The extent map shrinker runs asynchronously, so always return 0. */ ++ return 0; + } + + static const struct super_operations btrfs_super_ops = { +-- +2.39.5 + |