btrfs: introduce support for huge folios

With all the previous preparations, it's finally time to enable the huge folio support. - The max folio size Here we define BTRFS_MAX_FOLIO_SIZE, which is fixed at 2MiB. This will ensure we have a large enough but not too large folio for btrfs. This limit applies to all systems regardless of page size. Then we also define BTRFS_MAX_BLOCKS_PER_FOLIO, which depends on CONFIG_BTRFS_EXPERIMENTAL. If it's an experimental build, BTRFS_MAX_BLOCKS_PER_FOLIO is 512, otherwise it's BITS_PER_LONG. The filemap max order will be calculated using both BTRFS_MAX_FOLIO_SIZE and BTRFS_MAX_BLOCKS_PER_FOLIO. E.g. for 64K page size with 64K fs block size, the limit will be BTRFS_MAX_FOLIO_SIZE (2M), which limits the filemap max order to 5. This will be lower than the old order (6), but folios larger than 2M are rarely any better for IO performance. Meanwhile excessively large folios can cause other problems like stalling the IO pipeline for too long. For 4K page size and 4K fs block size, the limit will be increased to 2M from the old 256K. This new size is constrained by both BTRFS_MAX_FOLIO_SIZE (2M) and BTRFS_MAX_BLOCKS_PER_FOLIO (512 * 4K), allowing x86_64 to achieve huge folio support, and the filemap max order will be 9. - btrfs_bio_ctrl::submit_bitmap This will be enlarged to contain BTRFS_MAX_BLOCKS_PER_FOLIO bits, and this will be on-stack memory. This will increase on-stack memory usage by 56 bytes compared to the baseline (before the first patch in the series). - Local @delalloc_bitmap inside writepage_delalloc() Unfortunately we cannot afford to handle an allocation error here, thus again we use on-stack memory. Thus this will increase on-stack memory usage by 56 bytes again. So unfortunately this means during the delalloc window, the writeback path will have +112 bytes on-stack memory usage, and for other cases the writeback path will have +56 bytes on-stack memory usage. The +56 bytes (btrfs_bio_ctrl::submit_bitmap) can be removed after we have reworked the compression submission, so the current on-stack submit_bitmap is mostly a workaround until then. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
author: Qu Wenruo <wqu@suse.com> 2026-05-13 14:06:21 +0930
committer: David Sterba <dsterba@suse.com> 2026-05-26 16:34:51 +0200
commit: db44b97ba60724b959d217d2d327fcce7000efb1 (patch)
tree: c4c7a58f5bd6d1ba4637832089849e970d5f8dc5
parent: 80551189b5c12e36e03941079dca9d2c1500bdd5 (diff)
download: linux-next-history-db44b97ba60724b959d217d2d327fcce7000efb1.tar.gz
5 files changed, 42 insertions, 17 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index b9acea91cbe1f..9de04c37e11af 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -108,6 +108,8 @@ config BTRFS_EXPERIMENTAL
 
 	  - block size > page size support
 
+	  - huge folios for data - folios can be as large as 2MiB now
+
 	  - asynchronous checksum generation for data writes
 
 	  - remap-tree - logical address remapping tree
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e6ed52f5cd6d9..a6203bcf16e28 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3345,6 +3345,15 @@ static void invalidate_and_check_btree_folios(struct btrfs_fs_info *fs_info)
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 }
 
+static u32 calc_block_max_order(u32 sectorsize_bits)
+{
+	u32 max_size;
+
+	max_size = min(BTRFS_MAX_BLOCKS_PER_FOLIO << sectorsize_bits,
+		       BTRFS_MAX_FOLIO_SIZE);
+	return ilog2(round_up(max_size, PAGE_SIZE) >> PAGE_SHIFT);
+}
+
 int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices)
 {
 	u32 sectorsize;
@@ -3467,7 +3476,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	fs_info->sectorsize = sectorsize;
 	fs_info->sectorsize_bits = ilog2(sectorsize);
 	fs_info->block_min_order = ilog2(round_up(sectorsize, PAGE_SIZE) >> PAGE_SHIFT);
-	fs_info->block_max_order = ilog2((BITS_PER_LONG << fs_info->sectorsize_bits) >> PAGE_SHIFT);
+	fs_info->block_max_order = calc_block_max_order(fs_info->sectorsize_bits);
 	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
 	fs_info->stripesize = stripesize;
 	fs_info->fs_devices->fs_info = fs_info;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index de0f37663790f..b03eb211def70 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -130,12 +130,7 @@ struct btrfs_bio_ctrl {
 	 * extent_writepage_io().
 	 * This is to avoid touching ranges covered by compression/inline.
 	 */
-	unsigned long *submit_bitmap;
-	/*
-	 * When blocks_per_folio <= BITS_PER_LONG, we can use the inline
-	 * one without allocating memory.
-	 */
-	unsigned long submit_bitmap_value;
+	unsigned long submit_bitmap[BITS_TO_LONGS(BTRFS_MAX_BLOCKS_PER_FOLIO)];
 
 	struct readahead_control *ractl;
 
@@ -1473,7 +1468,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 	const u64 page_start = folio_pos(folio);
 	const u64 page_end = page_start + folio_size(folio) - 1;
 	const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
-	unsigned long delalloc_bitmap = 0;
+	unsigned long delalloc_bitmap[BITS_TO_LONGS(BTRFS_MAX_BLOCKS_PER_FOLIO)] = { 0 };
 	/*
 	 * Save the last found delalloc end. As the delalloc end can go beyond
 	 * page boundary, thus we cannot rely on subpage bitmap to locate the
@@ -1516,7 +1511,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 			delalloc_start = delalloc_end + 1;
 			continue;
 		}
-		set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start,
+		set_delalloc_bitmap(folio, delalloc_bitmap, delalloc_start,
 				    min(delalloc_end, page_end) + 1 - delalloc_start);
 		last_delalloc_end = delalloc_end;
 		delalloc_start = delalloc_end + 1;
@@ -1542,7 +1537,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 			found_len = last_delalloc_end + 1 - found_start;
 			found = true;
 		} else {
-			found = find_next_delalloc_bitmap(folio, &delalloc_bitmap,
+			found = find_next_delalloc_bitmap(folio, delalloc_bitmap,
 					delalloc_start, &found_start, &found_len);
 		}
 		if (!found)
@@ -1864,13 +1859,15 @@ static void bio_ctrl_init_submit_bitmap(struct btrfs_fs_info *fs_info,
 {
 	const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
 
-	/* Only supported for blocks per folio <= BITS_PER_LONG for now. */
-	ASSERT(blocks_per_folio <= BITS_PER_LONG);
-	bio_ctrl->submit_bitmap_value = 0;
-	bio_ctrl->submit_bitmap = &bio_ctrl->submit_bitmap_value;
+	ASSERT(blocks_per_folio <= BTRFS_MAX_BLOCKS_PER_FOLIO);
+
 	/*
 	 * Default to unlock the whole folio.
 	 * The proper bitmap is not initialized until writepage_delalloc().
+	 *
+	 * We're safe just to set the bitmap range [0, blocks_per_folio), as
+	 * all later usage of the bitmap will follow the same range limit.
+	 * Any bits beyond blocks_per_folio will be ignored.
 	 */
 	bitmap_set(bio_ctrl->submit_bitmap, 0, blocks_per_folio);
 }
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index d3b439fd611f7..e7ec8ebabf218 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -54,6 +54,22 @@ struct btrfs_space_info;
 #define BTRFS_MIN_BLOCKSIZE	(SZ_4K)
 #define BTRFS_MAX_BLOCKSIZE	(SZ_64K)
 
+/* The maximum folio size btrfs supports. */
+#define BTRFS_MAX_FOLIO_SIZE	(SZ_2M)
+static_assert(BTRFS_MAX_FOLIO_SIZE > PAGE_SIZE);
+
+/*
+ * The maximum number of blocks a huge folio can support.
+ *
+ * Depending on the filesystem block size, the real maximum blocks per folio
+ * may also be limited by the above BTRFS_MAX_FOLIO_SIZE.
+ */
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+#define BTRFS_MAX_BLOCKS_PER_FOLIO		(512)
+#else
+#define BTRFS_MAX_BLOCKS_PER_FOLIO		(BITS_PER_LONG)
+#endif
+
 #define BTRFS_MAX_EXTENT_SIZE SZ_128M
 
 /*
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index df923009060df..56060acac2e9c 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -13,9 +13,10 @@
  * - Metadata must be fully aligned to node size
  *   So when nodesize <= page size, the metadata can never cross folio boundaries.
  *
- * - Only support blocks per folio <= BITS_PER_LONG
- *   This is to make bitmap copying much easier, a single unsigned long can handle
- *   one bitmap.
+ * - Only support blocks per folio <= min(BTRFS_MAX_FOLIO_SIZE / fs block size,
+ *					  BTRFS_MAX_BLOCKS_PER_FOLIO)
+ *   This is to ensure we can afford an on-stack bitmap, without the need to allocate
+ *   bitmap memory at runtime.
  *
  * Implementation:
  *
author	Qu Wenruo <wqu@suse.com>	2026-05-13 14:06:21 +0930
committer	David Sterba <dsterba@suse.com>	2026-05-26 16:34:51 +0200
commit	db44b97ba60724b959d217d2d327fcce7000efb1 (patch)
tree	c4c7a58f5bd6d1ba4637832089849e970d5f8dc5
parent	80551189b5c12e36e03941079dca9d2c1500bdd5 (diff)
download	linux-next-history-db44b97ba60724b959d217d2d327fcce7000efb1.tar.gz