aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
authorMark Brown <broonie@kernel.org>2026-05-29 12:27:09 +0100
committerMark Brown <broonie@kernel.org>2026-05-29 12:27:09 +0100
commitaf628d83197362a1ce33e5cd4bdca0a4d5ad500a (patch)
tree27f6f8e7daa1181a1dd5525c91449c74c38c83c4 /fs
parentee58d910afb18694533fb17cc6ef3c2bb8daf9a6 (diff)
parentbc5a86e9c60a9caca2e1a51eb9051c29c435e66a (diff)
downloadlinux-next-history-af628d83197362a1ce33e5cd4bdca0a4d5ad500a.tar.gz
Merge branch 'for-next' of https://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/Kconfig8
-rw-r--r--fs/btrfs/backref.c43
-rw-r--r--fs/btrfs/backref.h9
-rw-r--r--fs/btrfs/block-group.c40
-rw-r--r--fs/btrfs/block-group.h40
-rw-r--r--fs/btrfs/btrfs_inode.h37
-rw-r--r--fs/btrfs/compression.c83
-rw-r--r--fs/btrfs/ctree.c16
-rw-r--r--fs/btrfs/defrag.c18
-rw-r--r--fs/btrfs/delalloc-space.c2
-rw-r--r--fs/btrfs/delayed-ref.c5
-rw-r--r--fs/btrfs/dev-replace.c11
-rw-r--r--fs/btrfs/disk-io.c125
-rw-r--r--fs/btrfs/extent-tree.c28
-rw-r--r--fs/btrfs/extent_io.c260
-rw-r--r--fs/btrfs/extent_io.h20
-rw-r--r--fs/btrfs/extent_map.c6
-rw-r--r--fs/btrfs/fiemap.c2
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/file.c116
-rw-r--r--fs/btrfs/free-space-cache.c151
-rw-r--r--fs/btrfs/free-space-cache.h14
-rw-r--r--fs/btrfs/free-space-tree.c36
-rw-r--r--fs/btrfs/fs.h48
-rw-r--r--fs/btrfs/inode.c311
-rw-r--r--fs/btrfs/ioctl.c642
-rw-r--r--fs/btrfs/ordered-data.c12
-rw-r--r--fs/btrfs/qgroup.c41
-rw-r--r--fs/btrfs/raid-stripe-tree.c4
-rw-r--r--fs/btrfs/reflink.c10
-rw-r--r--fs/btrfs/relocation.c45
-rw-r--r--fs/btrfs/send.c113
-rw-r--r--fs/btrfs/space-info.c9
-rw-r--r--fs/btrfs/space-info.h11
-rw-r--r--fs/btrfs/subpage.c286
-rw-r--r--fs/btrfs/subpage.h33
-rw-r--r--fs/btrfs/super.c51
-rw-r--r--fs/btrfs/tests/btrfs-tests.c1
-rw-r--r--fs/btrfs/tests/free-space-tests.c24
-rw-r--r--fs/btrfs/transaction.c33
-rw-r--r--fs/btrfs/transaction.h50
-rw-r--r--fs/btrfs/tree-checker.c183
-rw-r--r--fs/btrfs/tree-log.c273
-rw-r--r--fs/btrfs/tree-log.h7
-rw-r--r--fs/btrfs/verity.c4
-rw-r--r--fs/btrfs/volumes.c118
-rw-r--r--fs/btrfs/volumes.h1
-rw-r--r--fs/btrfs/zoned.c73
48 files changed, 1926 insertions, 1531 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 5e75438e0b738..9de04c37e11af 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -93,10 +93,6 @@ config BTRFS_EXPERIMENTAL
Current list:
- - COW fixup worker warning - last warning before removing the
- functionality catching out-of-band page
- dirtying, not necessary since 5.8
-
- RAID mirror read policy - additional read policies for balancing
reading from redundant block group
profiles (currently: pid, round-robin,
@@ -110,7 +106,9 @@ config BTRFS_EXPERIMENTAL
- extent tree v2 - complex rework of extent tracking
- - large folio and block size (> page size) support
+ - block size > page size support
+
+ - huge folios for data - folios can be as large as 2MiB now
- asynchronous checksum generation for data writes
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 273924ca912c2..0abcec0ceead0 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -2367,7 +2367,7 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
info = (struct btrfs_tree_block_info *)(ei + 1);
*out_level = btrfs_tree_block_level(eb, info);
} else {
- ASSERT(key->type == BTRFS_METADATA_ITEM_KEY);
+ ASSERT(key->type == BTRFS_METADATA_ITEM_KEY, "key->type=%hhu", key->type);
*out_level = (u8)key->offset;
}
@@ -2814,26 +2814,17 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
return ifp;
}
-struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info)
+int btrfs_backref_iter_init(struct btrfs_backref_iter *iter)
{
- struct btrfs_backref_iter *ret;
-
- ret = kzalloc_obj(*ret, GFP_NOFS);
- if (!ret)
- return NULL;
-
- ret->path = btrfs_alloc_path();
- if (!ret->path) {
- kfree(ret);
- return NULL;
- }
+ iter->path = btrfs_alloc_path();
+ if (!iter->path)
+ return -ENOMEM;
/* Current backref iterator only supports iteration in commit root */
- ret->path->search_commit_root = true;
- ret->path->skip_locking = true;
- ret->fs_info = fs_info;
+ iter->path->search_commit_root = true;
+ iter->path->skip_locking = true;
- return ret;
+ return 0;
}
static void btrfs_backref_iter_release(struct btrfs_backref_iter *iter)
@@ -2846,9 +2837,8 @@ static void btrfs_backref_iter_release(struct btrfs_backref_iter *iter)
memset(&iter->cur_key, 0, sizeof(iter->cur_key));
}
-int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
+int btrfs_backref_iter_start(struct btrfs_fs_info *fs_info, struct btrfs_backref_iter *iter, u64 bytenr)
{
- struct btrfs_fs_info *fs_info = iter->fs_info;
struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
struct btrfs_path *path = iter->path;
struct btrfs_extent_item *ei;
@@ -2963,7 +2953,7 @@ static bool btrfs_backref_iter_is_inline_ref(struct btrfs_backref_iter *iter)
* Return >0 if there is no extra backref for this bytenr.
* Return <0 if there is something wrong happened.
*/
-int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
+int btrfs_backref_iter_next(struct btrfs_fs_info *fs_info, struct btrfs_backref_iter *iter)
{
struct extent_buffer *eb = iter->path->nodes[0];
struct btrfs_root *extent_root;
@@ -2997,10 +2987,9 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
}
/* We're at keyed items, there is no inline item, go to the next one */
- extent_root = btrfs_extent_root(iter->fs_info, iter->bytenr);
+ extent_root = btrfs_extent_root(fs_info, iter->bytenr);
if (unlikely(!extent_root)) {
- btrfs_err(iter->fs_info,
- "missing extent root for extent at bytenr %llu",
+ btrfs_err(fs_info, "missing extent root for extent at bytenr %llu",
iter->bytenr);
return -EUCLEAN;
}
@@ -3199,7 +3188,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *upper;
struct rb_node *rb_node;
- ASSERT(ref_key->type == BTRFS_SHARED_BLOCK_REF_KEY);
+ ASSERT(ref_key->type == BTRFS_SHARED_BLOCK_REF_KEY, "ref_key->type=%hhu", ref_key->type);
/* Only reloc root uses backref pointing to itself */
if (ref_key->objectid == ref_key->offset) {
@@ -3454,7 +3443,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
struct btrfs_backref_node *exist;
int ret;
- ret = btrfs_backref_iter_start(iter, cur->bytenr);
+ ret = btrfs_backref_iter_start(trans->fs_info, iter, cur->bytenr);
if (ret < 0)
return ret;
/*
@@ -3462,7 +3451,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
* stored in it, but fetch it from the tree block
*/
if (btrfs_backref_has_tree_block_info(iter)) {
- ret = btrfs_backref_iter_next(iter);
+ ret = btrfs_backref_iter_next(trans->fs_info, iter);
if (ret < 0)
goto out;
/* No extra backref? This means the tree block is corrupted */
@@ -3492,7 +3481,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
exist = NULL;
}
- for (; ret == 0; ret = btrfs_backref_iter_next(iter)) {
+ for (; ret == 0; ret = btrfs_backref_iter_next(trans->fs_info, iter)) {
struct extent_buffer *eb;
struct btrfs_key key;
int type;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 1d009b0f4c699..179791de6b195 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -278,15 +278,12 @@ struct prelim_ref {
struct btrfs_backref_iter {
u64 bytenr;
struct btrfs_path *path;
- struct btrfs_fs_info *fs_info;
struct btrfs_key cur_key;
u32 item_ptr;
u32 cur_ptr;
u32 end_ptr;
};
-struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info);
-
/*
* For metadata with EXTENT_ITEM key (non-skinny) case, the first inline data
* is btrfs_tree_block_info, without a btrfs_extent_inline_ref header.
@@ -302,9 +299,11 @@ static inline bool btrfs_backref_has_tree_block_info(
return false;
}
-int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr);
+int btrfs_backref_iter_init(struct btrfs_backref_iter *iter);
+
+int btrfs_backref_iter_start(struct btrfs_fs_info *fs_info, struct btrfs_backref_iter *iter, u64 bytenr);
-int btrfs_backref_iter_next(struct btrfs_backref_iter *iter);
+int btrfs_backref_iter_next(struct btrfs_fs_info *fs_info, struct btrfs_backref_iter *iter);
/*
* Backref cache related structures
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index b611c64119dbc..a0cb0db68c9a4 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -22,6 +22,34 @@
#include "accessors.h"
#include "extent-tree.h"
+static struct kmem_cache *block_group_cache;
+static struct kmem_cache *free_space_ctl_cache;
+
+int __init btrfs_init_block_group(void)
+{
+ block_group_cache = kmem_cache_create("btrfs_block_group",
+ sizeof(struct btrfs_block_group),
+ 0, 0, NULL);
+ if (!block_group_cache)
+ return -ENOMEM;
+
+ free_space_ctl_cache = kmem_cache_create("btrfs_free_space_ctl",
+ sizeof(struct btrfs_free_space_ctl),
+ 0, 0, NULL);
+ if (!free_space_ctl_cache) {
+ kmem_cache_destroy(block_group_cache);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void __cold btrfs_exit_block_group(void)
+{
+ kmem_cache_destroy(block_group_cache);
+ kmem_cache_destroy(free_space_ctl_cache);
+}
+
#ifdef CONFIG_BTRFS_DEBUG
int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group)
{
@@ -180,9 +208,9 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
cache);
- kfree(cache->free_space_ctl);
+ kmem_cache_free(free_space_ctl_cache, cache->free_space_ctl);
btrfs_free_chunk_map(cache->physical_map);
- kfree(cache);
+ kmem_cache_free(block_group_cache, cache);
}
}
@@ -2371,13 +2399,13 @@ static struct btrfs_block_group *btrfs_create_block_group(
{
struct btrfs_block_group *cache;
- cache = kzalloc_obj(*cache, GFP_NOFS);
+ cache = kmem_cache_zalloc(block_group_cache, GFP_NOFS);
if (!cache)
return NULL;
- cache->free_space_ctl = kzalloc_obj(*cache->free_space_ctl, GFP_NOFS);
+ cache->free_space_ctl = kmem_cache_zalloc(free_space_ctl_cache, GFP_NOFS);
if (!cache->free_space_ctl) {
- kfree(cache);
+ kmem_cache_free(block_group_cache, cache);
return NULL;
}
@@ -4089,7 +4117,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
struct btrfs_space_info *space_info;
space_info = btrfs_find_space_info(trans->fs_info, type);
- if (!space_info) {
+ if (unlikely(!space_info)) {
DEBUG_WARN();
return -EINVAL;
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 0504cb3579921..790c2d467af57 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -122,6 +122,7 @@ struct btrfs_block_group {
struct btrfs_fs_info *fs_info;
struct btrfs_inode *inode;
spinlock_t lock;
+ unsigned int ro;
u64 start;
u64 length;
u64 pinned;
@@ -134,7 +135,8 @@ struct btrfs_block_group {
u64 global_root_id;
u64 remap_bytes;
u32 identity_remap_count;
-
+ /* The last commited identity_remap_count value of this block group. */
+ u32 last_identity_remap_count;
/*
* The last committed used bytes of this block group, if the above @used
* is still the same as @last_used, we don't need to update block
@@ -143,8 +145,6 @@ struct btrfs_block_group {
u64 last_used;
/* The last committed remap_bytes value of this block group. */
u64 last_remap_bytes;
- /* The last commited identity_remap_count value of this block group. */
- u32 last_identity_remap_count;
/* The last committed flags value for this block group. */
u64 last_flags;
@@ -171,12 +171,10 @@ struct btrfs_block_group {
unsigned long full_stripe_len;
unsigned long runtime_flags;
- unsigned int ro;
-
- int disk_cache_state;
+ enum btrfs_disk_cache_state disk_cache_state;
/* Cache tracking stuff */
- int cached;
+ enum btrfs_caching_type cached;
struct btrfs_caching_control *caching_ctl;
struct btrfs_space_info *space_info;
@@ -193,6 +191,16 @@ struct btrfs_block_group {
refcount_t refs;
/*
+ * When non-zero it means the block group's logical address and its
+ * device extents can not be reused for future block group allocations
+ * until the counter goes down to 0. This is to prevent them from being
+ * reused while some task is still using the block group after it was
+ * deleted - we want to make sure they can only be reused for new block
+ * groups after that task is done with the deleted block group.
+ */
+ atomic_t frozen;
+
+ /*
* List of struct btrfs_free_clusters for this block group.
* Today it will only have one thing on it, but that may change
*/
@@ -211,22 +219,12 @@ struct btrfs_block_group {
/* For read-only block groups */
struct list_head ro_list;
- /*
- * When non-zero it means the block group's logical address and its
- * device extents can not be reused for future block group allocations
- * until the counter goes down to 0. This is to prevent them from being
- * reused while some task is still using the block group after it was
- * deleted - we want to make sure they can only be reused for new block
- * groups after that task is done with the deleted block group.
- */
- atomic_t frozen;
-
/* For discard operations */
struct list_head discard_list;
int discard_index;
+ enum btrfs_discard_state discard_state;
u64 discard_eligible_time;
u64 discard_cursor;
- enum btrfs_discard_state discard_state;
/* For dirty block groups */
struct list_head dirty_list;
@@ -263,6 +261,8 @@ struct btrfs_block_group {
/* Protected by @free_space_lock. */
bool using_free_space_bitmaps_cached;
+ enum btrfs_block_group_size_class size_class:8;
+
/*
* Number of extents in this block group used for swap files.
* All accesses protected by the spinlock 'lock'.
@@ -281,7 +281,6 @@ struct btrfs_block_group {
struct list_head active_bg_list;
struct work_struct zone_finish_work;
struct extent_buffer *last_eb;
- enum btrfs_block_group_size_class size_class;
u64 reclaim_mark;
};
@@ -320,6 +319,9 @@ static inline u64 btrfs_block_group_available_space(const struct btrfs_block_gro
int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group);
#endif
+int __init btrfs_init_block_group(void);
+void __cold btrfs_exit_block_group(void);
+
struct btrfs_block_group *btrfs_lookup_first_block_group(
struct btrfs_fs_info *info, u64 bytenr);
struct btrfs_block_group *btrfs_lookup_block_group(
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 55c272fe5d92a..d5d81f9546c37 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -128,15 +128,6 @@ struct btrfs_inode {
/* which subvolume this inode belongs to */
struct btrfs_root *root;
-#if BITS_PER_LONG == 32
- /*
- * The objectid of the corresponding BTRFS_INODE_ITEM_KEY.
- * On 64 bits platforms we can get it from vfs_inode.i_ino, which is an
- * unsigned long and therefore 64 bits on such platforms.
- */
- u64 objectid;
-#endif
-
/* Cached value of inode property 'compression'. */
u8 prop_compress;
@@ -372,30 +363,11 @@ static inline unsigned long btrfs_inode_hash(u64 objectid,
return (unsigned long)h;
}
-#if BITS_PER_LONG == 32
-
-/*
- * On 32 bit systems the i_ino of struct inode is 32 bits (unsigned long), so
- * we use the inode's location objectid which is a u64 to avoid truncation.
- */
-static inline u64 btrfs_ino(const struct btrfs_inode *inode)
-{
- u64 ino = inode->objectid;
-
- if (test_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags))
- ino = inode->vfs_inode.i_ino;
- return ino;
-}
-
-#else
-
static inline u64 btrfs_ino(const struct btrfs_inode *inode)
{
return inode->vfs_inode.i_ino;
}
-#endif
-
static inline void btrfs_get_inode_key(const struct btrfs_inode *inode,
struct btrfs_key *key)
{
@@ -406,9 +378,6 @@ static inline void btrfs_get_inode_key(const struct btrfs_inode *inode,
static inline void btrfs_set_inode_number(struct btrfs_inode *inode, u64 ino)
{
-#if BITS_PER_LONG == 32
- inode->objectid = ino;
-#endif
inode->vfs_inode.i_ino = ino;
}
@@ -531,12 +500,9 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode)
/* Metadata inode should not reach here. */
ASSERT(is_data_inode(inode));
- /* We only allow BITS_PER_LONGS blocks for each bitmap. */
-#ifdef CONFIG_BTRFS_EXPERIMENTAL
mapping_set_folio_order_range(inode->vfs_inode.i_mapping,
inode->root->fs_info->block_min_order,
inode->root->fs_info->block_max_order);
-#endif
}
void btrfs_calculate_block_csum_folio(struct btrfs_fs_info *fs_info,
@@ -569,6 +535,8 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
+int btrfs_reset_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
+ unsigned int extra_bits, struct extent_state **cached_state);
struct btrfs_new_inode_args {
/* Input */
@@ -630,7 +598,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
loff_t actual_len, u64 *alloc_hint);
int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio,
u64 start, u64 end, struct writeback_control *wbc);
-int btrfs_writepage_cow_fixup(struct folio *folio);
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
int compress_type);
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index a02b62e0a8f33..ffb6b52863a78 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -355,21 +355,16 @@ struct compressed_bio *btrfs_alloc_compressed_write(struct btrfs_inode *inode,
}
/*
- * Add extra pages in the same compressed file extent so that we don't need to
+ * Add extra folios in the same compressed file extent so that we don't need to
* re-read the same extent again and again.
*
- * NOTE: this won't work well for subpage, as for subpage read, we lock the
- * full page then submit bio for each compressed/regular extents.
- *
- * This means, if we have several sectors in the same page points to the same
- * on-disk compressed data, we will re-read the same extent many times and
- * this function can only help for the next page.
+ * If in the same folio, we have several non-contiguous blocks which are pointing
+ * to the same on-disk compressed data, we will re-read the same extent many
+ * times, as this function can only help cross folio situations.
*/
-static noinline int add_ra_bio_pages(struct inode *inode,
- u64 compressed_end,
- struct compressed_bio *cb,
- int *memstall, unsigned long *pflags,
- bool direct_reclaim)
+static noinline int add_ra_bio_folios(struct inode *inode, u64 compressed_end,
+ struct compressed_bio *cb, int *memstall,
+ unsigned long *pflags, bool direct_reclaim)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
pgoff_t end_index;
@@ -391,16 +386,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (isize == 0)
return 0;
- /*
- * For current subpage support, we only support 64K page size,
- * which means maximum compressed extent size (128K) is just 2x page
- * size.
- * This makes readahead less effective, so here disable readahead for
- * subpage for now, until full compressed write is supported.
- */
- if (fs_info->sectorsize < PAGE_SIZE)
- return 0;
-
/* For bs > ps cases, we don't support readahead for compressed folios for now. */
if (fs_info->block_min_order)
return 0;
@@ -416,7 +401,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
while (cur < compressed_end) {
- pgoff_t page_end;
+ u64 folio_end;
pgoff_t pg_index = cur >> PAGE_SHIFT;
gfp_t masked_constraint_gfp;
u32 add_size;
@@ -438,8 +423,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
break;
/*
- * Jump to next page start as we already have page for
- * current offset.
+ * Jump to the next folio as we already have a folio for
+ * the current offset.
*/
cur += (folio_sz - offset);
continue;
@@ -457,8 +442,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
break;
if (filemap_add_folio(mapping, folio, pg_index, cache_gfp)) {
- /* There is already a page, skip to page end */
- cur += folio_size(folio);
+ /* There is already a folio, skip to the folio end. */
+ cur += folio_size(folio) - offset_in_folio(folio, cur);
folio_put(folio);
continue;
}
@@ -475,14 +460,14 @@ static noinline int add_ra_bio_pages(struct inode *inode,
break;
}
- page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1;
- btrfs_lock_extent(tree, cur, page_end, NULL);
+ folio_end = folio_next_pos(folio) - 1;
+ btrfs_lock_extent(tree, cur, folio_end, NULL);
read_lock(&em_tree->lock);
- em = btrfs_lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
+ em = btrfs_lookup_extent_mapping(em_tree, cur, folio_end + 1 - cur);
read_unlock(&em_tree->lock);
/*
- * At this point, we have a locked page in the page cache for
+ * At this point, we have a locked folio in the page cache for
* these bytes in the file. But, we have to make sure they map
* to this compressed extent on disk.
*/
@@ -491,14 +476,14 @@ static noinline int add_ra_bio_pages(struct inode *inode,
(btrfs_extent_map_block_start(em) >> SECTOR_SHIFT) !=
orig_bio->bi_iter.bi_sector) {
btrfs_free_extent_map(em);
- btrfs_unlock_extent(tree, cur, page_end, NULL);
+ btrfs_unlock_extent(tree, cur, folio_end, NULL);
folio_unlock(folio);
folio_put(folio);
break;
}
- add_size = min(btrfs_extent_map_end(em), page_end + 1) - cur;
+ add_size = min(btrfs_extent_map_end(em), folio_end + 1) - cur;
btrfs_free_extent_map(em);
- btrfs_unlock_extent(tree, cur, page_end, NULL);
+ btrfs_unlock_extent(tree, cur, folio_end, NULL);
if (folio_contains(folio, end_index)) {
size_t zero_offset = offset_in_folio(folio, isize);
@@ -516,13 +501,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
folio_put(folio);
break;
}
- /*
- * If it's subpage, we also need to increase its
- * subpage::readers number, as at endio we will decrease
- * subpage::readers and to unlock the page.
- */
- if (fs_info->sectorsize < PAGE_SIZE)
- btrfs_folio_set_lock(fs_info, folio, cur, add_size);
+ btrfs_folio_set_lock(fs_info, folio, cur, add_size);
folio_put(folio);
cur += add_size;
}
@@ -613,8 +592,8 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
}
ASSERT(cb->bbio.bio.bi_iter.bi_size == compressed_len);
- add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall,
- &pflags, !(bbio->bio.bi_opf & REQ_RAHEAD));
+ add_ra_bio_folios(&inode->vfs_inode, em_start + em_len, cb, &memstall,
+ &pflags, !(bbio->bio.bi_opf & REQ_RAHEAD));
cb->len = bbio->bio.bi_iter.bi_size;
cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector;
@@ -1192,22 +1171,6 @@ void __cold btrfs_exit_compress(void)
}
/*
- * The bvec is a single page bvec from a bio that contains folios from a filemap.
- *
- * Since the folio may be a large one, and if the bv_page is not a head page of
- * a large folio, then page->index is unreliable.
- *
- * Thus we need this helper to grab the proper file offset.
- */
-static u64 file_offset_from_bvec(const struct bio_vec *bvec)
-{
- const struct page *page = bvec->bv_page;
- const struct folio *folio = page_folio(page);
-
- return (page_pgoff(folio, page) << PAGE_SHIFT) + bvec->bv_offset;
-}
-
-/*
* Copy decompressed data from working buffer to pages.
*
* @buf: The decompressed data buffer
@@ -1259,7 +1222,7 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
* cb->start may underflow, but subtracting that value can still
* give us correct offset inside the full decompressed extent.
*/
- bvec_offset = file_offset_from_bvec(&bvec) - cb->start;
+ bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start;
/* Haven't reached the bvec range, exit */
if (decompressed + buf_len <= bvec_offset)
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d70da290bedfc..829d8be7f423b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1497,17 +1497,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
if (p->reada == READA_FORWARD_ALWAYS)
reada_for_search(fs_info, p, parent_level, slot, key->objectid);
- /* first we do an atomic uptodate check */
- if (btrfs_buffer_uptodate(tmp, check.transid, NULL) > 0) {
- /*
- * Do extra check for first_key, eb can be stale due to
- * being cached, read from scrub, or have multiple
- * parents (shared tree blocks).
- */
- if (unlikely(btrfs_verify_level_key(tmp, &check))) {
- ret = -EUCLEAN;
- goto out;
- }
+ /* Check if the cached eb is uptodate. */
+ ret = btrfs_buffer_uptodate(tmp, check.transid, &check);
+ if (unlikely(ret < 0))
+ goto out;
+ if (ret > 0) {
*eb_ret = tmp;
tmp = NULL;
ret = 0;
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 7e2db5d3a4d4c..f0c6758b7055d 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -859,23 +859,6 @@ again:
if (IS_ERR(folio))
return folio;
- /*
- * Since we can defragment files opened read-only, we can encounter
- * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS).
- *
- * The IO for such large folios is not fully tested, thus return
- * an error to reject such folios unless it's an experimental build.
- *
- * Filesystem transparent huge pages are typically only used for
- * executables that explicitly enable them, so this isn't very
- * restrictive.
- */
- if (!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && folio_test_large(folio)) {
- folio_unlock(folio);
- folio_put(folio);
- return ERR_PTR(-ETXTBSY);
- }
-
ret = set_folio_extent_mapped(folio);
if (ret < 0) {
folio_unlock(folio);
@@ -1179,7 +1162,6 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
if (start >= folio_next_pos(folio) ||
start + len <= folio_pos(folio))
continue;
- btrfs_folio_clamp_clear_checked(fs_info, folio, start, len);
btrfs_folio_clamp_set_dirty(fs_info, folio, start, len);
}
btrfs_delalloc_release_extents(inode, len);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 0970799d0aa44..4293a63834337 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -134,6 +134,8 @@ int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes)
if (btrfs_is_free_space_inode(inode))
flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
+ else if (btrfs_is_zoned(fs_info) && btrfs_is_data_reloc_root(root))
+ flush = BTRFS_RESERVE_FLUSH_ZONED_RELOCATION;
return btrfs_reserve_data_bytes(data_sinfo_for_inode(inode), bytes, flush);
}
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 605858c2d9a95..8bc1929237f70 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -615,6 +615,9 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *exist;
int mod;
+ ASSERT(ref->action == BTRFS_ADD_DELAYED_REF ||
+ ref->action == BTRFS_DROP_DELAYED_REF);
+
spin_lock(&href->lock);
exist = tree_insert(&href->ref_tree, ref);
if (!exist) {
@@ -641,7 +644,7 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans,
ASSERT(!list_empty(&exist->add_list));
list_del_init(&exist->add_list);
} else {
- ASSERT(0);
+ DEBUG_WARN();
}
} else
mod = -ref->ref_mod;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 8f8fa14886ded..318ddb7904292 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -307,6 +307,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->bdev_file = bdev_file;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+ /* Check the comment in btrfs_init_new_device() for the reason. */
+ atomic_inc(&device->dev_stats_ccnt);
device->dev_stats_valid = 1;
set_blocksize(bdev_file, BTRFS_BDEV_BLOCKSIZE);
device->fs_devices = fs_devices;
@@ -1013,8 +1015,15 @@ error:
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
- if (!IS_ERR(trans))
+ if (!IS_ERR(trans)) {
+ /*
+ * Ignore any error here, if we failed to remove the DEV_STATS
+ * item for devid 0, it's not a big deal. We have other ways
+ * to address it.
+ */
+ btrfs_remove_dev_stat_item(trans, BTRFS_DEV_REPLACE_DEVID);
btrfs_commit_transaction(trans);
+ }
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 27d7a24ff97ae..ec13eac2b3d7b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -491,10 +491,34 @@ static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
static void btree_invalidate_folio(struct folio *folio, size_t offset,
size_t length)
{
- struct extent_io_tree *tree;
+ struct extent_io_tree *tree = &folio_to_inode(folio)->io_tree;
+ struct extent_state *cached_state = NULL;
+ const u64 start = folio_pos(folio);
+ const u64 end = folio_next_pos(folio) - 1;
+
+ /*
+ * The range must cover the full @folio.
+ * Btree inode is never exposed to regular file operations, thus there
+ * is no partial truncation.
+ * The folio is only invalidated when the btree inode is evicted.
+ */
+ ASSERT(offset == 0, "folio=%llu offset=%zu", folio_pos(folio), offset);
+ ASSERT(length == folio_size(folio), "folio=%llu folio_size=%zu length=%zu",
+ folio_pos(folio), folio_size(folio), length);
+
+ /* This function is only called for the btree inode */
+ ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
+
+ btrfs_lock_extent(tree, start, end, &cached_state);
+ folio_wait_writeback(folio);
+
+ /*
+ * Currently for btree io tree, only EXTENT_LOCKED is utilized,
+ * so here we only need to unlock the extent range to free any
+ * existing extent state.
+ */
+ btrfs_unlock_extent(tree, start, end, &cached_state);
- tree = &folio_to_inode(folio)->io_tree;
- extent_invalidate_folio(tree, folio, offset);
btree_release_folio(folio, GFP_NOFS);
if (folio_get_private(folio)) {
btrfs_warn(folio_to_fs_info(folio),
@@ -1736,7 +1760,6 @@ static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
/* helper to cleanup workers */
static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
{
- btrfs_destroy_workqueue(fs_info->fixup_workers);
btrfs_destroy_workqueue(fs_info->delalloc_workers);
btrfs_destroy_workqueue(fs_info->workers);
if (fs_info->endio_workers)
@@ -1944,9 +1967,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->caching_workers =
btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
- fs_info->fixup_workers =
- btrfs_alloc_ordered_workqueue(fs_info, "fixup", ordered_flags);
-
fs_info->endio_workers =
alloc_workqueue("btrfs-endio", flags, max_active);
fs_info->endio_meta_workers =
@@ -1972,7 +1992,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->endio_workers && fs_info->endio_meta_workers &&
fs_info->endio_write_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
- fs_info->caching_workers && fs_info->fixup_workers &&
+ fs_info->caching_workers &&
fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
fs_info->discard_ctl.discard_workers)) {
return -ENOMEM;
@@ -3276,6 +3296,64 @@ static bool fs_is_full_ro(const struct btrfs_fs_info *fs_info)
return false;
}
+/*
+ * Try to wait for any metadata readahead, and invalidate all btree folios.
+ *
+ * If the invalidation failed, report any dirty/held extent buffers.
+ */
+static void invalidate_and_check_btree_folios(struct btrfs_fs_info *fs_info)
+{
+ unsigned long index = 0;
+ struct extent_buffer *eb;
+ int ret;
+
+ ret = invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+ if (likely(ret == 0))
+ return;
+
+ /*
+ * Some btree pages can not be invalidated, this happens when some tree
+ * blocks are still held (either by readahead or some task is holding a ref).
+ */
+ rcu_read_lock();
+ xa_for_each(&fs_info->buffer_tree, index, eb) {
+ /* Increase the ref so that the eb won't disappear. */
+ if (!refcount_inc_not_zero(&eb->refs))
+ continue;
+ rcu_read_unlock();
+
+ /* Wait for any readahead first. */
+ if (test_bit(EXTENT_BUFFER_READING, &eb->bflags))
+ wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING,
+ TASK_UNINTERRUPTIBLE);
+ /*
+ * The refs threshold is 2, one held by us at the beginning
+ * of the loop, one for the ownership in the buffer tree.
+ */
+ if (unlikely(refcount_read(&eb->refs) > 2 || extent_buffer_under_io(eb))) {
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_warn(fs_info,
+ "unable to release extent buffer %llu owner %llu gen %llu refs %u flags 0x%lx",
+ eb->start, btrfs_header_owner(eb),
+ btrfs_header_generation(eb),
+ refcount_read(&eb->refs), eb->bflags);
+ }
+ free_extent_buffer(eb);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+ invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+}
+
+static u32 calc_block_max_order(u32 sectorsize_bits)
+{
+ u32 max_size;
+
+ max_size = min(BTRFS_MAX_BLOCKS_PER_FOLIO << sectorsize_bits,
+ BTRFS_MAX_FOLIO_SIZE);
+ return ilog2(round_up(max_size, PAGE_SIZE) >> PAGE_SHIFT);
+}
+
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices)
{
u32 sectorsize;
@@ -3398,7 +3476,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->sectorsize = sectorsize;
fs_info->sectorsize_bits = ilog2(sectorsize);
fs_info->block_min_order = ilog2(round_up(sectorsize, PAGE_SIZE) >> PAGE_SHIFT);
- fs_info->block_max_order = ilog2((BITS_PER_LONG << fs_info->sectorsize_bits) >> PAGE_SHIFT);
+ fs_info->block_max_order = calc_block_max_order(fs_info->sectorsize_bits);
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
fs_info->fs_devices->fs_info = fs_info;
@@ -3451,7 +3529,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
/* Update the values for the current filesystem. */
sb->s_blocksize = sectorsize;
sb->s_blocksize_bits = blksize_bits(sectorsize);
- memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
+ /*
+ * When temp_fsid is active, fs_devices->fsid is assigned a random UUID
+ * at mount. This inconsistent UUID causes issues for layered filesystems
+ * like OverlayFS. Since metadata_uuid may or may not be set, provide the
+ * on-disk UUID directly from the super_copy.
+ */
+ if (fs_info->fs_devices->temp_fsid)
+ memcpy(&sb->s_uuid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
+ else
+ memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
mutex_lock(&fs_info->chunk_mutex);
ret = btrfs_read_sys_array(fs_info);
@@ -3713,7 +3800,7 @@ fail_tree_roots:
if (fs_info->data_reloc_root)
btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
free_root_pointers(fs_info, true);
- invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+ invalidate_and_check_btree_folios(fs_info);
fail_sb_buffer:
btrfs_stop_all_workers(fs_info);
@@ -4216,7 +4303,6 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
list_del_init(&trans->list);
btrfs_put_transaction(trans);
- trace_btrfs_transaction_commit(fs_info);
}
ASSERT(!found);
}
@@ -4287,16 +4373,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_error_commit_super(fs_info);
/*
- * Wait for any fixup workers to complete.
- * If we don't wait for them here and they are still running by the time
- * we call kthread_stop() against the cleaner kthread further below, we
- * get an use-after-free on the cleaner because the fixup worker adds an
- * inode to the list of delayed iputs and then attempts to wakeup the
- * cleaner kthread, which was already stopped and destroyed. We parked
- * already the cleaner, but below we run all pending delayed iputs.
- */
- btrfs_flush_workqueue(fs_info->fixup_workers);
- /*
* Similar case here, we have to wait for delalloc workers before we
* proceed below and stop the cleaner kthread, otherwise we trigger a
* use-after-tree on the cleaner kthread task_struct when a delalloc
@@ -4419,7 +4495,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
ASSERT(list_empty(&fs_info->delayed_iputs));
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
- if (btrfs_check_quota_leak(fs_info)) {
+ if (unlikely(btrfs_check_quota_leak(fs_info))) {
DEBUG_WARN("qgroup reserved space leaked");
btrfs_err(fs_info, "qgroup reserved space leaked");
}
@@ -4452,7 +4528,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
* We must make sure there is not any read request to
* submit after we stop all workers.
*/
- invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+ invalidate_and_check_btree_folios(fs_info);
btrfs_stop_all_workers(fs_info);
/*
@@ -4888,7 +4964,6 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->trans_lock);
btrfs_put_transaction(t);
- trace_btrfs_transaction_commit(fs_info);
spin_lock(&fs_info->trans_lock);
}
spin_unlock(&fs_info->trans_lock);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f87398910e59d..6030cdbdb7421 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -385,7 +385,7 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
return type;
}
} else {
- ASSERT(is_data == BTRFS_REF_TYPE_ANY);
+ ASSERT(is_data == BTRFS_REF_TYPE_ANY, "is_data=%d", is_data);
return type;
}
}
@@ -2532,8 +2532,11 @@ int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset,
struct btrfs_key key;
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- ASSERT(key.objectid == bytenr);
- ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY);
+ ASSERT(key.objectid == bytenr,
+ "key.objectid=%llu bytenr=%llu",
+ key.objectid, bytenr);
+ ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY, "key.type=%u",
+ key.type);
}
}
@@ -4599,10 +4602,12 @@ static noinline int find_free_extent(struct btrfs_root *root,
/* Use dedicated sub-space_info for dedicated block group users. */
if (ffe_ctl->for_data_reloc) {
space_info = space_info->sub_group[0];
- ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
+ ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC,
+ "space_info->subgroup_id=%d", space_info->subgroup_id);
} else if (ffe_ctl->for_treelog) {
space_info = space_info->sub_group[0];
- ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_TREELOG);
+ ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_TREELOG,
+ "space_info->subgroup_id=%d", space_info->subgroup_id);
}
}
if (!space_info) {
@@ -5782,16 +5787,21 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]);
- if (btrfs_buffer_uptodate(next, generation, NULL))
- return 0;
-
check.level = level - 1;
check.transid = generation;
check.owner_root = btrfs_root_id(root);
check.has_first_key = true;
btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]);
+ ret = btrfs_buffer_uptodate(next, generation, &check);
+ if (ret > 0)
+ return 0;
btrfs_tree_unlock(next);
+ if (ret < 0) {
+ free_extent_buffer(next);
+ return ret;
+ }
+
if (level == 1)
reada_walk_down(trans, root, wc, path);
ret = btrfs_read_extent_buffer(next, &check);
@@ -6646,7 +6656,7 @@ static int btrfs_trim_free_extents_throttle(struct btrfs_device *device,
start = max(start, cur_start);
/* Check if there are any CHUNK_* bits left */
- if (start > device->total_bytes) {
+ if (unlikely(start > device->total_bytes)) {
DEBUG_WARN();
btrfs_warn(fs_info,
"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0c6342995a00f..b7e3e83838d80 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -130,7 +130,8 @@ struct btrfs_bio_ctrl {
* extent_writepage_io().
* This is to avoid touching ranges covered by compression/inline.
*/
- unsigned long submit_bitmap;
+ unsigned long submit_bitmap[BITS_TO_LONGS(BTRFS_MAX_BLOCKS_PER_FOLIO)];
+
struct readahead_control *ractl;
/*
@@ -250,8 +251,6 @@ static void process_one_folio(struct btrfs_fs_info *fs_info,
ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
len = end + 1 - start;
- if (page_ops & PAGE_SET_ORDERED)
- btrfs_folio_clamp_set_ordered(fs_info, folio, start, len);
if (page_ops & PAGE_START_WRITEBACK) {
btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len);
btrfs_folio_clamp_set_writeback(fs_info, folio, start, len);
@@ -530,8 +529,6 @@ static void end_bbio_data_write(struct btrfs_bio *bbio)
u32 len = fi.length;
bio_size += len;
- ASSERT(btrfs_folio_test_ordered(fs_info, folio, start, len));
- btrfs_folio_clear_ordered(fs_info, folio, start, len);
btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
@@ -729,9 +726,9 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
bio_end_sector(bio) == sector;
}
-static void alloc_new_bio(struct btrfs_inode *inode,
- struct btrfs_bio_ctrl *bio_ctrl,
- u64 disk_bytenr, u64 file_offset)
+static int alloc_new_bio(struct btrfs_inode *inode,
+ struct btrfs_bio_ctrl *bio_ctrl,
+ u64 disk_bytenr, u64 file_offset)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_bio *bbio;
@@ -748,13 +745,25 @@ static void alloc_new_bio(struct btrfs_inode *inode,
if (bio_ctrl->wbc) {
struct btrfs_ordered_extent *ordered;
+ /* This must be a write for data inodes. */
+ ASSERT(btrfs_op(&bio_ctrl->bbio->bio) == BTRFS_MAP_WRITE);
+ ASSERT(is_data_inode(inode));
+
ordered = btrfs_lookup_ordered_extent(inode, file_offset);
- if (ordered) {
- bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
- ordered->file_offset +
- ordered->disk_num_bytes - file_offset);
- bbio->ordered = ordered;
+ if (unlikely(!ordered)) {
+ bio_ctrl->bbio = NULL;
+ bio_ctrl->next_file_offset = 0;
+ bio_put(&bbio->bio);
+ btrfs_err_rl(fs_info,
+ "root %lld ino %llu file offset %llu is marked dirty without notifying the fs",
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ file_offset);
+ return -EUCLEAN;
}
+ bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
+ ordered->file_offset +
+ ordered->disk_num_bytes - file_offset);
+ bbio->ordered = ordered;
/*
* Pick the last added device to support cgroup writeback. For
@@ -765,6 +774,7 @@ static void alloc_new_bio(struct btrfs_inode *inode,
bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
wbc_init_bio(bio_ctrl->wbc, &bbio->bio);
}
+ return 0;
}
/*
@@ -780,14 +790,19 @@ static void alloc_new_bio(struct btrfs_inode *inode,
* new one in @bio_ctrl->bbio.
* The mirror number for this IO should already be initialized in
* @bio_ctrl->mirror_num.
+ *
+ * Return the number of bytes that are queued into a bio.
+ * If the returned bytes is smaller than @size, it means we hit a critical error
+ * for data write, where there is no ordered extent for the range.
*/
-static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
- u64 disk_bytenr, struct folio *folio,
- size_t size, unsigned long pg_offset,
- u64 read_em_generation)
+static unsigned int submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
+ u64 disk_bytenr, struct folio *folio,
+ size_t size, unsigned long pg_offset,
+ u64 read_em_generation)
{
struct btrfs_inode *inode = folio_to_inode(folio);
loff_t file_offset = folio_pos(folio) + pg_offset;
+ unsigned int queued = 0;
ASSERT(pg_offset + size <= folio_size(folio));
ASSERT(bio_ctrl->end_io_func);
@@ -800,8 +815,13 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
u32 len = size;
/* Allocate new bio if needed */
- if (!bio_ctrl->bbio)
- alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset);
+ if (!bio_ctrl->bbio) {
+ int ret;
+
+ ret = alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset);
+ if (ret < 0)
+ break;
+ }
/* Cap to the current ordered extent boundary if there is one. */
if (len > bio_ctrl->len_to_oe_boundary) {
@@ -829,6 +849,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
pg_offset += len;
disk_bytenr += len;
file_offset += len;
+ queued += len;
/*
* len_to_oe_boundary defaults to U32_MAX, which isn't folio or
@@ -868,6 +889,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
submit_one_bio(bio_ctrl);
} while (size);
+ return queued;
}
static int attach_extent_buffer_folio(struct extent_buffer *eb,
@@ -932,6 +954,17 @@ void clear_folio_extent_mapped(struct folio *folio)
struct btrfs_fs_info *fs_info;
ASSERT(folio->mapping);
+ /*
+ * The folio should not have writeback nor dirty flag set.
+ *
+ * If dirty flag is set, the folio can be written back again and we
+ * expect the private flag set for the folio.
+ *
+ * If writeback flag is set, the endio may need to utilize the
+ * private for btrfs_folio_state.
+ */
+ ASSERT(!folio_test_dirty(folio));
+ ASSERT(!folio_test_writeback(folio));
if (!folio_test_private(folio))
return;
@@ -1040,6 +1073,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
u64 disk_bytenr;
u64 block_start;
u64 em_gen;
+ unsigned int queued;
ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
if (cur >= last_byte) {
@@ -1153,8 +1187,10 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
if (force_bio_submit)
submit_one_bio(bio_ctrl);
- submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize,
- pg_offset, em_gen);
+ queued = submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize,
+ pg_offset, em_gen);
+ /* Read submission should not fail. */
+ ASSERT(queued == blocksize);
}
return 0;
}
@@ -1432,7 +1468,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
const u64 page_start = folio_pos(folio);
const u64 page_end = page_start + folio_size(folio) - 1;
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
- unsigned long delalloc_bitmap = 0;
+ unsigned long delalloc_bitmap[BITS_TO_LONGS(BTRFS_MAX_BLOCKS_PER_FOLIO)] = { 0 };
/*
* Save the last found delalloc end. As the delalloc end can go beyond
* page boundary, thus we cannot rely on subpage bitmap to locate the
@@ -1457,14 +1493,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
int ret = 0;
/* Save the dirty bitmap as our submission bitmap will be a subset of it. */
- if (btrfs_is_subpage(fs_info, folio)) {
- ASSERT(blocks_per_folio > 1);
- btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap);
- } else {
- bio_ctrl->submit_bitmap = 1;
- }
+ btrfs_copy_subpage_dirty_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
- for_each_set_bitrange(start_bit, end_bit, &bio_ctrl->submit_bitmap,
+ for_each_set_bitrange(start_bit, end_bit, bio_ctrl->submit_bitmap,
blocks_per_folio) {
u64 start = page_start + (start_bit << fs_info->sectorsize_bits);
u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits;
@@ -1480,7 +1511,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
delalloc_start = delalloc_end + 1;
continue;
}
- set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start,
+ set_delalloc_bitmap(folio, delalloc_bitmap, delalloc_start,
min(delalloc_end, page_end) + 1 - delalloc_start);
last_delalloc_end = delalloc_end;
delalloc_start = delalloc_end + 1;
@@ -1506,7 +1537,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
found_len = last_delalloc_end + 1 - found_start;
found = true;
} else {
- found = find_next_delalloc_bitmap(folio, &delalloc_bitmap,
+ found = find_next_delalloc_bitmap(folio, delalloc_bitmap,
delalloc_start, &found_start, &found_len);
}
if (!found)
@@ -1540,7 +1571,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
btrfs_ino(inode),
folio_pos(folio),
blocks_per_folio,
- &bio_ctrl->submit_bitmap,
+ bio_ctrl->submit_bitmap,
found_start, found_len, ret);
} else {
/*
@@ -1565,7 +1596,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
fs_info->sectorsize_bits;
unsigned int end_bit = (min(page_end + 1, found_start + found_len) -
page_start) >> fs_info->sectorsize_bits;
- bitmap_clear(&bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit);
+ bitmap_clear(bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit);
}
/*
* Above btrfs_run_delalloc_range() may have unlocked the folio,
@@ -1586,12 +1617,11 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
fs_info->sectorsize_bits,
blocks_per_folio);
- for_each_set_bitrange(start_bit, end_bit, &bio_ctrl->submit_bitmap,
+ for_each_set_bitrange(start_bit, end_bit, bio_ctrl->submit_bitmap,
bitmap_size) {
u64 start = page_start + (start_bit << fs_info->sectorsize_bits);
u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits;
- btrfs_folio_clear_ordered(fs_info, folio, start, len);
btrfs_mark_ordered_io_finished(inode, start, len, false);
}
return ret;
@@ -1612,7 +1642,7 @@ out:
* If all ranges are submitted asynchronously, we just need to account
* for them here.
*/
- if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) {
+ if (bitmap_empty(bio_ctrl->submit_bitmap, blocks_per_folio)) {
wbc->nr_to_write -= delalloc_to_write;
return 1;
}
@@ -1647,6 +1677,7 @@ static int submit_one_sector(struct btrfs_inode *inode,
u64 extent_offset;
u64 em_end;
const u32 sectorsize = fs_info->sectorsize;
+ unsigned int queued;
ASSERT(IS_ALIGNED(filepos, sectorsize));
@@ -1668,7 +1699,6 @@ static int submit_one_sector(struct btrfs_inode *inode,
* ordered extent.
*/
btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
- btrfs_folio_clear_ordered(fs_info, folio, filepos, sectorsize);
btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize);
@@ -1713,8 +1743,14 @@ static int submit_one_sector(struct btrfs_inode *inode,
*/
ASSERT(folio_test_writeback(folio));
- submit_extent_folio(bio_ctrl, disk_bytenr, folio,
- sectorsize, filepos - folio_pos(folio), 0);
+ queued = submit_extent_folio(bio_ctrl, disk_bytenr, folio,
+ sectorsize, filepos - folio_pos(folio), 0);
+ if (unlikely(queued < sectorsize)) {
+ btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize);
+ btrfs_mark_ordered_io_finished(inode, filepos, fs_info->sectorsize,
+ false);
+ return -EUCLEAN;
+ }
return 0;
}
@@ -1733,7 +1769,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
loff_t i_size)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- unsigned long range_bitmap = 0;
bool submitted_io = false;
int found_error = 0;
const u64 end = start + len;
@@ -1748,28 +1783,18 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
ASSERT(end <= folio_end, "start=%llu len=%u folio_start=%llu folio_size=%zu",
start, len, folio_start, folio_size(folio));
- ret = btrfs_writepage_cow_fixup(folio);
- if (ret == -EAGAIN) {
- /* Fixup worker will requeue */
- folio_redirty_for_writepage(bio_ctrl->wbc, folio);
- folio_unlock(folio);
- return 1;
- }
- if (ret < 0) {
- btrfs_folio_clear_dirty(fs_info, folio, start, len);
- btrfs_folio_set_writeback(fs_info, folio, start, len);
- btrfs_folio_clear_writeback(fs_info, folio, start, len);
- return ret;
- }
-
- bitmap_set(&range_bitmap, (start - folio_pos(folio)) >> fs_info->sectorsize_bits,
- len >> fs_info->sectorsize_bits);
- bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap,
- blocks_per_folio);
+ /* Truncate the submit bitmap to the current range. */
+ if (start > folio_start)
+ bitmap_clear(bio_ctrl->submit_bitmap, 0,
+ (start - folio_start) >> fs_info->sectorsize_bits);
+ if (start + len < folio_end)
+ bitmap_clear(bio_ctrl->submit_bitmap,
+ (end - folio_start) >> fs_info->sectorsize_bits,
+ (folio_end - end) >> fs_info->sectorsize_bits);
bio_ctrl->end_io_func = end_bbio_data_write;
- for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) {
+ for_each_set_bit(bit, bio_ctrl->submit_bitmap, blocks_per_folio) {
cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits);
if (cur >= i_size) {
@@ -1789,7 +1814,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
spin_unlock(&inode->ordered_tree_lock);
btrfs_put_ordered_extent(ordered);
- btrfs_folio_clear_ordered(fs_info, folio, cur, fs_info->sectorsize);
btrfs_mark_ordered_io_finished(inode, cur, fs_info->sectorsize, true);
/*
* This range is beyond i_size, thus we don't need to
@@ -1829,6 +1853,25 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
return found_error;
}
+static void bio_ctrl_init_submit_bitmap(struct btrfs_fs_info *fs_info,
+ struct folio *folio,
+ struct btrfs_bio_ctrl *bio_ctrl)
+{
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
+
+ ASSERT(blocks_per_folio <= BTRFS_MAX_BLOCKS_PER_FOLIO);
+
+ /*
+ * Default to unlock the whole folio.
+ * The proper bitmap is not initialized until writepage_delalloc().
+ *
+ * We're safe just to set the bitmap range [0, blocks_per_folio), as
+ * all later usage of the bitmap will follow the same range limit.
+ * Any bits beyond blocks_per_folio will be ignored.
+ */
+ bitmap_set(bio_ctrl->submit_bitmap, 0, blocks_per_folio);
+}
+
/*
* the writepage semantics are similar to regular writepage. extent
* records are inserted to lock ranges in the tree, and as dirty areas
@@ -1863,12 +1906,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
if (folio_contains(folio, end_index))
folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset);
- /*
- * Default to unlock the whole folio.
- * The proper bitmap can only be initialized until writepage_delalloc().
- */
- bio_ctrl->submit_bitmap = (unsigned long)-1;
-
+ bio_ctrl_init_submit_bitmap(fs_info, folio, bio_ctrl);
/*
* If the page is dirty but without private set, it's marked dirty
* without informing the fs.
@@ -1877,13 +1915,9 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
*
* So here we check if the page has private set to rule out such
* case.
- * But we also have a long history of relying on the COW fixup,
- * so here we only enable this check for experimental builds until
- * we're sure it's safe.
*/
- if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) &&
- unlikely(!folio_test_private(folio))) {
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ if (unlikely(!folio_test_private(folio))) {
+ DEBUG_WARN();
btrfs_err_rl(fs_info,
"root %lld ino %llu folio %llu is marked dirty without notifying the fs",
btrfs_root_id(inode->root),
@@ -1911,7 +1945,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
"failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d",
btrfs_root_id(inode->root), btrfs_ino(inode),
folio_pos(folio), blocks_per_folio,
- &bio_ctrl->submit_bitmap, ret);
+ bio_ctrl->submit_bitmap, ret);
bio_ctrl->wbc->nr_to_write--;
@@ -2554,7 +2588,7 @@ retry:
}
if (folio_test_writeback(folio) ||
- !folio_clear_dirty_for_io(folio)) {
+ !folio_test_dirty(folio)) {
folio_unlock(folio);
continue;
}
@@ -2658,7 +2692,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
* Set the submission bitmap to submit all sectors.
* extent_writepage_io() will do the truncation correctly.
*/
- bio_ctrl.submit_bitmap = (unsigned long)-1;
+ bio_ctrl_init_submit_bitmap(fs_info, folio, &bio_ctrl);
ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len,
&bio_ctrl, i_size);
if (ret == 1)
@@ -2727,38 +2761,6 @@ void btrfs_readahead(struct readahead_control *rac)
}
/*
- * basic invalidate_folio code, this waits on any locked or writeback
- * ranges corresponding to the folio, and then deletes any extent state
- * records from the tree
- */
-int extent_invalidate_folio(struct extent_io_tree *tree,
- struct folio *folio, size_t offset)
-{
- struct extent_state *cached_state = NULL;
- u64 start = folio_pos(folio);
- u64 end = start + folio_size(folio) - 1;
- size_t blocksize = folio_to_fs_info(folio)->sectorsize;
-
- /* This function is only called for the btree inode */
- ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
-
- start += ALIGN(offset, blocksize);
- if (start > end)
- return 0;
-
- btrfs_lock_extent(tree, start, end, &cached_state);
- folio_wait_writeback(folio);
-
- /*
- * Currently for btree io tree, only EXTENT_LOCKED is utilized,
- * so here we only need to unlock the extent range to free any
- * existing extent state.
- */
- btrfs_unlock_extent(tree, start, end, &cached_state);
- return 0;
-}
-
-/*
* A helper for struct address_space_operations::release_folio, this tests for
* areas of the folio that are locked or under IO and drops the related state
* bits if it is safe to drop the folio.
@@ -2887,12 +2889,6 @@ next:
return try_release_extent_state(io_tree, folio);
}
-static int extent_buffer_under_io(const struct extent_buffer *eb)
-{
- return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
- test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
-}
-
static bool folio_range_has_eb(struct folio *folio)
{
struct btrfs_folio_state *bfs;
@@ -3404,8 +3400,8 @@ retry:
finish:
spin_lock(&mapping->i_private_lock);
if (existing_folio && btrfs_meta_is_subpage(fs_info)) {
- /* We're going to reuse the existing page, can drop our folio now. */
- __free_page(folio_page(eb->folios[i], 0));
+ /* We're going to reuse the existing folio, can drop our folio now. */
+ folio_put(eb->folios[i]);
eb->folios[i] = existing_folio;
} else if (existing_folio) {
struct extent_buffer *existing_eb;
@@ -3420,7 +3416,7 @@ finish:
return 1;
}
/* The extent buffer no longer exists, we can reuse the folio. */
- __free_page(folio_page(eb->folios[i], 0));
+ folio_put(eb->folios[i]);
eb->folios[i] = existing_folio;
}
eb->folio_size = folio_size(eb->folios[i]);
@@ -3749,17 +3745,6 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
release_extent_buffer(eb);
}
-static void btree_clear_folio_dirty_tag(struct folio *folio)
-{
- ASSERT(!folio_test_dirty(folio));
- ASSERT(folio_test_locked(folio));
- xa_lock_irq(&folio->mapping->i_pages);
- if (!folio_test_dirty(folio))
- __xa_clear_mark(&folio->mapping->i_pages, folio->index,
- PAGECACHE_TAG_DIRTY);
- xa_unlock_irq(&folio->mapping->i_pages);
-}
-
void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
@@ -3800,7 +3785,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
folio_lock(folio);
last = btrfs_meta_folio_clear_and_test_dirty(folio, eb);
if (last)
- btree_clear_folio_dirty_tag(folio);
+ btrfs_clear_folio_dirty_tag(folio);
folio_unlock(folio);
}
WARN_ON(refcount_read(&eb->refs) == 0);
@@ -3991,15 +3976,14 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
return 0;
}
-static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
- unsigned long len)
+/* Never inlined to decrease code size, as this is called in a cold path. */
+static noinline void report_eb_range(const struct extent_buffer *eb,
+ unsigned long start, unsigned long len)
{
btrfs_warn(eb->fs_info,
"access to eb bytenr %llu len %u out of range start %lu len %lu",
eb->start, eb->len, start, len);
DEBUG_WARN();
-
- return true;
}
/*
@@ -4009,14 +3993,16 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
*
* Caller should not touch the dst/src memory if this function returns error.
*/
-static inline int check_eb_range(const struct extent_buffer *eb,
- unsigned long start, unsigned long len)
+static inline bool check_eb_range(const struct extent_buffer *eb,
+ unsigned long start, unsigned long len)
{
unsigned long offset;
/* start, start + len should not go beyond eb->len nor overflow */
- if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
- return report_eb_range(eb, start, len);
+ if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) {
+ report_eb_range(eb, start, len);
+ return true;
+ }
return false;
}
@@ -4670,7 +4656,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
if (IS_ERR(eb))
return;
- if (btrfs_buffer_uptodate(eb, gen, NULL)) {
+ if (btrfs_buffer_uptodate(eb, gen, &check)) {
free_extent_buffer(eb);
return;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b310a5145cf69..8c58b114f5b3e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -55,7 +55,6 @@ enum {
/* Page starts writeback, clear dirty bit and set writeback bit */
ENUM_BIT(PAGE_START_WRITEBACK),
ENUM_BIT(PAGE_END_WRITEBACK),
- ENUM_BIT(PAGE_SET_ORDERED),
};
/*
@@ -327,6 +326,12 @@ static inline bool extent_buffer_uptodate(const struct extent_buffer *eb)
return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
}
+static inline bool extent_buffer_under_io(const struct extent_buffer *eb)
+{
+ return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
+ test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+}
+
int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len);
void read_extent_buffer(const struct extent_buffer *eb, void *dst,
@@ -381,11 +386,20 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
const struct folio *locked_folio,
struct extent_state **cached,
u32 bits_to_clear, unsigned long page_ops);
-int extent_invalidate_folio(struct extent_io_tree *tree,
- struct folio *folio, size_t offset);
void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *buf);
+static inline void btrfs_clear_folio_dirty_tag(struct folio *folio)
+{
+ ASSERT(!folio_test_dirty(folio));
+ ASSERT(folio_test_locked(folio));
+ ASSERT(folio->mapping);
+ xa_lock_irq(&folio->mapping->i_pages);
+ __xa_clear_mark(&folio->mapping->i_pages, folio->index,
+ PAGECACHE_TAG_DIRTY);
+ xa_unlock_irq(&folio->mapping->i_pages);
+}
+
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
bool nofail);
int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6b79bff241f21..fce9c5cc01228 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -717,7 +717,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
* file offset. Here just do a sanity check.
*/
if (em->disk_bytenr == EXTENT_MAP_INLINE)
- ASSERT(em->start == 0);
+ ASSERT(em->start == 0, "em->start=%llu", em->start);
ret = add_extent_mapping(inode, em, false);
/* it is possible that someone inserted the extent into the tree
@@ -761,7 +761,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
}
}
- ASSERT(ret == 0 || ret == -EEXIST);
+ ASSERT(ret == 0 || ret == -EEXIST, "ret=%d", ret);
return ret;
}
@@ -943,7 +943,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
ret = add_extent_mapping(inode, split, modified);
/* Logic error, shouldn't happen. */
- ASSERT(ret == 0);
+ ASSERT(ret == 0, "ret=%d", ret);
if (WARN_ON(ret != 0) && modified)
btrfs_set_inode_full_sync(inode);
}
diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c
index 27d361c7adc4d..6263e837093e4 100644
--- a/fs/btrfs/fiemap.c
+++ b/fs/btrfs/fiemap.c
@@ -112,7 +112,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
u64 cache_end;
/* Set at the end of extent_fiemap(). */
- ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
+ ASSERT((flags & FIEMAP_EXTENT_LAST) == 0, "flags=0x%u", flags);
if (!cache->cached)
goto assign;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index d72249390030f..82ae4a2afd347 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -325,7 +325,9 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
csum_start = key.offset;
csum_len = (itemsize / csum_size) * sectorsize;
- ASSERT(in_range(disk_bytenr, csum_start, csum_len));
+ ASSERT(in_range(disk_bytenr, csum_start, csum_len),
+ "disk_bytenr=%llu csum_start=%llu csum_len=%llu",
+ disk_bytenr, csum_start, csum_len);
found:
ret = (min(csum_start + csum_len, disk_bytenr + len) -
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8c171ed07008b..d786b9666755a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -49,14 +49,6 @@ static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
ASSERT(block_len <= U32_MAX);
- /*
- * Folio checked is some magic around finding folios that have been
- * modified without going through btrfs_dirty_folio(). Clear it here.
- * There should be no need to mark the pages accessed as
- * prepare_one_folio() should have marked them accessed in
- * prepare_one_folio() via find_or_create_page()
- */
- btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len);
folio_unlock(folio);
folio_put(folio);
}
@@ -65,7 +57,7 @@ static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
* After copy_folio_from_iter_atomic(), update the following things for delalloc:
* - Mark newly dirtied folio as DELALLOC in the io tree.
* Used to advise which range is to be written back.
- * - Mark modified folio as Uptodate/Dirty and not needing COW fixup
+ * - Mark modified folio as Uptodate/Dirty
* - Update inode size for past EOF write
*/
int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
@@ -93,21 +85,12 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos
end_of_last_block = start_pos + num_bytes - 1;
- /*
- * The pages may have already been dirty, clear out old accounting so
- * we can set things up properly
- */
- btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- cached);
-
- ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
- extra_bits, cached);
+ ret = btrfs_reset_extent_delalloc(inode, start_pos, end_of_last_block,
+ extra_bits, cached);
if (ret)
return ret;
btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes);
- btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes);
btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes);
/*
@@ -1232,8 +1215,11 @@ static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter,
return ret;
reserved_len = ret;
/* Write range must be inside the reserved range. */
- ASSERT(reserved_start <= start);
- ASSERT(start + write_bytes <= reserved_start + reserved_len);
+ ASSERT(reserved_start <= start, "reserved_start=%llu start=%llu",
+ reserved_start, start);
+ ASSERT(start + write_bytes <= reserved_start + reserved_len,
+ "start=%llu write_bytes=%zu reserved_start=%llu reserved_len=%llu",
+ start, write_bytes, reserved_start, reserved_len);
again:
ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping,
@@ -1578,7 +1564,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
btrfs_assert_inode_locked(inode);
}
- trace_btrfs_sync_file(file, datasync);
+ trace_btrfs_sync_file_enter(file, datasync);
btrfs_init_log_ctx(&ctx, inode);
@@ -1702,14 +1688,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* reason, it's no longer relevant.
*/
clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
- /*
- * An ordered extent might have started before and completed
- * already with io errors, in which case the inode was not
- * updated and we end up here. So check the inode's mapping
- * for any errors that might have happened since we last
- * checked called fsync.
- */
- ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err);
goto out_release_extents;
}
@@ -1824,10 +1802,17 @@ out:
free_extent_buffer(ctx.scratch_eb);
ASSERT(list_empty(&ctx.list));
ASSERT(list_empty(&ctx.conflict_inodes));
+ ASSERT(ret <= 0, "ret=%d", ret);
+ /*
+ * Ordered extents might have started and completed before this fsync,
+ * so check for any io errors and advance the writeback error sequence.
+ */
err = file_check_and_advance_wb_err(file);
if (!ret)
ret = err;
- return ret > 0 ? -EIO : ret;
+ trace_btrfs_sync_file_exit(file, ret);
+
+ return ret;
out_release_extents:
btrfs_release_log_ctx_extents(&ctx);
@@ -1966,18 +1951,7 @@ again:
}
}
- /*
- * page_mkwrite gets called when the page is firstly dirtied after it's
- * faulted in, but write(2) could also dirty a page and set delalloc
- * bits, thus in this case for space account reason, we still need to
- * clear any delalloc bits within this page range since we have to
- * reserve data&meta space before lock_page() (see above comments).
- */
- btrfs_clear_extent_bit(io_tree, page_start, end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, &cached_state);
-
- ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state);
+ ret = btrfs_reset_extent_delalloc(inode, page_start, end, 0, &cached_state);
if (ret < 0) {
btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
goto out_unlock;
@@ -1992,7 +1966,6 @@ again:
if (zero_start != fsize)
folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
- btrfs_folio_clear_checked(fs_info, folio, page_start, fsize);
btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
@@ -2098,6 +2071,10 @@ static int fill_holes(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item *fi;
struct extent_map *hole_em;
struct btrfs_key key;
+ int modify_slot = -1;
+ int del_slot = -1;
+ bool update_offset = false;
+ u64 num_bytes = 0;
int ret;
if (btrfs_fs_incompat(fs_info, NO_HOLES))
@@ -2107,7 +2084,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = offset;
- ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret <= 0) {
/*
* We should have dropped this offset, so if we find it then
@@ -2120,33 +2097,44 @@ static int fill_holes(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
- u64 num_bytes;
-
- path->slots[0]--;
- fi = btrfs_item_ptr(leaf, path->slots[0],
+ fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
struct btrfs_file_extent_item);
num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
end - offset;
- btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
- btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
- btrfs_set_file_extent_offset(leaf, fi, 0);
- btrfs_set_file_extent_generation(leaf, fi, trans->transid);
- goto out;
+ modify_slot = path->slots[0] - 1;
}
-
if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
- u64 num_bytes;
-
- key.offset = offset;
- btrfs_set_item_key_safe(trans, path, &key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
- offset;
+ if (modify_slot != -1) {
+ num_bytes += btrfs_file_extent_num_bytes(leaf, fi);
+ del_slot = path->slots[0];
+ } else {
+ num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
+ end - offset;
+ modify_slot = path->slots[0];
+ update_offset = true;
+ }
+ }
+ if (modify_slot >= 0) {
+ fi = btrfs_item_ptr(leaf, modify_slot,
+ struct btrfs_file_extent_item);
btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+ if (update_offset) {
+ key.offset = offset;
+ btrfs_set_item_key_safe(trans, path, &key);
+ }
btrfs_set_file_extent_offset(leaf, fi, 0);
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+ if (del_slot >= 0) {
+ ret = btrfs_del_items(trans, root, path, del_slot, 1);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_release_path(path);
+ return ret;
+ }
+ }
goto out;
}
btrfs_release_path(path);
@@ -2407,7 +2395,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
+ const u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
struct btrfs_trans_handle *trans = NULL;
struct btrfs_block_rsv rsv;
@@ -2420,7 +2408,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
return -EINVAL;
btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
- rsv.size = btrfs_calc_insert_metadata_size(fs_info, 1);
+ rsv.size = min_size;
rsv.failfast = true;
/*
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ab22e4f9ffdde..2354cb6fce369 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -433,10 +433,6 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
for (i = 0; i < io_ctl->num_pages; i++) {
if (io_ctl->pages[i]) {
- btrfs_folio_clear_checked(io_ctl->fs_info,
- page_folio(io_ctl->pages[i]),
- page_offset(io_ctl->pages[i]),
- PAGE_SIZE);
unlock_page(io_ctl->pages[i]);
put_page(io_ctl->pages[i]);
}
@@ -690,11 +686,12 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
struct btrfs_block_group *block_group = ctl->block_group;
+ const int unit = block_group->fs_info->sectorsize;
u64 max_bytes;
u64 bitmap_bytes;
u64 extent_bytes;
u64 size = block_group->length;
- u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
+ u64 bytes_per_bg = BITS_PER_BITMAP * unit;
u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
max_bitmaps = max_t(u64, max_bitmaps, 1);
@@ -703,7 +700,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
btrfs_err(block_group->fs_info,
"invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu",
block_group->start, block_group->length,
- ctl->total_bitmaps, ctl->unit, max_bitmaps,
+ ctl->total_bitmaps, unit, max_bitmaps,
bytes_per_bg);
ASSERT(ctl->total_bitmaps <= max_bitmaps);
@@ -718,7 +715,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
else
max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
- bitmap_bytes = ctl->total_bitmaps * ctl->unit;
+ bitmap_bytes = ctl->total_bitmaps * unit;
/*
* we want the extent entry threshold to always be at most 1/2 the max
@@ -896,8 +893,7 @@ free_cache:
goto out;
}
-static int copy_free_space_cache(struct btrfs_block_group *block_group,
- struct btrfs_free_space_ctl *ctl)
+static int copy_free_space_cache(struct btrfs_free_space_ctl *ctl)
{
struct btrfs_free_space *info;
struct rb_node *n;
@@ -912,17 +908,17 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group,
unlink_free_space(ctl, info, true);
spin_unlock(&ctl->tree_lock);
kmem_cache_free(btrfs_free_space_cachep, info);
- ret = btrfs_add_free_space(block_group, offset, bytes);
+ ret = btrfs_add_free_space(ctl->block_group, offset, bytes);
spin_lock(&ctl->tree_lock);
} else {
u64 offset = info->offset;
- u64 bytes = ctl->unit;
+ u64 bytes = ctl->block_group->fs_info->sectorsize;
ret = search_bitmap(ctl, info, &offset, &bytes, false);
if (ret == 0) {
bitmap_clear_bits(ctl, info, offset, bytes, true);
spin_unlock(&ctl->tree_lock);
- ret = btrfs_add_free_space(block_group, offset,
+ ret = btrfs_add_free_space(ctl->block_group, offset,
bytes);
spin_lock(&ctl->tree_lock);
} else {
@@ -1025,7 +1021,7 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
if (matched) {
spin_lock(&tmp_ctl.tree_lock);
- ret = copy_free_space_cache(block_group, &tmp_ctl);
+ ret = copy_free_space_cache(&tmp_ctl);
spin_unlock(&tmp_ctl.tree_lock);
/*
* ret == 1 means we successfully loaded the free space cache,
@@ -1068,12 +1064,12 @@ out:
static noinline_for_stack
int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
- struct btrfs_free_space_ctl *ctl,
struct btrfs_block_group *block_group,
int *entries, int *bitmaps,
struct list_head *bitmap_list)
{
int ret;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_cluster *cluster = NULL;
struct btrfs_free_cluster *cluster_locked = NULL;
struct rb_node *node = rb_first(&ctl->free_space_offset);
@@ -1367,10 +1363,10 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
* or an errno if it was not.
*/
static int __btrfs_write_out_cache(struct inode *inode,
- struct btrfs_free_space_ctl *ctl,
struct btrfs_block_group *block_group,
struct btrfs_trans_handle *trans)
{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_io_ctl *io_ctl = &block_group->io_ctl;
struct extent_state *cached_state = NULL;
LIST_HEAD(bitmap_list);
@@ -1416,8 +1412,7 @@ static int __btrfs_write_out_cache(struct inode *inode,
mutex_lock(&ctl->cache_writeout_mutex);
/* Write out the extent entries in the free space cache */
spin_lock(&ctl->tree_lock);
- ret = write_cache_extent_entries(io_ctl, ctl,
- block_group, &entries, &bitmaps,
+ ret = write_cache_extent_entries(io_ctl, block_group, &entries, &bitmaps,
&bitmap_list);
if (ret)
goto out_nospc_locked;
@@ -1516,7 +1511,6 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct inode *inode;
int ret = 0;
@@ -1531,7 +1525,7 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
if (IS_ERR(inode))
return 0;
- ret = __btrfs_write_out_cache(inode, ctl, block_group, trans);
+ ret = __btrfs_write_out_cache(inode, block_group, trans);
if (ret) {
btrfs_debug(fs_info,
"failed to write free space cache for block group %llu error %d",
@@ -1571,11 +1565,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
u64 bitmap_start;
u64 bytes_per_bitmap;
- bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
- bitmap_start = offset - ctl->start;
+ bytes_per_bitmap = BITS_PER_BITMAP * ctl->block_group->fs_info->sectorsize;
+ bitmap_start = offset - ctl->block_group->start;
bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
bitmap_start *= bytes_per_bitmap;
- bitmap_start += ctl->start;
+ bitmap_start += ctl->block_group->start;
return bitmap_start;
}
@@ -1702,6 +1696,7 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
{
struct rb_node *n = ctl->free_space_offset.rb_node;
struct btrfs_free_space *entry = NULL, *prev = NULL;
+ const int unit = ctl->block_group->fs_info->sectorsize;
lockdep_assert_held(&ctl->tree_lock);
@@ -1785,7 +1780,7 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
prev->offset + prev->bytes > offset)
return prev;
}
- if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
+ if (entry->offset + BITS_PER_BITMAP * unit > offset)
return entry;
} else if (entry->offset + entry->bytes > offset)
return entry;
@@ -1799,8 +1794,7 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
return NULL;
entry = rb_entry(n, struct btrfs_free_space, offset_index);
if (entry->bitmap) {
- if (entry->offset + BITS_PER_BITMAP *
- ctl->unit > offset)
+ if (entry->offset + BITS_PER_BITMAP * unit > offset)
break;
} else {
if (entry->offset + entry->bytes > offset)
@@ -1875,18 +1869,19 @@ static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info,
u64 offset, u64 bytes, bool update_stat)
{
+ const int unit = ctl->block_group->fs_info->sectorsize;
unsigned long start, count, end;
int extent_delta = -1;
- start = offset_to_bit(info->offset, ctl->unit, offset);
- count = bytes_to_bits(bytes, ctl->unit);
+ start = offset_to_bit(info->offset, unit, offset);
+ count = bytes_to_bits(bytes, unit);
end = start + count;
ASSERT(end <= BITS_PER_BITMAP);
bitmap_clear(info->bitmap, start, count);
info->bytes -= bytes;
- if (info->max_extent_size > ctl->unit)
+ if (info->max_extent_size > unit)
info->max_extent_size = 0;
relink_bitmap_entry(ctl, info);
@@ -1911,11 +1906,12 @@ static void btrfs_bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
u64 bytes)
{
+ const int unit = ctl->block_group->fs_info->sectorsize;
unsigned long start, count, end;
int extent_delta = 1;
- start = offset_to_bit(info->offset, ctl->unit, offset);
- count = bytes_to_bits(bytes, ctl->unit);
+ start = offset_to_bit(info->offset, unit, offset);
+ count = bytes_to_bits(bytes, unit);
end = start + count;
ASSERT(end <= BITS_PER_BITMAP);
@@ -1952,6 +1948,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info, u64 *offset,
u64 *bytes, bool for_alloc)
{
+ const int unit = ctl->block_group->fs_info->sectorsize;
unsigned long found_bits = 0;
unsigned long max_bits = 0;
unsigned long bits, i;
@@ -1969,9 +1966,9 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
return -1;
}
- i = offset_to_bit(bitmap_info->offset, ctl->unit,
+ i = offset_to_bit(bitmap_info->offset, unit,
max_t(u64, *offset, bitmap_info->offset));
- bits = bytes_to_bits(*bytes, ctl->unit);
+ bits = bytes_to_bits(*bytes, unit);
for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
if (for_alloc && bits == 1) {
@@ -1991,12 +1988,12 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
}
if (found_bits) {
- *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
- *bytes = (u64)(found_bits) * ctl->unit;
+ *offset = (u64)(i * unit) + bitmap_info->offset;
+ *bytes = (u64)(found_bits) * unit;
return 0;
}
- *bytes = (u64)(max_bits) * ctl->unit;
+ *bytes = (u64)(max_bits) * unit;
bitmap_info->max_extent_size = *bytes;
relink_bitmap_entry(ctl, bitmap_info);
return -1;
@@ -2054,9 +2051,9 @@ again:
* to match our requested alignment
*/
if (*bytes >= align) {
- tmp = entry->offset - ctl->start + align - 1;
+ tmp = entry->offset - ctl->block_group->start + align - 1;
tmp = div64_u64(tmp, align);
- tmp = tmp * align + ctl->start;
+ tmp = tmp * align + ctl->block_group->start;
align_off = tmp - entry->offset;
} else {
align_off = 0;
@@ -2148,12 +2145,13 @@ static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info,
u64 *offset, u64 *bytes)
{
+ const int unit = ctl->block_group->fs_info->sectorsize;
u64 end;
u64 search_start, search_bytes;
int ret;
again:
- end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
+ end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * unit) - 1;
/*
* We need to search for bits in this bitmap. We could only cover some
@@ -2162,7 +2160,7 @@ again:
* go searching for the next bit.
*/
search_start = *offset;
- search_bytes = ctl->unit;
+ search_bytes = unit;
search_bytes = min(search_bytes, end - search_start + 1);
ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
false);
@@ -2208,7 +2206,7 @@ again:
* everything over again.
*/
search_start = *offset;
- search_bytes = ctl->unit;
+ search_bytes = unit;
ret = search_bitmap(ctl, bitmap_info, &search_start,
&search_bytes, false);
if (ret < 0 || search_start != *offset)
@@ -2225,6 +2223,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
u64 bytes, enum btrfs_trim_state trim_state)
{
+ const int unit = ctl->block_group->fs_info->sectorsize;
u64 bytes_to_set = 0;
u64 end;
@@ -2241,7 +2240,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
}
- end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
+ end = info->offset + (u64)(BITS_PER_BITMAP * unit);
bytes_to_set = min(end - offset, bytes);
@@ -2251,7 +2250,8 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
}
-static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
+EXPORT_FOR_TESTS
+bool btrfs_use_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
struct btrfs_block_group *block_group = ctl->block_group;
@@ -2295,21 +2295,17 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
* so allow those block groups to still be allowed to have a bitmap
* entry.
*/
- if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
+ if (((BITS_PER_BITMAP * fs_info->sectorsize) >> 1) > block_group->length)
return false;
return true;
}
-static const struct btrfs_free_space_op free_space_op = {
- .use_bitmap = use_bitmap,
-};
-
static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
struct btrfs_free_space *bitmap_info;
- struct btrfs_block_group *block_group = NULL;
+ struct btrfs_block_group *block_group = ctl->block_group;
int added = 0;
u64 bytes, offset, bytes_added;
enum btrfs_trim_state trim_state;
@@ -2319,18 +2315,20 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
offset = info->offset;
trim_state = info->trim_state;
- if (!ctl->op->use_bitmap(ctl, info))
- return 0;
-
- if (ctl->op == &free_space_op)
- block_group = ctl->block_group;
+ if (btrfs_is_testing(block_group->fs_info)) {
+ if (!block_group->fs_info->use_bitmap(ctl, info))
+ return 0;
+ } else {
+ if (!btrfs_use_bitmap(ctl, info))
+ return 0;
+ }
again:
/*
* Since we link bitmaps right into the cluster we need to see if we
* have a cluster here, and if so and it has our bitmap we need to add
* the free space to that bitmap.
*/
- if (block_group && !list_empty(&block_group->cluster_list)) {
+ if (!list_empty(&block_group->cluster_list)) {
struct btrfs_free_cluster *cluster;
struct rb_node *node;
struct btrfs_free_space *entry;
@@ -2494,6 +2492,7 @@ static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info,
bool update_stat)
{
+ const int unit = ctl->block_group->fs_info->sectorsize;
struct btrfs_free_space *bitmap;
unsigned long i;
unsigned long j;
@@ -2505,11 +2504,11 @@ static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
if (!bitmap)
return false;
- i = offset_to_bit(bitmap->offset, ctl->unit, end);
+ i = offset_to_bit(bitmap->offset, unit, end);
j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
if (j == i)
return false;
- bytes = (j - i) * ctl->unit;
+ bytes = (j - i) * unit;
info->bytes += bytes;
/* See try_merge_free_space() comment. */
@@ -2528,6 +2527,7 @@ static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info,
bool update_stat)
{
+ const int unit = ctl->block_group->fs_info->sectorsize;
struct btrfs_free_space *bitmap;
u64 bitmap_offset;
unsigned long i;
@@ -2547,7 +2547,7 @@ static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
if (!bitmap)
return false;
- i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
+ i = offset_to_bit(bitmap->offset, unit, info->offset) - 1;
j = 0;
prev_j = (unsigned long)-1;
for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
@@ -2559,9 +2559,9 @@ static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
return false;
if (prev_j == (unsigned long)-1)
- bytes = (i + 1) * ctl->unit;
+ bytes = (i + 1) * unit;
else
- bytes = (i - prev_j) * ctl->unit;
+ bytes = (i - prev_j) * unit;
info->offset -= bytes;
info->bytes += bytes;
@@ -2947,13 +2947,8 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
struct btrfs_free_space_ctl *ctl)
{
- struct btrfs_fs_info *fs_info = block_group->fs_info;
-
spin_lock_init(&ctl->tree_lock);
- ctl->unit = fs_info->sectorsize;
- ctl->start = block_group->start;
ctl->block_group = block_group;
- ctl->op = &free_space_op;
ctl->free_space_bytes = RB_ROOT_CACHED;
INIT_LIST_HEAD(&ctl->trimming_ranges);
mutex_init(&ctl->cache_writeout_mutex);
@@ -3327,6 +3322,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ const int unit = block_group->fs_info->sectorsize;
unsigned long next_zero;
unsigned long i;
unsigned long want_bits;
@@ -3339,10 +3335,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
lockdep_assert_held(&ctl->tree_lock);
- i = offset_to_bit(entry->offset, ctl->unit,
+ i = offset_to_bit(entry->offset, unit,
max_t(u64, offset, entry->offset));
- want_bits = bytes_to_bits(bytes, ctl->unit);
- min_bits = bytes_to_bits(min_bytes, ctl->unit);
+ want_bits = bytes_to_bits(bytes, unit);
+ min_bits = bytes_to_bits(min_bytes, unit);
/*
* Don't bother looking for a cluster in this bitmap if it's heavily
@@ -3368,7 +3364,7 @@ again:
}
if (!found_bits) {
- entry->max_extent_size = (u64)max_bits * ctl->unit;
+ entry->max_extent_size = (u64)max_bits * unit;
return -ENOSPC;
}
@@ -3379,15 +3375,15 @@ again:
total_found += found_bits;
- if (cluster->max_size < found_bits * ctl->unit)
- cluster->max_size = found_bits * ctl->unit;
+ if (cluster->max_size < found_bits * unit)
+ cluster->max_size = found_bits * unit;
if (total_found < want_bits || cluster->max_size < cont1_bytes) {
i = next_zero + 1;
goto again;
}
- cluster->window_start = start * ctl->unit + entry->offset;
+ cluster->window_start = start * unit + entry->offset;
rb_erase(&entry->offset_index, &ctl->free_space_offset);
rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
@@ -3403,8 +3399,7 @@ again:
ret = tree_insert_offset(ctl, cluster, entry);
ASSERT(!ret); /* -EEXIST; Logic error */
- trace_btrfs_setup_cluster(block_group, cluster,
- total_found * ctl->unit, 1);
+ trace_btrfs_setup_cluster(block_group, cluster, total_found * unit, 1);
return 0;
}
@@ -4044,7 +4039,9 @@ static int trim_bitmaps(struct btrfs_block_group *block_group,
}
next:
if (next_bitmap) {
- offset += BITS_PER_BITMAP * ctl->unit;
+ const int unit = block_group->fs_info->sectorsize;
+
+ offset += BITS_PER_BITMAP * unit;
start = offset;
} else {
start += bytes;
@@ -4071,6 +4068,7 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ const int unit = block_group->fs_info->sectorsize;
int ret;
u64 rem = 0;
@@ -4091,7 +4089,7 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group,
goto out;
ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
- div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
+ div64_u64_rem(end, BITS_PER_BITMAP * unit, &rem);
/* If we ended in the middle of a bitmap, reset the trimming flag */
if (rem)
reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
@@ -4310,6 +4308,7 @@ int test_check_exists(struct btrfs_block_group *cache,
u64 offset, u64 bytes)
{
struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+ const int unit = cache->fs_info->sectorsize;
struct btrfs_free_space *info;
int ret = 0;
@@ -4329,7 +4328,7 @@ have_info:
struct btrfs_free_space *tmp;
bit_off = offset;
- bit_bytes = ctl->unit;
+ bit_bytes = unit;
ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
if (!ret) {
if (bit_off == offset) {
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 33fc3b2456485..53fe8e293af1d 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -74,28 +74,20 @@ enum {
};
struct btrfs_free_space_ctl {
- spinlock_t tree_lock;
struct rb_root free_space_offset;
struct rb_root_cached free_space_bytes;
- u64 free_space;
+ spinlock_t tree_lock;
int extents_thresh;
int free_extents;
int total_bitmaps;
- int unit;
- u64 start;
+ u64 free_space;
s32 discardable_extents[BTRFS_STAT_NR_ENTRIES];
s64 discardable_bytes[BTRFS_STAT_NR_ENTRIES];
- const struct btrfs_free_space_op *op;
struct btrfs_block_group *block_group;
struct mutex cache_writeout_mutex;
struct list_head trimming_ranges;
};
-struct btrfs_free_space_op {
- bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info);
-};
-
struct btrfs_io_ctl {
void *cur, *orig;
struct page *page;
@@ -172,6 +164,8 @@ bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info);
int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active);
/* Support functions for running our sanity tests */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+bool btrfs_use_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info);
int test_add_free_space_entry(struct btrfs_block_group *cache,
u64 offset, u64 bytes, bool bitmap);
int test_check_exists(struct btrfs_block_group *cache, u64 offset, u64 bytes);
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 472b3060e5ac3..5e61612f96128 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -109,7 +109,7 @@ struct btrfs_free_space_info *btrfs_search_free_space_info(
ret = btrfs_search_slot(trans, root, &key, path, 0, cow);
if (ret < 0)
return ERR_PTR(ret);
- if (ret != 0) {
+ if (unlikely(ret != 0)) {
btrfs_warn(fs_info, "missing free space info for %llu",
block_group->start);
DEBUG_WARN();
@@ -1545,6 +1545,29 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
return 0;
}
+static int validate_free_space_key(struct btrfs_block_group *block_group,
+ const struct btrfs_key *key, u8 expected_type)
+{
+ const u64 end = btrfs_block_group_end(block_group);
+
+ if (unlikely(key->type != expected_type)) {
+ btrfs_err(block_group->fs_info,
+ "block group %llu has unexpected free space key type %u, expected %u",
+ block_group->start, key->type, expected_type);
+ return -EUCLEAN;
+ }
+
+ if (unlikely(key->objectid + key->offset > end)) {
+ btrfs_err(block_group->fs_info,
+ "block group %llu has invalid free space key (%llu %u %llu)",
+ block_group->start, key->objectid, key->type,
+ key->offset);
+ return -EUCLEAN;
+ }
+
+ return 0;
+}
+
static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
struct btrfs_path *path,
u32 expected_extent_count)
@@ -1576,8 +1599,9 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
break;
- ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
- ASSERT(key.objectid < end && key.objectid + key.offset <= end);
+ ret = validate_free_space_key(block_group, &key, BTRFS_FREE_SPACE_BITMAP_KEY);
+ if (unlikely(ret))
+ return ret;
offset = key.objectid;
while (offset < key.objectid + key.offset) {
@@ -1633,7 +1657,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *root;
struct btrfs_key key;
- const u64 end = btrfs_block_group_end(block_group);
u64 total_found = 0;
u32 extent_count = 0;
int ret;
@@ -1654,8 +1677,9 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
break;
- ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
- ASSERT(key.objectid < end && key.objectid + key.offset <= end);
+ ret = validate_free_space_key(block_group, &key, BTRFS_FREE_SPACE_EXTENT_KEY);
+ if (unlikely(ret))
+ return ret;
ret = btrfs_add_new_free_space(block_group, key.objectid,
key.objectid + key.offset,
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 1782f228c45c0..da87292420fa9 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -50,20 +50,26 @@ struct btrfs_subpage_info;
struct btrfs_stripe_hash_table;
struct btrfs_space_info;
+/* Minimum data and metadata block size. */
+#define BTRFS_MIN_BLOCKSIZE (SZ_4K)
+#define BTRFS_MAX_BLOCKSIZE (SZ_64K)
+
+/* The maximum folio size btrfs supports. */
+#define BTRFS_MAX_FOLIO_SIZE (SZ_2M)
+static_assert(BTRFS_MAX_FOLIO_SIZE > PAGE_SIZE);
+
/*
- * Minimum data and metadata block size.
+ * The maximum number of blocks a huge folio can support.
*
- * Normally it's 4K, but for testing subpage block size on 4K page systems, we
- * allow DEBUG builds to accept 2K page size.
+ * Depending on the filesystem block size, the real maximum blocks per folio
+ * may also be limited by the above BTRFS_MAX_FOLIO_SIZE.
*/
-#ifdef CONFIG_BTRFS_DEBUG
-#define BTRFS_MIN_BLOCKSIZE (SZ_2K)
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+#define BTRFS_MAX_BLOCKS_PER_FOLIO (512)
#else
-#define BTRFS_MIN_BLOCKSIZE (SZ_4K)
+#define BTRFS_MAX_BLOCKS_PER_FOLIO (BITS_PER_LONG)
#endif
-#define BTRFS_MAX_BLOCKSIZE (SZ_64K)
-
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
/*
@@ -89,6 +95,10 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
#define BTRFS_KEY_FMT "(%llu %u %llu)"
#define BTRFS_KEY_FMT_VALUE(key) (key)->objectid, (key)->type, (key)->offset
+#define BTRFS_QGROUP_FMT "%hu/%llu"
+#define BTRFS_QGROUP_FMT_VALUE(qgroup) btrfs_qgroup_level((qgroup)->qgroupid), \
+ btrfs_qgroup_subvolid((qgroup)->qgroupid)
+
/*
* Number of metadata items necessary for an unlink operation:
*
@@ -486,6 +496,9 @@ struct btrfs_delayed_root {
wait_queue_head_t wait;
};
+struct btrfs_free_space_ctl;
+struct btrfs_free_space;
+
struct btrfs_fs_info {
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
@@ -698,13 +711,6 @@ struct btrfs_fs_info {
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
struct btrfs_workqueue *caching_workers;
-
- /*
- * Fixup workers take dirty pages that didn't properly go through the
- * cow mechanism and make them safe to write. It happens for the
- * sys_munmap function call path.
- */
- struct btrfs_workqueue *fixup_workers;
struct btrfs_workqueue *delayed_workers;
struct task_struct *transaction_kthread;
@@ -959,6 +965,10 @@ struct btrfs_fs_info {
spinlock_t eb_leak_lock;
struct list_head allocated_ebs;
#endif
+
+ /* Used by self tests only. */
+ bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info);
};
#define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \
@@ -1208,14 +1218,6 @@ static inline void btrfs_force_shutdown(struct btrfs_fs_info *fs_info)
}
}
-/*
- * We use folio flag owner_2 to indicate there is an ordered extent with
- * unfinished IO.
- */
-#define folio_test_ordered(folio) folio_test_owner_2(folio)
-#define folio_set_ordered(folio) folio_set_owner_2(folio)
-#define folio_clear_ordered(folio) folio_clear_owner_2(folio)
-
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
#define EXPORT_FOR_TESTS
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1ca1cbdf25bcd..61b5594c4206f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -401,28 +401,6 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
u64 offset, u64 bytes)
{
- pgoff_t index = offset >> PAGE_SHIFT;
- const pgoff_t end_index = (offset + bytes - 1) >> PAGE_SHIFT;
- struct folio *folio;
-
- while (index <= end_index) {
- folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
- if (IS_ERR(folio)) {
- index++;
- continue;
- }
-
- index = folio_next_index(folio);
- /*
- * Here we just clear all Ordered bits for every page in the
- * range, then btrfs_mark_ordered_io_finished() will handle
- * the ordered extent accounting for the range.
- */
- btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio,
- offset, bytes);
- folio_put(folio);
- }
-
return btrfs_mark_ordered_io_finished(inode, offset, bytes, false);
}
@@ -755,7 +733,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- if (!btrfs_inode_can_compress(inode)) {
+ if (unlikely(!btrfs_inode_can_compress(inode))) {
DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode));
return 0;
}
@@ -842,7 +820,7 @@ static struct folio *compressed_bio_last_folio(struct compressed_bio *cb)
ASSERT(bio->bi_vcnt);
bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
- paddr = page_to_phys(bvec->bv_page) + bvec->bv_offset + bvec->bv_len - 1;
+ paddr = bvec_phys(bvec) + bvec->bv_len - 1;
return page_folio(phys_to_page(paddr));
}
@@ -1406,7 +1384,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* setup for writepage.
*/
page_ops = ((flags & COW_FILE_RANGE_KEEP_LOCKED) ? 0 : PAGE_UNLOCK);
- page_ops |= PAGE_SET_ORDERED;
/*
* Relocation relies on the relocated extents to have exactly the same
@@ -1972,8 +1949,7 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio
goto error;
extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_CLEAR_DATA_RESV,
- PAGE_SET_ORDERED);
+ EXTENT_CLEAR_DATA_RESV, 0);
return ret;
error:
@@ -2317,7 +2293,7 @@ error:
static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
{
if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
- if (inode->defrag_bytes &&
+ if (data_race(inode->defrag_bytes) &&
btrfs_test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
return false;
return true;
@@ -2605,8 +2581,7 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
lockdep_assert_held(&inode->io_tree.lock);
- if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
- WARN_ON(1);
+ WARN_ON((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC));
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testing for the DELALLOC
@@ -2810,7 +2785,13 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state)
{
- WARN_ON(PAGE_ALIGNED(end));
+ const u32 blocksize = inode->root->fs_info->sectorsize;
+
+ /* Basic alignment check. */
+ ASSERT(IS_ALIGNED(start, blocksize), "start=%llu blocksize=%u",
+ start, blocksize);
+ ASSERT(IS_ALIGNED(end + 1, blocksize), "inclusive end=%llu blocksize=%u",
+ end, blocksize);
if (start >= i_size_read(&inode->vfs_inode) &&
!(inode->flags & BTRFS_INODE_PREALLOC)) {
@@ -2833,206 +2814,50 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
EXTENT_DELALLOC | extra_bits, cached_state);
}
-/* see btrfs_writepage_start_hook for details on why this is required */
-struct btrfs_writepage_fixup {
- struct folio *folio;
- struct btrfs_inode *inode;
- struct btrfs_work work;
-};
-
-static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
-{
- struct btrfs_writepage_fixup *fixup =
- container_of(work, struct btrfs_writepage_fixup, work);
- struct btrfs_ordered_extent *ordered;
- struct extent_state *cached_state = NULL;
- struct extent_changeset *data_reserved = NULL;
- struct folio *folio = fixup->folio;
- struct btrfs_inode *inode = fixup->inode;
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- u64 page_start = folio_pos(folio);
- u64 page_end = folio_next_pos(folio) - 1;
- int ret = 0;
- bool free_delalloc_space = true;
-
- /*
- * This is similar to page_mkwrite, we need to reserve the space before
- * we take the folio lock.
- */
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
- folio_size(folio));
-again:
- folio_lock(folio);
-
- /*
- * Before we queued this fixup, we took a reference on the folio.
- * folio->mapping may go NULL, but it shouldn't be moved to a different
- * address space.
- */
- if (!folio->mapping || !folio_test_dirty(folio) ||
- !folio_test_checked(folio)) {
- /*
- * Unfortunately this is a little tricky, either
- *
- * 1) We got here and our folio had already been dealt with and
- * we reserved our space, thus ret == 0, so we need to just
- * drop our space reservation and bail. This can happen the
- * first time we come into the fixup worker, or could happen
- * while waiting for the ordered extent.
- * 2) Our folio was already dealt with, but we happened to get an
- * ENOSPC above from the btrfs_delalloc_reserve_space. In
- * this case we obviously don't have anything to release, but
- * because the folio was already dealt with we don't want to
- * mark the folio with an error, so make sure we're resetting
- * ret to 0. This is why we have this check _before_ the ret
- * check, because we do not want to have a surprise ENOSPC
- * when the folio was already properly dealt with.
- */
- if (!ret) {
- btrfs_delalloc_release_extents(inode, folio_size(folio));
- btrfs_delalloc_release_space(inode, data_reserved,
- page_start, folio_size(folio),
- true);
- }
- ret = 0;
- goto out_page;
- }
-
- /*
- * We can't mess with the folio state unless it is locked, so now that
- * it is locked bail if we failed to make our space reservation.
- */
- if (ret)
- goto out_page;
-
- btrfs_lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
-
- /* already ordered? We're done */
- if (folio_test_ordered(folio))
- goto out_reserved;
-
- ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
- if (ordered) {
- btrfs_unlock_extent(&inode->io_tree, page_start, page_end,
- &cached_state);
- folio_unlock(folio);
- btrfs_start_ordered_extent(ordered);
- btrfs_put_ordered_extent(ordered);
- goto again;
- }
-
- ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
- &cached_state);
- if (ret)
- goto out_reserved;
-
- /*
- * Everything went as planned, we're now the owner of a dirty page with
- * delayed allocation bits set and space reserved for our COW
- * destination.
- *
- * The page was dirty when we started, nothing should have cleaned it.
- */
- BUG_ON(!folio_test_dirty(folio));
- free_delalloc_space = false;
-out_reserved:
- btrfs_delalloc_release_extents(inode, PAGE_SIZE);
- if (free_delalloc_space)
- btrfs_delalloc_release_space(inode, data_reserved, page_start,
- PAGE_SIZE, true);
- btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
-out_page:
- if (ret) {
- /*
- * We hit ENOSPC or other errors. Update the mapping and page
- * to reflect the errors and clean the page.
- */
- mapping_set_error(folio->mapping, ret);
- btrfs_folio_clear_ordered(fs_info, folio, page_start,
- folio_size(folio));
- btrfs_mark_ordered_io_finished(inode, page_start,
- folio_size(folio), !ret);
- folio_clear_dirty_for_io(folio);
- }
- btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
- folio_unlock(folio);
- folio_put(folio);
- kfree(fixup);
- extent_changeset_free(data_reserved);
- /*
- * As a precaution, do a delayed iput in case it would be the last iput
- * that could need flushing space. Recursing back to fixup worker would
- * deadlock.
- */
- btrfs_add_delayed_iput(inode);
-}
-
/*
- * There are a few paths in the higher layers of the kernel that directly
- * set the folio dirty bit without asking the filesystem if it is a
- * good idea. This causes problems because we want to make sure COW
- * properly happens and the data=ordered rules are followed.
+ * Clear the old accounting flags and set EXTENT_DELALLOC for the range.
*
- * In our case any range that doesn't have the ORDERED bit set
- * hasn't been properly setup for IO. We kick off an async process
- * to fix it up. The async helper will wait for ordered extents, set
- * the delalloc bit and make it safe to write the folio.
+ * Return <0 for error, in that case no range has EXTENT_DELALLOC bit cleared or set.
*/
-int btrfs_writepage_cow_fixup(struct folio *folio)
+int btrfs_reset_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
+ unsigned int extra_bits, struct extent_state **cached_state)
{
- struct inode *inode = folio->mapping->host;
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_writepage_fixup *fixup;
+ const u32 blocksize = inode->root->fs_info->sectorsize;
- /* This folio has ordered extent covering it already */
- if (folio_test_ordered(folio))
- return 0;
+ /* The @extra_bits can only be EXTENT_NORESERVE for now. */
+ ASSERT(!(extra_bits & ~EXTENT_NORESERVE), "extra_bits=0x%x", extra_bits);
- /*
- * For experimental build, we error out instead of EAGAIN.
- *
- * We should not hit such out-of-band dirty folios anymore.
- */
- if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
- DEBUG_WARN();
- btrfs_err_rl(fs_info,
- "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
- btrfs_root_id(BTRFS_I(inode)->root),
- btrfs_ino(BTRFS_I(inode)),
- folio_pos(folio));
- return -EUCLEAN;
- }
+ /* Basic alignment check. */
+ ASSERT(IS_ALIGNED(start, blocksize), "start=%llu blocksize=%u",
+ start, blocksize);
+ ASSERT(IS_ALIGNED(end + 1, blocksize), "inclusive end=%llu blocksize=%u",
+ end, blocksize);
/*
- * folio_checked is set below when we create a fixup worker for this
- * folio, don't try to create another one if we're already
- * folio_test_checked.
- *
- * The extent_io writepage code will redirty the foio if we send back
- * EAGAIN.
+ * Check and set DELALLOC_NEW flag, this needs to search tree thus can
+ * fail early. Thus we want to do this before clearing EXTENT_DELALLOC.
*/
- if (folio_test_checked(folio))
- return -EAGAIN;
-
- fixup = kzalloc_obj(*fixup, GFP_NOFS);
- if (!fixup)
- return -EAGAIN;
-
- /*
- * We are already holding a reference to this inode from
- * write_cache_pages. We need to hold it because the space reservation
- * takes place outside of the folio lock, and we can't trust
- * folio->mapping outside of the folio lock.
- */
- ihold(inode);
- btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
- folio_get(folio);
- btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
- fixup->folio = folio;
- fixup->inode = BTRFS_I(inode);
- btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
+ if (start >= i_size_read(&inode->vfs_inode) &&
+ !(inode->flags & BTRFS_INODE_PREALLOC)) {
+ /*
+ * There can't be any extents following EOF in this case so just
+ * set the delalloc new bit for the range directly.
+ */
+ extra_bits |= EXTENT_DELALLOC_NEW;
+ } else {
+ int ret;
- return -EAGAIN;
+ ret = btrfs_find_new_delalloc_bytes(inode, start, end + 1 - start,
+ NULL);
+ if (unlikely(ret))
+ return ret;
+ }
+ /* Clear the old accounting as the range may already be dirty. */
+ btrfs_clear_extent_bit(&inode->io_tree, start, end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, cached_state);
+ return btrfs_set_extent_bit(&inode->io_tree, start, end,
+ EXTENT_DELALLOC | extra_bits, cached_state);
}
static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -5181,12 +5006,7 @@ again:
goto again;
}
- btrfs_clear_extent_bit(&inode->io_tree, block_start, block_end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- &cached_state);
-
- ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
- &cached_state);
+ ret = btrfs_reset_extent_delalloc(inode, block_start, block_end, 0, &cached_state);
if (ret) {
btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
goto out_unlock;
@@ -5211,8 +5031,6 @@ again:
folio_zero_range(folio, zero_start - folio_pos(folio),
zero_end - zero_start + 1);
- btrfs_folio_clear_checked(fs_info, folio, block_start,
- block_end + 1 - block_start);
btrfs_folio_set_dirty(fs_info, folio, block_start,
block_end + 1 - block_start);
@@ -7657,12 +7475,6 @@ static int btrfs_migrate_folio(struct address_space *mapping,
if (ret)
return ret;
-
- if (folio_test_ordered(src)) {
- folio_clear_ordered(src);
- folio_set_ordered(dst);
- }
-
return 0;
}
#else
@@ -7751,18 +7563,20 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
page_end);
ASSERT(range_end + 1 - cur < U32_MAX);
range_len = range_end + 1 - cur;
- if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
- /*
- * If Ordered is cleared, it means endio has
- * already been executed for the range.
- * We can't delete the extent states as
- * btrfs_finish_ordered_io() may still use some of them.
- */
+ /*
+ * If the range is not dirty, the range has been submitted and
+ * since we have waited for the writeback, endio has been
+ * executed, thus we must skip the range to avoid double
+ * accounting for the ordered extent.
+ */
+ if (!btrfs_folio_test_dirty(fs_info, folio, cur, range_len))
goto next;
- }
- btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
/*
+ * The range is dirty meaning it has not been submitted.
+ * Here we need to truncate the OE range as the range will never
+ * be submitted.
+ *
* IO on this page will never be started, so we need to account
* for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
* here, must leave that up for the ordered extent completion.
@@ -7823,13 +7637,8 @@ next:
&cached_state);
cur = range_end + 1;
}
- /*
- * We have iterated through all ordered extents of the page, the page
- * should not have Ordered anymore, or the above iteration
- * did something wrong.
- */
- ASSERT(!folio_test_ordered(folio));
- btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
+ btrfs_folio_clear_dirty(fs_info, folio, page_start, folio_size(folio));
+ btrfs_clear_folio_dirty_tag(folio);
if (!inode_evicting)
__btrfs_release_folio(folio, GFP_NOFS);
clear_folio_extent_mapped(folio);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a39460bf68a77..d4981d2a42d71 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -56,6 +56,7 @@
#include "uuid-tree.h"
#include "ioctl.h"
#include "file.h"
+#include "file-item.h"
#include "scrub.h"
#include "super.h"
@@ -707,7 +708,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct inode *inode;
- struct btrfs_pending_snapshot *pending_snapshot;
+ struct btrfs_pending_snapshot AUTO_KFREE(pending_snapshot);
unsigned int trans_num_items;
struct btrfs_trans_handle *trans;
struct btrfs_block_rsv *block_rsv;
@@ -816,7 +817,6 @@ free_pending:
free_anon_bdev(pending_snapshot->anon_dev);
kfree(pending_snapshot->root_item);
btrfs_free_path(pending_snapshot->path);
- kfree(pending_snapshot);
return ret;
}
@@ -961,7 +961,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
u64 new_size;
u64 old_size;
u64 devid = 1;
- struct btrfs_ioctl_vol_args *vol_args;
+ struct btrfs_ioctl_vol_args AUTO_KFREE(vol_args);
struct btrfs_device *device = NULL;
char *sizestr;
char *devstr = NULL;
@@ -987,13 +987,13 @@ static noinline int btrfs_ioctl_resize(struct file *file,
}
ret = btrfs_check_ioctl_vol_args_path(vol_args);
if (ret < 0)
- goto out_free;
+ goto out_drop;
sizestr = vol_args->name;
cancel = (strcmp("cancel", sizestr) == 0);
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel);
if (ret)
- goto out_free;
+ goto out_drop;
/* Exclusive operation is now claimed */
devstr = strchr(sizestr, ':');
@@ -1100,8 +1100,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
old_size, new_size);
out_finish:
btrfs_exclop_finish(fs_info);
-out_free:
- kfree(vol_args);
out_drop:
mnt_drop_write_file(file);
return ret;
@@ -1114,7 +1112,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
struct btrfs_qgroup_inherit *inherit)
{
int ret;
- struct qstr qname = QSTR_INIT(name, strlen(name));
+ struct qstr qname = QSTR(name);
if (!S_ISDIR(file_inode(file)->i_mode))
return -ENOTDIR;
@@ -1179,7 +1177,7 @@ out_drop_write:
static noinline int btrfs_ioctl_snap_create(struct file *file,
void __user *arg, bool subvol)
{
- struct btrfs_ioctl_vol_args *vol_args;
+ struct btrfs_ioctl_vol_args AUTO_KFREE(vol_args);
int ret;
if (!S_ISDIR(file_inode(file)->i_mode))
@@ -1190,24 +1188,20 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
return PTR_ERR(vol_args);
ret = btrfs_check_ioctl_vol_args_path(vol_args);
if (ret < 0)
- goto out;
-
- ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
- vol_args->name, vol_args->fd, subvol,
- false, NULL);
+ return ret;
-out:
- kfree(vol_args);
- return ret;
+ return __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
+ vol_args->name, vol_args->fd, subvol,
+ false, NULL);
}
static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
void __user *arg, bool subvol)
{
- struct btrfs_ioctl_vol_args_v2 *vol_args;
+ struct btrfs_ioctl_vol_args_v2 AUTO_KFREE(vol_args);
+ struct btrfs_qgroup_inherit AUTO_KFREE(inherit);
int ret;
bool readonly = false;
- struct btrfs_qgroup_inherit *inherit = NULL;
if (!S_ISDIR(file_inode(file)->i_mode))
return -ENOTDIR;
@@ -1217,44 +1211,32 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
return PTR_ERR(vol_args);
ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args);
if (ret < 0)
- goto free_args;
+ return ret;
- if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) {
- ret = -EOPNOTSUPP;
- goto free_args;
- }
+ if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK)
+ return -EOPNOTSUPP;
if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
readonly = true;
if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file));
- if (vol_args->size < sizeof(*inherit) ||
- vol_args->size > PAGE_SIZE) {
- ret = -EINVAL;
- goto free_args;
- }
+ if (vol_args->size < sizeof(*inherit) || vol_args->size > PAGE_SIZE)
+ return -EINVAL;
+
inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
if (IS_ERR(inherit)) {
- ret = PTR_ERR(inherit);
- goto free_args;
+ return PTR_ERR(inherit);
}
ret = btrfs_qgroup_check_inherit(fs_info, inherit, vol_args->size);
if (ret < 0)
- goto free_inherit;
+ return ret;
}
- ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
- vol_args->name, vol_args->fd, subvol,
- readonly, inherit);
- if (ret)
- goto free_inherit;
-free_inherit:
- kfree(inherit);
-free_args:
- kfree(vol_args);
- return ret;
+ return __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
+ vol_args->name, vol_args->fd, subvol,
+ readonly, inherit);
}
static noinline int btrfs_ioctl_subvol_getflags(struct btrfs_inode *inode,
@@ -1865,7 +1847,7 @@ out_put:
static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root,
void __user *argp)
{
- struct btrfs_ioctl_ino_lookup_args *args;
+ struct btrfs_ioctl_ino_lookup_args AUTO_KFREE(args);
int ret = 0;
args = memdup_user(argp, sizeof(*args));
@@ -1895,9 +1877,8 @@ static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root,
out:
if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
- ret = -EFAULT;
+ return -EFAULT;
- kfree(args);
return ret;
}
@@ -1915,7 +1896,7 @@ out:
*/
static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
{
- struct btrfs_ioctl_ino_lookup_user_args *args;
+ struct btrfs_ioctl_ino_lookup_user_args AUTO_KFREE(args);
struct inode *inode;
int ret;
@@ -1931,7 +1912,6 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
* The subvolume does not exist under fd with which this is
* called
*/
- kfree(args);
return -EACCES;
}
@@ -1940,14 +1920,13 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
ret = -EFAULT;
- kfree(args);
return ret;
}
/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
{
- struct btrfs_ioctl_get_subvol_info_args *subvol_info;
+ struct btrfs_ioctl_get_subvol_info_args AUTO_KFREE(subvol_info);
struct btrfs_fs_info *fs_info;
struct btrfs_root *root;
struct btrfs_path *path;
@@ -1956,7 +1935,6 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
struct btrfs_root_ref *rref;
struct extent_buffer *leaf;
unsigned long item_off;
- unsigned long item_len;
int slot;
int ret = 0;
@@ -2031,17 +2009,17 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid == subvol_info->treeid &&
key.type == BTRFS_ROOT_BACKREF_KEY) {
+ u16 name_len;
+
subvol_info->parent_id = key.offset;
rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+ name_len = btrfs_root_ref_name_len(leaf, rref);
subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref);
- item_off = btrfs_item_ptr_offset(leaf, slot)
- + sizeof(struct btrfs_root_ref);
- item_len = btrfs_item_size(leaf, slot)
- - sizeof(struct btrfs_root_ref);
+ item_off = btrfs_item_ptr_offset(leaf, slot) + sizeof(*rref);
read_extent_buffer(leaf, subvol_info->name,
- item_off, item_len);
+ item_off, name_len);
} else {
ret = -ENOENT;
goto out;
@@ -2057,7 +2035,6 @@ out:
btrfs_put_root(root);
out_free:
btrfs_free_path(path);
- kfree(subvol_info);
return ret;
}
@@ -2068,7 +2045,7 @@ out_free:
static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
void __user *argp)
{
- struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
+ struct btrfs_ioctl_get_subvol_rootref_args AUTO_KFREE(rootrefs);
struct btrfs_root_ref *rref;
struct btrfs_path *path;
struct btrfs_key key;
@@ -2151,8 +2128,6 @@ out:
ret = -EFAULT;
}
- kfree(rootrefs);
-
return ret;
}
@@ -2167,8 +2142,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *dest = NULL;
- struct btrfs_ioctl_vol_args *vol_args = NULL;
- struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
+ struct btrfs_ioctl_vol_args AUTO_KFREE(vol_args);
+ struct btrfs_ioctl_vol_args_v2 AUTO_KFREE(vol_args2);
struct mnt_idmap *idmap = file_mnt_idmap(file);
char *subvol_name, *subvol_name_ptr = NULL;
int ret = 0;
@@ -2186,10 +2161,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (IS_ERR(vol_args2))
return PTR_ERR(vol_args2);
- if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) {
- ret = -EOPNOTSUPP;
- goto out;
- }
+ if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK)
+ return -EOPNOTSUPP;
/*
* If SPEC_BY_ID is not set, we are looking for the subvolume by
@@ -2198,23 +2171,21 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) {
ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args2);
if (ret < 0)
- goto out;
+ return ret;
subvol_name = vol_args2->name;
ret = mnt_want_write_file(file);
if (ret)
- goto out;
+ return ret;
} else {
struct inode *old_dir;
- if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
- ret = -EINVAL;
- goto out;
- }
+ if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID)
+ return -EINVAL;
ret = mnt_want_write_file(file);
if (ret)
- goto out;
+ return ret;
dentry = btrfs_get_dentry(fs_info->sb,
BTRFS_FIRST_FREE_OBJECTID,
@@ -2284,13 +2255,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
ret = btrfs_check_ioctl_vol_args_path(vol_args);
if (ret < 0)
- goto out;
+ return ret;
subvol_name = vol_args->name;
ret = mnt_want_write_file(file);
if (ret)
- goto out;
+ return ret;
}
if (strchr(subvol_name, '/') ||
@@ -2371,9 +2342,6 @@ free_parent:
dput(parent);
out_drop_write:
mnt_drop_write_file(file);
-out:
- kfree(vol_args2);
- kfree(vol_args);
return ret;
}
@@ -2461,7 +2429,7 @@ out:
static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
{
- struct btrfs_ioctl_vol_args *vol_args;
+ struct btrfs_ioctl_vol_args AUTO_KFREE(vol_args);
bool restore_op = false;
int ret;
@@ -2501,15 +2469,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
ret = btrfs_check_ioctl_vol_args_path(vol_args);
if (ret < 0)
- goto out_free;
+ goto out;
ret = btrfs_init_new_device(fs_info, vol_args->name);
if (!ret)
btrfs_info(fs_info, "disk added %s", vol_args->name);
-out_free:
- kfree(vol_args);
out:
if (restore_op)
btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
@@ -2523,7 +2489,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_ioctl_vol_args_v2 *vol_args;
+ struct btrfs_ioctl_vol_args_v2 AUTO_KFREE(vol_args);
struct file *bdev_file = NULL;
int ret;
bool cancel = false;
@@ -2582,7 +2548,6 @@ err_drop:
bdev_fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
- kfree(vol_args);
return ret;
}
@@ -2591,7 +2556,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_ioctl_vol_args *vol_args;
+ struct btrfs_ioctl_vol_args AUTO_KFREE(vol_args);
struct file *bdev_file = NULL;
int ret;
bool cancel = false;
@@ -2605,7 +2570,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
ret = btrfs_check_ioctl_vol_args_path(vol_args);
if (ret < 0)
- goto out_free;
+ return ret;
if (!strcmp("cancel", vol_args->name)) {
cancel = true;
@@ -2633,19 +2598,16 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
bdev_fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
-out_free:
- kfree(vol_args);
return ret;
}
static long btrfs_ioctl_fs_info(const struct btrfs_fs_info *fs_info,
void __user *arg)
{
- struct btrfs_ioctl_fs_info_args *fi_args;
+ struct btrfs_ioctl_fs_info_args AUTO_KFREE(fi_args);
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 flags_in;
- int ret = 0;
fi_args = memdup_user(arg, sizeof(*fi_args));
if (IS_ERR(fi_args))
@@ -2686,17 +2648,16 @@ static long btrfs_ioctl_fs_info(const struct btrfs_fs_info *fs_info,
}
if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
- ret = -EFAULT;
+ return -EFAULT;
- kfree(fi_args);
- return ret;
+ return 0;
}
static long btrfs_ioctl_dev_info(const struct btrfs_fs_info *fs_info,
void __user *arg)
{
BTRFS_DEV_LOOKUP_ARGS(args);
- struct btrfs_ioctl_dev_info_args *di_args;
+ struct btrfs_ioctl_dev_info_args AUTO_KFREE(di_args);
struct btrfs_device *dev;
int ret = 0;
@@ -2730,7 +2691,6 @@ out:
if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
ret = -EFAULT;
- kfree(di_args);
return ret;
}
@@ -3011,7 +2971,7 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file));
- struct btrfs_ioctl_scrub_args *sa;
+ struct btrfs_ioctl_scrub_args AUTO_KFREE(sa);
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -3026,15 +2986,13 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
if (IS_ERR(sa))
return PTR_ERR(sa);
- if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) {
- ret = -EOPNOTSUPP;
- goto out;
- }
+ if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS)
+ return -EOPNOTSUPP;
if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
ret = mnt_want_write_file(file);
if (ret)
- goto out;
+ return ret;
}
ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end,
@@ -3058,8 +3016,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
if (!(sa->flags & BTRFS_SCRUB_READONLY))
mnt_drop_write_file(file);
-out:
- kfree(sa);
+
return ret;
}
@@ -3074,7 +3031,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info)
static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
void __user *arg)
{
- struct btrfs_ioctl_scrub_args *sa;
+ struct btrfs_ioctl_scrub_args AUTO_KFREE(sa);
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -3087,40 +3044,36 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
- ret = -EFAULT;
+ return -EFAULT;
- kfree(sa);
return ret;
}
static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
void __user *arg)
{
- struct btrfs_ioctl_get_dev_stats *sa;
+ struct btrfs_ioctl_get_dev_stats AUTO_KFREE(sa);
int ret;
sa = memdup_user(arg, sizeof(*sa));
if (IS_ERR(sa))
return PTR_ERR(sa);
- if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
- kfree(sa);
+ if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN))
return -EPERM;
- }
ret = btrfs_get_dev_stats(fs_info, sa);
if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
- ret = -EFAULT;
+ return -EFAULT;
- kfree(sa);
return ret;
}
static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
void __user *arg)
{
- struct btrfs_ioctl_dev_replace_args *p;
+ struct btrfs_ioctl_dev_replace_args AUTO_KFREE(p);
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -3137,10 +3090,8 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
switch (p->cmd) {
case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
- if (sb_rdonly(fs_info->sb)) {
- ret = -EROFS;
- goto out;
- }
+ if (sb_rdonly(fs_info->sb))
+ return -EROFS;
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
} else {
@@ -3162,9 +3113,8 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
}
if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p)))
- ret = -EFAULT;
-out:
- kfree(p);
+ return -EFAULT;
+
return ret;
}
@@ -3174,7 +3124,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
int i;
u64 rel_ptr;
int size;
- struct btrfs_ioctl_ino_path_args *ipa = NULL;
+ struct btrfs_ioctl_ino_path_args AUTO_KFREE(ipa);
struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL;
struct btrfs_path *path;
@@ -3223,7 +3173,6 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
out:
btrfs_free_path(path);
- kfree(ipa);
return ret;
}
@@ -3233,8 +3182,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
{
int ret = 0;
int size;
- struct btrfs_ioctl_logical_ino_args *loi;
- struct btrfs_data_container *inodes = NULL;
+ struct btrfs_ioctl_logical_ino_args AUTO_KFREE(loi);
+ struct btrfs_data_container AUTO_KVFREE(inodes);
bool ignore_offset;
if (!capable(CAP_SYS_ADMIN))
@@ -3249,41 +3198,32 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
size = min_t(u32, loi->size, SZ_64K);
} else {
/* All reserved bits must be 0 for now */
- if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) {
- ret = -EINVAL;
- goto out_loi;
- }
+ if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved)))
+ return -EINVAL;
+
/* Only accept flags we have defined so far */
- if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) {
- ret = -EINVAL;
- goto out_loi;
- }
+ if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET))
+ return -EINVAL;
+
ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;
size = min_t(u32, loi->size, SZ_16M);
}
inodes = init_data_container(size);
- if (IS_ERR(inodes)) {
- ret = PTR_ERR(inodes);
- goto out_loi;
- }
+ if (IS_ERR(inodes))
+ return PTR_ERR(inodes);
ret = iterate_inodes_from_logical(loi->logical, fs_info, inodes, ignore_offset);
if (ret == -EINVAL)
- ret = -ENOENT;
+ return -ENOENT;
if (ret < 0)
- goto out;
+ return ret;
ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes,
size);
if (ret)
ret = -EFAULT;
-out:
- kvfree(inodes);
-out_loi:
- kfree(loi);
-
return ret;
}
@@ -3380,7 +3320,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
{
struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_ioctl_balance_args *bargs;
+ struct btrfs_ioctl_balance_args AUTO_KFREE(bargs);
struct btrfs_balance_control *bctl;
bool need_unlock = true;
int ret;
@@ -3465,7 +3405,6 @@ out_unlock:
btrfs_exclop_finish(fs_info);
out:
mnt_drop_write_file(file);
- kfree(bargs);
return ret;
}
@@ -3518,7 +3457,7 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_ioctl_quota_ctl_args *sa;
+ struct btrfs_ioctl_quota_ctl_args AUTO_KFREE(sa);
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -3577,7 +3516,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
break;
}
- kfree(sa);
drop_write:
mnt_drop_write_file(file);
return ret;
@@ -3588,8 +3526,8 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_ioctl_qgroup_assign_args *sa;
- struct btrfs_qgroup_list *prealloc = NULL;
+ struct btrfs_ioctl_qgroup_assign_args AUTO_KFREE(sa);
+ struct btrfs_qgroup_list AUTO_KFREE(prealloc);
struct btrfs_trans_handle *trans;
int ret;
int err;
@@ -3614,7 +3552,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
prealloc = kzalloc_obj(*prealloc);
if (!prealloc) {
ret = -ENOMEM;
- goto out;
+ goto drop_write;
}
}
@@ -3622,7 +3560,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- goto out;
+ goto drop_write;
}
/*
@@ -3648,9 +3586,6 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
if (err && !ret)
ret = err;
-out:
- kfree(prealloc);
- kfree(sa);
drop_write:
mnt_drop_write_file(file);
return ret;
@@ -3660,7 +3595,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_ioctl_qgroup_create_args *sa;
+ struct btrfs_ioctl_qgroup_create_args AUTO_KFREE(sa);
struct btrfs_trans_handle *trans;
int ret;
int err;
@@ -3683,12 +3618,12 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
if (!sa->qgroupid) {
ret = -EINVAL;
- goto out;
+ goto drop_write;
}
if (sa->create && btrfs_is_fstree(sa->qgroupid)) {
ret = -EINVAL;
- goto out;
+ goto drop_write;
}
/*
@@ -3698,7 +3633,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- goto out;
+ goto drop_write;
}
if (sa->create) {
@@ -3711,8 +3646,6 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
if (err && !ret)
ret = err;
-out:
- kfree(sa);
drop_write:
mnt_drop_write_file(file);
return ret;
@@ -3722,7 +3655,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_ioctl_qgroup_limit_args *sa;
+ struct btrfs_ioctl_qgroup_limit_args AUTO_KFREE(sa);
struct btrfs_trans_handle *trans;
int ret;
int err;
@@ -3748,7 +3681,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- goto out;
+ goto drop_write;
}
qgroupid = sa->qgroupid;
@@ -3763,8 +3696,6 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
if (err && !ret)
ret = err;
-out:
- kfree(sa);
drop_write:
mnt_drop_write_file(file);
return ret;
@@ -3774,7 +3705,7 @@ static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_ioctl_quota_rescan_args *qsa;
+ struct btrfs_ioctl_quota_rescan_args AUTO_KFREE(qsa);
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -3795,13 +3726,11 @@ static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
if (qsa->flags) {
ret = -EINVAL;
- goto out;
+ goto drop_write;
}
ret = btrfs_qgroup_rescan(fs_info);
-out:
- kfree(qsa);
drop_write:
mnt_drop_write_file(file);
return ret;
@@ -3946,8 +3875,8 @@ out:
static long btrfs_ioctl_set_received_subvol_32(struct file *file,
void __user *arg)
{
- struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
- struct btrfs_ioctl_received_subvol_args *args64 = NULL;
+ struct btrfs_ioctl_received_subvol_args_32 AUTO_KFREE(args32);
+ struct btrfs_ioctl_received_subvol_args AUTO_KFREE(args64);
int ret = 0;
args32 = memdup_user(arg, sizeof(*args32));
@@ -3955,10 +3884,8 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
return PTR_ERR(args32);
args64 = kmalloc_obj(*args64);
- if (!args64) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!args64)
+ return -ENOMEM;
memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
args64->stransid = args32->stransid;
@@ -3971,7 +3898,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_idmap(file), args64);
if (ret)
- goto out;
+ return ret;
memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
args32->stransid = args64->stransid;
@@ -3984,19 +3911,16 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
ret = copy_to_user(arg, args32, sizeof(*args32));
if (ret)
- ret = -EFAULT;
+ return -EFAULT;
-out:
- kfree(args32);
- kfree(args64);
- return ret;
+ return 0;
}
#endif
static long btrfs_ioctl_set_received_subvol(struct file *file,
void __user *arg)
{
- struct btrfs_ioctl_received_subvol_args *sa = NULL;
+ struct btrfs_ioctl_received_subvol_args AUTO_KFREE(sa);
int ret = 0;
sa = memdup_user(arg, sizeof(*sa));
@@ -4004,17 +3928,14 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
return PTR_ERR(sa);
ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_idmap(file), sa);
-
if (ret)
- goto out;
+ return ret;
ret = copy_to_user(arg, sa, sizeof(*sa));
if (ret)
- ret = -EFAULT;
+ return -EFAULT;
-out:
- kfree(sa);
- return ret;
+ return 0;
}
static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info,
@@ -4254,11 +4175,11 @@ out_drop_write:
static int _btrfs_ioctl_send(struct btrfs_root *root, void __user *argp, bool compat)
{
- struct btrfs_ioctl_send_args *arg;
- int ret;
+ struct btrfs_ioctl_send_args AUTO_KFREE(arg);
if (compat) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ int ret;
struct btrfs_ioctl_send_args_32 args32 = { 0 };
ret = copy_from_user(&args32, argp, sizeof(args32));
@@ -4283,9 +4204,7 @@ static int _btrfs_ioctl_send(struct btrfs_root *root, void __user *argp, bool co
if (IS_ERR(arg))
return PTR_ERR(arg);
}
- ret = btrfs_ioctl_send(root, arg);
- kfree(arg);
- return ret;
+ return btrfs_ioctl_send(root, arg);
}
static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
@@ -5092,7 +5011,8 @@ static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *a
return -ENOENT;
wait_for_deletion = true;
- ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD);
+ ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD, "root_flags=0x%llx",
+ root_flags);
sched_ret = schedule_timeout_interruptible(HZ);
/* Early wake up or error. */
if (sched_ret != 0)
@@ -5140,6 +5060,342 @@ static int btrfs_ioctl_shutdown(struct btrfs_fs_info *fs_info, unsigned long arg
return ret;
}
+#define GET_CSUMS_BUF_MAX SZ_16M
+
+static int copy_csums_to_user(struct btrfs_fs_info *fs_info, u64 disk_bytenr,
+ u64 len, u8 __user *buf)
+{
+ struct btrfs_root *csum_root;
+ struct btrfs_ordered_sum *sums;
+ LIST_HEAD(list);
+ const u32 csum_size = fs_info->csum_size;
+ int ret;
+
+ csum_root = btrfs_csum_root(fs_info, disk_bytenr);
+ if (unlikely(!csum_root)) {
+ btrfs_err(fs_info, "missing csum root for extent at bytenr %llu", disk_bytenr);
+ return -EUCLEAN;
+ }
+
+ ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
+ disk_bytenr + len - 1, &list, false);
+ if (ret < 0)
+ return ret;
+
+ ret = 0;
+ while (!list_empty(&list)) {
+ u64 offset;
+ size_t copy_size;
+
+ sums = list_first_entry(&list, struct btrfs_ordered_sum, list);
+ list_del(&sums->list);
+
+ offset = ((sums->logical - disk_bytenr) >> fs_info->sectorsize_bits) * csum_size;
+ copy_size = (sums->len >> fs_info->sectorsize_bits) * csum_size;
+
+ if (copy_to_user(buf + offset, sums->sums, copy_size)) {
+ kfree(sums);
+ ret = -EFAULT;
+ goto out;
+ }
+
+ kfree(sums);
+ }
+
+out:
+ while (!list_empty(&list)) {
+ sums = list_first_entry(&list, struct btrfs_ordered_sum, list);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ return ret;
+}
+
+static int btrfs_ioctl_get_csums(struct file *file, void __user *argp)
+{
+ struct inode *vfs_inode = file_inode(file);
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_ioctl_get_csums_args args;
+ BTRFS_PATH_AUTO_FREE(path);
+ const u64 ino = btrfs_ino(inode);
+ const u32 csum_size = fs_info->csum_size;
+ u8 __user *ubuf;
+ u64 buf_limit;
+ u64 buf_used = 0;
+ u64 cur_offset;
+ u64 end_offset;
+ u64 prev_extent_end;
+ struct btrfs_key key;
+ int ret;
+
+ if (!(file->f_mode & FMODE_READ))
+ return -EBADF;
+
+ if (!S_ISREG(vfs_inode->i_mode))
+ return -EINVAL;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+
+ if (!IS_ALIGNED(args.offset, fs_info->sectorsize) ||
+ !IS_ALIGNED(args.length, fs_info->sectorsize))
+ return -EINVAL;
+ if (args.length == 0)
+ return -EINVAL;
+ if (args.offset + args.length < args.offset)
+ return -EOVERFLOW;
+ if (args.flags != 0)
+ return -EINVAL;
+ if (args.buf_size < sizeof(struct btrfs_ioctl_get_csums_entry))
+ return -EINVAL;
+
+ buf_limit = min_t(u64, args.buf_size, GET_CSUMS_BUF_MAX);
+ ubuf = (u8 __user *)(argp + offsetof(struct btrfs_ioctl_get_csums_args, buf));
+
+ if (clear_user(ubuf, buf_limit))
+ return -EFAULT;
+
+ cur_offset = args.offset;
+ end_offset = args.offset + args.length;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_wait_ordered_range(inode, cur_offset, args.length);
+ if (ret)
+ return ret;
+
+ ret = down_read_interruptible(&vfs_inode->i_rwsem);
+ if (ret)
+ return ret;
+
+ ret = btrfs_wait_ordered_range(inode, cur_offset, args.length);
+ if (ret)
+ goto out_unlock;
+
+ /* NODATASUM early exit. */
+ if (inode->flags & BTRFS_INODE_NODATASUM) {
+ struct btrfs_ioctl_get_csums_entry entry = {
+ .offset = cur_offset,
+ .length = end_offset - cur_offset,
+ .type = BTRFS_GET_CSUMS_NODATASUM,
+ };
+
+ if (copy_to_user(ubuf, &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ buf_used = sizeof(entry);
+ cur_offset = end_offset;
+ goto done;
+ }
+
+ prev_extent_end = cur_offset;
+
+ while (cur_offset < end_offset) {
+ struct btrfs_file_extent_item *ei;
+ struct extent_buffer *leaf;
+ struct btrfs_ioctl_get_csums_entry entry = { 0 };
+ u64 extent_end;
+ u64 disk_bytenr = 0;
+ u64 extent_offset = 0;
+ u64 range_start, range_len;
+ u64 entry_csum_size;
+ u64 key_offset;
+ int extent_type;
+ u8 compression;
+ u8 encryption;
+
+ /* Search for the extent at or before cur_offset. */
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = cur_offset;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out_unlock;
+
+ if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) {
+ path->slots[0]--;
+ if (btrfs_file_extent_end(path) <= cur_offset)
+ path->slots[0]++;
+ }
+ }
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out_unlock;
+ if (ret > 0) {
+ ret = 0;
+ btrfs_release_path(path);
+ break;
+ }
+ }
+
+ leaf = path->nodes[0];
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+ btrfs_release_path(path);
+ break;
+ }
+
+ extent_end = btrfs_file_extent_end(path);
+ key_offset = key.offset;
+
+ /* Read extent fields before releasing the path. */
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, ei);
+ compression = btrfs_file_extent_compression(leaf, ei);
+ encryption = btrfs_file_extent_encryption(leaf, ei);
+
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ if (disk_bytenr && compression == BTRFS_COMPRESS_NONE)
+ extent_offset = btrfs_file_extent_offset(leaf, ei);
+ }
+
+ btrfs_release_path(path);
+
+ /* Implicit hole (NO_HOLES feature). */
+ if (prev_extent_end < key_offset) {
+ u64 hole_end = min(key_offset, end_offset);
+ u64 hole_len = hole_end - prev_extent_end;
+
+ if (prev_extent_end >= cur_offset) {
+ entry.offset = prev_extent_end;
+ entry.length = hole_len;
+ entry.type = BTRFS_GET_CSUMS_ZEROED;
+
+ if (buf_used + sizeof(entry) > buf_limit)
+ goto done;
+ if (copy_to_user(ubuf + buf_used, &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+ buf_used += sizeof(entry);
+ cur_offset = hole_end;
+ }
+
+ if (key_offset >= end_offset) {
+ cur_offset = end_offset;
+ break;
+ }
+ }
+
+ /* Clamp to our query range. */
+ range_start = max(cur_offset, key_offset);
+ range_len = min(extent_end, end_offset) - range_start;
+
+ entry.offset = range_start;
+ entry.length = range_len;
+
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ entry.type = BTRFS_GET_CSUMS_INLINE;
+ if (compression != BTRFS_COMPRESS_NONE)
+ entry.type |= BTRFS_GET_CSUMS_COMPRESSED;
+ if (encryption != 0)
+ entry.type |= BTRFS_GET_CSUMS_ENCRYPTED;
+ entry_csum_size = 0;
+ } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ entry.type = BTRFS_GET_CSUMS_ZEROED;
+ entry_csum_size = 0;
+ } else {
+ /* BTRFS_FILE_EXTENT_REG */
+ if (disk_bytenr == 0) {
+ /* Explicit hole. */
+ entry.type = BTRFS_GET_CSUMS_ZEROED;
+ entry_csum_size = 0;
+ } else if (encryption != 0 || compression != BTRFS_COMPRESS_NONE) {
+ entry.type = 0;
+ if (encryption != 0)
+ entry.type |= BTRFS_GET_CSUMS_ENCRYPTED;
+ if (compression != BTRFS_COMPRESS_NONE)
+ entry.type |= BTRFS_GET_CSUMS_COMPRESSED;
+ entry_csum_size = 0;
+ } else {
+ entry.type = BTRFS_GET_CSUMS_HAS_CSUMS;
+ entry_csum_size = (range_len >> fs_info->sectorsize_bits) * csum_size;
+ }
+ }
+
+ /* Check if this entry (+ csum data) fits in the buffer. */
+ if (buf_used + sizeof(entry) + entry_csum_size > buf_limit) {
+ if (buf_used == 0) {
+ ret = -EOVERFLOW;
+ goto out_unlock;
+ }
+ goto done;
+ }
+
+ if (copy_to_user(ubuf + buf_used, &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+ buf_used += sizeof(entry);
+
+ if (entry.type == BTRFS_GET_CSUMS_HAS_CSUMS) {
+ ret = copy_csums_to_user(fs_info,
+ disk_bytenr + extent_offset + (range_start - key_offset),
+ range_len, ubuf + buf_used);
+ if (ret)
+ goto out_unlock;
+ buf_used += entry_csum_size;
+ }
+
+ cur_offset = range_start + range_len;
+ prev_extent_end = extent_end;
+
+ if (fatal_signal_pending(current)) {
+ if (buf_used == 0) {
+ ret = -EINTR;
+ goto out_unlock;
+ }
+ goto done;
+ }
+
+ cond_resched();
+ }
+
+ /* Handle trailing implicit hole. */
+ if (cur_offset < end_offset) {
+ struct btrfs_ioctl_get_csums_entry entry = {
+ .offset = prev_extent_end,
+ .length = end_offset - prev_extent_end,
+ .type = BTRFS_GET_CSUMS_ZEROED,
+ };
+
+ if (buf_used + sizeof(entry) <= buf_limit) {
+ if (copy_to_user(ubuf + buf_used, &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+ buf_used += sizeof(entry);
+ cur_offset = end_offset;
+ }
+ }
+
+done:
+ args.offset = cur_offset;
+ args.length = (cur_offset < end_offset) ? end_offset - cur_offset : 0;
+ args.buf_size = buf_used;
+
+ if (copy_to_user(argp, &args, sizeof(args)))
+ ret = -EFAULT;
+
+out_unlock:
+ up_read(&vfs_inode->i_rwsem);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -5297,6 +5553,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_subvol_sync(fs_info, argp);
case BTRFS_IOC_SHUTDOWN:
return btrfs_ioctl_shutdown(fs_info, arg);
+ case BTRFS_IOC_GET_CSUMS:
+ return btrfs_ioctl_get_csums(file, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e5a24b3ff95e4..f5f77c33cf592 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -157,7 +157,8 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC)));
/* Only one type flag can be set. */
- ASSERT(has_single_bit_set(flags & BTRFS_ORDERED_EXCLUSIVE_FLAGS));
+ ASSERT(has_single_bit_set(flags & BTRFS_ORDERED_EXCLUSIVE_FLAGS),
+ "flags=0x%lx", flags);
/* DIRECT cannot be set with COMPRESSED nor ENCODED. */
if (test_bit(BTRFS_ORDERED_DIRECT, &flags)) {
@@ -302,7 +303,7 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
{
struct btrfs_ordered_extent *entry;
- ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
+ ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0, "flags=0x%lx", flags);
/*
* For regular writes, we just use the members in @file_extent.
@@ -1238,7 +1239,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
trace_btrfs_ordered_extent_split(inode, ordered);
- ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED)));
+ ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED)), "flags=0x%lx", flags);
/*
* The entire bio must be covered by the ordered extent, but we can't
@@ -1260,7 +1261,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
}
/* We cannot split partially completed ordered extents. */
if (ordered->bytes_left) {
- ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS));
+ ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS), "flags=0x%lx", flags);
if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes))
return ERR_PTR(-EINVAL);
}
@@ -1307,7 +1308,8 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
ordered->ram_bytes -= len;
if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) {
- ASSERT(ordered->bytes_left == 0);
+ ASSERT(ordered->bytes_left == 0, "ordered->bytes_left=%llu",
+ ordered->bytes_left);
new->bytes_left = 0;
} else {
ordered->bytes_left -= len;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 6838faceb6d5c..502fb4a55cb22 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -373,10 +373,9 @@ static bool squota_check_parent_usage(struct btrfs_fs_info *fs_info, struct btrf
parent->excl_cmpr != excl_cmpr_sum || parent->rfer_cmpr != rfer_cmpr_sum);
WARN(mismatch,
- "parent squota qgroup %hu/%llu has mismatched usage from its %d members. "
+ "parent squota qgroup " BTRFS_QGROUP_FMT " has mismatched usage from its %d members. "
"%llu %llu %llu %llu vs %llu %llu %llu %llu\n",
- btrfs_qgroup_level(parent->qgroupid),
- btrfs_qgroup_subvolid(parent->qgroupid), nr_members, parent->excl,
+ BTRFS_QGROUP_FMT_VALUE(parent), nr_members, parent->excl,
parent->rfer, parent->excl_cmpr, parent->rfer_cmpr, excl_sum,
rfer_sum, excl_cmpr_sum, rfer_cmpr_sum);
return mismatch;
@@ -652,9 +651,8 @@ bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info)
if (qgroup->rsv.values[i]) {
ret = true;
btrfs_warn(fs_info,
- "qgroup %hu/%llu has unreleased space, type %d rsv %llu",
- btrfs_qgroup_level(qgroup->qgroupid),
- btrfs_qgroup_subvolid(qgroup->qgroupid),
+ "qgroup " BTRFS_QGROUP_FMT " has unreleased space, type %d rsv %llu",
+ BTRFS_QGROUP_FMT_VALUE(qgroup),
i, qgroup->rsv.values[i]);
}
}
@@ -1858,14 +1856,13 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
* Thus its reserved space should all be zero, no matter if qgroup
* is consistent or the mode.
*/
- if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
- qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
- qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) {
+ if (unlikely(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS])) {
DEBUG_WARN();
btrfs_warn_rl(fs_info,
-"to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu",
- btrfs_qgroup_level(qgroup->qgroupid),
- btrfs_qgroup_subvolid(qgroup->qgroupid),
+"to be deleted qgroup " BTRFS_QGROUP_FMT " has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu",
+ BTRFS_QGROUP_FMT_VALUE(qgroup),
qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA],
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC],
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
@@ -1879,13 +1876,12 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
*/
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
- if (qgroup->rfer || qgroup->excl ||
- qgroup->rfer_cmpr || qgroup->excl_cmpr) {
+ if (unlikely(qgroup->rfer || qgroup->excl ||
+ qgroup->rfer_cmpr || qgroup->excl_cmpr)) {
DEBUG_WARN();
qgroup_mark_inconsistent(fs_info,
- "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
- btrfs_qgroup_level(qgroup->qgroupid),
- btrfs_qgroup_subvolid(qgroup->qgroupid),
+"to be deleted qgroup " BTRFS_QGROUP_FMT " has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
+ BTRFS_QGROUP_FMT_VALUE(qgroup),
qgroup->rfer, qgroup->rfer_cmpr,
qgroup->excl, qgroup->excl_cmpr);
}
@@ -4822,9 +4818,9 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
entry = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
- if (entry->subvol_generation != block->subvol_generation ||
- entry->reloc_bytenr != block->reloc_bytenr ||
- entry->reloc_generation != block->reloc_generation) {
+ if (unlikely(entry->subvol_generation != block->subvol_generation ||
+ entry->reloc_bytenr != block->reloc_bytenr ||
+ entry->reloc_generation != block->reloc_generation)) {
/*
* Duplicated but mismatch entry found. Shouldn't happen.
* Marking qgroup inconsistent should be enough for end
@@ -4971,9 +4967,8 @@ int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
ASSERT(qg->excl == qg->rfer);
if (WARN_ON_ONCE(sign < 0 && qg->excl < num_bytes)) {
btrfs_warn(fs_info,
- "squota underflow qg %hu/%llu excl %llu num_bytes %llu",
- btrfs_qgroup_level(qg->qgroupid),
- btrfs_qgroup_subvolid(qg->qgroupid),
+ "squota underflow qg " BTRFS_QGROUP_FMT " excl %llu num_bytes %llu",
+ BTRFS_QGROUP_FMT_VALUE(qg),
qg->excl, num_bytes);
qg->excl = 0;
qg->rfer = 0;
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 4b0186c83ad1d..454a95bf542a0 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -272,7 +272,9 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
&key,
key.offset - length,
length);
- ASSERT(key.offset - diff_end == length);
+ ASSERT(key.offset - diff_end == length,
+ "key.offset=%llu diff_end=%llu length=%llu",
+ key.offset, diff_end, length);
break;
}
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 49865a4637809..0a4628b3007df 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -69,7 +69,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
struct address_space *mapping = inode->vfs_inode.i_mapping;
int ret;
- ASSERT(IS_ALIGNED(file_offset, block_size));
+ ASSERT(IS_ALIGNED(file_offset, block_size), "file_offset=%llu block_size=%u",
+ file_offset, block_size);
/*
* We have flushed and locked the ranges of the source and destination
@@ -94,9 +95,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
if (ret < 0)
goto out_unlock;
- btrfs_clear_extent_bit(&inode->io_tree, file_offset, range_end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, NULL);
- ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
+ ret = btrfs_reset_extent_delalloc(inode, file_offset, range_end, 0, NULL);
if (ret)
goto out_unlock;
@@ -141,7 +140,6 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
folio_zero_range(folio, datal, block_size - datal);
btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size);
- btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size);
btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size);
out_unlock:
if (!IS_ERR(folio)) {
@@ -459,7 +457,7 @@ process_slot:
key.objectid != btrfs_ino(BTRFS_I(src)))
break;
- ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
+ ASSERT(key.type == BTRFS_EXTENT_DATA_KEY, "key.type=%u", key.type);
extent = btrfs_item_ptr(leaf, slot,
struct btrfs_file_extent_item);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 3ebaf5880125f..955e338dcfd89 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -407,7 +407,7 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
struct reloc_control *rc, struct btrfs_key *node_key,
int level, u64 bytenr)
{
- struct btrfs_backref_iter *iter;
+ struct btrfs_backref_iter iter;
struct btrfs_backref_cache *cache = &rc->backref_cache;
/* For searching parent of TREE_BLOCK_REF */
struct btrfs_path *path;
@@ -416,9 +416,9 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
struct btrfs_backref_edge *edge;
int ret;
- iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info);
- if (!iter)
- return ERR_PTR(-ENOMEM);
+ ret = btrfs_backref_iter_init(&iter);
+ if (ret < 0)
+ return ERR_PTR(ret);
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -435,7 +435,7 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
/* Breadth-first search to build backref cache */
do {
- ret = btrfs_backref_add_tree_node(trans, cache, path, iter,
+ ret = btrfs_backref_add_tree_node(trans, cache, path, &iter,
node_key, cur);
if (ret < 0)
goto out;
@@ -460,8 +460,7 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
if (handle_useless_nodes(rc, node))
node = NULL;
out:
- btrfs_free_path(iter->path);
- kfree(iter);
+ btrfs_free_path(iter.path);
btrfs_free_path(path);
if (ret) {
btrfs_backref_error_cleanup(cache, node);
@@ -590,7 +589,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_key root_key;
int ret = 0;
- root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+ root_item = kmalloc_obj(*root_item, GFP_NOFS);
if (!root_item)
return ERR_PTR(-ENOMEM);
@@ -814,6 +813,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
u64 bytenr, u64 num_bytes)
{
struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
BTRFS_PATH_AUTO_FREE(path);
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
@@ -835,10 +835,23 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
- btrfs_file_extent_compression(leaf, fi) ||
- btrfs_file_extent_encryption(leaf, fi) ||
- btrfs_file_extent_other_encoding(leaf, fi));
+ /*
+ * The cluster-boundary key searched above is always written by
+ * relocation with offset 0: either by insert_prealloc_file_extent()
+ * (memsets the stack item to 0) or by the front portion of a partial
+ * writeback (offset=0 by construction). A non-zero value here means
+ * the on-disk leaf does not match what relocation wrote, i.e.
+ * corruption. The other encoding fields are caught earlier by
+ * tree-checker's check_extent_data_item().
+ */
+ if (unlikely(btrfs_file_extent_offset(leaf, fi))) {
+ btrfs_print_leaf(leaf);
+ btrfs_err(fs_info,
+"unexpected non-zero offset in file extent item for data reloc inode %llu key offset %llu offset %llu",
+ btrfs_ino(BTRFS_I(reloc_inode)), bytenr,
+ btrfs_file_extent_offset(leaf, fi));
+ return -EUCLEAN;
+ }
if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi))
return -EINVAL;
@@ -2944,7 +2957,7 @@ static int relocate_file_extent_cluster(struct reloc_control *rc)
if (!cluster->nr)
return 0;
- ra = kzalloc(sizeof(*ra), GFP_NOFS);
+ ra = kzalloc_obj(*ra, GFP_NOFS);
if (!ra)
return -ENOMEM;
@@ -3863,7 +3876,7 @@ static int add_remap_tree_entries(struct btrfs_trans_handle *trans, struct btrfs
max_items = BTRFS_LEAF_DATA_SIZE(trans->fs_info) / sizeof(struct btrfs_item);
- data_sizes = kzalloc(sizeof(u32) * min_t(u32, num_entries, max_items), GFP_NOFS);
+ data_sizes = kzalloc_objs(u32, min_t(u32, num_entries, max_items), GFP_NOFS);
if (!data_sizes)
return -ENOMEM;
@@ -4454,7 +4467,7 @@ static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
- space_runs = kmalloc(sizeof(*space_runs) * extent_count, GFP_NOFS);
+ space_runs = kmalloc_objs(*space_runs, extent_count, GFP_NOFS);
if (!space_runs) {
mutex_unlock(&bg->free_space_lock);
return -ENOMEM;
@@ -4543,7 +4556,7 @@ static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
mutex_unlock(&bg->free_space_lock);
max_entries = extent_count + 2;
- entries = kmalloc(sizeof(*entries) * max_entries, GFP_NOFS);
+ entries = kmalloc_objs(*entries, max_entries, GFP_NOFS);
if (!entries) {
ret = -ENOMEM;
goto out;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 89d72d8cb85fd..c36b741dbe6de 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -17,6 +17,7 @@
#include <linux/compat.h>
#include <linux/crc32c.h>
#include <linux/fsverity.h>
+#include <linux/cleanup.h>
#include "send.h"
#include "ctree.h"
#include "backref.h"
@@ -72,6 +73,8 @@ struct fs_path {
#define FS_PATH_INLINE_SIZE \
sizeof_field(struct fs_path, inline_buf)
+static void fs_path_free(struct fs_path *p);
+DEFINE_FREE(fs_path_free, struct fs_path *, fs_path_free(_T))
/* reused for each extent */
struct clone_root {
@@ -981,7 +984,7 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_inode_ref *iref;
struct btrfs_inode_extref *extref;
BTRFS_PATH_AUTO_FREE(tmp_path);
- struct fs_path *p;
+ struct fs_path *p __free(fs_path_free) = NULL;
u32 cur = 0;
u32 total;
int slot = path->slots[0];
@@ -998,11 +1001,8 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
return -ENOMEM;
tmp_path = alloc_path_for_send();
- if (!tmp_path) {
- fs_path_free(p);
+ if (!tmp_path)
return -ENOMEM;
- }
-
if (found_key->type == BTRFS_INODE_REF_KEY) {
ptr = (unsigned long)btrfs_item_ptr(eb, slot,
@@ -1034,30 +1034,27 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
start = btrfs_ref_to_path(root, tmp_path, name_len,
name_off, eb, dir,
p->buf, p->buf_len);
- if (IS_ERR(start)) {
- ret = PTR_ERR(start);
- goto out;
- }
+ if (IS_ERR(start))
+ return PTR_ERR(start);
+
if (start < p->buf) {
/* overflow , try again with larger buffer */
ret = fs_path_ensure_buf(p,
p->buf_len + p->buf - start);
if (ret < 0)
- goto out;
+ return ret;
start = btrfs_ref_to_path(root, tmp_path,
name_len, name_off,
eb, dir,
p->buf, p->buf_len);
- if (IS_ERR(start)) {
- ret = PTR_ERR(start);
- goto out;
- }
+ if (IS_ERR(start))
+ return PTR_ERR(start);
+
if (unlikely(start < p->buf)) {
btrfs_err(root->fs_info,
"send: path ref buffer underflow for key " BTRFS_KEY_FMT,
BTRFS_KEY_FMT_VALUE(found_key));
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
}
p->start = start;
@@ -1065,17 +1062,15 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
ret = fs_path_add_from_extent_buffer(p, eb, name_off,
name_len);
if (ret < 0)
- goto out;
+ return ret;
}
cur += elem_size + name_len;
ret = iterate(dir, p, ctx);
if (ret)
- goto out;
+ return ret;
}
-out:
- fs_path_free(p);
return ret;
}
@@ -2028,7 +2023,7 @@ static int is_first_ref(struct btrfs_root *root,
const char *name, int name_len)
{
int ret;
- struct fs_path *tmp_name;
+ struct fs_path *tmp_name __free(fs_path_free) = NULL;
u64 tmp_dir;
tmp_name = fs_path_alloc();
@@ -2037,17 +2032,13 @@ static int is_first_ref(struct btrfs_root *root,
ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);
if (ret < 0)
- goto out;
+ return ret;
- if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
- ret = 0;
- goto out;
- }
+ if (dir != tmp_dir || name_len != fs_path_len(tmp_name))
+ return 0;
ret = !memcmp(tmp_name->start, name, name_len);
-out:
- fs_path_free(tmp_name);
return ret;
}
@@ -2196,13 +2187,13 @@ static int did_overwrite_ref(struct send_ctx *sctx,
*/
static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
{
- int ret = 0;
- struct fs_path *name = NULL;
+ int ret;
+ struct fs_path *name __free(fs_path_free) = NULL;
u64 dir;
u64 dir_gen;
if (!sctx->parent_root)
- goto out;
+ return 0;
name = fs_path_alloc();
if (!name)
@@ -2210,14 +2201,10 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
if (ret < 0)
- goto out;
-
- ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
- name->start, fs_path_len(name));
+ return ret;
-out:
- fs_path_free(name);
- return ret;
+ return did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
+ name->start, fs_path_len(name));
}
static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
@@ -2375,7 +2362,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
struct fs_path *dest)
{
int ret = 0;
- struct fs_path *name = NULL;
+ struct fs_path *name __free(fs_path_free) = NULL;
u64 parent_inode = 0;
u64 parent_gen = 0;
int stop = 0;
@@ -2389,10 +2376,8 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
}
name = fs_path_alloc();
- if (!name) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!name)
+ return -ENOMEM;
dest->reversed = 1;
fs_path_reset(dest);
@@ -2437,7 +2422,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
}
out:
- fs_path_free(name);
if (!ret) {
fs_path_unreverse(dest);
if (is_cur_inode && dest != &sctx->cur_inode_path)
@@ -2787,7 +2771,7 @@ static int trim_dir_utimes_cache(struct send_ctx *sctx)
static int send_create_inode(struct send_ctx *sctx, u64 ino)
{
int ret = 0;
- struct fs_path *p;
+ struct fs_path *p __free(fs_path_free) = NULL;
int cmd;
struct btrfs_inode_info info;
u64 gen;
@@ -2801,7 +2785,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
if (ino != sctx->cur_ino) {
ret = get_inode_info(sctx->send_root, ino, &info);
if (ret < 0)
- goto out;
+ return ret;
gen = info.gen;
mode = info.mode;
rdev = info.rdev;
@@ -2826,17 +2810,16 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
} else {
btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
(int)(mode & S_IFMT));
- ret = -EOPNOTSUPP;
- goto out;
+ return -EOPNOTSUPP;
}
ret = begin_cmd(sctx, cmd);
if (ret < 0)
- goto out;
+ return ret;
ret = gen_unique_name(sctx, ino, gen, p);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
@@ -2845,7 +2828,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
fs_path_reset(p);
ret = read_symlink(sctx->send_root, ino, p);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
} else if (S_ISCHR(mode) || S_ISBLK(mode) ||
S_ISFIFO(mode) || S_ISSOCK(mode)) {
@@ -2855,12 +2838,9 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
ret = send_cmd(sctx);
if (ret < 0)
- goto out;
-
+ return ret;
tlv_put_failure:
-out:
- fs_path_free(p);
return ret;
}
@@ -3039,7 +3019,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
struct fs_path *path)
{
int ret;
- struct fs_path *orphan;
+ struct fs_path *orphan __free(fs_path_free) = NULL;
orphan = fs_path_alloc();
if (!orphan)
@@ -3047,17 +3027,15 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
ret = gen_unique_name(sctx, ino, gen, orphan);
if (ret < 0)
- goto out;
+ return ret;
ret = send_rename(sctx, path, orphan);
if (ret < 0)
- goto out;
+ return ret;
if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen)
ret = fs_path_copy(&sctx->cur_inode_path, orphan);
-out:
- fs_path_free(orphan);
return ret;
}
@@ -3467,9 +3445,9 @@ static int path_loop(struct send_ctx *sctx, struct fs_path *name,
static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
{
- struct fs_path *from_path = NULL;
- struct fs_path *to_path = NULL;
- struct fs_path *name = NULL;
+ struct fs_path *from_path __free(fs_path_free) = NULL;
+ struct fs_path *to_path __free(fs_path_free) = NULL;
+ struct fs_path *name __free(fs_path_free) = NULL;
u64 orig_progress = sctx->send_progress;
struct recorded_ref *cur;
u64 parent_ino, parent_gen;
@@ -3482,10 +3460,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
name = fs_path_alloc();
from_path = fs_path_alloc();
- if (!name || !from_path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!name || !from_path)
+ return -ENOMEM;
dm = get_waiting_dir_move(sctx, pm->ino);
ASSERT(dm);
@@ -3599,9 +3575,6 @@ finish:
}
out:
- fs_path_free(name);
- fs_path_free(from_path);
- fs_path_free(to_path);
sctx->send_progress = orig_progress;
return ret;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index f0436eea15445..e6641597b321e 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1411,6 +1411,13 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* This is where we reclaim all of the pinned space generated by running the
* iputs
*
+ * RECLAIM_ZONES
+ * This state only works for the zoned mode. We scan the block groups in the
+ * reclaim_bgs_list and check if we can relocate them. If yes perform the
+ * relocation to garbage collect the zone. On each of these runs
+ * BTRFS_ZONED_SYNC_RECLAIM_BATCH (5) block-groups will be reclaimed, after all
+ * unused block-groups have been deleted.
+ *
* RESET_ZONES
* This state works only for the zoned mode. We scan the unused block group
* list and reset the zones and reuse the block group.
@@ -1698,6 +1705,7 @@ static int handle_reserve_ticket(struct btrfs_space_info *space_info,
ARRAY_SIZE(evict_flush_states));
break;
case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
+ case BTRFS_RESERVE_FLUSH_ZONED_RELOCATION:
priority_reclaim_data_space(space_info, ticket);
break;
default:
@@ -1961,6 +1969,7 @@ int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes,
ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE ||
+ flush == BTRFS_RESERVE_FLUSH_ZONED_RELOCATION ||
flush == BTRFS_RESERVE_NO_FLUSH, "flush=%d", flush);
ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA,
"current->journal_info=0x%lx flush=%d",
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 24f45072ca4b1..aa836e8a9d4a6 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -78,6 +78,17 @@ enum btrfs_reserve_flush_enum {
BTRFS_RESERVE_FLUSH_ALL_STEAL,
/*
+ * This is for relocation on zoned filesystems only. We need to use
+ * priority flushing for this, because otherwise we can deadlock on
+ * waiting for a ticket, that cannot be granted, because we cannot do
+ * any allocations.
+ *
+ * Apart from being specific to zoned relocation, it is equal to
+ * BTRFS_FLUSH_FREE_SPACE_INODE.
+ */
+ BTRFS_RESERVE_FLUSH_ZONED_RELOCATION,
+
+ /*
* This is for btrfs_use_block_rsv only. We have exhausted our block
* rsv and our global block rsv. This can happen for things like
* delalloc where we are overwriting a lot of extents with a single
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index f82e71f5d88b6..56060acac2e9c 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -10,41 +10,13 @@
*
* Limitations:
*
- * - Only support 64K page size for now
- * This is to make metadata handling easier, as 64K page would ensure
- * all nodesize would fit inside one page, thus we don't need to handle
- * cases where a tree block crosses several pages.
+ * - Metadata must be fully aligned to node size
+ * So when nodesize <= page size, the metadata can never cross folio boundaries.
*
- * - Only metadata read-write for now
- * The data read-write part is in development.
- *
- * - Metadata can't cross 64K page boundary
- * btrfs-progs and kernel have done that for a while, thus only ancient
- * filesystems could have such problem. For such case, do a graceful
- * rejection.
- *
- * Special behavior:
- *
- * - Metadata
- * Metadata read is fully supported.
- * Meaning when reading one tree block will only trigger the read for the
- * needed range, other unrelated range in the same page will not be touched.
- *
- * Metadata write support is partial.
- * The writeback is still for the full page, but we will only submit
- * the dirty extent buffers in the page.
- *
- * This means, if we have a metadata page like this:
- *
- * Page offset
- * 0 16K 32K 48K 64K
- * |/////////| |///////////|
- * \- Tree block A \- Tree block B
- *
- * Even if we just want to writeback tree block A, we will also writeback
- * tree block B if it's also dirty.
- *
- * This may cause extra metadata writeback which results more COW.
+ * - Only support blocks per folio <= min(BTRFS_MAX_FOLIO_SIZE / fs block size,
+ * BTRFS_MAX_BLOCKS_PER_FOLIO)
+ * This is to ensure we can afford an on-stack bitmap, without the need to allocate
+ * bitmap memory at runtime.
*
* Implementation:
*
@@ -224,11 +196,8 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
struct btrfs_folio_state *bfs = folio_get_private(folio);
- const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
const int nbits = (len >> fs_info->sectorsize_bits);
unsigned long flags;
- unsigned int cleared = 0;
- int bit = start_bit;
bool last;
btrfs_subpage_assert(fs_info, folio, start, len);
@@ -245,15 +214,10 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
spin_unlock_irqrestore(&bfs->lock, flags);
return true;
}
-
- for_each_set_bit_from(bit, bfs->bitmaps, start_bit + nbits) {
- clear_bit(bit, bfs->bitmaps);
- cleared++;
- }
- ASSERT(atomic_read(&bfs->nr_locked) >= cleared,
- "atomic_read(&bfs->nr_locked)=%d cleared=%d",
- atomic_read(&bfs->nr_locked), cleared);
- last = atomic_sub_and_test(cleared, &bfs->nr_locked);
+ ASSERT(atomic_read(&bfs->nr_locked) >= nbits,
+ "atomic_read(&bfs->nr_locked)=%d nbits=%d",
+ atomic_read(&bfs->nr_locked), nbits);
+ last = atomic_sub_and_test(nbits, &bfs->nr_locked);
spin_unlock_irqrestore(&bfs->lock, flags);
return last;
}
@@ -305,15 +269,13 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
}
void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
- struct folio *folio, unsigned long bitmap)
+ struct folio *folio, unsigned long *bitmap)
{
struct btrfs_folio_state *bfs = folio_get_private(folio);
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
- const int start_bit = blocks_per_folio * btrfs_bitmap_nr_locked;
+ const unsigned int nbits = bitmap_weight(bitmap, blocks_per_folio);
unsigned long flags;
bool last = false;
- int cleared = 0;
- int bit;
if (!btrfs_is_subpage(fs_info, folio)) {
folio_unlock(folio);
@@ -327,14 +289,10 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
}
spin_lock_irqsave(&bfs->lock, flags);
- for_each_set_bit(bit, &bitmap, blocks_per_folio) {
- if (test_and_clear_bit(bit + start_bit, bfs->bitmaps))
- cleared++;
- }
- ASSERT(atomic_read(&bfs->nr_locked) >= cleared,
- "atomic_read(&bfs->nr_locked)=%d cleared=%d",
- atomic_read(&bfs->nr_locked), cleared);
- last = atomic_sub_and_test(cleared, &bfs->nr_locked);
+ ASSERT(atomic_read(&bfs->nr_locked) >= nbits,
+ "atomic_read(&bfs->nr_locked)=%d nbits=%d",
+ atomic_read(&bfs->nr_locked), nbits);
+ last = atomic_sub_and_test(nbits, &bfs->nr_locked);
spin_unlock_irqrestore(&bfs->lock, flags);
if (last)
folio_unlock(folio);
@@ -479,64 +437,6 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
spin_unlock_irqrestore(&bfs->lock, flags);
}
-void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
-{
- struct btrfs_folio_state *bfs = folio_get_private(folio);
- unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
- ordered, start, len);
- unsigned long flags;
-
- spin_lock_irqsave(&bfs->lock, flags);
- bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- folio_set_ordered(folio);
- spin_unlock_irqrestore(&bfs->lock, flags);
-}
-
-void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
-{
- struct btrfs_folio_state *bfs = folio_get_private(folio);
- unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
- ordered, start, len);
- unsigned long flags;
-
- spin_lock_irqsave(&bfs->lock, flags);
- bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_zero(fs_info, folio, ordered))
- folio_clear_ordered(folio);
- spin_unlock_irqrestore(&bfs->lock, flags);
-}
-
-void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
-{
- struct btrfs_folio_state *bfs = folio_get_private(folio);
- unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
- checked, start, len);
- unsigned long flags;
-
- spin_lock_irqsave(&bfs->lock, flags);
- bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_set(fs_info, folio, checked))
- folio_set_checked(folio);
- spin_unlock_irqrestore(&bfs->lock, flags);
-}
-
-void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
-{
- struct btrfs_folio_state *bfs = folio_get_private(folio);
- unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
- checked, start, len);
- unsigned long flags;
-
- spin_lock_irqsave(&bfs->lock, flags);
- bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- folio_clear_checked(folio);
- spin_unlock_irqrestore(&bfs->lock, flags);
-}
-
/*
* Unlike set/clear which is dependent on each page status, for test all bits
* are tested in the same way.
@@ -560,8 +460,6 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
-IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
-IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
/*
* Note that, in selftests (extent-io-tests), we can have empty fs_info passed
@@ -657,30 +555,55 @@ IMPLEMENT_BTRFS_PAGE_OPS(dirty, folio_mark_dirty, folio_clear_dirty_for_io,
folio_test_dirty);
IMPLEMENT_BTRFS_PAGE_OPS(writeback, folio_start_writeback, folio_end_writeback,
folio_test_writeback);
-IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered,
- folio_test_ordered);
-IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
- folio_test_checked);
-#define GET_SUBPAGE_BITMAP(fs_info, folio, name, dst) \
-{ \
- const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \
- const struct btrfs_folio_state *__bfs = folio_get_private(folio); \
- \
- ASSERT(__bpf <= BITS_PER_LONG); \
- *dst = bitmap_read(__bfs->bitmaps, \
- __bpf * btrfs_bitmap_nr_##name, __bpf); \
+#define DEFINE_GET_SUBPAGE_BITMAP(name) \
+static inline unsigned long get_bitmap_value_##name( \
+ const struct btrfs_fs_info *fs_info, \
+ struct folio *folio) \
+{ \
+ const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \
+ const struct btrfs_folio_state *__bfs = folio_get_private(folio); \
+ unsigned long value; \
+ \
+ ASSERT(__bpf <= BITS_PER_LONG); \
+ value = bitmap_read(__bfs->bitmaps, __bpf * btrfs_bitmap_nr_##name, \
+ __bpf); \
+ return value; \
+} \
+static inline const unsigned long *get_bitmap_pointer_##name( \
+ const struct btrfs_fs_info *fs_info, \
+ struct folio *folio) \
+{ \
+ const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \
+ struct btrfs_folio_state *__bfs = folio_get_private(folio); \
+ unsigned long *pointer; \
+ \
+ ASSERT(__bpf >= BITS_PER_LONG); \
+ ASSERT(IS_ALIGNED(__bpf, BITS_PER_LONG)); \
+ pointer = __bfs->bitmaps + (BIT_WORD(__bpf) * btrfs_bitmap_nr_##name); \
+ return pointer; \
}
-#define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \
-{ \
- unsigned long bitmap; \
- const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \
- \
- GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \
- btrfs_warn(fs_info, \
- "dumping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \
- start, len, folio_pos(folio), __bpf, &bitmap); \
+DEFINE_GET_SUBPAGE_BITMAP(uptodate);
+DEFINE_GET_SUBPAGE_BITMAP(dirty);
+DEFINE_GET_SUBPAGE_BITMAP(writeback);
+
+#define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \
+{ \
+ const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \
+ \
+ if (__bpf <= BITS_PER_LONG) { \
+ unsigned long bitmap = get_bitmap_value_##name(fs_info, folio); \
+ \
+ btrfs_warn(fs_info, \
+ "dumping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \
+ start, len, folio_pos(folio), __bpf, &bitmap); \
+ } else { \
+ btrfs_warn(fs_info, \
+ "dumping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \
+ start, len, folio_pos(folio), __bpf, \
+ get_bitmap_pointer_##name(fs_info, folio)); \
+ } \
}
/*
@@ -728,7 +651,6 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
{
struct btrfs_folio_state *bfs;
unsigned long flags;
- unsigned int start_bit;
unsigned int nbits;
int ret;
@@ -737,15 +659,8 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
return;
bfs = folio_get_private(folio);
- start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
nbits = len >> fs_info->sectorsize_bits;
spin_lock_irqsave(&bfs->lock, flags);
- /* Target range should not yet be locked. */
- if (unlikely(!bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits))) {
- SUBPAGE_DUMP_BITMAP(fs_info, folio, locked, start, len);
- ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits));
- }
- bitmap_set(bfs->bitmaps, start_bit, nbits);
ret = atomic_add_return(nbits, &bfs->nr_locked);
ASSERT(ret <= btrfs_blocks_per_folio(fs_info, folio));
spin_unlock_irqrestore(&bfs->lock, flags);
@@ -778,51 +693,74 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
{
struct btrfs_folio_state *bfs;
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
- unsigned long uptodate_bitmap;
- unsigned long dirty_bitmap;
- unsigned long writeback_bitmap;
- unsigned long ordered_bitmap;
- unsigned long checked_bitmap;
- unsigned long locked_bitmap;
unsigned long flags;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(blocks_per_folio > 1);
bfs = folio_get_private(folio);
- spin_lock_irqsave(&bfs->lock, flags);
- GET_SUBPAGE_BITMAP(fs_info, folio, uptodate, &uptodate_bitmap);
- GET_SUBPAGE_BITMAP(fs_info, folio, dirty, &dirty_bitmap);
- GET_SUBPAGE_BITMAP(fs_info, folio, writeback, &writeback_bitmap);
- GET_SUBPAGE_BITMAP(fs_info, folio, ordered, &ordered_bitmap);
- GET_SUBPAGE_BITMAP(fs_info, folio, checked, &checked_bitmap);
- GET_SUBPAGE_BITMAP(fs_info, folio, locked, &locked_bitmap);
- spin_unlock_irqrestore(&bfs->lock, flags);
-
dump_page(folio_page(folio, 0), "btrfs folio state dump");
+
+ if (blocks_per_folio <= BITS_PER_LONG) {
+ unsigned long uptodate;
+ unsigned long dirty;
+ unsigned long writeback;
+
+ spin_lock_irqsave(&bfs->lock, flags);
+ uptodate = get_bitmap_value_uptodate(fs_info, folio);
+ dirty = get_bitmap_value_dirty(fs_info, folio);
+ writeback = get_bitmap_value_writeback(fs_info, folio);
+
+ spin_unlock_irqrestore(&bfs->lock, flags);
+
+ btrfs_warn(fs_info,
+"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl",
+ start, len, folio_pos(folio),
+ blocks_per_folio, &uptodate,
+ blocks_per_folio, &dirty,
+ blocks_per_folio, &writeback);
+ return;
+ }
+
+ spin_lock_irqsave(&bfs->lock, flags);
btrfs_warn(fs_info,
-"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
+"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl",
start, len, folio_pos(folio),
- blocks_per_folio, &uptodate_bitmap,
- blocks_per_folio, &dirty_bitmap,
- blocks_per_folio, &locked_bitmap,
- blocks_per_folio, &writeback_bitmap,
- blocks_per_folio, &ordered_bitmap,
- blocks_per_folio, &checked_bitmap);
+ blocks_per_folio, get_bitmap_pointer_uptodate(fs_info, folio),
+ blocks_per_folio, get_bitmap_pointer_dirty(fs_info, folio),
+ blocks_per_folio, get_bitmap_pointer_writeback(fs_info, folio));
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
-void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
- struct folio *folio,
- unsigned long *ret_bitmap)
+void btrfs_copy_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
+ struct folio *folio,
+ unsigned long *dst)
{
struct btrfs_folio_state *bfs;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
unsigned long flags;
+ unsigned long value;
+
+ if (blocks_per_folio == 1) {
+ value = 1;
+ bitmap_copy(dst, &value, 1);
+ return;
+ }
ASSERT(folio_test_private(folio) && folio_get_private(folio));
- ASSERT(btrfs_blocks_per_folio(fs_info, folio) > 1);
+ ASSERT(blocks_per_folio > 1);
bfs = folio_get_private(folio);
+ if (blocks_per_folio <= BITS_PER_LONG) {
+ spin_lock_irqsave(&bfs->lock, flags);
+ value = bitmap_read(bfs->bitmaps, btrfs_bitmap_nr_dirty * blocks_per_folio,
+ blocks_per_folio);
+ spin_unlock_irqrestore(&bfs->lock, flags);
+ bitmap_copy(dst, &value, blocks_per_folio);
+ return;
+ }
spin_lock_irqsave(&bfs->lock, flags);
- GET_SUBPAGE_BITMAP(fs_info, folio, dirty, ret_bitmap);
+ bitmap_copy(dst, get_bitmap_pointer_dirty(fs_info, folio),
+ blocks_per_folio);
spin_unlock_irqrestore(&bfs->lock, flags);
}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index d81a0ade559fd..c6d7394e6418a 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -14,15 +14,15 @@ struct folio;
/*
* Extra info for subpage bitmap.
*
- * For subpage we pack all uptodate/dirty/writeback/ordered bitmaps into
+ * For subpage we pack all uptodate/dirty/writeback bitmaps into
* one larger bitmap.
*
* This structure records how they are organized in the bitmap:
*
- * /- uptodate /- dirty /- ordered
+ * /- uptodate /- dirty /- writeback
* | | |
* v v v
- * |u|u|u|u|........|u|u|d|d|.......|d|d|o|o|.......|o|o|
+ * |u|u|u|u|........|u|u|d|d|.......|d|d|w|w|.......|w|w|
* |< sectors_per_page >|
*
* Unlike regular macro-like enums, here we do not go upper-case names, as
@@ -40,23 +40,6 @@ enum {
*/
btrfs_bitmap_nr_writeback,
- /*
- * The ordered and checked flags are for COW fixup, already marked
- * deprecated, and will be removed eventually.
- */
- btrfs_bitmap_nr_ordered,
- btrfs_bitmap_nr_checked,
-
- /*
- * The locked bit is for async delalloc range (compression), currently
- * async extent is queued with the range locked, until the compression
- * is done.
- * So an async extent can unlock the range at any random timing.
- *
- * This will need a rework on the async extent lifespan (mark writeback
- * and do compression) before deprecating this flag.
- */
- btrfs_bitmap_nr_locked,
btrfs_bitmap_nr_max
};
@@ -133,7 +116,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
- struct folio *folio, unsigned long bitmap);
+ struct folio *folio, unsigned long *bitmap);
/*
* Template for subpage related operations.
*
@@ -181,8 +164,6 @@ bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffe
DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
DECLARE_BTRFS_SUBPAGE_OPS(writeback);
-DECLARE_BTRFS_SUBPAGE_OPS(ordered);
-DECLARE_BTRFS_SUBPAGE_OPS(checked);
/*
* Helper for error cleanup, where a folio will have its dirty flag cleared,
@@ -203,9 +184,9 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb);
-void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
- struct folio *folio,
- unsigned long *ret_bitmap);
+void btrfs_copy_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
+ struct folio *folio,
+ unsigned long *dst);
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 64514d600eec7..1a5d1c126dfd5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -60,6 +60,7 @@
#include "verity.h"
#include "super.h"
#include "extent-tree.h"
+#include "tree-log.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@@ -1633,8 +1634,7 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
}
}
- devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
- GFP_KERNEL);
+ devices_info = kmalloc_objs(*devices_info, nr_devices);
if (!devices_info)
return -ENOMEM;
@@ -1732,15 +1732,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 total_free_data = 0;
u64 total_free_meta = 0;
u32 bits = fs_info->sectorsize_bits;
- __be32 *fsid = (__be32 *)fs_info->fs_devices->fsid;
+ __be32 *fsid;
unsigned factor = 1;
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
int ret;
u64 thresh = 0;
int mixed = 0;
+ __kernel_fsid_t f_fsid;
list_for_each_entry(found, &fs_info->space_info, list) {
- if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
+ if (found->flags & BTRFS_BLOCK_GROUP_DATA &&
+ found->subgroup_id != BTRFS_SUB_GROUP_DATA_RELOC) {
int i;
total_free_data += found->disk_total - found->disk_used;
@@ -1818,14 +1820,38 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bsize = fs_info->sectorsize;
buf->f_namelen = BTRFS_NAME_LEN;
- /* We treat it as constant endianness (it doesn't matter _which_)
- because we want the fsid to come out the same whether mounted
- on a big-endian or little-endian host */
- buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
- buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
+ /*
+ * fs_devices->fsid is dynamically generated when temp_fsid is active
+ * to support cloned filesystems. Use the original on-disk fsid instead,
+ * as it remains consistent across mount cycles.
+ */
+ if (fs_info->fs_devices->temp_fsid)
+ fsid = (__be32 *)fs_info->super_copy->fsid;
+ else
+ fsid = (__be32 *)fs_info->fs_devices->fsid;
+
+ /*
+ * We treat it as constant endianness (it doesn't matter _which_)
+ * because we want the fsid to come out the same whether mounted
+ * on a big-endian or little-endian host.
+ */
+ f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
+ f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
+
/* Mask in the root object ID too, to disambiguate subvols */
- buf->f_fsid.val[0] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root) >> 32;
- buf->f_fsid.val[1] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root);
+ f_fsid.val[0] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root) >> 32;
+ f_fsid.val[1] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root);
+
+ /* Hash dev_t to avoid f_fsid collision with cloned filesystems. */
+ if (fs_info->fs_devices->total_devices == 1) {
+ __kernel_fsid_t dev_fsid =
+ u64_to_fsid(huge_encode_dev(fs_info->fs_devices->latest_dev->bdev->bd_dev));
+
+ f_fsid.val[0] ^= dev_fsid.val[1];
+ f_fsid.val[1] ^= dev_fsid.val[0];
+ }
+
+ memcpy(&buf->f_fsid, &f_fsid, sizeof(f_fsid));
return 0;
}
@@ -2606,6 +2632,9 @@ static const struct init_sequence mod_init_seq[] = {
.init_func = btrfs_init_compress,
.exit_func = btrfs_exit_compress,
}, {
+ .init_func = btrfs_init_block_group,
+ .exit_func = btrfs_exit_block_group,
+ }, {
.init_func = btrfs_init_cachep,
.exit_func = btrfs_destroy_cachep,
}, {
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 19c127ac6d10c..6287d940323d6 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -145,6 +145,7 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
fs_info->csum_size = 4;
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) /
fs_info->csum_size;
+ fs_info->use_bitmap = btrfs_use_bitmap;
set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
test_mnt->mnt_sb->s_fs_info = fs_info;
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index ebf68fcd2149d..0425b3b68716a 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -398,10 +398,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
int ret;
u64 offset;
u64 max_extent_size;
- const struct btrfs_free_space_op test_free_space_ops = {
- .use_bitmap = test_use_bitmap,
- };
- const struct btrfs_free_space_op *orig_free_space_ops;
+ bool (*orig_use_bitmap)(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info);
test_msg("running space stealing from bitmap to extent tests");
@@ -423,8 +421,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
* that forces use of bitmaps as soon as we have at least 1
* extent entry.
*/
- orig_free_space_ops = cache->free_space_ctl->op;
- cache->free_space_ctl->op = &test_free_space_ops;
+ orig_use_bitmap = cache->fs_info->use_bitmap;
+ cache->fs_info->use_bitmap = test_use_bitmap;
/*
* Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[
@@ -818,7 +816,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
if (ret)
return ret;
- cache->free_space_ctl->op = orig_free_space_ops;
+ cache->fs_info->use_bitmap = orig_use_bitmap;
btrfs_remove_free_space_cache(cache);
return 0;
@@ -832,10 +830,8 @@ static bool bytes_index_use_bitmap(struct btrfs_free_space_ctl *ctl,
static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
{
- const struct btrfs_free_space_op test_free_space_ops = {
- .use_bitmap = bytes_index_use_bitmap,
- };
- const struct btrfs_free_space_op *orig_free_space_ops;
+ bool (*orig_use_bitmap)(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info);
struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
struct btrfs_free_space *entry;
struct rb_node *node;
@@ -892,8 +888,8 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
/* Now validate bitmaps with different ->max_extent_size. */
btrfs_remove_free_space_cache(cache);
- orig_free_space_ops = cache->free_space_ctl->op;
- cache->free_space_ctl->op = &test_free_space_ops;
+ orig_use_bitmap = cache->fs_info->use_bitmap;
+ cache->fs_info->use_bitmap = bytes_index_use_bitmap;
ret = test_add_free_space_entry(cache, 0, sectorsize, 1);
if (ret) {
@@ -997,7 +993,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
return -EINVAL;
}
- cache->free_space_ctl->op = orig_free_space_ops;
+ cache->fs_info->use_bitmap = orig_use_bitmap;
btrfs_remove_free_space_cache(cache);
return 0;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 194f581b36f36..a289a8fa237c8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -394,6 +394,7 @@ loop:
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
cur_trans->aborted = 0;
+ trace_btrfs_transaction_start(cur_trans);
spin_unlock(&fs_info->trans_lock);
return 0;
@@ -2114,7 +2115,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
btrfs_put_transaction(cur_trans);
btrfs_put_transaction(cur_trans);
- trace_btrfs_transaction_commit(fs_info);
+ trace_btrfs_transaction_commit(trans);
if (current->journal_info == trans)
current->journal_info = NULL;
@@ -2320,6 +2321,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
}
cur_trans->state = TRANS_STATE_COMMIT_PREP;
+ trace_btrfs_transaction_commit(trans);
wake_up(&fs_info->transaction_blocked_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
@@ -2358,6 +2360,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
}
cur_trans->state = TRANS_STATE_COMMIT_START;
+ trace_btrfs_transaction_commit(trans);
wake_up(&fs_info->transaction_blocked_wait);
spin_unlock(&fs_info->trans_lock);
@@ -2413,6 +2416,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
spin_lock(&fs_info->trans_lock);
add_pending_snapshot(trans);
cur_trans->state = TRANS_STATE_COMMIT_DOING;
+ trace_btrfs_transaction_commit(trans);
spin_unlock(&fs_info->trans_lock);
/*
@@ -2561,6 +2565,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
spin_lock(&fs_info->trans_lock);
cur_trans->state = TRANS_STATE_UNBLOCKED;
+ trace_btrfs_transaction_commit(trans);
fs_info->running_transaction = NULL;
spin_unlock(&fs_info->trans_lock);
mutex_unlock(&fs_info->reloc_mutex);
@@ -2603,6 +2608,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* which can change it.
*/
cur_trans->state = TRANS_STATE_SUPER_COMMITTED;
+ trace_btrfs_transaction_commit(trans);
wake_up(&cur_trans->commit_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
@@ -2619,6 +2625,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* which can change it.
*/
cur_trans->state = TRANS_STATE_COMPLETED;
+ trace_btrfs_transaction_commit(trans);
wake_up(&cur_trans->commit_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
@@ -2632,8 +2639,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (trans->type & __TRANS_FREEZABLE)
sb_end_intwrite(fs_info->sb);
- trace_btrfs_transaction_commit(fs_info);
-
btrfs_scrub_continue(fs_info);
if (current->journal_info == trans)
@@ -2722,17 +2727,33 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
*
* We'll complete the cleanup in btrfs_end_transaction and
* btrfs_commit_transaction.
+ *
+ * Note: the parameter @error encodes whether the transactin abort was first hit
+ * (setting the FS_ERROR state bit in btrfs_abort_transaction())
+ * - positive number - first hit
+ * - negative number - abort after it was already done
*/
void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
- unsigned int line, int error, bool first_hit)
+ unsigned int line, int error)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
+ bool first_hit = false;
+
+ if (error > 0) {
+ error = -error;
+ first_hit = true;
+ }
WRITE_ONCE(trans->aborted, error);
WRITE_ONCE(trans->transaction->aborted, error);
- if (first_hit && error == -ENOSPC)
- btrfs_dump_space_info_for_trans_abort(fs_info);
+ trace_btrfs_transaction_abort(trans);
+ if (first_hit) {
+ btrfs_err(fs_info, "Transaction %llu aborted (error %d)",
+ trans->transid, error);
+ if (error == -ENOSPC)
+ btrfs_dump_space_info_for_trans_abort(fs_info);
+ }
/* Wake up anybody who may be waiting on this transaction */
wake_up(&fs_info->transaction_wait);
wake_up(&fs_info->transaction_blocked_wait);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 7d70fe4867587..5e4b1106fd905 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -243,29 +243,47 @@ static inline bool btrfs_abort_should_print_stack(int error)
}
/*
- * Call btrfs_abort_transaction as early as possible when an error condition is
- * detected, that way the exact stack trace is reported for some errors.
+ * Compile-time and run-time verification of error passed to transaction abort.
+ * Direct constants will be caught at compile time, errors read from variables
+ * can be caught only at run-time and will warn under debugging config.
+ *
+ * How verification works:
+ * - accepted builtin constants are all -EIO and such
+ * - for compile-time check, invalid condition produces a negative-sized array
+ * type, valid zero-sized
+ * - when a variable is passed as error the first check is a no-op
+ * - with enabled debugging, the second array type size is constructed from the
+ * real variable value, valid condition produces array of size 1
+ * - sizeof(type) does not generate any code
+ */
+#define VERIFY_NEGATIVE_ERROR(error) \
+do { \
+ (void)sizeof(char[-!(__builtin_constant_p(error) ? (error) < 0 : 1)]); \
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { \
+ if (sizeof(char[(error) < 0]) != 1) \
+ DEBUG_WARN("error >= 0 passed to btrfs_abort_transaction()"); \
+ } \
+} while(0)
+
+/*
+ * Call btrfs_abort_transaction() as early as possible when an error condition
+ * is detected, that way the exact stack trace is reported for some errors.
+ *
+ * Error number must be negative as it encodes wheather it's the first abort.
*/
#define btrfs_abort_transaction(trans, error) \
do { \
- bool __first = false; \
+ int __error = (error); \
+ \
+ VERIFY_NEGATIVE_ERROR(error); \
/* Report first abort since mount */ \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
- __first = true; \
- if (WARN(btrfs_abort_should_print_stack(error), \
- KERN_ERR \
- "BTRFS: Transaction aborted (error %d)\n", \
- (error))) { \
- /* Stack trace printed. */ \
- } else { \
- btrfs_err((trans)->fs_info, \
- "Transaction aborted (error %d)", \
- (error)); \
- } \
+ WARN_ON(btrfs_abort_should_print_stack(__error)); \
+ __error = -__error; \
} \
__btrfs_abort_transaction((trans), __func__, \
- __LINE__, (error), __first); \
+ __LINE__, __error); \
} while (0)
int btrfs_end_transaction(struct btrfs_trans_handle *trans);
@@ -303,7 +321,7 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
- unsigned int line, int error, bool first_hit);
+ unsigned int line, int error);
int __init btrfs_transaction_init(void);
void __cold btrfs_transaction_exit(void);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 1f15d0793a9c9..cb3e676a81cc4 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -296,6 +296,33 @@ static int check_extent_data_item(struct extent_buffer *leaf,
return 0;
}
+ /*
+ * For the data reloc tree, file extent items are written by
+ * relocation's own paths. The data reloc inode is created with
+ * BTRFS_INODE_NOCOMPRESS, so insert_ordered_extent_file_extent()
+ * always leaves the compression field at 0. Encryption and
+ * other_encoding are reserved-and-zero in btrfs. A non-zero value
+ * for any of these means the leaf decoded from disk does not match
+ * what the kernel wrote, i.e. on-disk corruption.
+ *
+ * The file_extent_item's offset field is NOT a universal invariant
+ * here: partial-PREALLOC writebacks legitimately produce REG items
+ * with non-zero offset at non-boundary keys. The offset check is
+ * performed at the call site in get_new_location(), which only
+ * inspects cluster-boundary keys where offset is always 0.
+ */
+ if (unlikely(btrfs_header_owner(leaf) == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ (btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi)))) {
+ file_extent_err(leaf, slot,
+"invalid encoding fields for data reloc tree, compression=%u encryption=%u other_encoding=%u",
+ btrfs_file_extent_compression(leaf, fi),
+ btrfs_file_extent_encryption(leaf, fi),
+ btrfs_file_extent_other_encoding(leaf, fi));
+ return -EUCLEAN;
+ }
+
/* Regular or preallocated extent has fixed item size */
if (unlikely(item_size != sizeof(*fi))) {
file_extent_err(leaf, slot,
@@ -1371,6 +1398,37 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
return 0;
}
+static int check_root_ref(struct extent_buffer *leaf, struct btrfs_key *key, int slot)
+{
+ struct btrfs_root_ref *rref;
+ u32 item_size = btrfs_item_size(leaf, slot);
+ u32 name_len;
+
+ if (unlikely(item_size <= sizeof(*rref))) {
+ generic_err(leaf, slot,
+ "invalid root ref item size for key type %u, have %u expect > %zu",
+ key->type, item_size, sizeof(*rref));
+ return -EUCLEAN;
+ }
+
+ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+ name_len = btrfs_root_ref_name_len(leaf, rref);
+ if (unlikely(name_len > BTRFS_NAME_LEN)) {
+ generic_err(leaf, slot,
+ "root ref name too long for key type %u, have %u max %u",
+ key->type, name_len, BTRFS_NAME_LEN);
+ return -EUCLEAN;
+ }
+ if (unlikely(item_size != sizeof(*rref) + name_len)) {
+ generic_err(leaf, slot,
+ "invalid root ref item size for key type %u, have %u expect %zu",
+ key->type, item_size, sizeof(*rref) + name_len);
+ return -EUCLEAN;
+ }
+
+ return 0;
+}
+
__printf(3,4)
__cold
static void extent_err(const struct extent_buffer *eb, int slot,
@@ -2071,6 +2129,7 @@ static int check_free_space_info(struct extent_buffer *leaf, struct btrfs_key *k
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_free_space_info *fsi;
const u32 blocksize = fs_info->sectorsize;
+ u64 end;
u32 flags;
if (unlikely(!IS_ALIGNED(key->objectid, blocksize))) {
@@ -2085,6 +2144,12 @@ static int check_free_space_info(struct extent_buffer *leaf, struct btrfs_key *k
blocksize, BTRFS_KEY_FMT_VALUE(key));
return -EUCLEAN;
}
+ if (unlikely(check_add_overflow(key->objectid, key->offset, &end))) {
+ generic_err(leaf, slot,
+ "free space info key overflows, has " BTRFS_KEY_FMT,
+ BTRFS_KEY_FMT_VALUE(key));
+ return -EUCLEAN;
+ }
if (unlikely(btrfs_item_size(leaf, slot) !=
sizeof(struct btrfs_free_space_info))) {
generic_err(leaf, slot,
@@ -2112,23 +2177,98 @@ static int check_free_space_info(struct extent_buffer *leaf, struct btrfs_key *k
return 0;
}
-static int check_free_space_extent(struct extent_buffer *leaf, struct btrfs_key *key, int slot)
+static int check_free_space_common_key(struct extent_buffer *leaf, struct btrfs_key *key, int slot,
+ struct btrfs_key *prev_key)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
const u32 blocksize = fs_info->sectorsize;
+ const char *type_str = (key->type == BTRFS_FREE_SPACE_EXTENT_KEY) ? "extent" : "bitmap";
+ u64 end;
if (unlikely(!IS_ALIGNED(key->objectid, blocksize))) {
generic_err(leaf, slot,
- "free space extent key objectid is not aligned to %u, has " BTRFS_KEY_FMT,
- blocksize, BTRFS_KEY_FMT_VALUE(key));
+ "free space %s key objectid is not aligned to %u, has " BTRFS_KEY_FMT,
+ type_str, blocksize, BTRFS_KEY_FMT_VALUE(key));
return -EUCLEAN;
}
if (unlikely(!IS_ALIGNED(key->offset, blocksize))) {
generic_err(leaf, slot,
- "free space extent key offset is not aligned to %u, has " BTRFS_KEY_FMT,
- blocksize, BTRFS_KEY_FMT_VALUE(key));
+ "free space %s key offset is not aligned to %u, has " BTRFS_KEY_FMT,
+ type_str, blocksize, BTRFS_KEY_FMT_VALUE(key));
+ return -EUCLEAN;
+ }
+ if (unlikely(key->offset == 0)) {
+ generic_err(leaf, slot, "free space %s length is 0", type_str);
+ return -EUCLEAN;
+ }
+ if (unlikely(check_add_overflow(key->objectid, key->offset, &end))) {
+ generic_err(leaf, slot,
+ "free space %s end overflow, have objectid %llu offset %llu",
+ type_str, key->objectid, key->offset);
+ return -EUCLEAN;
+ }
+ if (slot == 0)
+ return 0;
+
+ /*
+ * Make sure the current key is inside the block group, and matching
+ * the expected info type.
+ */
+ if (prev_key->type == BTRFS_FREE_SPACE_INFO_KEY) {
+ struct btrfs_free_space_info *fsi;
+ u32 info_flags;
+
+ if (unlikely(key->objectid < prev_key->objectid ||
+ key->objectid + key->offset > prev_key->objectid + prev_key->offset)) {
+ generic_err(leaf, slot,
+"free space %s is not inside the space info, prev key " BTRFS_KEY_FMT " current key " BTRFS_KEY_FMT,
+ type_str, BTRFS_KEY_FMT_VALUE(prev_key),
+ BTRFS_KEY_FMT_VALUE(key));
+ return -EUCLEAN;
+ }
+ fsi = btrfs_item_ptr(leaf, slot - 1, struct btrfs_free_space_info);
+ info_flags = btrfs_free_space_flags(leaf, fsi);
+ if (unlikely((info_flags == BTRFS_FREE_SPACE_USING_BITMAPS &&
+ key->type == BTRFS_FREE_SPACE_EXTENT_KEY) ||
+ (info_flags != BTRFS_FREE_SPACE_USING_BITMAPS &&
+ key->type == BTRFS_FREE_SPACE_BITMAP_KEY))) {
+ generic_err(leaf, slot,
+"free space %s key type is not matching the type of space info, key type %u space info flags %u",
+ type_str, key->type, info_flags);
+ return -EUCLEAN;
+ }
+ return 0;
+ }
+ /*
+ * Previous key should be either FREE_SPACE_EXTENT or FREE_SPACE_BITMAP.
+ * Inside the same block group the key type should match each other, and
+ * no overlaps.
+ */
+ if (unlikely(key->type != prev_key->type)) {
+ generic_err(leaf, slot,
+"free space %s key type is not matching the type of previous key, key type %u prev key type %u",
+ type_str, key->type, prev_key->type);
+ return -EUCLEAN;
+ }
+ if (unlikely(prev_key->objectid + prev_key->offset > key->objectid)) {
+ generic_err(leaf, slot,
+"free space %s key overlaps previous key, prev key " BTRFS_KEY_FMT " current key " BTRFS_KEY_FMT,
+ type_str, BTRFS_KEY_FMT_VALUE(prev_key),
+ BTRFS_KEY_FMT_VALUE(key));
return -EUCLEAN;
}
+ return 0;
+}
+
+static int check_free_space_extent(struct extent_buffer *leaf, struct btrfs_key *key, int slot,
+ struct btrfs_key *prev_key)
+{
+ int ret;
+
+ ret = check_free_space_common_key(leaf, key, slot, prev_key);
+ if (unlikely(ret < 0))
+ return ret;
+
if (unlikely(btrfs_item_size(leaf, slot) != 0)) {
generic_err(leaf, slot,
"invalid item size for free space info, has %u expect 0",
@@ -2139,28 +2279,17 @@ static int check_free_space_extent(struct extent_buffer *leaf, struct btrfs_key
}
static int check_free_space_bitmap(struct extent_buffer *leaf,
- struct btrfs_key *key, int slot)
+ struct btrfs_key *key, int slot,
+ struct btrfs_key *prev_key)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
- const u32 blocksize = fs_info->sectorsize;
u32 expected_item_size;
+ int ret;
+
+ ret = check_free_space_common_key(leaf, key, slot, prev_key);
+ if (unlikely(ret < 0))
+ return ret;
- if (unlikely(!IS_ALIGNED(key->objectid, blocksize))) {
- generic_err(leaf, slot,
- "free space bitmap key objectid is not aligned to %u, has " BTRFS_KEY_FMT,
- blocksize, BTRFS_KEY_FMT_VALUE(key));
- return -EUCLEAN;
- }
- if (unlikely(!IS_ALIGNED(key->offset, blocksize))) {
- generic_err(leaf, slot,
- "free space bitmap key offset is not aligned to %u, has " BTRFS_KEY_FMT,
- blocksize, BTRFS_KEY_FMT_VALUE(key));
- return -EUCLEAN;
- }
- if (unlikely(key->offset == 0)) {
- generic_err(leaf, slot, "free space bitmap length is 0");
- return -EUCLEAN;
- }
/*
* The item must hold exactly the right number of bitmap bytes for the
* range described by key->offset. A mismatch means the item was
@@ -2226,6 +2355,10 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
case BTRFS_ROOT_ITEM_KEY:
ret = check_root_item(leaf, key, slot);
break;
+ case BTRFS_ROOT_REF_KEY:
+ case BTRFS_ROOT_BACKREF_KEY:
+ ret = check_root_ref(leaf, key, slot);
+ break;
case BTRFS_EXTENT_ITEM_KEY:
case BTRFS_METADATA_ITEM_KEY:
ret = check_extent_item(leaf, key, slot, prev_key);
@@ -2245,10 +2378,10 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
ret = check_free_space_info(leaf, key, slot);
break;
case BTRFS_FREE_SPACE_EXTENT_KEY:
- ret = check_free_space_extent(leaf, key, slot);
+ ret = check_free_space_extent(leaf, key, slot, prev_key);
break;
case BTRFS_FREE_SPACE_BITMAP_KEY:
- ret = check_free_space_bitmap(leaf, key, slot);
+ ret = check_free_space_bitmap(leaf, key, slot, prev_key);
break;
case BTRFS_IDENTITY_REMAP_KEY:
case BTRFS_REMAP_KEY:
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9123adafa0d19..bcccddcc568c0 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -33,17 +33,6 @@
#define MAX_CONFLICT_INODES 10
-/* magic values for the inode_only field in btrfs_log_inode:
- *
- * LOG_INODE_ALL means to log everything
- * LOG_INODE_EXISTS means to log just enough to recreate the inode
- * during log replay
- */
-enum {
- LOG_INODE_ALL,
- LOG_INODE_EXISTS,
-};
-
/*
* directory trouble cases
*
@@ -227,7 +216,7 @@ static void do_abort_log_replay(struct walk_control *wc, const char *function,
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
- int inode_only,
+ enum btrfs_log_mode log_mode,
struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct walk_control *wc, u64 objectid);
static noinline int replay_dir_deletes(struct walk_control *wc,
@@ -502,7 +491,7 @@ static int overwrite_item(struct walk_control *wc)
* the leaf before writing into the log tree. See the comments at
* copy_items() for more details.
*/
- ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID, "root_id=%llu", btrfs_root_id(root));
+ ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
item_size = btrfs_item_size(wc->log_leaf, wc->log_slot);
src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
@@ -3333,8 +3322,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
u64 log_root_level;
mutex_lock(&root->log_mutex);
+ trace_btrfs_sync_log_enter(trans, root, ctx);
log_transid = ctx->log_transid;
if (root->log_transid_committed >= log_transid) {
+ trace_btrfs_sync_log_exit(trans, root, ctx, ctx->log_ret);
mutex_unlock(&root->log_mutex);
return ctx->log_ret;
}
@@ -3342,6 +3333,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
index1 = log_transid % 2;
if (atomic_read(&root->log_commit[index1])) {
wait_log_commit(root, log_transid);
+ trace_btrfs_sync_log_exit(trans, root, ctx, ctx->log_ret);
mutex_unlock(&root->log_mutex);
return ctx->log_ret;
}
@@ -3370,6 +3362,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* bail out if we need to do a full commit */
if (btrfs_need_log_full_commit(trans)) {
ret = BTRFS_LOG_FORCE_COMMIT;
+ trace_btrfs_sync_log_exit(trans, root, ctx, ret);
mutex_unlock(&root->log_mutex);
goto out;
}
@@ -3396,6 +3389,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
ret = 0;
if (ret) {
+ trace_btrfs_sync_log_exit(trans, root, ctx, ret);
blk_finish_plug(&plug);
btrfs_set_log_full_commit(trans);
mutex_unlock(&root->log_mutex);
@@ -3433,6 +3427,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (!log_root_tree->node) {
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
if (ret) {
+ trace_btrfs_sync_log_exit(trans, root, ctx, ret);
mutex_unlock(&fs_info->tree_root->log_mutex);
blk_finish_plug(&plug);
goto out;
@@ -3456,6 +3451,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
ret = update_log_root(trans, log, &new_root_item);
if (ret) {
+ trace_btrfs_sync_log_exit(trans, root, ctx, ret);
list_del_init(&root_log_ctx.list);
blk_finish_plug(&plug);
btrfs_set_log_full_commit(trans);
@@ -3473,6 +3469,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
list_del_init(&root_log_ctx.list);
mutex_unlock(&log_root_tree->log_mutex);
ret = root_log_ctx.log_ret;
+ trace_btrfs_sync_log_exit(trans, root, ctx, ret);
goto out;
}
@@ -3484,6 +3481,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&log_root_tree->log_mutex);
if (!ret)
ret = root_log_ctx.log_ret;
+ trace_btrfs_sync_log_exit(trans, root, ctx, ret);
goto out;
}
ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid,
@@ -3505,6 +3503,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
ret = BTRFS_LOG_FORCE_COMMIT;
+ trace_btrfs_sync_log_exit(trans, root, ctx, ret);
goto out_wake_log_root;
}
@@ -3518,11 +3517,13 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* deadlock. Bail out to the full commit instead.
*/
if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
+ trace_btrfs_sync_log_exit(trans, root, ctx, ret);
btrfs_set_log_full_commit(trans);
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
} else if (ret) {
+ trace_btrfs_sync_log_exit(trans, root, ctx, ret);
btrfs_set_log_full_commit(trans);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
@@ -3532,6 +3533,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_wait_tree_log_extents(log_root_tree,
EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
if (ret) {
+ trace_btrfs_sync_log_exit(trans, root, ctx, ret);
btrfs_set_log_full_commit(trans);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
@@ -3568,6 +3570,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
if (unlikely(BTRFS_FS_ERROR(fs_info))) {
ret = -EIO;
+ trace_btrfs_sync_log_exit(trans, root, ctx, ret);
btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
mutex_unlock(&fs_info->tree_log_mutex);
@@ -3579,6 +3582,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = write_all_supers(trans);
mutex_unlock(&fs_info->tree_log_mutex);
if (unlikely(ret)) {
+ trace_btrfs_sync_log_exit(trans, root, ctx, ret);
btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
goto out_wake_log_root;
@@ -4771,7 +4775,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *dst_path,
struct btrfs_path *src_path,
- int start_slot, int nr, int inode_only,
+ int start_slot, int nr, enum btrfs_log_mode log_mode,
u64 logged_isize, struct btrfs_log_ctx *ctx)
{
struct btrfs_root *log = inode->root->log_root;
@@ -4985,7 +4989,7 @@ copy_item:
inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
struct btrfs_inode_item);
fill_inode_item(trans, dst_path->nodes[0], inode_item,
- inode, inode_only == LOG_INODE_EXISTS,
+ inode, log_mode == LOG_INODE_EXISTS,
logged_isize);
} else {
copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
@@ -5913,9 +5917,13 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
struct btrfs_inode *curr_inode = start_inode;
int ret = 0;
+ trace_btrfs_log_new_dir_dentries_enter(trans, start_inode);
+
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
/* Pairs with btrfs_add_delayed_iput below. */
ihold(&curr_inode->vfs_inode);
@@ -6034,6 +6042,8 @@ out:
kfree(dir_elem);
}
+ trace_btrfs_log_new_dir_dentries_exit(trans, start_inode, ret);
+
return ret;
}
@@ -6126,6 +6136,9 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
{
struct btrfs_ino_list *ino_elem;
struct btrfs_inode *inode;
+ int ret = 0;
+
+ trace_btrfs_add_conflicting_inode_enter(trans, ctx, ino, parent);
/*
* It's rare to have a lot of conflicting inodes, in practice it is not
@@ -6134,8 +6147,10 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
* LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
* commits.
*/
- if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
- return BTRFS_LOG_FORCE_COMMIT;
+ if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out;
+ }
inode = btrfs_iget_logging(ino, root);
/*
@@ -6162,23 +6177,25 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
int ret = PTR_ERR(inode);
if (ret != -ENOENT)
- return ret;
+ goto out;
ret = conflicting_inode_is_dir(root, ino, path);
/* Not a directory or we got an error. */
if (ret <= 0)
- return ret;
+ goto out;
/* Conflicting inode is a directory, so we'll log its parent. */
ino_elem = kmalloc_obj(*ino_elem, GFP_NOFS);
- if (!ino_elem)
- return -ENOMEM;
+ if (!ino_elem) {
+ ret = -ENOMEM;
+ goto out;
+ }
ino_elem->ino = ino;
ino_elem->parent = parent;
list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
ctx->num_conflict_inodes++;
-
- return 0;
+ ret = 0;
+ goto out;
}
/*
@@ -6218,25 +6235,31 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
*/
if (!need_log_inode(trans, inode)) {
btrfs_add_delayed_iput(inode);
- return 0;
+ goto out;
}
if (!can_log_conflicting_inode(trans, inode)) {
btrfs_add_delayed_iput(inode);
- return BTRFS_LOG_FORCE_COMMIT;
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out;
}
btrfs_add_delayed_iput(inode);
ino_elem = kmalloc_obj(*ino_elem, GFP_NOFS);
- if (!ino_elem)
- return -ENOMEM;
+ if (!ino_elem) {
+ ret = -ENOMEM;
+ goto out;
+ }
ino_elem->ino = ino;
ino_elem->parent = parent;
list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
ctx->num_conflict_inodes++;
- return 0;
+out:
+ trace_btrfs_add_conflicting_inode_exit(trans, ctx, ino, parent, ret);
+
+ return ret;
}
static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
@@ -6254,7 +6277,15 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
if (ctx->logging_conflict_inodes)
return 0;
+ /*
+ * Avoid any work if no conflicting inodes and emitting the trace event
+ * which only adds noise and it's useless if there are no inodes.
+ */
+ if (list_empty(&ctx->conflict_inodes))
+ return 0;
+
ctx->logging_conflict_inodes = true;
+ trace_btrfs_log_conflicting_inodes_enter(trans, ctx);
/*
* New conflicting inodes may be found and added to the list while we
@@ -6348,6 +6379,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
ctx->logging_conflict_inodes = false;
if (ret)
free_conflicting_inodes(ctx);
+ trace_btrfs_log_conflicting_inodes_exit(trans, ctx, ret);
return ret;
}
@@ -6359,7 +6391,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_path *dst_path,
const u64 logged_isize,
- const int inode_only,
+ const enum btrfs_log_mode log_mode,
struct btrfs_log_ctx *ctx,
bool *need_log_inode_item)
{
@@ -6415,7 +6447,7 @@ again:
}
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr,
- inode_only, logged_isize, ctx);
+ log_mode, logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -6434,7 +6466,7 @@ again:
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot,
- ins_nr, inode_only, logged_isize, ctx);
+ ins_nr, log_mode, logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -6451,7 +6483,7 @@ again:
}
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
- ins_nr, inode_only, logged_isize, ctx);
+ ins_nr, log_mode, logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 1;
@@ -6465,7 +6497,7 @@ next_slot:
}
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path,
- ins_start_slot, ins_nr, inode_only,
+ ins_start_slot, ins_nr, log_mode,
logged_isize, ctx);
if (ret < 0)
return ret;
@@ -6491,12 +6523,12 @@ next_key:
}
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
- ins_nr, inode_only, logged_isize, ctx);
+ ins_nr, log_mode, logged_isize, ctx);
if (ret)
return ret;
}
- if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
+ if (log_mode == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
/*
* Release the path because otherwise we might attempt to double
* lock the same leaf with btrfs_log_prealloc_extents() below.
@@ -6827,8 +6859,16 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
*/
lockdep_assert_not_held(&inode->log_mutex);
- ASSERT(!ctx->logging_new_delayed_dentries,
- "ctx->logging_new_delayed_dentries=%d", ctx->logging_new_delayed_dentries);
+ ASSERT(!ctx->logging_new_delayed_dentries);
+
+ /*
+ * Return early if empty list, avoid emitting redundant trace events
+ * that generate noise only.
+ */
+ if (list_empty(delayed_ins_list))
+ return 0;
+
+ trace_btrfs_log_new_delayed_dentries_enter(trans, inode);
ctx->logging_new_delayed_dentries = true;
list_for_each_entry(item, delayed_ins_list, log_list) {
@@ -6871,6 +6911,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
ctx->log_new_dentries = orig_log_new_dentries;
ctx->logging_new_delayed_dentries = false;
+ trace_btrfs_log_new_delayed_dentries_exit(trans, inode, ret);
return ret;
}
@@ -6891,11 +6932,11 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
*/
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
- int inode_only,
+ enum btrfs_log_mode log_mode,
struct btrfs_log_ctx *ctx)
{
struct btrfs_path *path;
- struct btrfs_path *dst_path;
+ struct btrfs_path *dst_path = NULL;
struct btrfs_key min_key;
struct btrfs_key max_key;
struct btrfs_root *log = inode->root->log_root;
@@ -6911,13 +6952,17 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
LIST_HEAD(delayed_ins_list);
LIST_HEAD(delayed_del_list);
+ trace_btrfs_log_inode_enter(trans, inode, ctx, log_mode);
+
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
dst_path = btrfs_alloc_path();
if (!dst_path) {
- btrfs_free_path(path);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
min_key.objectid = ino;
@@ -6931,13 +6976,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (S_ISDIR(inode->vfs_inode.i_mode) ||
(!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags) &&
- inode_only >= LOG_INODE_EXISTS))
+ log_mode >= LOG_INODE_EXISTS))
max_key.type = BTRFS_XATTR_ITEM_KEY;
else
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
- if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
+ if (S_ISDIR(inode->vfs_inode.i_mode) && log_mode == LOG_INODE_ALL)
full_dir_logging = true;
/*
@@ -6988,7 +7033,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* for symlinks).
*/
if (S_ISLNK(inode->vfs_inode.i_mode))
- inode_only = LOG_INODE_ALL;
+ log_mode = LOG_INODE_ALL;
/*
* Before logging the inode item, cache the value returned by
@@ -7023,7 +7068,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
ret = drop_inode_items(trans, log, path, inode,
BTRFS_XATTR_ITEM_KEY);
} else {
- if (inode_only == LOG_INODE_EXISTS) {
+ if (log_mode == LOG_INODE_EXISTS) {
/*
* Make sure the new inode item we write to the log has
* the same isize as the current one (if it exists).
@@ -7043,7 +7088,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
}
if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags)) {
- if (inode_only == LOG_INODE_EXISTS) {
+ if (log_mode == LOG_INODE_EXISTS) {
max_key.type = BTRFS_XATTR_ITEM_KEY;
if (ctx->logged_before)
ret = drop_inode_items(trans, log, path,
@@ -7059,15 +7104,15 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
}
} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&inode->runtime_flags) ||
- inode_only == LOG_INODE_EXISTS) {
- if (inode_only == LOG_INODE_ALL)
+ log_mode == LOG_INODE_EXISTS) {
+ if (log_mode == LOG_INODE_ALL)
fast_search = true;
max_key.type = BTRFS_XATTR_ITEM_KEY;
if (ctx->logged_before)
ret = drop_inode_items(trans, log, path, inode,
max_key.type);
} else {
- if (inode_only == LOG_INODE_ALL)
+ if (log_mode == LOG_INODE_ALL)
fast_search = true;
inode_item_dropped = false;
goto log_extents;
@@ -7102,8 +7147,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
path, dst_path, logged_isize,
- inode_only, ctx,
- &need_log_inode_item);
+ log_mode, ctx, &need_log_inode_item);
if (ret)
goto out_unlock;
@@ -7146,7 +7190,7 @@ log_extents:
ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
if (ret)
goto out_unlock;
- } else if (inode_only == LOG_INODE_ALL) {
+ } else if (log_mode == LOG_INODE_ALL) {
struct extent_map *em, *n;
write_lock(&em_tree->lock);
@@ -7202,7 +7246,7 @@ log_extents:
* a power failure unless the log was synced as part of an fsync
* against any other unrelated inode.
*/
- if (!ctx->logging_new_name && inode_only != LOG_INODE_EXISTS)
+ if (!ctx->logging_new_name && log_mode != LOG_INODE_EXISTS)
inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
@@ -7210,7 +7254,7 @@ log_extents:
* Reset the last_reflink_trans so that the next fsync does not need to
* go through the slower path when logging extents and their checksums.
*/
- if (inode_only == LOG_INODE_ALL)
+ if (log_mode == LOG_INODE_ALL)
inode->last_reflink_trans = 0;
out_unlock:
@@ -7233,6 +7277,8 @@ out:
&delayed_del_list);
}
+ trace_btrfs_log_inode_exit(trans, inode, ret);
+
return ret;
}
@@ -7246,9 +7292,13 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
struct btrfs_root *root = inode->root;
const u64 ino = btrfs_ino(inode);
+ trace_btrfs_log_all_parents_enter(trans, inode);
+
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
path->skip_locking = true;
path->search_commit_root = true;
@@ -7257,7 +7307,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- return ret;
+ goto out;
while (true) {
struct extent_buffer *leaf = path->nodes[0];
@@ -7269,9 +7319,11 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- return ret;
- if (ret > 0)
+ goto out;
+ if (ret > 0) {
+ ret = 0;
break;
+ }
continue;
}
@@ -7324,8 +7376,10 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
* at both parents and the old parent B would still
* exist.
*/
- if (IS_ERR(dir_inode))
- return PTR_ERR(dir_inode);
+ if (IS_ERR(dir_inode)) {
+ ret = PTR_ERR(dir_inode);
+ goto out;
+ }
if (!need_log_inode(trans, dir_inode)) {
btrfs_add_delayed_iput(dir_inode);
@@ -7338,11 +7392,14 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
ret = log_new_dir_dentries(trans, dir_inode, ctx);
btrfs_add_delayed_iput(dir_inode);
if (ret)
- return ret;
+ goto out;
}
path->slots[0]++;
}
- return 0;
+out:
+ trace_btrfs_log_all_parents_exit(trans, inode, ret);
+
+ return ret;
}
static int log_new_ancestors(struct btrfs_trans_handle *trans,
@@ -7457,16 +7514,22 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
struct btrfs_key search_key;
int ret;
+ trace_btrfs_log_all_new_ancestors_enter(trans, inode);
+
/*
* For a single hard link case, go through a fast path that does not
* need to iterate the fs/subvolume tree.
*/
- if (inode->vfs_inode.i_nlink < 2)
- return log_new_ancestors_fast(trans, inode, parent, ctx);
+ if (inode->vfs_inode.i_nlink < 2) {
+ ret = log_new_ancestors_fast(trans, inode, parent, ctx);
+ goto out;
+ }
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
search_key.objectid = ino;
search_key.type = BTRFS_INODE_REF_KEY;
@@ -7474,7 +7537,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
again:
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
- return ret;
+ goto out;
if (ret == 0)
path->slots[0]++;
@@ -7486,9 +7549,11 @@ again:
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- return ret;
- if (ret > 0)
+ goto out;
+ if (ret > 0) {
+ ret = 0;
break;
+ }
continue;
}
@@ -7504,8 +7569,10 @@ again:
* this loop, etc). So just return some error to fallback to
* a transaction commit.
*/
- if (found_key.type == BTRFS_INODE_EXTREF_KEY)
- return -EMLINK;
+ if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
+ ret = -EMLINK;
+ goto out;
+ }
/*
* Logging ancestors needs to do more searches on the fs/subvol
@@ -7517,11 +7584,13 @@ again:
ret = log_new_ancestors(trans, root, path, ctx);
if (ret)
- return ret;
+ goto out;
btrfs_release_path(path);
goto again;
}
- return 0;
+out:
+ trace_btrfs_log_all_new_ancestors_exit(trans, inode, ret);
+ return ret;
}
/*
@@ -7533,7 +7602,7 @@ again:
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct dentry *parent,
- int inode_only,
+ enum btrfs_log_mode log_mode,
struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = inode->root;
@@ -7541,29 +7610,39 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
int ret = 0;
bool log_dentries;
- if (btrfs_test_opt(fs_info, NOTREELOG))
- return BTRFS_LOG_FORCE_COMMIT;
+ trace_btrfs_log_inode_parent_enter(trans, inode);
+
+ if (btrfs_test_opt(fs_info, NOTREELOG)) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out;
+ }
- if (btrfs_root_refs(&root->root_item) == 0)
- return BTRFS_LOG_FORCE_COMMIT;
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out;
+ }
/*
* If we're logging an inode from a subvolume created in the current
* transaction we must force a commit since the root is not persisted.
*/
- if (btrfs_root_generation(&root->root_item) == trans->transid)
- return BTRFS_LOG_FORCE_COMMIT;
+ if (btrfs_root_generation(&root->root_item) == trans->transid) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out;
+ }
/* Skip already logged inodes and without new extents. */
if (btrfs_inode_in_log(inode, trans->transid) &&
- list_empty(&ctx->ordered_extents))
- return BTRFS_NO_LOG_SYNC;
+ list_empty(&ctx->ordered_extents)) {
+ ret = BTRFS_NO_LOG_SYNC;
+ goto out;
+ }
ret = start_log_trans(trans, root, ctx);
if (ret)
- return ret;
+ goto out;
- ret = btrfs_log_inode(trans, inode, inode_only, ctx);
+ ret = btrfs_log_inode(trans, inode, log_mode, ctx);
if (ret)
goto end_trans;
@@ -7649,6 +7728,9 @@ end_trans:
btrfs_remove_log_ctx(root, ctx);
btrfs_end_log_trans(root);
+out:
+ trace_btrfs_log_inode_parent_exit(trans, inode, ret);
+
return ret;
}
@@ -7872,6 +7954,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir, struct btrfs_inode *inode,
bool for_rename)
{
+ trace_btrfs_record_unlink_dir(trans, dir, inode, for_rename);
+
/*
* when we're logging a file, if it hasn't been renamed
* or unlinked, and its inode is fully committed on disk,
@@ -7934,6 +8018,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir)
{
+ trace_btrfs_record_snapshot_destroy(trans, dir);
+
mutex_lock(&dir->log_mutex);
dir->last_unlink_trans = trans->transid;
mutex_unlock(&dir->log_mutex);
@@ -7954,6 +8040,8 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
struct btrfs_inode *dir)
{
+ trace_btrfs_record_new_subvolume(trans, dir);
+
mutex_lock(&dir->log_mutex);
dir->last_unlink_trans = trans->transid;
mutex_unlock(&dir->log_mutex);
@@ -7986,6 +8074,8 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
bool log_pinned = false;
int ret;
+ trace_btrfs_log_new_name_enter(trans, inode, old_dir, old_dir_index);
+
/* The inode has a new name (ref/extref), so make sure we log it. */
set_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
@@ -8008,7 +8098,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
goto out;
} else if (ret == 0) {
if (!old_dir)
- return;
+ goto out;
/*
* If the inode was not logged and we are doing a rename (old_dir is not
* NULL), check if old_dir was logged - if it was not we can return and
@@ -8018,7 +8108,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
else if (ret == 0)
- return;
+ goto out;
}
ret = 0;
@@ -8117,6 +8207,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
ASSERT(list_empty(&ctx.conflict_inodes));
out:
+ trace_btrfs_log_new_name_exit(trans, inode, old_dir, ret);
/*
* If an error happened mark the log for a full commit because it's not
* consistent and up to date or we couldn't find out if one of the
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 4a626dc6a58b9..81ab5eeeb9747 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -11,6 +11,13 @@
#include <linux/fscrypt.h>
#include "transaction.h"
+enum btrfs_log_mode {
+ /* Log everything about an inode. */
+ LOG_INODE_ALL,
+ /* Log just enough to recreate the inode during log replay. */
+ LOG_INODE_EXISTS,
+};
+
struct inode;
struct dentry;
struct btrfs_ordered_extent;
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 0062b3a557819..983365a735416 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -458,7 +458,7 @@ static int rollback_verity(struct btrfs_inode *inode)
if (ret) {
btrfs_handle_fs_error(root->fs_info, ret,
"failed to drop verity items in rollback %llu",
- (u64)inode->vfs_inode.i_ino);
+ inode->vfs_inode.i_ino);
goto out;
}
@@ -472,7 +472,7 @@ static int rollback_verity(struct btrfs_inode *inode)
trans = NULL;
btrfs_handle_fs_error(root->fs_info, ret,
"failed to start transaction in verity rollback %llu",
- (u64)inode->vfs_inode.i_ino);
+ inode->vfs_inode.i_ino);
goto out;
}
inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ed009bc9da2ad..4cd9033320f76 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2286,6 +2286,38 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_devic
update_dev_time(rcu_dereference_raw(device->name));
}
+int btrfs_remove_dev_stat_item(struct btrfs_trans_handle *trans, u64 devid)
+{
+ BTRFS_PATH_AUTO_RELEASE(path);
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = BTRFS_DEV_STATS_OBJECTID;
+ key.type = BTRFS_PERSISTENT_ITEM_KEY;
+ key.offset = devid;
+
+ ret = btrfs_search_slot(trans, dev_root, &key, &path, -1, 1);
+ if (ret < 0) {
+ btrfs_warn(fs_info,
+ "error %d while searching for dev_stats item for devid %llu",
+ ret, devid);
+ return ret;
+ }
+ /* The dev stats item does not exist, nothing to bother. */
+ if (ret > 0)
+ return 0;
+ ret = btrfs_del_item(trans, dev_root, &path);
+ if (ret < 0) {
+ btrfs_warn(fs_info,
+ "error %d while deleting dev_stats item for devid %llu",
+ ret, devid);
+ return ret;
+ }
+ return 0;
+}
+
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
struct file **bdev_file)
@@ -2365,6 +2397,12 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
return ret;
}
+ ret = btrfs_remove_dev_stat_item(trans, device->devid);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
btrfs_scrub_cancel_dev(device);
@@ -2889,6 +2927,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
device->commit_total_bytes = device->total_bytes;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+
+ /*
+ * Increase dev_stats_ccnt so that corresponding DEV_STATS item can be
+ * created at the next transaction commit.
+ */
+ atomic_inc(&device->dev_stats_ccnt);
device->dev_stats_valid = 1;
set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
@@ -3718,7 +3762,11 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
u64 chunk_type;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
- ASSERT(cache);
+ if (unlikely(!cache)) {
+ btrfs_err(fs_info, "balance: chunk at bytenr %llu has no corresponding block group",
+ chunk_offset);
+ return -EUCLEAN;
+ }
chunk_type = cache->flags;
btrfs_put_block_group(cache);
@@ -3957,16 +4005,21 @@ static bool chunk_profiles_filter(u64 chunk_type, struct btrfs_balance_args *bar
return true;
}
-static bool chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
- struct btrfs_balance_args *bargs)
+static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
{
struct btrfs_block_group *cache;
u64 chunk_used;
u64 user_thresh_min;
u64 user_thresh_max;
- bool ret = true;
+ int ret = 1;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ if (unlikely(!cache)) {
+ btrfs_err(fs_info, "balance: chunk at bytenr %llu has no corresponding block group",
+ chunk_offset);
+ return -EUCLEAN;
+ }
chunk_used = cache->used;
if (bargs->usage_min == 0)
@@ -3982,20 +4035,25 @@ static bool chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_of
user_thresh_max = mult_perc(cache->length, bargs->usage_max);
if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
- ret = false;
+ ret = 0;
btrfs_put_block_group(cache);
return ret;
}
-static bool chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
- struct btrfs_balance_args *bargs)
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
{
struct btrfs_block_group *cache;
u64 chunk_used, user_thresh;
- bool ret = true;
+ int ret = 1;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ if (unlikely(!cache)) {
+ btrfs_err(fs_info, "balance: chunk at bytenr %llu has no corresponding block group",
+ chunk_offset);
+ return -EUCLEAN;
+ }
chunk_used = cache->used;
if (bargs->usage_min == 0)
@@ -4006,7 +4064,7 @@ static bool chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
user_thresh = mult_perc(cache->length, bargs->usage);
if (chunk_used < user_thresh)
- ret = false;
+ ret = 0;
btrfs_put_block_group(cache);
return ret;
@@ -4111,8 +4169,8 @@ static bool chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args
return false;
}
-static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
- u64 chunk_offset)
+static int should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
+ u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -4145,12 +4203,22 @@ static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk
}
/* usage filter */
- if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
- chunk_usage_filter(fs_info, chunk_offset, bargs)) {
- return false;
- } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
- chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
- return false;
+ if (bargs->flags & BTRFS_BALANCE_ARGS_USAGE) {
+ int ret2;
+
+ ret2 = chunk_usage_filter(fs_info, chunk_offset, bargs);
+ if (ret2 < 0)
+ return ret2;
+ if (ret2)
+ return false;
+ } else if (bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) {
+ int ret2;
+
+ ret2 = chunk_usage_range_filter(fs_info, chunk_offset, bargs);
+ if (ret2 < 0)
+ return ret2;
+ if (ret2)
+ return false;
}
/* devid filter */
@@ -4430,6 +4498,10 @@ again:
ret = should_balance_chunk(leaf, chunk, found_key.offset);
btrfs_release_path(path);
+ if (ret < 0) {
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ goto error;
+ }
if (!ret) {
mutex_unlock(&fs_info->reclaim_bgs_lock);
goto loop;
@@ -6053,7 +6125,7 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
lockdep_assert_held(&info->chunk_mutex);
- if (!alloc_profile_is_valid(type, 0)) {
+ if (unlikely(!alloc_profile_is_valid(type, 0))) {
DEBUG_WARN("invalid alloc profile for type %llu", type);
return ERR_PTR(-EINVAL);
}
@@ -6064,7 +6136,7 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
return ERR_PTR(-ENOSPC);
}
- if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ if (unlikely(!(type & BTRFS_BLOCK_GROUP_TYPE_MASK))) {
btrfs_err(info, "invalid chunk type 0x%llx requested", type);
DEBUG_WARN();
return ERR_PTR(-EINVAL);
@@ -6234,7 +6306,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
alloc_profile = btrfs_metadata_alloc_profile(fs_info);
meta_space_info = btrfs_find_space_info(fs_info, alloc_profile);
- if (!meta_space_info) {
+ if (unlikely(!meta_space_info)) {
DEBUG_WARN();
return -EINVAL;
}
@@ -6244,7 +6316,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
alloc_profile = btrfs_system_alloc_profile(fs_info);
sys_space_info = btrfs_find_space_info(fs_info, alloc_profile);
- if (!sys_space_info) {
+ if (unlikely(!sys_space_info)) {
DEBUG_WARN();
return -EINVAL;
}
@@ -8137,8 +8209,8 @@ static int btrfs_device_init_dev_stats(struct btrfs_device *device,
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
if (item_size >= (1 + i) * sizeof(__le64))
- btrfs_dev_stat_set(device, i,
- btrfs_dev_stats_value(eb, ptr, i));
+ atomic_set(device->dev_stat_values + i,
+ btrfs_dev_stats_value(eb, ptr, i));
else
btrfs_dev_stat_set(device, i, 0);
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 96904d18f686b..63be45c3298ca 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -933,6 +933,7 @@ bool btrfs_first_pending_extent(struct btrfs_device *device, u64 start, u64 len,
u64 *pending_start, u64 *pending_end);
bool btrfs_find_hole_in_pending_extents(struct btrfs_device *device,
u64 *start, u64 *len, u64 min_hole_size);
+int btrfs_remove_dev_stat_item(struct btrfs_trans_handle *trans, u64 devid);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 5f75cf0e14b95..139d49db9978d 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -356,12 +356,33 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
return ret;
}
+static int btrfs_get_max_active_zones(struct btrfs_device *device,
+ struct btrfs_zoned_device_info *zone_info)
+{
+ struct block_device *bdev = device->bdev;
+ int max_active_zones;
+
+ if (unlikely(zone_info->nr_zones < BTRFS_MIN_ACTIVE_ZONES)) {
+ btrfs_err(device->fs_info, "zoned: not enough zones to mount filesystem: %u < %d",
+ zone_info->nr_zones, BTRFS_MIN_ACTIVE_ZONES);
+ return -EINVAL;
+ }
+
+ max_active_zones = min_not_zero(bdev_max_active_zones(bdev),
+ bdev_max_open_zones(bdev));
+ if (max_active_zones == 0)
+ max_active_zones = min(zone_info->nr_zones / 4,
+ BTRFS_DEFAULT_MAX_ACTIVE_ZONES);
+
+ zone_info->max_active_zones = max(max_active_zones, BTRFS_MIN_ACTIVE_ZONES);
+ return 0;
+}
+
int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_zoned_device_info *zone_info = NULL;
struct block_device *bdev = device->bdev;
- unsigned int max_active_zones;
unsigned int nactive;
sector_t nr_sectors;
sector_t sector = 0;
@@ -426,19 +447,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
- max_active_zones = min_not_zero(bdev_max_active_zones(bdev),
- bdev_max_open_zones(bdev));
- if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES)
- max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES;
- if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
- btrfs_err(fs_info,
-"zoned: %s: max active zones %u is too small, need at least %u active zones",
- rcu_dereference(device->name), max_active_zones,
- BTRFS_MIN_ACTIVE_ZONES);
- ret = -EINVAL;
+ ret = btrfs_get_max_active_zones(device, zone_info);
+ if (ret)
goto out;
- }
- zone_info->max_active_zones = max_active_zones;
zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
if (!zone_info->seq_zones) {
@@ -519,26 +530,29 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
goto out;
}
- if (max_active_zones) {
- if (unlikely(nactive > max_active_zones)) {
- if (bdev_max_active_zones(bdev) == 0) {
- max_active_zones = 0;
- zone_info->max_active_zones = 0;
- goto validate;
- }
+ if (unlikely(nactive > zone_info->max_active_zones)) {
+ if (bdev_max_active_zones(bdev) > 0) {
btrfs_err(device->fs_info,
- "zoned: %u active zones on %s exceeds max_active_zones %u",
- nactive, rcu_dereference(device->name),
- max_active_zones);
+ "zoned: %u active zones on %s exceeds max_active_zones %u",
+ nactive, rcu_dereference(device->name),
+ zone_info->max_active_zones);
ret = -EIO;
goto out;
}
+
+ /*
+ * This is for backward compatibility with old filesystems that
+ * have a lot of active zones because the device doesn't report
+ * a maximum number of zones and we previously didn't care for
+ * the limit.
+ */
+ zone_info->max_active_zones = 0;
+ } else {
atomic_set(&zone_info->active_zones_left,
- max_active_zones - nactive);
+ zone_info->max_active_zones - nactive);
set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
}
-validate:
/* Validate superblock log */
nr_zones = BTRFS_NR_SB_LOG_ZONES;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -2765,7 +2779,6 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
struct btrfs_block_group *bg;
struct list_head *bg_list;
u64 alloc_flags;
- bool first = true;
bool did_chunk_alloc = false;
int index;
int ret;
@@ -2782,17 +2795,12 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
index = btrfs_bg_flags_to_raid_index(alloc_flags);
- /* Scan the data space_info to find empty block groups. Take the second one. */
again:
bg_list = &space_info->block_groups[index];
list_for_each_entry(bg, bg_list, list) {
- if (bg->alloc_offset != 0)
- continue;
- if (first) {
- first = false;
+ if (bg->alloc_offset != 0)
continue;
- }
if (space_info == data_sinfo) {
/* Migrate the block group to the data relocation space_info. */
@@ -2851,7 +2859,6 @@ again:
* We allocated a new block group in the data relocation space_info. We
* can take that one.
*/
- first = false;
did_chunk_alloc = true;
goto again;
}