diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 3cfc440c636ca3..2d5f0482678b82 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -44,4 +44,4 @@ btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ tests/free-space-tree-tests.o tests/extent-map-tests.o \ - tests/raid-stripe-tree-tests.o + tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 04f53ca548e188..6d9f39c1d89c71 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -3022,9 +3022,6 @@ void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, cache->rb_root = RB_ROOT; for (i = 0; i < BTRFS_MAX_LEVEL; i++) INIT_LIST_HEAD(&cache->pending[i]); - INIT_LIST_HEAD(&cache->changed); - INIT_LIST_HEAD(&cache->detached); - INIT_LIST_HEAD(&cache->leaves); INIT_LIST_HEAD(&cache->pending_edge); INIT_LIST_HEAD(&cache->useless_node); cache->fs_info = fs_info; @@ -3132,29 +3129,17 @@ void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node) { - struct btrfs_backref_node *upper; struct btrfs_backref_edge *edge; if (!node) return; - BUG_ON(!node->lowest && !node->detached); while (!list_empty(&node->upper)) { edge = list_entry(node->upper.next, struct btrfs_backref_edge, list[LOWER]); - upper = edge->node[UPPER]; list_del(&edge->list[LOWER]); list_del(&edge->list[UPPER]); btrfs_backref_free_edge(cache, edge); - - /* - * Add the node to leaf node list if no other child block - * cached. - */ - if (list_empty(&upper->lower)) { - list_add_tail(&upper->lower, &cache->leaves); - upper->lowest = 1; - } } btrfs_backref_drop_node(cache, node); @@ -3166,33 +3151,13 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, void btrfs_backref_release_cache(struct btrfs_backref_cache *cache) { struct btrfs_backref_node *node; - int i; - while (!list_empty(&cache->detached)) { - node = list_entry(cache->detached.next, - struct btrfs_backref_node, list); + while ((node = rb_entry_safe(rb_first(&cache->rb_root), + struct btrfs_backref_node, rb_node))) btrfs_backref_cleanup_node(cache, node); - } - while (!list_empty(&cache->leaves)) { - node = list_entry(cache->leaves.next, - struct btrfs_backref_node, lower); - btrfs_backref_cleanup_node(cache, node); - } - - for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - while (!list_empty(&cache->pending[i])) { - node = list_first_entry(&cache->pending[i], - struct btrfs_backref_node, - list); - btrfs_backref_cleanup_node(cache, node); - } - } ASSERT(list_empty(&cache->pending_edge)); ASSERT(list_empty(&cache->useless_node)); - ASSERT(list_empty(&cache->changed)); - ASSERT(list_empty(&cache->detached)); - ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); ASSERT(!cache->nr_nodes); ASSERT(!cache->nr_edges); } @@ -3316,8 +3281,12 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, root = btrfs_get_fs_root(fs_info, ref_key->offset, false); if (IS_ERR(root)) return PTR_ERR(root); - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - cur->cowonly = 1; + + /* We shouldn't be using backref cache for non-shareable roots. */ + if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { + btrfs_put_root(root); + return -EUCLEAN; + } if (btrfs_root_level(&root->root_item) == cur->level) { /* Tree root */ @@ -3403,8 +3372,15 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, goto out; } upper->owner = btrfs_header_owner(eb); - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - upper->cowonly = 1; + + /* We shouldn't be using backref cache for non shareable roots. */ + if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { + btrfs_put_root(root); + btrfs_backref_free_edge(cache, edge); + btrfs_backref_free_node(cache, upper); + ret = -EUCLEAN; + goto out; + } /* * If we know the block isn't shared we can avoid @@ -3595,15 +3571,9 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, ASSERT(start->checked); - /* Insert this node to cache if it's not COW-only */ - if (!start->cowonly) { - rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, - &start->rb_node); - if (rb_node) - btrfs_backref_panic(cache->fs_info, start->bytenr, - -EEXIST); - list_add_tail(&start->lower, &cache->leaves); - } + rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, &start->rb_node); + if (rb_node) + btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST); /* * Use breadth first search to iterate all related edges. @@ -3642,11 +3612,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, * parents have already been linked. */ if (!RB_EMPTY_NODE(&upper->rb_node)) { - if (upper->lowest) { - list_del_init(&upper->lower); - upper->lowest = 0; - } - list_add_tail(&edge->list[UPPER], &upper->lower); continue; } @@ -3657,23 +3622,13 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, return -EUCLEAN; } - /* Sanity check, COW-only node has non-COW-only parent */ - if (start->cowonly != upper->cowonly) { - ASSERT(0); + rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr, + &upper->rb_node); + if (unlikely(rb_node)) { + btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST); return -EUCLEAN; } - /* Only cache non-COW-only (subvolume trees) tree blocks */ - if (!upper->cowonly) { - rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr, - &upper->rb_node); - if (rb_node) { - btrfs_backref_panic(cache->fs_info, - upper->bytenr, -EEXIST); - return -EUCLEAN; - } - } - list_add_tail(&edge->list[UPPER], &upper->lower); /* diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index e8c22cccb5c132..74e6140312747f 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -318,6 +318,12 @@ struct btrfs_backref_node { u64 bytenr; }; /* Use rb_simple_node for search/insert */ + /* + * This is a sanity check, whenever we COW a block we will update + * new_bytenr with it's current location, and we will check this in + * various places to validate that the cache makes sense, it shouldn't + * be used for anything else. + */ u64 new_bytenr; /* Objectid of tree block owner, can be not uptodate */ u64 owner; @@ -335,10 +341,6 @@ struct btrfs_backref_node { struct extent_buffer *eb; /* Level of the tree block */ unsigned int level:8; - /* Is the block in a non-shareable tree */ - unsigned int cowonly:1; - /* 1 if no child node is in the cache */ - unsigned int lowest:1; /* Is the extent buffer locked */ unsigned int locked:1; /* Has the block been processed */ @@ -391,12 +393,6 @@ struct btrfs_backref_cache { * level blocks may not reflect the new location */ struct list_head pending[BTRFS_MAX_LEVEL]; - /* List of backref nodes with no child node */ - struct list_head leaves; - /* List of blocks that have been COWed in current transaction */ - struct list_head changed; - /* List of detached backref node. */ - struct list_head detached; u64 last_trans; diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 1f216d07eff65c..7ea6f0b43b9507 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -81,6 +81,9 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS, &btrfs_clone_bioset); + if (IS_ERR(bio)) + return ERR_CAST(bio); + bbio = btrfs_bio(bio); btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); bbio->inode = orig_bbio->inode; @@ -355,7 +358,7 @@ static void btrfs_simple_end_io(struct bio *bio) INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) btrfs_record_physical_zoned(bbio); btrfs_bio_end_io(bbio, bbio->bio.bi_status); } @@ -398,7 +401,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; btrfs_bio_end_io(bbio, bbio->bio.bi_status); @@ -412,7 +415,7 @@ static void btrfs_clone_write_end_io(struct bio *bio) if (bio->bi_status) { atomic_inc(&stripe->bioc->error); btrfs_log_dev_io_error(bio, stripe->dev); - } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + } else if (bio_is_zone_append(bio)) { stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; } @@ -649,8 +652,14 @@ static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) map_length = min(map_length, bbio->fs_info->max_zone_append_size); sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, &nr_segs, map_length); - if (sector_offset) - return sector_offset << SECTOR_SHIFT; + if (sector_offset) { + /* + * bio_split_rw_at() could split at a size smaller than our + * sectorsize and thus cause unaligned I/Os. Fix that by + * always rounding down to the nearest boundary. + */ + return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize); + } return map_length; } @@ -678,7 +687,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) &bioc, &smap, &mirror_num); if (error) { ret = errno_to_blk_status(error); - goto fail; + btrfs_bio_counter_dec(fs_info); + goto end_bbio; } map_length = min(map_length, length); @@ -686,7 +696,15 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) map_length = btrfs_append_map_length(bbio, map_length); if (map_length < length) { - bbio = btrfs_split_bio(fs_info, bbio, map_length); + struct btrfs_bio *split; + + split = btrfs_split_bio(fs_info, bbio, map_length); + if (IS_ERR(split)) { + ret = errno_to_blk_status(PTR_ERR(split)); + btrfs_bio_counter_dec(fs_info); + goto end_bbio; + } + bbio = split; bio = &bbio->bio; } @@ -760,6 +778,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) btrfs_bio_end_io(remaining, ret); } +end_bbio: btrfs_bio_end_io(bbio, ret); /* Do not submit another chunk */ return true; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 4427c1b835e8b2..5be029734cfa2c 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1223,7 +1223,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, block_group->space_info->total_bytes -= block_group->length; block_group->space_info->bytes_readonly -= (block_group->length - block_group->zone_unusable); - btrfs_space_info_update_bytes_zone_unusable(fs_info, block_group->space_info, + btrfs_space_info_update_bytes_zone_unusable(block_group->space_info, -block_group->zone_unusable); block_group->space_info->disk_total -= block_group->length * factor; @@ -1396,8 +1396,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) if (btrfs_is_zoned(cache->fs_info)) { /* Migrate zone_unusable bytes to readonly */ sinfo->bytes_readonly += cache->zone_unusable; - btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo, - -cache->zone_unusable); + btrfs_space_info_update_bytes_zone_unusable(sinfo, -cache->zone_unusable); cache->zone_unusable = 0; } cache->ro++; @@ -1645,8 +1644,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); spin_lock(&block_group->lock); - btrfs_space_info_update_bytes_pinned(fs_info, space_info, - -block_group->pinned); + btrfs_space_info_update_bytes_pinned(space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; block_group->pinned = 0; @@ -3060,8 +3058,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) (cache->alloc_offset - cache->used - cache->pinned - cache->reserved) + (cache->length - cache->zone_capacity); - btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo, - cache->zone_unusable); + btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable); sinfo->bytes_readonly -= cache->zone_unusable; } num_bytes = cache->length - cache->reserved - @@ -3699,7 +3696,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, old_val -= num_bytes; cache->used = old_val; cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes); + btrfs_space_info_update_bytes_pinned(space_info, num_bytes); space_info->bytes_used -= num_bytes; space_info->disk_used -= num_bytes * factor; if (READ_ONCE(space_info->periodic_reclaim)) @@ -3781,8 +3778,7 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, space_info->bytes_reserved += num_bytes; trace_btrfs_space_reservation(cache->fs_info, "space_info", space_info->flags, num_bytes, 1); - btrfs_space_info_update_bytes_may_use(cache->fs_info, - space_info, -ram_bytes); + btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index a07b9594dc7067..3f3608299c0b7e 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -150,9 +150,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, spin_unlock(&dest->lock); } if (num_bytes) - btrfs_space_info_free_bytes_may_use(fs_info, - space_info, - num_bytes); + btrfs_space_info_free_bytes_may_use(space_info, num_bytes); } if (qgroup_to_release_ret) *qgroup_to_release_ret = qgroup_to_release; @@ -383,13 +381,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) if (block_rsv->reserved < block_rsv->size) { num_bytes = block_rsv->size - block_rsv->reserved; - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, - num_bytes); + btrfs_space_info_update_bytes_may_use(sinfo, num_bytes); block_rsv->reserved = block_rsv->size; } else if (block_rsv->reserved > block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, - -num_bytes); + btrfs_space_info_update_bytes_may_use(sinfo, -num_bytes); block_rsv->reserved = block_rsv->size; btrfs_try_granting_tickets(fs_info, sinfo); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 307dedf95c7066..2c341956a01ce6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -370,6 +370,25 @@ static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transi WRITE_ONCE(root->last_trans, transid); } +/* + * Return the generation this root started with. + * + * Every normal root that is created with root->root_key.offset set to it's + * originating generation. If it is a snapshot it is the generation when the + * snapshot was created. + * + * However for TREE_RELOC roots root_key.offset is the objectid of the owning + * tree root. Thankfully we copy the root item of the owning tree root, which + * has it's last_snapshot set to what we would have root_key.offset set to, so + * return that if this is a TREE_RELOC root. + */ +static inline u64 btrfs_root_origin_generation(const struct btrfs_root *root) +{ + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) + return btrfs_root_last_snapshot(&root->root_item); + return root->root_key.offset; +} + /* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 7aa8a395d838c1..88e900e5a43d43 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -176,7 +176,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); data_sinfo = fs_info->data_sinfo; - btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len); + btrfs_space_info_free_bytes_may_use(data_sinfo, len); } /* diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 0d878dbbabba44..30f7079fa28eac 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -93,6 +93,9 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) u64 num_bytes; u64 reserved_bytes; + if (btrfs_is_testing(fs_info)) + return; + num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates); num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, trans->delayed_ref_csum_deletions); @@ -254,7 +257,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, spin_unlock(&block_rsv->lock); if (to_free > 0) - btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free); + btrfs_space_info_free_bytes_may_use(space_info, to_free); if (refilled_bytes > 0) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, @@ -555,6 +558,32 @@ void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info, delayed_refs->num_heads_ready--; } +struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_node *ref; + + lockdep_assert_held(&head->mutex); + lockdep_assert_held(&head->lock); + + if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) + return NULL; + + /* + * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. + * This is to prevent a ref count from going down to zero, which deletes + * the extent item from the extent tree, when there still are references + * to add, which would fail because they would not find the extent item. + */ + if (!list_empty(&head->ref_add_list)) + return list_first_entry(&head->ref_add_list, + struct btrfs_delayed_ref_node, add_list); + + ref = rb_entry(rb_first_cached(&head->ref_tree), + struct btrfs_delayed_ref_node, ref_node); + ASSERT(list_empty(&ref->add_list)); + return ref; +} + /* * Helper to insert the ref_node to the tail or merge with tail. * @@ -1234,6 +1263,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) { struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; + bool testing = btrfs_is_testing(fs_info); spin_lock(&delayed_refs->lock); while (true) { @@ -1263,7 +1293,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); - if (pin_bytes) { + if (!testing && pin_bytes) { struct btrfs_block_group *bg; bg = btrfs_lookup_block_group(fs_info, head->bytenr); @@ -1281,8 +1311,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) spin_lock(&bg->space_info->lock); spin_lock(&bg->lock); bg->pinned += head->num_bytes; - btrfs_space_info_update_bytes_pinned(fs_info, - bg->space_info, + btrfs_space_info_update_bytes_pinned(bg->space_info, head->num_bytes); bg->reserved -= head->num_bytes; bg->space_info->bytes_reserved -= head->num_bytes; @@ -1295,12 +1324,15 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) btrfs_error_unpin_extent_range(fs_info, head->bytenr, head->bytenr + head->num_bytes - 1); } - btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); + if (!testing) + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); } - btrfs_qgroup_destroy_extent_records(trans); + + if (!testing) + btrfs_qgroup_destroy_extent_records(trans); spin_unlock(&delayed_refs->lock); } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 611fb3388f82bd..a35067cebb974b 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -402,6 +402,7 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( struct btrfs_delayed_ref_root *delayed_refs); void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); +struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8143209486450d..eff0dd1ae62f06 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4262,6 +4262,15 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) * already the cleaner, but below we run all pending delayed iputs. */ btrfs_flush_workqueue(fs_info->fixup_workers); + /* + * Similar case here, we have to wait for delalloc workers before we + * proceed below and stop the cleaner kthread, otherwise we trigger a + * use-after-tree on the cleaner kthread task_struct when a delalloc + * worker running submit_compressed_extents() adds a delayed iput, which + * does a wake up on the cleaner kthread, which was already freed below + * when we call kthread_stop(). + */ + btrfs_flush_workqueue(fs_info->delalloc_workers); /* * After we parked the cleaner kthread, ordered extents may have diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index bd09dd3ad1a0a5..2f9126528a0170 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1803,30 +1803,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, return ret; } -static inline struct btrfs_delayed_ref_node * -select_delayed_ref(struct btrfs_delayed_ref_head *head) -{ - struct btrfs_delayed_ref_node *ref; - - if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) - return NULL; - - /* - * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. - * This is to prevent a ref count from going down to zero, which deletes - * the extent item from the extent tree, when there still are references - * to add, which would fail because they would not find the extent item. - */ - if (!list_empty(&head->ref_add_list)) - return list_first_entry(&head->ref_add_list, - struct btrfs_delayed_ref_node, add_list); - - ref = rb_entry(rb_first_cached(&head->ref_tree), - struct btrfs_delayed_ref_node, ref_node); - ASSERT(list_empty(&ref->add_list)); - return ref; -} - static struct btrfs_delayed_extent_op *cleanup_extent_op( struct btrfs_delayed_ref_head *head) { @@ -1959,7 +1935,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, lockdep_assert_held(&locked_ref->mutex); lockdep_assert_held(&locked_ref->lock); - while ((ref = select_delayed_ref(locked_ref))) { + while ((ref = btrfs_select_delayed_ref(locked_ref))) { if (ref->seq && btrfs_check_delayed_seq(fs_info, ref->seq)) { spin_unlock(&locked_ref->lock); @@ -2571,13 +2547,10 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group *cache, u64 bytenr, u64 num_bytes, int reserved) { - struct btrfs_fs_info *fs_info = cache->fs_info; - spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, - num_bytes); + btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes); if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; @@ -2724,15 +2697,15 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, { struct btrfs_block_group *cache = NULL; struct btrfs_space_info *space_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_free_cluster *cluster = NULL; - u64 len; u64 total_unpinned = 0; u64 empty_cluster = 0; bool readonly; int ret = 0; while (start <= end) { + u64 len; + readonly = false; if (!cache || start >= cache->start + cache->length) { @@ -2778,37 +2751,19 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); + btrfs_space_info_update_bytes_pinned(space_info, -len); space_info->max_extent_size = 0; if (cache->ro) { space_info->bytes_readonly += len; readonly = true; } else if (btrfs_is_zoned(fs_info)) { /* Need reset before reusing in a zoned block group */ - btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info, - len); + btrfs_space_info_update_bytes_zone_unusable(space_info, len); readonly = true; } spin_unlock(&cache->lock); - if (!readonly && return_free_space && - global_rsv->space_info == space_info) { - spin_lock(&global_rsv->lock); - if (!global_rsv->full) { - u64 to_add = min(len, global_rsv->size - - global_rsv->reserved); - - global_rsv->reserved += to_add; - btrfs_space_info_update_bytes_may_use(fs_info, - space_info, to_add); - if (global_rsv->reserved >= global_rsv->size) - global_rsv->full = 1; - len -= to_add; - } - spin_unlock(&global_rsv->lock); - } - /* Add to any tickets we may have */ - if (!readonly && return_free_space && len) - btrfs_try_granting_tickets(fs_info, space_info); + if (!readonly && return_free_space) + btrfs_return_free_space(space_info, len); spin_unlock(&space_info->lock); } @@ -5285,7 +5240,7 @@ static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control * * reference to it. */ generation = btrfs_node_ptr_generation(eb, slot); - if (!wc->update_ref || generation <= root->root_key.offset) + if (!wc->update_ref || generation <= btrfs_root_origin_generation(root)) return false; /* @@ -5340,7 +5295,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, goto reada; if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) + generation <= btrfs_root_origin_generation(root)) continue; /* We don't lock the tree block, it's OK to be racy here */ @@ -5683,7 +5638,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, * for the subtree */ if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) { + generation <= btrfs_root_origin_generation(root)) { wc->lookup_info = 1; return 1; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b923d0cec61c73..f4fb1fb3454a08 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1167,6 +1167,12 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, * last delalloc end. */ u64 last_delalloc_end = 0; + /* + * Save the last successfully ran delalloc range end (exclusive). + * This is for error handling to avoid ranges with ordered extent created + * but no IO will be submitted due to error. + */ + u64 last_finished = page_start; u64 delalloc_start = page_start; u64 delalloc_end = page_end; u64 delalloc_to_write = 0; @@ -1235,11 +1241,28 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, found_len = last_delalloc_end + 1 - found_start; if (ret >= 0) { + /* + * Some delalloc range may be created by previous folios. + * Thus we still need to clean those range up during error + * handling. + */ + last_finished = found_start; /* No errors hit so far, run the current delalloc range. */ ret = btrfs_run_delalloc_range(inode, folio, found_start, found_start + found_len - 1, wbc); + if (ret >= 0) + last_finished = found_start + found_len; + if (unlikely(ret < 0)) + btrfs_err_rl(fs_info, +"failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d", + inode->root->root_key.objectid, + btrfs_ino(inode), + folio_pos(folio), + fs_info->sectors_per_page, + &bio_ctrl->submit_bitmap, + found_start, found_len, ret); } else { /* * We've hit an error during previous delalloc range, @@ -1274,8 +1297,21 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, delalloc_start = found_start + found_len; } - if (ret < 0) + /* + * It's possible we have some ordered extents created before we hit + * an error, cleanup non-async successfully created delalloc ranges. + */ + if (unlikely(ret < 0)) { + unsigned int bitmap_size = min( + (last_finished - page_start) >> fs_info->sectorsize_bits, + fs_info->sectors_per_page); + + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) + btrfs_mark_ordered_io_finished(inode, folio, + page_start + (bit << fs_info->sectorsize_bits), + fs_info->sectorsize, false); return ret; + } out: if (last_delalloc_end) delalloc_end = last_delalloc_end; @@ -1335,7 +1371,7 @@ static int submit_one_sector(struct btrfs_inode *inode, em = btrfs_get_extent(inode, NULL, filepos, sectorsize); if (IS_ERR(em)) - return PTR_ERR_OR_ZERO(em); + return PTR_ERR(em); extent_offset = filepos - em->start; em_end = extent_map_end(em); @@ -1391,6 +1427,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long range_bitmap = 0; bool submitted_io = false; + bool error = false; const u64 folio_start = folio_pos(folio); u64 cur; int bit; @@ -1433,11 +1470,21 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, break; } ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); - if (ret < 0) - goto out; + if (unlikely(ret < 0)) { + submit_one_bio(bio_ctrl); + /* + * Failed to grab the extent map which should be very rare. + * Since there is no bio submitted to finish the ordered + * extent, we have to manually finish this sector. + */ + btrfs_mark_ordered_io_finished(inode, folio, cur, + fs_info->sectorsize, false); + error = true; + continue; + } submitted_io = true; } -out: + /* * If we didn't submitted any sector (>= i_size), folio dirty get * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared @@ -1445,8 +1492,11 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, * * Here we set writeback and clear for the range. If the full folio * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. + * + * If we hit any error, the corresponding sector will still be dirty + * thus no need to clear PAGECACHE_TAG_DIRTY. */ - if (!submitted_io) { + if (!submitted_io && !error) { btrfs_folio_set_writeback(fs_info, folio, start, len); btrfs_folio_clear_writeback(fs_info, folio, start, len); } @@ -1466,7 +1516,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl { struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - const u64 page_start = folio_pos(folio); int ret; size_t pg_offset; loff_t i_size = i_size_read(inode); @@ -1506,16 +1555,19 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl PAGE_SIZE, bio_ctrl, i_size); if (ret == 1) return 0; + if (ret < 0) + btrfs_err_rl(fs_info, +"failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", + BTRFS_I(inode)->root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), + folio_pos(folio), fs_info->sectors_per_page, + &bio_ctrl->submit_bitmap, ret); bio_ctrl->wbc->nr_to_write--; done: - if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, - page_start, PAGE_SIZE, !ret); + if (ret < 0) mapping_set_error(folio->mapping, ret); - } - /* * Only unlock ranges that are submitted. As there can be some async * submitted ranges inside the folio. @@ -2292,11 +2344,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f if (ret == 1) goto next_page; - if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, - cur, cur_len, !ret); + if (ret) mapping_set_error(mapping, ret); - } btrfs_folio_end_lock(fs_info, folio, cur, cur_len); if (ret < 0) found_error = true; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 588c353d296935..3c00dc48b925e9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -36,52 +36,7 @@ #include "ioctl.h" #include "file.h" #include "super.h" - -/* - * Helper to fault in page and copy. This should go away and be replaced with - * calls into generic code. - */ -static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, - struct folio *folio, struct iov_iter *i) -{ - size_t copied = 0; - size_t total_copied = 0; - int offset = offset_in_page(pos); - - while (write_bytes > 0) { - size_t count = min_t(size_t, PAGE_SIZE - offset, write_bytes); - /* - * Copy data from userspace to the current page - */ - copied = copy_folio_from_iter_atomic(folio, offset, count, i); - - /* Flush processor's dcache for this page */ - flush_dcache_folio(folio); - - /* - * if we get a partial write, we can end up with - * partially up to date page. These add - * a lot of complexity, so make sure they don't - * happen by forcing this copy to be retried. - * - * The rest of the btrfs_file_write code will fall - * back to page at a time copies after we return 0. - */ - if (unlikely(copied < count)) { - if (!folio_test_uptodate(folio)) { - iov_iter_revert(i, copied); - copied = 0; - } - if (!copied) - break; - } - - write_bytes -= copied; - total_copied += copied; - offset += copied; - } - return total_copied; -} +#include "print-tree.h" /* * Unlock folio after btrfs_file_write() is done with it. @@ -106,7 +61,7 @@ static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio, } /* - * After btrfs_copy_from_user(), update the following things for delalloc: + * After copy_folio_from_iter_atomic(), update the following things for delalloc: * - Mark newly dirtied folio as DELALLOC in the io tree. * Used to advise which range is to be written back. * - Mark modified folio as Uptodate/Dirty and not needing COW fixup @@ -224,7 +179,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (args->drop_cache) btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false); - if (args->start >= inode->disk_i_size && !args->replace_extent) + if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent) modify_tree = 0; update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); @@ -245,7 +200,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { - BUG_ON(del_nr > 0); + if (WARN_ON(del_nr > 0)) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } ret = btrfs_next_leaf(root, path); if (ret < 0) break; @@ -321,7 +280,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, * | -------- extent -------- | */ if (args->start > key.offset && args->end < extent_end) { - BUG_ON(del_nr > 0); + if (WARN_ON(del_nr > 0)) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; @@ -409,7 +372,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, * | -------- extent -------- | */ if (args->start > key.offset && args->end >= extent_end) { - BUG_ON(del_nr > 0); + if (WARN_ON(del_nr > 0)) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; @@ -437,7 +404,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, del_slot = path->slots[0]; del_nr = 1; } else { - BUG_ON(del_slot + del_nr != path->slots[0]); + if (WARN_ON(del_slot + del_nr != path->slots[0])) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } del_nr++; } @@ -911,6 +882,7 @@ static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ ret = PTR_ERR(folio); return ret; } + folio_wait_writeback(folio); /* Only support page sized folio yet. */ ASSERT(folio_order(folio) == 0); ret = set_folio_extent_mapped(folio); @@ -1251,7 +1223,23 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) break; } - copied = btrfs_copy_from_user(pos, write_bytes, folio, i); + copied = copy_folio_from_iter_atomic(folio, + offset_in_folio(folio, pos), write_bytes, i); + flush_dcache_folio(folio); + + /* + * If we get a partial write, we can end up with partially + * uptodate page. Although if sector size < page size we can + * handle it, but if it's not sector aligned it can cause + * a lot of complexity, so make sure they don't happen by + * forcing retry this copy. + */ + if (unlikely(copied < write_bytes)) { + if (!folio_test_uptodate(folio)) { + iov_iter_revert(i, copied); + copied = 0; + } + } num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); dirty_sectors = round_up(copied + sector_offset, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fa648ab6fe8066..a5d33ebf90d49a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -393,34 +393,13 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, - struct folio *locked_folio, u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start = 0, page_end = 0; struct folio *folio; - if (locked_folio) { - page_start = folio_pos(locked_folio); - page_end = page_start + folio_size(locked_folio) - 1; - } - while (index <= end_index) { - /* - * For locked page, we will call btrfs_mark_ordered_io_finished - * through btrfs_mark_ordered_io_finished() on it - * in run_delalloc_range() for the error handling, which will - * clear page Ordered and run the ordered extent accounting. - * - * Here we can't just clear the Ordered bit, or - * btrfs_mark_ordered_io_finished() would skip the accounting - * for the page range, and the ordered extent will never finish. - */ - if (locked_folio && index == (page_start >> PAGE_SHIFT)) { - index++; - continue; - } folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); index++; if (IS_ERR(folio)) @@ -436,23 +415,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, folio_put(folio); } - if (locked_folio) { - /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_start + folio_size(locked_folio)) - return; - /* - * In case this page belongs to the delalloc range being - * instantiated then skip it, since the first page of a range is - * going to be properly cleaned up by the caller of - * run_delalloc_range - */ - if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - folio_pos(locked_folio) - - folio_size(locked_folio); - offset = folio_pos(locked_folio) + folio_size(locked_folio); - } - } - return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); } @@ -1129,19 +1091,14 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, locked_folio, - start, end - start + 1); - if (locked_folio) { - const u64 page_start = folio_pos(locked_folio); - - folio_start_writeback(locked_folio); - folio_end_writeback(locked_folio); - btrfs_mark_ordered_io_finished(inode, locked_folio, - page_start, PAGE_SIZE, - !ret); - mapping_set_error(locked_folio->mapping, ret); - folio_unlock(locked_folio); - } + btrfs_cleanup_ordered_extents(inode, start, end - start + 1); + if (locked_folio) + btrfs_folio_end_lock(inode->root->fs_info, locked_folio, + start, async_extent->ram_size); + btrfs_err_rl(inode->root->fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), start, async_extent->ram_size, ret); } } @@ -1254,7 +1211,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, free_async_extent_pages(async_extent); if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); - btrfs_debug(fs_info, + btrfs_debug_rl(fs_info, "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", btrfs_root_id(root), btrfs_ino(inode), start, async_extent->ram_size, ret); @@ -1372,6 +1329,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode, alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes); + /* + * We're not doing compressed IO, don't unlock the first page + * (which the caller expects to stay locked), don't clear any + * dirty bits and don't set any writeback bits + * + * Do set the Ordered (Private2) bit so we know this page was + * properly setup for writepage. + */ + page_ops = (keep_locked ? 0 : PAGE_UNLOCK); + page_ops |= PAGE_SET_ORDERED; + /* * Relocation relies on the relocated extents to have exactly the same * size as the original extents. Normally writeback for relocation data @@ -1431,6 +1399,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode, file_extent.offset = 0; file_extent.compression = BTRFS_COMPRESS_NONE; + /* + * Locked range will be released either during error clean up or + * after the whole range is finished. + */ lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, &cached); @@ -1476,21 +1448,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); - /* - * We're not doing compressed IO, don't unlock the first page - * (which the caller expects to stay locked), don't clear any - * dirty bits and don't set any writeback bits - * - * Do set the Ordered flag so we know this page was - * properly setup for writepage. - */ - page_ops = (keep_locked ? 0 : PAGE_UNLOCK); - page_ops |= PAGE_SET_ORDERED; - - extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, - locked_folio, &cached, - EXTENT_LOCKED | EXTENT_DELALLOC, - page_ops); if (num_bytes < cur_alloc_size) num_bytes = 0; else @@ -1507,6 +1464,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_unlock; } + extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC, + page_ops); done: if (done_offset) *done_offset = end; @@ -1527,35 +1487,31 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * We process each region below. */ - clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; - page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; - /* * For the range (1). We have already instantiated the ordered extents * for this region. They are cleaned up by * btrfs_cleanup_ordered_extents() in e.g, - * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are - * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | - * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup - * function. + * btrfs_run_delalloc_range(). + * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV + * are also handled by the cleanup function. * - * However, in case of @keep_locked, we still need to unlock the pages - * (except @locked_folio) to ensure all the pages are unlocked. + * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, + * and finish the writeback of the involved folios, which will be + * never submitted. */ - if (keep_locked && orig_start < start) { + if (orig_start < start) { + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, - locked_folio, NULL, 0, page_ops); + locked_folio, NULL, clear_bits, page_ops); } - /* - * At this point we're unlocked, we want to make sure we're only - * clearing these flags under the extent lock, so lock the rest of the - * range and clear everything up. - */ - lock_extent(&inode->io_tree, start, end, NULL); + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; /* * For the range (2). If we reserved an extent for our delalloc range @@ -1589,6 +1545,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode, btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, end - start - cur_alloc_size + 1, NULL); } + btrfs_err_rl(fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), orig_start, end + 1 - orig_start, ret); return ret; } @@ -1809,7 +1769,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, bytes = range_bytes; spin_lock(&sinfo->lock); - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); + btrfs_space_info_update_bytes_may_use(sinfo, bytes); spin_unlock(&sinfo->lock); if (count > 0) @@ -1970,6 +1930,48 @@ static int can_nocow_file_extent(struct btrfs_path *path, return ret < 0 ? ret : can_nocow; } +static void cleanup_dirty_folios(struct btrfs_inode *inode, + struct folio *locked_folio, + u64 start, u64 end, int error) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct address_space *mapping = inode->vfs_inode.i_mapping; + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + u32 len; + + ASSERT(end + 1 - start < U32_MAX); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(end + 1, fs_info->sectorsize)); + len = end + 1 - start; + + /* + * Handle the locked folio first. + * btrfs_folio_clamp_*() helpers can handle range out of the folio case. + */ + btrfs_folio_clamp_clear_dirty(fs_info, locked_folio, start, len); + btrfs_folio_clamp_set_writeback(fs_info, locked_folio, start, len); + btrfs_folio_clamp_clear_writeback(fs_info, locked_folio, start, len); + + for (pgoff_t index = start_index; index <= end_index; index++) { + struct folio *folio; + + /* Already handled at the beginning. */ + if (index == locked_folio->index) + continue; + folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS); + /* Cache already dropped, no need to do any cleanup. */ + if (IS_ERR(folio)) + continue; + btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len); + btrfs_folio_clamp_set_writeback(fs_info, folio, start, len); + btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); + folio_unlock(folio); + folio_put(folio); + } + mapping_set_error(mapping, error); +} + /* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. @@ -1985,6 +1987,11 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct btrfs_root *root = inode->root; struct btrfs_path *path; u64 cow_start = (u64)-1; + /* + * If not 0, represents the inclusive end of the last fallback_to_cow() + * range. Only for error handling. + */ + u64 cow_end = 0; u64 cur_offset = start; int ret; bool check_prev = true; @@ -2145,6 +2152,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, found_key.offset - 1); cow_start = (u64)-1; if (ret) { + cow_end = found_key.offset - 1; btrfs_dec_nocow_writers(nocow_bg); goto error; } @@ -2218,24 +2226,55 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, cow_start = cur_offset; if (cow_start != (u64)-1) { - cur_offset = end; ret = fallback_to_cow(inode, locked_folio, cow_start, end); cow_start = (u64)-1; - if (ret) + if (ret) { + cow_end = end; goto error; + } } btrfs_free_path(path); return 0; error: + /* + * There are several error cases: + * + * 1) Failed without falling back to COW + * start cur_start end + * |/////////////| | + * + * For range [start, cur_start) the folios are already unlocked (except + * @locked_folio), EXTENT_DELALLOC already removed. + * Only need to clear the dirty flag as they will never be submitted. + * Ordered extent and extent maps are handled by + * btrfs_mark_ordered_io_finished() inside run_delalloc_range(). + * + * 2) Failed with error from fallback_to_cow() + * start cur_start cow_end end + * |/////////////|-----------| | + * + * For range [start, cur_start) it's the same as case 1). + * But for range [cur_start, cow_end), the folios have dirty flag + * cleared and unlocked, EXTENT_DEALLLOC cleared. + * There may or may not be any ordered extents/extent maps allocated. + * + * We should not call extent_clear_unlock_delalloc() on range [cur_start, + * cow_end), as the folios are already unlocked. + * + * So clear the folio dirty flags for [start, cur_offset) first. + */ + if (cur_offset > start) + cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); + /* * If an error happened while a COW region is outstanding, cur_offset - * needs to be reset to cow_start to ensure the COW region is unlocked - * as well. + * needs to be reset to @cow_end + 1 to skip the COW range, as + * cow_file_range() will do the proper cleanup at error. */ - if (cow_start != (u64)-1) - cur_offset = cow_start; + if (cow_end) + cur_offset = cow_end + 1; /* * We need to lock the extent here because we're clearing DELALLOC and @@ -2255,6 +2294,10 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL); } btrfs_free_path(path); + btrfs_err_rl(fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), start, end + 1 - start, ret); return ret; } @@ -2305,8 +2348,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol out: if (ret < 0) - btrfs_cleanup_ordered_extents(inode, locked_folio, start, - end - start + 1); + btrfs_cleanup_ordered_extents(inode, start, end - start + 1); return ret; } @@ -9078,9 +9120,9 @@ static ssize_t btrfs_encoded_read_inline( } struct btrfs_encoded_read_private { - wait_queue_head_t wait; + struct completion done; void *uring_ctx; - atomic_t pending; + refcount_t pending_refs; blk_status_t status; }; @@ -9099,14 +9141,14 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } - if (atomic_dec_and_test(&priv->pending)) { + if (refcount_dec_and_test(&priv->pending_refs)) { int err = blk_status_to_errno(READ_ONCE(priv->status)); if (priv->uring_ctx) { btrfs_uring_read_extent_endio(priv->uring_ctx, err); kfree(priv); } else { - wake_up(&priv->wait); + complete(&priv->done); } } bio_put(&bbio->bio); @@ -9126,8 +9168,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, if (!priv) return -ENOMEM; - init_waitqueue_head(&priv->wait); - atomic_set(&priv->pending, 1); + init_completion(&priv->done); + refcount_set(&priv->pending_refs, 1); priv->status = 0; priv->uring_ctx = uring_ctx; @@ -9140,7 +9182,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { - atomic_inc(&priv->pending); + refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, @@ -9155,11 +9197,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, disk_io_size -= bytes; } while (disk_io_size); - atomic_inc(&priv->pending); + refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); if (uring_ctx) { - if (atomic_dec_return(&priv->pending) == 0) { + if (refcount_dec_and_test(&priv->pending_refs)) { ret = blk_status_to_errno(READ_ONCE(priv->status)); btrfs_uring_read_extent_endio(uring_ctx, ret); kfree(priv); @@ -9168,8 +9210,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, return -EIOCBQUEUED; } else { - if (atomic_dec_return(&priv->pending) != 0) - io_wait_event(priv->wait, !atomic_read(&priv->pending)); + if (!refcount_dec_and_test(&priv->pending_refs)) + wait_for_completion_io(&priv->done); /* See btrfs_encoded_read_endio() for ordering. */ ret = blk_status_to_errno(READ_ONCE(priv->status)); kfree(priv); @@ -9876,6 +9918,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (btrfs_root_dead(root)) { spin_unlock(&root->root_item_lock); + btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because subvolume %llu is being deleted", diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3af8bb0c8d75d7..dc5faa89cdba38 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4984,15 +4984,14 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue * undo this. */ if (!iov) { - iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS); + iov = kmemdup(iovstack, sizeof(struct iovec) * args.iovcnt, + GFP_NOFS); if (!iov) { unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); ret = -ENOMEM; goto out_acct; } - - memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt); } count = min_t(u64, iov_iter_count(&iter), disk_io_size); @@ -5300,6 +5299,8 @@ long btrfs_ioctl(struct file *file, unsigned int return fsverity_ioctl_enable(file, (const void __user *)argp); case FS_IOC_MEASURE_VERITY: return fsverity_ioctl_measure(file, argp); + case FS_IOC_READ_VERITY_METADATA: + return fsverity_ioctl_read_metadata(file, argp); case BTRFS_IOC_ENCODED_READ: return btrfs_ioctl_encoded_read(file, argp, false); case BTRFS_IOC_ENCODED_WRITE: diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a6f92836c9b113..c82df56fbf3dcf 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1839,9 +1839,19 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) * Thus its reserved space should all be zero, no matter if qgroup * is consistent or the mode. */ - WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] || - qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] || - qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]); + if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_warn_rl(fs_info, +"to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu", + btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid), + qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA], + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC], + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]); + + } /* * The same for rfer/excl numbers, but that's only if our qgroup is * consistent and if it's in regular qgroup mode. @@ -1850,8 +1860,9 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) */ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL && !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) { - if (WARN_ON(qgroup->rfer || qgroup->excl || - qgroup->rfer_cmpr || qgroup->excl_cmpr)) { + if (qgroup->rfer || qgroup->excl || + qgroup->rfer_cmpr || qgroup->excl_cmpr) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); btrfs_warn_rl(fs_info, "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu", btrfs_qgroup_level(qgroup->qgroupid), diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index bf267bdfa8f811..6d3d2a9b6ed33f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -342,12 +342,6 @@ static bool handle_useless_nodes(struct reloc_control *rc, if (cur == node) ret = true; - /* The node is the lowest node */ - if (cur->lowest) { - list_del_init(&cur->lower); - cur->lowest = 0; - } - /* Cleanup the lower edges */ while (!list_empty(&cur->lower)) { struct btrfs_backref_edge *edge; @@ -373,7 +367,6 @@ static bool handle_useless_nodes(struct reloc_control *rc, * cache to avoid unnecessary backref lookup. */ if (cur->level > 0) { - list_add(&cur->list, &cache->detached); cur->detached = 1; } else { rb_erase(&cur->rb_node, &cache->rb_root); @@ -426,7 +419,6 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( goto out; } - node->lowest = 1; cur = node; /* Breadth-first search to build backref cache */ @@ -469,92 +461,6 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( return node; } -/* - * helper to add backref node for the newly created snapshot. - * the backref node is created by cloning backref node that - * corresponds to root of source tree - */ -static int clone_backref_node(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - const struct btrfs_root *src, - struct btrfs_root *dest) -{ - struct btrfs_root *reloc_root = src->reloc_root; - struct btrfs_backref_cache *cache = &rc->backref_cache; - struct btrfs_backref_node *node = NULL; - struct btrfs_backref_node *new_node; - struct btrfs_backref_edge *edge; - struct btrfs_backref_edge *new_edge; - struct rb_node *rb_node; - - rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); - if (rb_node) { - node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); - if (node->detached) - node = NULL; - else - BUG_ON(node->new_bytenr != reloc_root->node->start); - } - - if (!node) { - rb_node = rb_simple_search(&cache->rb_root, - reloc_root->commit_root->start); - if (rb_node) { - node = rb_entry(rb_node, struct btrfs_backref_node, - rb_node); - BUG_ON(node->detached); - } - } - - if (!node) - return 0; - - new_node = btrfs_backref_alloc_node(cache, dest->node->start, - node->level); - if (!new_node) - return -ENOMEM; - - new_node->lowest = node->lowest; - new_node->checked = 1; - new_node->root = btrfs_grab_root(dest); - ASSERT(new_node->root); - - if (!node->lowest) { - list_for_each_entry(edge, &node->lower, list[UPPER]) { - new_edge = btrfs_backref_alloc_edge(cache); - if (!new_edge) - goto fail; - - btrfs_backref_link_edge(new_edge, edge->node[LOWER], - new_node, LINK_UPPER); - } - } else { - list_add_tail(&new_node->lower, &cache->leaves); - } - - rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr, - &new_node->rb_node); - if (rb_node) - btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST); - - if (!new_node->lowest) { - list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { - list_add_tail(&new_edge->list[LOWER], - &new_edge->node[LOWER]->upper); - } - } - return 0; -fail: - while (!list_empty(&new_node->lower)) { - new_edge = list_entry(new_node->lower.next, - struct btrfs_backref_edge, list[UPPER]); - list_del(&new_edge->list[UPPER]); - btrfs_backref_free_edge(cache, new_edge); - } - btrfs_backref_free_node(cache, new_node); - return -ENOMEM; -} - /* * helper to add 'address of tree root -> reloc tree' mapping */ @@ -2058,100 +1964,72 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, int index = 0; int ret; - next = node; - while (1) { - cond_resched(); - next = walk_up_backref(next, edges, &index); - root = next->root; + next = walk_up_backref(node, edges, &index); + root = next->root; - /* - * If there is no root, then our references for this block are - * incomplete, as we should be able to walk all the way up to a - * block that is owned by a root. - * - * This path is only for SHAREABLE roots, so if we come upon a - * non-SHAREABLE root then we have backrefs that resolve - * improperly. - * - * Both of these cases indicate file system corruption, or a bug - * in the backref walking code. - */ - if (!root) { - ASSERT(0); - btrfs_err(trans->fs_info, - "bytenr %llu doesn't have a backref path ending in a root", - node->bytenr); - return ERR_PTR(-EUCLEAN); - } - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { - ASSERT(0); - btrfs_err(trans->fs_info, - "bytenr %llu has multiple refs with one ending in a non-shareable root", - node->bytenr); - return ERR_PTR(-EUCLEAN); - } - - if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { - ret = record_reloc_root_in_trans(trans, root); - if (ret) - return ERR_PTR(ret); - break; - } + /* + * If there is no root, then our references for this block are + * incomplete, as we should be able to walk all the way up to a block + * that is owned by a root. + * + * This path is only for SHAREABLE roots, so if we come upon a + * non-SHAREABLE root then we have backrefs that resolve improperly. + * + * Both of these cases indicate file system corruption, or a bug in the + * backref walking code. + */ + if (unlikely(!root)) { + btrfs_err(trans->fs_info, + "bytenr %llu doesn't have a backref path ending in a root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } + if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { + btrfs_err(trans->fs_info, + "bytenr %llu has multiple refs with one ending in a non-shareable root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } - ret = btrfs_record_root_in_trans(trans, root); + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { + ret = record_reloc_root_in_trans(trans, root); if (ret) return ERR_PTR(ret); - root = root->reloc_root; - - /* - * We could have raced with another thread which failed, so - * root->reloc_root may not be set, return ENOENT in this case. - */ - if (!root) - return ERR_PTR(-ENOENT); + goto found; + } - if (next->new_bytenr != root->node->start) { - /* - * We just created the reloc root, so we shouldn't have - * ->new_bytenr set and this shouldn't be in the changed - * list. If it is then we have multiple roots pointing - * at the same bytenr which indicates corruption, or - * we've made a mistake in the backref walking code. - */ - ASSERT(next->new_bytenr == 0); - ASSERT(list_empty(&next->list)); - if (next->new_bytenr || !list_empty(&next->list)) { - btrfs_err(trans->fs_info, - "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", - node->bytenr, next->bytenr); - return ERR_PTR(-EUCLEAN); - } + ret = btrfs_record_root_in_trans(trans, root); + if (ret) + return ERR_PTR(ret); + root = root->reloc_root; - next->new_bytenr = root->node->start; - btrfs_put_root(next->root); - next->root = btrfs_grab_root(root); - ASSERT(next->root); - list_add_tail(&next->list, - &rc->backref_cache.changed); - mark_block_processed(rc, next); - break; - } + /* + * We could have raced with another thread which failed, so + * root->reloc_root may not be set, return ENOENT in this case. + */ + if (!root) + return ERR_PTR(-ENOENT); - WARN_ON(1); - root = NULL; - next = walk_down_backref(edges, &index); - if (!next || next->level <= node->level) - break; - } - if (!root) { + if (next->new_bytenr) { /* - * This can happen if there's fs corruption or if there's a bug - * in the backref lookup code. + * We just created the reloc root, so we shouldn't have + * ->new_bytenr set yet. If it is then we have multiple roots + * pointing at the same bytenr which indicates corruption, or + * we've made a mistake in the backref walking code. */ - ASSERT(0); - return ERR_PTR(-ENOENT); + ASSERT(next->new_bytenr == 0); + btrfs_err(trans->fs_info, + "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", + node->bytenr, next->bytenr); + return ERR_PTR(-EUCLEAN); } + next->new_bytenr = root->node->start; + btrfs_put_root(next->root); + next->root = btrfs_grab_root(root); + ASSERT(next->root); + mark_block_processed(rc, next); +found: next = node; /* setup backref node path for btrfs_reloc_cow_block */ while (1) { @@ -2247,17 +2125,11 @@ static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, return num_bytes; } -static int reserve_metadata_space(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct btrfs_backref_node *node) +static int refill_metadata_space(struct btrfs_trans_handle *trans, + struct reloc_control *rc, u64 num_bytes) { - struct btrfs_root *root = rc->extent_root; - struct btrfs_fs_info *fs_info = root->fs_info; - u64 num_bytes; + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; - u64 tmp; - - num_bytes = calcu_metadata_size(rc, node) * 2; trans->block_rsv = rc->block_rsv; rc->reserved_bytes += num_bytes; @@ -2270,7 +2142,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, BTRFS_RESERVE_FLUSH_LIMIT); if (ret) { - tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; + u64 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; + while (tmp <= rc->reserved_bytes) tmp <<= 1; /* @@ -2288,6 +2161,16 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, return 0; } +static int reserve_metadata_space(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_backref_node *node) +{ + u64 num_bytes; + + num_bytes = calcu_metadata_size(rc, node) * 2; + return refill_metadata_space(trans, rc, num_bytes); +} + /* * relocate a block tree, and then update pointers in upper level * blocks that reference the block to point to the new location. @@ -2442,7 +2325,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!ret && node->pending) { btrfs_backref_drop_node_buffer(node); - list_move_tail(&node->list, &rc->backref_cache.changed); + list_del_init(&node->list); node->pending = 0; } @@ -2605,8 +2488,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, /* * This block was the root block of a root, and this is * the first time we're processing the block and thus it - * should not have had the ->new_bytenr modified and - * should have not been included on the changed list. + * should not have had the ->new_bytenr modified. * * However in the case of corruption we could have * multiple refs pointing to the same block improperly, @@ -2616,8 +2498,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, * normal user in the case of corruption. */ ASSERT(node->new_bytenr == 0); - ASSERT(list_empty(&node->list)); - if (node->new_bytenr || !list_empty(&node->list)) { + if (node->new_bytenr) { btrfs_err(root->fs_info, "bytenr %llu has improper references to it", node->bytenr); @@ -2640,17 +2521,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, btrfs_put_root(node->root); node->root = btrfs_grab_root(root); ASSERT(node->root); - list_add_tail(&node->list, &rc->backref_cache.changed); } else { - path->lowest_level = node->level; - if (root == root->fs_info->chunk_root) - btrfs_reserve_chunk_metadata(trans, false); - ret = btrfs_search_slot(trans, root, key, path, 0, 1); - btrfs_release_path(path); - if (root == root->fs_info->chunk_root) - btrfs_trans_release_chunk_metadata(trans); - if (ret > 0) - ret = 0; + btrfs_err(root->fs_info, + "bytenr %llu resolved to a non-shareable root", + node->bytenr); + ret = -EUCLEAN; + goto out; } if (!ret) update_processed_blocks(rc, node); @@ -2658,11 +2534,50 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, ret = do_relocation(trans, rc, node, key, path, 1); } out: - if (ret || node->level == 0 || node->cowonly) + if (ret || node->level == 0) btrfs_backref_cleanup_node(&rc->backref_cache, node); return ret; } +static int relocate_cowonly_block(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct tree_block *block, + struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root; + u64 num_bytes; + int nr_levels; + int ret; + + root = btrfs_get_fs_root(fs_info, block->owner, true); + if (IS_ERR(root)) + return PTR_ERR(root); + + nr_levels = max(btrfs_header_level(root->node) - block->level, 0) + 1; + + num_bytes = fs_info->nodesize * nr_levels; + ret = refill_metadata_space(trans, rc, num_bytes); + if (ret) { + btrfs_put_root(root); + return ret; + } + path->lowest_level = block->level; + if (root == root->fs_info->chunk_root) + btrfs_reserve_chunk_metadata(trans, false); + + ret = btrfs_search_slot(trans, root, &block->key, path, 0, 1); + path->lowest_level = 0; + btrfs_release_path(path); + + if (root == root->fs_info->chunk_root) + btrfs_trans_release_chunk_metadata(trans); + if (ret > 0) + ret = 0; + btrfs_put_root(root); + + return ret; +} + /* * relocate a list of blocks */ @@ -2702,6 +2617,20 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, /* Do tree relocation */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { + /* + * For COWonly blocks, or the data reloc tree, we only need to + * COW down to the block, there's no need to generate a backref + * tree. + */ + if (block->owner && + (!is_fstree(block->owner) || + block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { + ret = relocate_cowonly_block(trans, rc, block, path); + if (ret) + break; + continue; + } + node = build_backref_tree(trans, rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { @@ -4399,8 +4328,18 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, WARN_ON(!first_cow && level == 0); node = rc->backref_cache.path[level]; - BUG_ON(node->bytenr != buf->start && - node->new_bytenr != buf->start); + + /* + * If node->bytenr != buf->start and node->new_bytenr != + * buf->start then we've got the wrong backref node for what we + * expected to see here and the cache is incorrect. + */ + if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) { + btrfs_err(fs_info, +"bytenr %llu was found but our backref cache was expecting %llu or %llu", + buf->start, node->bytenr, node->new_bytenr); + return -EUCLEAN; + } btrfs_backref_drop_node_buffer(node); atomic_inc(&cow->refs); @@ -4500,10 +4439,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, return ret; } new_root->reloc_root = btrfs_grab_root(reloc_root); - - if (rc->create_reloc_tree) - ret = clone_backref_node(trans, rc, root, reloc_root); - return ret; + return 0; } /* diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7254279c3cc92c..c5a318feb8aed4 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7253,7 +7253,7 @@ static int changed_cb(struct btrfs_path *left_path, enum btrfs_compare_tree_result result, struct send_ctx *sctx) { - int ret = 0; + int ret; /* * We can not hold the commit root semaphore here. This is because in @@ -7313,7 +7313,6 @@ static int changed_cb(struct btrfs_path *left_path, return 0; } result = BTRFS_COMPARE_TREE_CHANGED; - ret = 0; } sctx->left_path = left_path; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 255e85f78313ca..a341d087567aa0 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -14,6 +14,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "zoned.h" /* * HOW DOES SPACE RESERVATION WORK @@ -127,6 +128,14 @@ * churn a lot and we can avoid making some extent tree modifications if we * are able to delay for as long as possible. * + * RESET_ZONES + * This state works only for the zoned mode. On the zoned mode, we cannot + * reuse once allocated then freed region until we reset the zone, due to + * the sequential write zone requirement. The RESET_ZONES state resets the + * zones of an unused block group and let us reuse the space. The reusing + * is faster than removing the block group and allocating another block + * group on the zones. + * * ALLOC_CHUNK * We will skip this the first time through space reservation, because of * overcommit and we don't want to have a lot of useless metadata space when @@ -316,7 +325,7 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, found->bytes_used += block_group->used; found->disk_used += block_group->used * factor; found->bytes_readonly += block_group->bytes_super; - btrfs_space_info_update_bytes_zone_unusable(info, found, block_group->zone_unusable); + btrfs_space_info_update_bytes_zone_unusable(found, block_group->zone_unusable); if (block_group->length > 0) found->full = 0; btrfs_try_granting_tickets(info, found); @@ -489,9 +498,7 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, if ((used + ticket->bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, ticket->bytes, flush)) { - btrfs_space_info_update_bytes_may_use(fs_info, - space_info, - ticket->bytes); + btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes); remove_ticket(space_info, ticket); ticket->bytes = 0; space_info->tickets_id++; @@ -834,6 +841,9 @@ static void flush_space(struct btrfs_fs_info *fs_info, */ ret = btrfs_commit_current_transaction(root); break; + case RESET_ZONES: + ret = btrfs_reset_unused_block_groups(space_info, num_bytes); + break; default: ret = -ENOSPC; break; @@ -1086,9 +1096,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) enum btrfs_flush_state flush_state; int commit_cycles = 0; u64 last_tickets_id; + enum btrfs_flush_state final_state; fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + if (btrfs_is_zoned(fs_info)) + final_state = RESET_ZONES; + else + final_state = COMMIT_TRANS; spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); @@ -1141,7 +1156,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) flush_state++; - if (flush_state > COMMIT_TRANS) { + if (flush_state > final_state) { commit_cycles++; if (commit_cycles > 2) { if (maybe_fail_all_tickets(fs_info, space_info)) { @@ -1155,7 +1170,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } } spin_unlock(&space_info->lock); - } while (flush_state <= COMMIT_TRANS); + } while (flush_state <= final_state); } /* @@ -1286,6 +1301,10 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) * This is where we reclaim all of the pinned space generated by running the * iputs * + * RESET_ZONES + * This state works only for the zoned mode. We scan the unused block group + * list and reset the zones and reuse the block group. + * * ALLOC_CHUNK_FORCE * For data we start with alloc chunk force, however we could have been full * before, and then the transaction commit could have freed new block groups, @@ -1295,6 +1314,7 @@ static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_FULL, RUN_DELAYED_IPUTS, COMMIT_TRANS, + RESET_ZONES, ALLOC_CHUNK_FORCE, }; @@ -1386,6 +1406,7 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) static const enum btrfs_flush_state priority_flush_states[] = { FLUSH_DELAYED_ITEMS_NR, FLUSH_DELAYED_ITEMS, + RESET_ZONES, ALLOC_CHUNK, }; @@ -1399,6 +1420,7 @@ static const enum btrfs_flush_state evict_flush_states[] = { FLUSH_DELALLOC_FULL, ALLOC_CHUNK, COMMIT_TRANS, + RESET_ZONES, }; static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, @@ -1690,8 +1712,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, if (!pending_tickets && ((used + orig_bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { - btrfs_space_info_update_bytes_may_use(fs_info, space_info, - orig_bytes); + btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; } @@ -1703,8 +1724,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { used = btrfs_space_info_used(space_info, false); if (used + orig_bytes <= space_info->total_bytes) { - btrfs_space_info_update_bytes_may_use(fs_info, space_info, - orig_bytes); + btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; } } @@ -2082,3 +2102,32 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) do_reclaim_sweep(space_info, raid); } } + +void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + + lockdep_assert_held(&space_info->lock); + + /* Prioritize the global reservation to receive the freed space. */ + if (global_rsv->space_info != space_info) + goto grant; + + spin_lock(&global_rsv->lock); + if (!global_rsv->full) { + u64 to_add = min(len, global_rsv->size - global_rsv->reserved); + + global_rsv->reserved += to_add; + btrfs_space_info_update_bytes_may_use(space_info, to_add); + if (global_rsv->reserved >= global_rsv->size) + global_rsv->full = 1; + len -= to_add; + } + spin_unlock(&global_rsv->lock); + +grant: + /* Add to any tickets we may have. */ + if (len) + btrfs_try_granting_tickets(fs_info, space_info); +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index efbecc0c5258d2..a96efdb5e681a3 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -79,6 +79,10 @@ enum btrfs_reserve_flush_enum { BTRFS_RESERVE_FLUSH_EMERGENCY, }; +/* + * Please be aware that the order of enum values will be the order of the reclaim + * process in btrfs_async_reclaim_metadata_space(). + */ enum btrfs_flush_state { FLUSH_DELAYED_ITEMS_NR = 1, FLUSH_DELAYED_ITEMS = 2, @@ -91,6 +95,7 @@ enum btrfs_flush_state { ALLOC_CHUNK_FORCE = 9, RUN_DELAYED_IPUTS = 10, COMMIT_TRANS = 11, + RESET_ZONES = 12, }; struct btrfs_space_info { @@ -229,10 +234,10 @@ static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_i */ #define DECLARE_SPACE_INFO_UPDATE(name, trace_name) \ static inline void \ -btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ - struct btrfs_space_info *sinfo, \ +btrfs_space_info_update_##name(struct btrfs_space_info *sinfo, \ s64 bytes) \ { \ + struct btrfs_fs_info *fs_info = sinfo->fs_info; \ const u64 abs_bytes = (bytes < 0) ? -bytes : bytes; \ lockdep_assert_held(&sinfo->lock); \ trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \ @@ -275,13 +280,12 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( - struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes) { spin_lock(&space_info->lock); - btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); - btrfs_try_granting_tickets(fs_info, space_info); + btrfs_space_info_update_bytes_may_use(space_info, -num_bytes); + btrfs_try_granting_tickets(space_info->fs_info, space_info); spin_unlock(&space_info->lock); } int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, @@ -295,5 +299,6 @@ void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info); int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info); void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info); +void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len); #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 8c68059ac1b0c1..7f47bc61389ca7 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -635,6 +635,28 @@ IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered, IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, folio_test_checked); +#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ +{ \ + const int sectors_per_page = fs_info->sectors_per_page; \ + \ + ASSERT(sectors_per_page < BITS_PER_LONG); \ + *dst = bitmap_read(subpage->bitmaps, \ + sectors_per_page * btrfs_bitmap_nr_##name, \ + sectors_per_page); \ +} + +#define subpage_dump_bitmap(fs_info, folio, name, start, len) \ +{ \ + struct btrfs_subpage *subpage = folio_get_private(folio); \ + unsigned long bitmap; \ + \ + GET_SUBPAGE_BITMAP(subpage, fs_info, name, &bitmap); \ + btrfs_warn(fs_info, \ + "dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ + start, len, folio_pos(folio), \ + fs_info->sectors_per_page, &bitmap); \ +} + /* * Make sure not only the page dirty bit is cleared, but also subpage dirty bit * is cleared. @@ -660,6 +682,10 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, subpage = folio_get_private(folio); ASSERT(subpage); spin_lock_irqsave(&subpage->lock, flags); + if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) { + subpage_dump_bitmap(fs_info, folio, dirty, start, len); + ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + } ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -689,23 +715,17 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, nbits = len >> fs_info->sectorsize_bits; spin_lock_irqsave(&subpage->lock, flags); /* Target range should not yet be locked. */ - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) { + subpage_dump_bitmap(fs_info, folio, locked, start, len); + btrfs_warn(fs_info, "nr_locked=%u\n", atomic_read(&subpage->nr_locked)); + ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + } bitmap_set(subpage->bitmaps, start_bit, nbits); ret = atomic_add_return(nbits, &subpage->nr_locked); ASSERT(ret <= fs_info->sectors_per_page); spin_unlock_irqrestore(&subpage->lock, flags); } -#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ -{ \ - const int sectors_per_page = fs_info->sectors_per_page; \ - \ - ASSERT(sectors_per_page < BITS_PER_LONG); \ - *dst = bitmap_read(subpage->bitmaps, \ - sectors_per_page * btrfs_bitmap_nr_##name, \ - sectors_per_page); \ -} - void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { @@ -716,6 +736,7 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, unsigned long writeback_bitmap; unsigned long ordered_bitmap; unsigned long checked_bitmap; + unsigned long locked_bitmap; unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -728,15 +749,16 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap); GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap); GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &checked_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &locked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); dump_page(folio_page(folio, 0), "btrfs subpage dump"); btrfs_warn(fs_info, -"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", +"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", start, len, folio_pos(folio), sectors_per_page, &uptodate_bitmap, sectors_per_page, &dirty_bitmap, + sectors_per_page, &locked_bitmap, sectors_per_page, &writeback_bitmap, sectors_per_page, &ordered_bitmap, sectors_per_page, &checked_bitmap); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 97a85d180b61ab..f6eaaf20229d84 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -971,7 +971,7 @@ static int btrfs_fill_super(struct super_block *sb, err = open_ctree(sb, fs_devices); if (err) { - btrfs_err(fs_info, "open_ctree failed"); + btrfs_err(fs_info, "open_ctree failed: %d", err); return err; } @@ -1885,18 +1885,21 @@ static int btrfs_get_tree_super(struct fs_context *fc) if (sb->s_root) { btrfs_close_devices(fs_devices); - if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) - ret = -EBUSY; + /* + * At this stage we may have RO flag mismatch between + * fc->sb_flags and sb->s_flags. Caller should detect such + * mismatch and reconfigure with sb->s_umount rwsem held if + * needed. + */ } else { snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); btrfs_sb(sb)->bdev_holder = &btrfs_fs_type; ret = btrfs_fill_super(sb, fs_devices); - } - - if (ret) { - deactivate_locked_super(sb); - return ret; + if (ret) { + deactivate_locked_super(sb); + return ret; + } } btrfs_clear_oneshot_options(fs_info); @@ -1982,39 +1985,18 @@ static int btrfs_get_tree_super(struct fs_context *fc) * btrfs or not, setting the whole super block RO. To make per-subvolume mounting * work with different options work we need to keep backward compatibility. */ -static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc) +static int btrfs_reconfigure_for_mount(struct fs_context *fc, struct vfsmount *mnt) { - struct vfsmount *mnt; - int ret; - const bool ro2rw = !(fc->sb_flags & SB_RDONLY); - - /* - * We got an EBUSY because our SB_RDONLY flag didn't match the existing - * super block, so invert our setting here and retry the mount so we - * can get our vfsmount. - */ - if (ro2rw) - fc->sb_flags |= SB_RDONLY; - else - fc->sb_flags &= ~SB_RDONLY; - - mnt = fc_mount(fc); - if (IS_ERR(mnt)) - return mnt; + int ret = 0; - if (!ro2rw) - return mnt; + if (fc->sb_flags & SB_RDONLY) + return ret; - /* We need to convert to rw, call reconfigure. */ - fc->sb_flags &= ~SB_RDONLY; down_write(&mnt->mnt_sb->s_umount); - ret = btrfs_reconfigure(fc); + if (!(fc->sb_flags & SB_RDONLY) && (mnt->mnt_sb->s_flags & SB_RDONLY)) + ret = btrfs_reconfigure(fc); up_write(&mnt->mnt_sb->s_umount); - if (ret) { - mntput(mnt); - return ERR_PTR(ret); - } - return mnt; + return ret; } static int btrfs_get_tree_subvol(struct fs_context *fc) @@ -2024,6 +2006,7 @@ static int btrfs_get_tree_subvol(struct fs_context *fc) struct fs_context *dup_fc; struct dentry *dentry; struct vfsmount *mnt; + int ret = 0; /* * Setup a dummy root and fs_info for test/set super. This is because @@ -2066,11 +2049,16 @@ static int btrfs_get_tree_subvol(struct fs_context *fc) fc->security = NULL; mnt = fc_mount(dup_fc); - if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) - mnt = btrfs_reconfigure_for_mount(dup_fc); - put_fs_context(dup_fc); - if (IS_ERR(mnt)) + if (IS_ERR(mnt)) { + put_fs_context(dup_fc); return PTR_ERR(mnt); + } + ret = btrfs_reconfigure_for_mount(dup_fc, mnt); + put_fs_context(dup_fc); + if (ret) { + mntput(mnt); + return ret; + } /* * This free's ->subvol_name, because if it isn't set we have to diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index e607b5d52fb1d2..5eff8d7d236053 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -30,6 +30,7 @@ const char *test_error[] = { [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map", [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map", [TEST_ALLOC_IO_CONTEXT] = "cannot allocate io context", + [TEST_ALLOC_TRANSACTION] = "cannot allocate transaction", }; static const struct super_operations btrfs_test_super_ops = { @@ -142,6 +143,11 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); + + /* CRC32C csum size. */ + fs_info->csum_size = 4; + fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / + fs_info->csum_size; set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); test_mnt->mnt_sb->s_fs_info = fs_info; @@ -247,6 +253,15 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache) kfree(cache); } +void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info) +{ + memset(trans, 0, sizeof(*trans)); + trans->fs_info = fs_info; + xa_init(&trans->delayed_refs.head_refs); + xa_init(&trans->delayed_refs.dirty_extents); + spin_lock_init(&trans->delayed_refs.lock); +} + void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { @@ -295,6 +310,9 @@ int btrfs_run_sanity_tests(void) ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize); if (ret) goto out; + ret = btrfs_test_delayed_refs(sectorsize, nodesize); + if (ret) + goto out; } } ret = btrfs_test_extent_map(); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index b524ecf2f452b6..4307bdaa67497c 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -6,6 +6,8 @@ #ifndef BTRFS_TESTS_H #define BTRFS_TESTS_H +#include + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_run_sanity_tests(void); @@ -25,12 +27,14 @@ enum { TEST_ALLOC_EXTENT_MAP, TEST_ALLOC_CHUNK_MAP, TEST_ALLOC_IO_CONTEXT, + TEST_ALLOC_TRANSACTION, }; extern const char *test_error[]; struct btrfs_root; struct btrfs_trans_handle; +struct btrfs_transaction; int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize); @@ -40,6 +44,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize); int btrfs_test_extent_map(void); +int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); @@ -49,6 +54,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long lengt void btrfs_free_dummy_block_group(struct btrfs_block_group *cache); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); +void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info); #else static inline int btrfs_run_sanity_tests(void) diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c new file mode 100644 index 00000000000000..6558508c2ddf50 --- /dev/null +++ b/fs/btrfs/tests/delayed-refs-tests.c @@ -0,0 +1,1015 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include "btrfs-tests.h" +#include "../transaction.h" +#include "../delayed-ref.h" +#include "../extent-tree.h" + +#define FAKE_ROOT_OBJECTID 256 +#define FAKE_BYTENR 0 +#define FAKE_LEVEL 1 +#define FAKE_INO 256 +#define FAKE_FILE_OFFSET 0 +#define FAKE_PARENT SZ_1M + +struct ref_head_check { + u64 bytenr; + u64 num_bytes; + int ref_mod; + int total_ref_mod; + int must_insert; +}; + +struct ref_node_check { + u64 bytenr; + u64 num_bytes; + int ref_mod; + enum btrfs_delayed_ref_action action; + u8 type; + u64 parent; + u64 root; + u64 owner; + u64 offset; +}; + +static enum btrfs_ref_type ref_type_from_disk_ref_type(u8 type) +{ + if ((type == BTRFS_TREE_BLOCK_REF_KEY) || + (type == BTRFS_SHARED_BLOCK_REF_KEY)) + return BTRFS_REF_METADATA; + return BTRFS_REF_DATA; +} + +static void delete_delayed_ref_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + + spin_lock(&delayed_refs->lock); + spin_lock(&head->lock); + btrfs_delete_ref_head(fs_info, delayed_refs, head); + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + + btrfs_delayed_ref_unlock(head); + btrfs_put_delayed_ref_head(head); +} + +static void delete_delayed_ref_node(struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_ref_node *node) +{ + rb_erase_cached(&node->ref_node, &head->ref_tree); + RB_CLEAR_NODE(&node->ref_node); + if (!list_empty(&node->add_list)) + list_del_init(&node->add_list); + btrfs_put_delayed_ref(node); +} + +static int validate_ref_head(struct btrfs_delayed_ref_head *head, + struct ref_head_check *check) +{ + if (head->bytenr != check->bytenr) { + test_err("invalid bytenr have: %llu want: %llu", head->bytenr, + check->bytenr); + return -EINVAL; + } + + if (head->num_bytes != check->num_bytes) { + test_err("invalid num_bytes have: %llu want: %llu", + head->num_bytes, check->num_bytes); + return -EINVAL; + } + + if (head->ref_mod != check->ref_mod) { + test_err("invalid ref_mod have: %d want: %d", head->ref_mod, + check->ref_mod); + return -EINVAL; + } + + if (head->total_ref_mod != check->total_ref_mod) { + test_err("invalid total_ref_mod have: %d want: %d", + head->total_ref_mod, check->total_ref_mod); + return -EINVAL; + } + + if (head->must_insert_reserved != check->must_insert) { + test_err("invalid must_insert have: %d want: %d", + head->must_insert_reserved, check->must_insert); + return -EINVAL; + } + + return 0; +} + +static int validate_ref_node(struct btrfs_delayed_ref_node *node, + struct ref_node_check *check) +{ + if (node->bytenr != check->bytenr) { + test_err("invalid bytenr have: %llu want: %llu", node->bytenr, + check->bytenr); + return -EINVAL; + } + + if (node->num_bytes != check->num_bytes) { + test_err("invalid num_bytes have: %llu want: %llu", + node->num_bytes, check->num_bytes); + return -EINVAL; + } + + if (node->ref_mod != check->ref_mod) { + test_err("invalid ref_mod have: %d want: %d", node->ref_mod, + check->ref_mod); + return -EINVAL; + } + + if (node->action != check->action) { + test_err("invalid action have: %d want: %d", node->action, + check->action); + return -EINVAL; + } + + if (node->parent != check->parent) { + test_err("invalid parent have: %llu want: %llu", node->parent, + check->parent); + return -EINVAL; + } + + if (node->ref_root != check->root) { + test_err("invalid root have: %llu want: %llu", node->ref_root, + check->root); + return -EINVAL; + } + + if (node->type != check->type) { + test_err("invalid type have: %d want: %d", node->type, + check->type); + return -EINVAL; + } + + if (btrfs_delayed_ref_owner(node) != check->owner) { + test_err("invalid owner have: %llu want: %llu", + btrfs_delayed_ref_owner(node), check->owner); + return -EINVAL; + } + + if (btrfs_delayed_ref_offset(node) != check->offset) { + test_err("invalid offset have: %llu want: %llu", + btrfs_delayed_ref_offset(node), check->offset); + return -EINVAL; + } + + return 0; +} + +static int simple_test(struct btrfs_trans_handle *trans, + struct ref_head_check *head_check, + struct ref_node_check *node_check) +{ + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *node; + struct btrfs_ref ref = { + .type = ref_type_from_disk_ref_type(node_check->type), + .action = node_check->action, + .parent = node_check->parent, + .ref_root = node_check->root, + .bytenr = node_check->bytenr, + .num_bytes = fs_info->nodesize, + }; + int ret; + + if (ref.type == BTRFS_REF_METADATA) + btrfs_init_tree_ref(&ref, node_check->owner, node_check->root, + false); + else + btrfs_init_data_ref(&ref, node_check->owner, node_check->offset, + node_check->root, true); + + if (ref.type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + return ret; + } + + head = btrfs_select_ref_head(fs_info, delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + return -EINVAL; + } + + ret = -EINVAL; + if (validate_ref_head(head, head_check)) + goto out; + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, node_check)) + goto out; + ret = 0; +out: + btrfs_unselect_ref_head(delayed_refs, head); + btrfs_destroy_delayed_refs(trans->transaction); + return ret; +} + +/* + * These are simple tests, make sure that our btrfs_ref's get turned into the + * appropriate btrfs_delayed_ref_node based on their settings and action. + */ +static int simple_tests(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct ref_head_check head_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 1, + .total_ref_mod = 1, + }; + struct ref_node_check node_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 1, + .action = BTRFS_ADD_DELAYED_REF, + .type = BTRFS_TREE_BLOCK_REF_KEY, + .parent = 0, + .root = FAKE_ROOT_OBJECTID, + .owner = FAKE_LEVEL, + .offset = 0, + }; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add tree block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_EXTENT_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add extent data failed"); + return -EINVAL; + } + + node_check.parent = FAKE_PARENT; + node_check.type = BTRFS_SHARED_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + node_check.offset = 0; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add shared block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_SHARED_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add shared data failed"); + return -EINVAL; + } + + head_check.ref_mod = -1; + head_check.total_ref_mod = -1; + node_check.action = BTRFS_DROP_DELAYED_REF; + node_check.type = BTRFS_TREE_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + node_check.offset = 0; + node_check.parent = 0; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop tree block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_EXTENT_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop extent data failed"); + return -EINVAL; + } + + node_check.parent = FAKE_PARENT; + node_check.type = BTRFS_SHARED_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + node_check.offset = 0; + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop shared block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_SHARED_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop shared data failed"); + return -EINVAL; + } + + return 0; +} + +/* + * Merge tests, validate that we do delayed ref merging properly, the ref counts + * all end up properly, and delayed refs are deleted once they're no longer + * needed. + */ +static int merge_tests(struct btrfs_trans_handle *trans, + enum btrfs_ref_type type) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_head *head = NULL; + struct btrfs_delayed_ref_node *node; + struct btrfs_ref ref = { + .type = type, + .action = BTRFS_ADD_DELAYED_REF, + .parent = 0, + .ref_root = FAKE_ROOT_OBJECTID, + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + }; + struct ref_head_check head_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 0, + .total_ref_mod = 0, + }; + struct ref_node_check node_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 2, + .action = BTRFS_ADD_DELAYED_REF, + .parent = 0, + .root = FAKE_ROOT_OBJECTID, + }; + int ret; + + /* + * First add a ref and then drop it, make sure we get a head ref with a + * 0 total ref mod and no nodes. + */ + if (type == BTRFS_REF_METADATA) { + node_check.type = BTRFS_TREE_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + btrfs_init_tree_ref(&ref, FAKE_LEVEL, FAKE_ROOT_OBJECTID, false); + } else { + node_check.type = BTRFS_EXTENT_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + btrfs_init_data_ref(&ref, FAKE_INO, FAKE_FILE_OFFSET, + FAKE_ROOT_OBJECTID, true); + } + + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + return ret; + } + + ref.action = BTRFS_DROP_DELAYED_REF; + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + goto out; + } + + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("single add and drop failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + + delete_delayed_ref_head(trans, head); + head = NULL; + + /* + * Add a ref, then add another ref, make sure we get a head ref with a + * 2 total ref mod and 1 node. + */ + ref.action = BTRFS_ADD_DELAYED_REF; + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + goto out; + } + + head_check.ref_mod = 2; + head_check.total_ref_mod = 2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("double add failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + delete_delayed_ref_head(trans, head); + head = NULL; + + /* Add two drop refs, make sure they are merged properly. */ + ref.action = BTRFS_DROP_DELAYED_REF; + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + goto out; + } + + head_check.ref_mod = -2; + head_check.total_ref_mod = -2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("double drop failed"); + goto out; + } + + node_check.action = BTRFS_DROP_DELAYED_REF; + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + delete_delayed_ref_head(trans, head); + head = NULL; + + /* Add multiple refs, then drop until we go negative again. */ + ref.action = BTRFS_ADD_DELAYED_REF; + for (int i = 0; i < 10; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + ref.action = BTRFS_DROP_DELAYED_REF; + for (int i = 0; i < 12; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + goto out; + } + + head_check.ref_mod = -2; + head_check.total_ref_mod = -2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("double drop failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + + delete_delayed_ref_head(trans, head); + head = NULL; + + /* Drop multiple refs, then add until we go positive again. */ + ref.action = BTRFS_DROP_DELAYED_REF; + for (int i = 0; i < 10; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + ref.action = BTRFS_ADD_DELAYED_REF; + for (int i = 0; i < 12; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + goto out; + } + + head_check.ref_mod = 2; + head_check.total_ref_mod = 2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("add and drop to positive failed"); + goto out; + } + + node_check.action = BTRFS_ADD_DELAYED_REF; + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + delete_delayed_ref_head(trans, head); + head = NULL; + + /* + * Add a bunch of refs with different roots and parents, then drop them + * all, make sure everything is properly merged. + */ + ref.action = BTRFS_ADD_DELAYED_REF; + for (int i = 0; i < 50; i++) { + if (!(i % 2)) { + ref.parent = 0; + ref.ref_root = FAKE_ROOT_OBJECTID + i; + } else { + ref.parent = FAKE_PARENT + (i * fs_info->nodesize); + } + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + ref.action = BTRFS_DROP_DELAYED_REF; + for (int i = 0; i < 50; i++) { + if (!(i % 2)) { + ref.parent = 0; + ref.ref_root = FAKE_ROOT_OBJECTID + i; + } else { + ref.parent = FAKE_PARENT + (i * fs_info->nodesize); + } + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + goto out; + } + + head_check.ref_mod = 0; + head_check.total_ref_mod = 0; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("add and drop multiple failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + ret = 0; +out: + if (!IS_ERR_OR_NULL(head)) + btrfs_unselect_ref_head(&trans->transaction->delayed_refs, head); + btrfs_destroy_delayed_refs(trans->transaction); + return ret; +} + +/* + * Basic test to validate we always get the add operations first followed by any + * delete operations. + */ +static int select_delayed_refs_test(struct btrfs_trans_handle *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_head *head = NULL; + struct btrfs_delayed_ref_node *node; + struct btrfs_ref ref = { + .type = BTRFS_REF_METADATA, + .action = BTRFS_DROP_DELAYED_REF, + .parent = 0, + .ref_root = FAKE_ROOT_OBJECTID, + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + }; + struct ref_head_check head_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 0, + .total_ref_mod = 0, + }; + struct ref_node_check node_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 1, + .action = BTRFS_ADD_DELAYED_REF, + .type = BTRFS_TREE_BLOCK_REF_KEY, + .parent = 0, + .owner = FAKE_LEVEL, + .offset = 0, + }; + int ret; + + /* Add the drop first. */ + btrfs_init_tree_ref(&ref, FAKE_LEVEL, FAKE_ROOT_OBJECTID, false); + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + return ret; + } + + /* + * Now add the add, and make it a different root so it's logically later + * in the rb tree. + */ + ref.action = BTRFS_ADD_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID + 1; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + head = NULL; + goto out; + } + + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("head check failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.root = FAKE_ROOT_OBJECTID + 1; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.action = BTRFS_DROP_DELAYED_REF; + node_check.root = FAKE_ROOT_OBJECTID; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + delete_delayed_ref_head(trans, head); + head = NULL; + + /* + * Now we're going to do the same thing, but we're going to have an add + * that gets deleted because of a merge, and make sure we still have + * another add in place. + */ + ref.action = BTRFS_DROP_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + ref.action = BTRFS_ADD_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID + 1; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + ref.action = BTRFS_DROP_DELAYED_REF; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + ref.action = BTRFS_ADD_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID + 2; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + head = NULL; + goto out; + } + + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("head check failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.action = BTRFS_ADD_DELAYED_REF; + node_check.root = FAKE_ROOT_OBJECTID + 2; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.action = BTRFS_DROP_DELAYED_REF; + node_check.root = FAKE_ROOT_OBJECTID; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + ret = 0; +out: + if (head) + btrfs_unselect_ref_head(delayed_refs, head); + btrfs_destroy_delayed_refs(trans->transaction); + return ret; +} + +int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize) +{ + struct btrfs_transaction *transaction; + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info; + int ret; + + test_msg("running delayed refs tests"); + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + transaction = kmalloc(sizeof(*transaction), GFP_KERNEL); + if (!transaction) { + test_std_err(TEST_ALLOC_TRANSACTION); + ret = -ENOMEM; + goto out_free_fs_info; + } + btrfs_init_dummy_trans(&trans, fs_info); + btrfs_init_dummy_transaction(transaction, fs_info); + trans.transaction = transaction; + + ret = simple_tests(&trans); + if (!ret) { + test_msg("running delayed refs merg tests on metadata refs"); + ret = merge_tests(&trans, BTRFS_REF_METADATA); + } + + if (!ret) { + test_msg("running delayed refs merg tests on data refs"); + ret = merge_tests(&trans, BTRFS_REF_DATA); + } + + if (!ret) + ret = select_delayed_refs_test(&trans); + +out_free_fs_info: + btrfs_free_dummy_fs_info(fs_info); + return ret; +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dc0b837efd5dfb..15312013f2a34d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -795,8 +795,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, if (num_bytes) btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL); if (delayed_refs_bytes) - btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info, - delayed_refs_bytes); + btrfs_space_info_free_bytes_may_use(trans_rsv->space_info, delayed_refs_bytes); reserve_fail: btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); return ERR_PTR(ret); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 148d8cefa40eef..dfeee033f31fb9 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1527,6 +1527,11 @@ static int check_extent_item(struct extent_buffer *leaf, dref_offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { + extent_err(leaf, slot, + "invalid data ref count, should have non-zero value"); + return -EUCLEAN; + } inline_refs += btrfs_extent_data_ref_count(leaf, dref); break; /* Contains parent bytenr and ref count */ @@ -1539,6 +1544,11 @@ static int check_extent_item(struct extent_buffer *leaf, inline_offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { + extent_err(leaf, slot, + "invalid shared data ref count, should have non-zero value"); + return -EUCLEAN; + } inline_refs += btrfs_shared_data_ref_count(leaf, sref); break; case BTRFS_EXTENT_OWNER_REF_KEY: @@ -1611,8 +1621,18 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf, { u32 expect_item_size = 0; - if (key->type == BTRFS_SHARED_DATA_REF_KEY) + if (key->type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_shared_data_ref *sref; + + sref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref); + if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { + extent_err(leaf, slot, + "invalid shared data backref count, should have non-zero value"); + return -EUCLEAN; + } + expect_item_size = sizeof(struct btrfs_shared_data_ref); + } if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) { generic_err(leaf, slot, @@ -1689,6 +1709,11 @@ static int check_extent_data_ref(struct extent_buffer *leaf, offset, leaf->fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { + extent_err(leaf, slot, + "invalid extent data backref count, should have non-zero value"); + return -EUCLEAN; + } } return 0; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 11ed523e528ec2..abea8f2f497e2b 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2651,3 +2651,127 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->zone_active_bgs_lock); } + +/* + * Reset the zones of unused block groups from @space_info->bytes_zone_unusable. + * + * @space_info: the space to work on + * @num_bytes: targeting reclaim bytes + * + * This one resets the zones of a block group, so we can reuse the region + * without removing the block group. On the other hand, btrfs_delete_unused_bgs() + * just removes a block group and frees up the underlying zones. So, we still + * need to allocate a new block group to reuse the zones. + * + * Resetting is faster than deleting/recreating a block group. It is similar + * to freeing the logical space on the regular mode. However, we cannot change + * the block group's profile with this operation. + */ +int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + const sector_t zone_size_sectors = fs_info->zone_size >> SECTOR_SHIFT; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + while (num_bytes > 0) { + struct btrfs_chunk_map *map; + struct btrfs_block_group *bg = NULL; + bool found = false; + u64 reclaimed = 0; + + /* + * Here, we choose a fully zone_unusable block group. It's + * technically possible to reset a partly zone_unusable block + * group, which still has some free space left. However, + * handling that needs to cope with the allocation side, which + * makes the logic more complex. So, let's handle the easy case + * for now. + */ + spin_lock(&fs_info->unused_bgs_lock); + list_for_each_entry(bg, &fs_info->unused_bgs, bg_list) { + if ((bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != space_info->flags) + continue; + + /* + * Use trylock to avoid locking order violation. In + * btrfs_reclaim_bgs_work(), the lock order is + * &bg->lock -> &fs_info->unused_bgs_lock. We skip a + * block group if we cannot take its lock. + */ + if (!spin_trylock(&bg->lock)) + continue; + if (btrfs_is_block_group_used(bg) || bg->zone_unusable < bg->length) { + spin_unlock(&bg->lock); + continue; + } + spin_unlock(&bg->lock); + found = true; + break; + } + if (!found) { + spin_unlock(&fs_info->unused_bgs_lock); + return 0; + } + + list_del_init(&bg->bg_list); + btrfs_put_block_group(bg); + spin_unlock(&fs_info->unused_bgs_lock); + + /* + * Since the block group is fully zone_unusable and we cannot + * allocate from this block group anymore, we don't need to set + * this block group read-only. + */ + + down_read(&fs_info->dev_replace.rwsem); + map = bg->physical_map; + for (int i = 0; i < map->num_stripes; i++) { + struct btrfs_io_stripe *stripe = &map->stripes[i]; + unsigned int nofs_flags; + int ret; + + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(stripe->dev->bdev, REQ_OP_ZONE_RESET, + stripe->physical >> SECTOR_SHIFT, + zone_size_sectors); + memalloc_nofs_restore(nofs_flags); + + if (ret) { + up_read(&fs_info->dev_replace.rwsem); + return ret; + } + } + up_read(&fs_info->dev_replace.rwsem); + + spin_lock(&space_info->lock); + spin_lock(&bg->lock); + ASSERT(!btrfs_is_block_group_used(bg)); + if (bg->ro) { + spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); + continue; + } + + reclaimed = bg->alloc_offset; + bg->zone_unusable = bg->length - bg->zone_capacity; + bg->alloc_offset = 0; + /* + * This holds because we currently reset fully used then freed + * block group. + */ + ASSERT(reclaimed == bg->zone_capacity); + bg->free_space_ctl->free_space += reclaimed; + space_info->bytes_zone_unusable -= reclaimed; + spin_unlock(&bg->lock); + btrfs_return_free_space(space_info, reclaimed); + spin_unlock(&space_info->lock); + + if (num_bytes <= reclaimed) + break; + num_bytes -= reclaimed; + } + + return 0; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 7612e657260530..9672bf4c333552 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -96,6 +96,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, bool do_finish); void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); +int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) @@ -265,6 +266,12 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { } +static inline int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, + u64 num_bytes) +{ + return 0; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 4df93ca9b7a8d4..549ab3b4196180 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -100,7 +100,8 @@ struct find_free_extent_ctl; EM( ALLOC_CHUNK, "ALLOC_CHUNK") \ EM( ALLOC_CHUNK_FORCE, "ALLOC_CHUNK_FORCE") \ EM( RUN_DELAYED_IPUTS, "RUN_DELAYED_IPUTS") \ - EMe(COMMIT_TRANS, "COMMIT_TRANS") + EM( COMMIT_TRANS, "COMMIT_TRANS") \ + EMe(RESET_ZONES, "RESET_ZONES") /* * First define the enums in the above macros to be exported to userspace via