Merge tag 'bcachefs-2023-11-29' of https://evilpiepirate.org/git/bcachefs

Pull more bcachefs bugfixes from Kent Overstreet: - bcache & bcachefs were broken with CFI enabled; patch for closures to fix type punning - mark erasure coding as extra-experimental; there are incompatible disk space accounting changes coming for erasure coding, and I'm still seeing checksum errors in some tests - several fixes for durability-related issues (durability is a device specific setting where we can tell bcachefs that data on a given device should be counted as replicated x times) - a fix for a rare livelock when a btree node merge then updates a parent node that is almost full - fix a race in the device removal path, where dropping a pointer in a btree node to a device would be clobbered by an in flight btree write updating the btree node key on completion - fix one SRCU lock hold time warning in the btree gc code - ther's still a bunch more of these to fix - fix a rare race where we'd start copygc before initializing the "are we rw" percpu refcount; copygc would think we were already ro and die immediately * tag 'bcachefs-2023-11-29' of https://evilpiepirate.org/git/bcachefs: (23 commits) bcachefs: Extra kthread_should_stop() calls for copygc bcachefs: Convert gc_alloc_start() to for_each_btree_key2() bcachefs: Fix race between btree writes and metadata drop bcachefs: move journal seq assertion bcachefs: -EROFS doesn't count as move_extent_start_fail bcachefs: trace_move_extent_start_fail() now includes errcode bcachefs: Fix split_race livelock bcachefs: Fix bucket data type for stripe buckets bcachefs: Add missing validation for jset_entry_data_usage bcachefs: Fix zstd compress workspace size bcachefs: bpos is misaligned on big endian bcachefs: Fix ec + durability calculation bcachefs: Data update path won't accidentaly grow replicas bcachefs: deallocate_extra_replicas() bcachefs: Proper refcounting for journal_keys bcachefs: preserve device path as device name bcachefs: Fix an endianness conversion bcachefs: Start gc, copygc, rebalance threads after initing writes ref bcachefs: Don't stop copygc thread on device resize bcachefs: Make sure bch2_move_ratelimit() also waits for move_ops ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2023-12-02 06:02:16 +0900
committer: Linus Torvalds <torvalds@linux-foundation.org> 2023-12-02 06:02:16 +0900
commit: e6861be452a53a5de3e1a048eabd811a05a44915 (patch)
tree: 065281129fb5e7b1ed51d6298fbd1faa0902063d /fs
parent: 994d5c58e50e91bb02c7be4a91d5186292a895c8 (diff)
parent: 415e5107b0dce0e5407ae4a46700cd7e8859e252 (diff)
download: linux-e6861be452a53a5de3e1a048eabd811a05a44915.tar.gz
linux-e6861be452a53a5de3e1a048eabd811a05a44915.tar.bz2
linux-e6861be452a53a5de3e1a048eabd811a05a44915.zip
36 files changed, 394 insertions, 228 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index c08c2c7d6fbb..fddc7be58022 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -33,6 +33,18 @@ config BCACHEFS_QUOTA
 	depends on BCACHEFS_FS
 	select QUOTACTL
 
+config BCACHEFS_ERASURE_CODING
+	bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)"
+	depends on BCACHEFS_FS
+	select QUOTACTL
+	help
+	This enables the "erasure_code" filesysystem and inode option, which
+	organizes data into reed-solomon stripes instead of ordinary
+	replication.
+
+	WARNING: this feature is still undergoing on disk format changes, and
+	should only be enabled for testing purposes.
+
 config BCACHEFS_POSIX_ACL
 	bool "bcachefs POSIX ACL support"
 	depends on BCACHEFS_FS
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index b85c7765272f..1ba0eeb7552a 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1297,6 +1297,30 @@ out:
 	return wp;
 }
 
+static noinline void
+deallocate_extra_replicas(struct bch_fs *c,
+			  struct open_buckets *ptrs,
+			  struct open_buckets *ptrs_no_use,
+			  unsigned extra_replicas)
+{
+	struct open_buckets ptrs2 = { 0 };
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, ptrs, ob, i) {
+		unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
+
+		if (d && d <= extra_replicas) {
+			extra_replicas -= d;
+			ob_push(c, ptrs_no_use, ob);
+		} else {
+			ob_push(c, &ptrs2, ob);
+		}
+	}
+
+	*ptrs = ptrs2;
+}
+
 /*
  * Get us an open_bucket we can allocate from, return with it locked:
  */
@@ -1321,6 +1345,9 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
 	int ret;
 	int i;
 
+	if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
+		erasure_code = false;
+
 	BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
 
 	BUG_ON(!nr_replicas || !nr_replicas_required);
@@ -1382,6 +1409,9 @@ alloc_done:
 	if (ret)
 		goto err;
 
+	if (nr_effective > nr_replicas)
+		deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
+
 	/* Free buckets we didn't use: */
 	open_bucket_for_each(c, &wp->ptrs, ob, i)
 		open_bucket_free_unused(c, ob);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 403aa3389fcc..dfa22f9d9a1d 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -638,6 +638,8 @@ struct journal_keys {
 	size_t			gap;
 	size_t			nr;
 	size_t			size;
+	atomic_t		ref;
+	bool			initial_ref_held;
 };
 
 struct btree_trans_buf {
@@ -929,7 +931,7 @@ struct bch_fs {
 	mempool_t		compression_bounce[2];
 	mempool_t		compress_workspace[BCH_COMPRESSION_TYPE_NR];
 	mempool_t		decompress_workspace;
-	ZSTD_parameters		zstd_params;
+	size_t			zstd_workspace_size;
 
 	struct crypto_shash	*sha256;
 	struct crypto_sync_skcipher *chacha20;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 0a750953ff92..1ab1f08d763b 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -151,7 +151,11 @@ struct bpos {
 #else
 #error edit for your odd byteorder.
 #endif
-} __packed __aligned(4);
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+__aligned(4)
+#endif
+;
 
 #define KEY_INODE_MAX			((__u64)~0ULL)
 #define KEY_OFFSET_MAX			((__u64)~0ULL)
@@ -1528,7 +1532,7 @@ struct bch_sb_field_disk_groups {
 	x(move_extent_write,				36)	\
 	x(move_extent_finish,				37)	\
 	x(move_extent_fail,				38)	\
-	x(move_extent_alloc_mem_fail,			39)	\
+	x(move_extent_start_fail,			39)	\
 	x(copygc,					40)	\
 	x(copygc_wait,					41)	\
 	x(gc_gens_end,					42)	\
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 0b5d09c8475d..30ab78a24517 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1541,8 +1541,8 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 		rcu_assign_pointer(ca->buckets_gc, buckets);
 	}
 
-	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
-			   BTREE_ITER_PREFETCH, k, ret) {
+	ret = for_each_btree_key2(trans, iter, BTREE_ID_alloc, POS_MIN,
+				  BTREE_ITER_PREFETCH, k, ({
 		ca = bch_dev_bkey_exists(c, k.k->p.inode);
 		g = gc_bucket(ca, k.k->p.offset);
 
@@ -1561,8 +1561,9 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 			g->stripe		= a->stripe;
 			g->stripe_redundancy	= a->stripe_redundancy;
 		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
+
+		0;
+	}));
 err:
 	bch2_trans_put(trans);
 	if (ret)
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 37d896edb06e..57c20390e10e 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1358,10 +1358,9 @@ static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *
 	return offset;
 }
 
-static void btree_node_read_all_replicas_done(struct closure *cl)
+static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
 {
-	struct btree_node_read_all *ra =
-		container_of(cl, struct btree_node_read_all, cl);
+	closure_type(ra, struct btree_node_read_all, cl);
 	struct bch_fs *c = ra->c;
 	struct btree *b = ra->b;
 	struct printbuf buf = PRINTBUF;
@@ -1567,7 +1566,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
 
 	if (sync) {
 		closure_sync(&ra->cl);
-		btree_node_read_all_replicas_done(&ra->cl);
+		btree_node_read_all_replicas_done(&ra->cl.work);
 	} else {
 		continue_at(&ra->cl, btree_node_read_all_replicas_done,
 			    c->io_complete_wq);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 6fa90bcd7016..8e0fe65f6101 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2981,7 +2981,8 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
 	trans->fn_idx		= fn_idx;
 	trans->locking_wait.task = current;
 	trans->journal_replay_not_finished =
-		!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
+		unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
+		atomic_inc_not_zero(&c->journal_keys.ref);
 	closure_init_stack(&trans->ref);
 
 	s = btree_trans_stats(trans);
@@ -3098,6 +3099,9 @@ void bch2_trans_put(struct btree_trans *trans)
 			kfree(trans->fs_usage_deltas);
 	}
 
+	if (unlikely(trans->journal_replay_not_finished))
+		bch2_journal_keys_put(c);
+
 	if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
 		mempool_free(trans->mem, &c->btree_trans_mem_pool);
 	else
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 58a981bcf3aa..ec52f50d249d 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -80,6 +80,8 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree
 	struct journal_keys *keys = &c->journal_keys;
 	unsigned iters = 0;
 	struct journal_key *k;
+
+	BUG_ON(*idx > keys->nr);
 search:
 	if (!*idx)
 		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
@@ -189,10 +191,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
 		/* Since @keys was full, there was no gap: */
 		memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
 		kvfree(keys->d);
-		*keys = new_keys;
+		keys->d		= new_keys.d;
+		keys->nr	= new_keys.nr;
+		keys->size	= new_keys.size;
 
 		/* And now the gap is at the end: */
-		keys->gap = keys->nr;
+		keys->gap	= keys->nr;
 	}
 
 	journal_iters_move_gap(c, keys->gap, idx);
@@ -415,10 +419,16 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
 		cmp_int(l->journal_offset, r->journal_offset);
 }
 
-void bch2_journal_keys_free(struct journal_keys *keys)
+void bch2_journal_keys_put(struct bch_fs *c)
 {
+	struct journal_keys *keys = &c->journal_keys;
 	struct journal_key *i;
 
+	BUG_ON(atomic_read(&keys->ref) <= 0);
+
+	if (!atomic_dec_and_test(&keys->ref))
+		return;
+
 	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
 	keys->gap = keys->nr;
 
@@ -429,6 +439,8 @@ void bch2_journal_keys_free(struct journal_keys *keys)
 	kvfree(keys->d);
 	keys->d = NULL;
 	keys->nr = keys->gap = keys->size = 0;
+
+	bch2_journal_entries_free(c);
 }
 
 static void __journal_keys_sort(struct journal_keys *keys)
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 5d64e7e22f26..8ca4c100b2e3 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -49,7 +49,15 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
 						struct bch_fs *,
 						struct btree *);
 
-void bch2_journal_keys_free(struct journal_keys *);
+void bch2_journal_keys_put(struct bch_fs *);
+
+static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
+{
+	if (c->journal_keys.initial_ref_held)
+		bch2_journal_keys_put(c);
+	c->journal_keys.initial_ref_held = false;
+}
+
 void bch2_journal_entries_free(struct bch_fs *);
 
 int bch2_journal_keys_sort(struct bch_fs *);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 76f27bc9fa24..6697417273aa 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -778,9 +778,9 @@ static void btree_interior_update_work(struct work_struct *work)
 	}
 }
 
-static void btree_update_set_nodes_written(struct closure *cl)
+static CLOSURE_CALLBACK(btree_update_set_nodes_written)
 {
-	struct btree_update *as = container_of(cl, struct btree_update, cl);
+	closure_type(as, struct btree_update, cl);
 	struct bch_fs *c = as->c;
 
 	mutex_lock(&c->btree_interior_update_lock);
@@ -1071,8 +1071,12 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 			break;
 		}
 
+		/*
+		 * Always check for space for two keys, even if we won't have to
+		 * split at prior level - it might have been a merge instead:
+		 */
 		if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
-					BKEY_BTREE_PTR_U64s_MAX * (1 + split)))
+						BKEY_BTREE_PTR_U64s_MAX * 2))
 			break;
 
 		split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
@@ -2266,6 +2270,10 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
 
 	BUG_ON(!btree_node_hashed(b));
 
+	struct bch_extent_ptr *ptr;
+	bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
+			    !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
+
 	ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
 					 commit_flags, skip_triggers);
 out:
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 58d8c6ffd955..5a91d3189fcf 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -854,8 +854,12 @@ static int __mark_pointer(struct btree_trans *trans,
 		return ret;
 
 	*dst_sectors += sectors;
-	*bucket_data_type = *dirty_sectors || *cached_sectors
-		? ptr_data_type : 0;
+
+	if (!*dirty_sectors && !*cached_sectors)
+		*bucket_data_type = 0;
+	else if (*bucket_data_type != BCH_DATA_stripe)
+		*bucket_data_type = ptr_data_type;
+
 	return 0;
 }
 
@@ -2091,8 +2095,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	bucket_gens->first_bucket = ca->mi.first_bucket;
 	bucket_gens->nbuckets	= nbuckets;
 
-	bch2_copygc_stop(c);
-
 	if (resize) {
 		down_write(&c->gc_lock);
 		down_write(&ca->bucket_lock);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index a8b148ec2a2b..51af8ea230ed 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -354,8 +354,7 @@ static int attempt_compress(struct bch_fs *c,
 		 */
 		unsigned level = min((compression.level * 3) / 2, zstd_max_clevel());
 		ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max);
-		ZSTD_CCtx *ctx = zstd_init_cctx(workspace,
-			zstd_cctx_workspace_bound(&params.cParams));
+		ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size);
 
 		/*
 		 * ZSTD requires that when we decompress we pass in the exact
@@ -371,7 +370,7 @@ static int attempt_compress(struct bch_fs *c,
 		size_t len = zstd_compress_cctx(ctx,
 				dst + 4,	dst_len - 4 - 7,
 				src,		src_len,
-				&c->zstd_params);
+				&params);
 		if (zstd_is_error(len))
 			return 0;
 
@@ -572,6 +571,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 	size_t decompress_workspace_size = 0;
 	ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
 						 c->opts.encoded_extent_max);
+
+	/*
+	 * ZSTD is lying: if we allocate the size of the workspace it says it
+	 * requires, it returns memory allocation errors
+	 */
+	c->zstd_workspace_size = zstd_cctx_workspace_bound(&params.cParams);
+
 	struct {
 		unsigned			feature;
 		enum bch_compression_type	type;
@@ -585,13 +591,11 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 			zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
 			zlib_inflate_workspacesize(), },
 		{ BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
-			zstd_cctx_workspace_bound(&params.cParams),
+			c->zstd_workspace_size,
 			zstd_dctx_workspace_bound() },
 	}, *i;
 	bool have_compressed = false;
 
-	c->zstd_params = params;
-
 	for (i = compression_types;
 	     i < compression_types + ARRAY_SIZE(compression_types);
 	     i++)
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 5ed66202c226..71aa5e59787b 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -356,7 +356,7 @@ void bch2_data_update_exit(struct data_update *update)
 	bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
 }
 
-void bch2_update_unwritten_extent(struct btree_trans *trans,
+static void bch2_update_unwritten_extent(struct btree_trans *trans,
 				  struct data_update *update)
 {
 	struct bch_fs *c = update->op.c;
@@ -436,7 +436,51 @@ void bch2_update_unwritten_extent(struct btree_trans *trans,
 	}
 }
 
+int bch2_extent_drop_ptrs(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  struct bkey_s_c k,
+			  struct data_update_opts data_opts)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i *n;
+	int ret;
+
+	n = bch2_bkey_make_mut_noupdate(trans, k);
+	ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	while (data_opts.kill_ptrs) {
+		unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
+		struct bch_extent_ptr *ptr;
+
+		bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
+		data_opts.kill_ptrs ^= 1U << drop;
+	}
+
+	/*
+	 * If the new extent no longer has any pointers, bch2_extent_normalize()
+	 * will do the appropriate thing with it (turning it into a
+	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
+	 */
+	bch2_extent_normalize(c, bkey_i_to_s(n));
+
+	/*
+	 * Since we're not inserting through an extent iterator
+	 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+	 * we aren't using the extent overwrite path to delete, we're
+	 * just using the normal key deletion path:
+	 */
+	if (bkey_deleted(&n->k))
+		n->k.size = 0;
+
+	return bch2_trans_relock(trans) ?:
+		bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+		bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
 int bch2_data_update_init(struct btree_trans *trans,
+			  struct btree_iter *iter,
 			  struct moving_context *ctxt,
 			  struct data_update *m,
 			  struct write_point_specifier wp,
@@ -452,7 +496,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 	const struct bch_extent_ptr *ptr;
 	unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
 	unsigned ptrs_locked = 0;
-	int ret;
+	int ret = 0;
 
 	bch2_bkey_buf_init(&m->k);
 	bch2_bkey_buf_reassemble(&m->k, c, k);
@@ -478,6 +522,8 @@ int bch2_data_update_init(struct btree_trans *trans,
 	bkey_for_each_ptr(ptrs, ptr)
 		percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref);
 
+	unsigned durability_have = 0, durability_removing = 0;
+
 	i = 0;
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		bool locked;
@@ -489,8 +535,11 @@ int bch2_data_update_init(struct btree_trans *trans,
 				reserve_sectors += k.k->size;
 
 			m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
-		} else if (!p.ptr.cached) {
+			durability_removing += bch2_extent_ptr_desired_durability(c, &p);
+		} else if (!p.ptr.cached &&
+			   !((1U << i) & m->data_opts.kill_ptrs)) {
 			bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
+			durability_have += bch2_extent_ptr_durability(c, &p);
 		}
 
 		/*
@@ -529,6 +578,29 @@ int bch2_data_update_init(struct btree_trans *trans,
 		i++;
 	}
 
+	/*
+	 * If current extent durability is less than io_opts.data_replicas,
+	 * we're not trying to rereplicate the extent up to data_replicas here -
+	 * unless extra_replicas was specified
+	 *
+	 * Increasing replication is an explicit operation triggered by
+	 * rereplicate, currently, so that users don't get an unexpected -ENOSPC
+	 */
+	if (durability_have >= io_opts.data_replicas) {
+		m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
+		m->data_opts.rewrite_ptrs = 0;
+		/* if iter == NULL, it's just a promote */
+		if (iter)
+			ret = bch2_extent_drop_ptrs(trans, iter, k, data_opts);
+		goto done;
+	}
+
+	m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) +
+		m->data_opts.extra_replicas;
+	m->op.nr_replicas_required = m->op.nr_replicas;
+
+	BUG_ON(!m->op.nr_replicas);
+
 	if (reserve_sectors) {
 		ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
 				m->data_opts.extra_replicas
@@ -538,14 +610,11 @@ int bch2_data_update_init(struct btree_trans *trans,
 			goto err;
 	}
 
-	m->op.nr_replicas += m->data_opts.extra_replicas;
-	m->op.nr_replicas_required = m->op.nr_replicas;
-
-	BUG_ON(!m->op.nr_replicas);
+	if (bkey_extent_is_unwritten(k)) {
+		bch2_update_unwritten_extent(trans, m);
+		goto done;
+	}
 
-	/* Special handling required: */
-	if (bkey_extent_is_unwritten(k))
-		return -BCH_ERR_unwritten_extent_update;
 	return 0;
 err:
 	i = 0;
@@ -560,6 +629,9 @@ err:
 	bch2_bkey_buf_exit(&m->k, c);
 	bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
 	return ret;
+done:
+	bch2_data_update_exit(m);
+	return ret ?: -BCH_ERR_data_update_done;
 }
 
 void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index 9dc17b9d8379..991095bbd469 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -32,9 +32,14 @@ int bch2_data_update_index_update(struct bch_write_op *);
 void bch2_data_update_read_done(struct data_update *,
 				struct bch_extent_crc_unpacked);
 
+int bch2_extent_drop_ptrs(struct btree_trans *,
+			  struct btree_iter *,
+			  struct bkey_s_c,
+			  struct data_update_opts);
+
 void bch2_data_update_exit(struct data_update *);
-void bch2_update_unwritten_extent(struct btree_trans *, struct data_update *);
-int bch2_data_update_init(struct btree_trans *, struct moving_context *,
+int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
+			  struct moving_context *,
 			  struct data_update *,
 			  struct write_point_specifier,
 			  struct bch_io_opts, struct data_update_opts,
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 68a1a96bb7ca..ae7910bf2228 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -162,7 +162,7 @@
 	x(BCH_ERR_fsck,			fsck_repair_unimplemented)		\
 	x(BCH_ERR_fsck,			fsck_repair_impossible)			\
 	x(0,				restart_recovery)			\
-	x(0,				unwritten_extent_update)		\
+	x(0,				data_update_done)			\
 	x(EINVAL,			device_state_not_allowed)		\
 	x(EINVAL,			member_info_missing)			\
 	x(EINVAL,			mismatched_block_size)			\
@@ -210,6 +210,7 @@
 	x(BCH_ERR_invalid_sb,		invalid_sb_members)			\
 	x(BCH_ERR_invalid_sb,		invalid_sb_disk_groups)			\
 	x(BCH_ERR_invalid_sb,		invalid_sb_replicas)			\
+	x(BCH_ERR_invalid_sb,		invalid_replicas_entry)			\
 	x(BCH_ERR_invalid_sb,		invalid_sb_journal)			\
 	x(BCH_ERR_invalid_sb,		invalid_sb_journal_seq_blacklist)	\
 	x(BCH_ERR_invalid_sb,		invalid_sb_crypt)			\
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index a864de231b69..f6c92df55270 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -649,37 +649,31 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 	return replicas;
 }
 
-unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
+static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p)
 {
-	struct bch_dev *ca;
-
 	if (p->ptr.cached)
 		return 0;
 
-	ca = bch_dev_bkey_exists(c, p->ptr.dev);
-
-	return ca->mi.durability +
-		(p->has_ec
-		 ? p->ec.redundancy
-		 : 0);
+	return p->has_ec
+		? p->ec.redundancy + 1
+		: ca->mi.durability;
 }
 
-unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
+unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
 {
-	struct bch_dev *ca;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev);
 
-	if (p->ptr.cached)
-		return 0;
+	return __extent_ptr_durability(ca, p);
+}
 
-	ca = bch_dev_bkey_exists(c, p->ptr.dev);
+unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev);
 
 	if (ca->mi.state == BCH_MEMBER_STATE_failed)
 		return 0;
 
-	return ca->mi.durability +
-		(p->has_ec
-		 ? p->ec.redundancy
-		 : 0);
+	return __extent_ptr_durability(ca, p);
 }
 
 unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 5b42a76c4796..9a479e4de6b3 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -35,9 +35,9 @@ static void bio_check_or_release(struct bio *bio, bool check_dirty)
 	}
 }
 
-static void bch2_dio_read_complete(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_dio_read_complete)
 {
-	struct dio_read *dio = container_of(cl, struct dio_read, cl);
+	closure_type(dio, struct dio_read, cl);
 
 	dio->req->ki_complete(dio->req, dio->ret);
 	bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
@@ -325,9 +325,9 @@ static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
 	return 0;
 }
 
-static void bch2_dio_write_flush_done(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_dio_write_flush_done)
 {
-	struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
+	closure_type(dio, struct dio_write, op.cl);
 	struct bch_fs *c = dio->op.c;
 
 	closure_debug_destroy(cl);
diff --git a/fs/bcachefs/fs.c b/fs/bcach
author	Linus Torvalds <torvalds@linux-foundation.org>	2023-12-02 06:02:16 +0900
committer	Linus Torvalds <torvalds@linux-foundation.org>	2023-12-02 06:02:16 +0900
commit	e6861be452a53a5de3e1a048eabd811a05a44915 (patch)
tree	065281129fb5e7b1ed51d6298fbd1faa0902063d /fs
parent	994d5c58e50e91bb02c7be4a91d5186292a895c8 (diff)
parent	415e5107b0dce0e5407ae4a46700cd7e8859e252 (diff)
download	linux-e6861be452a53a5de3e1a048eabd811a05a44915.tar.gz linux-e6861be452a53a5de3e1a048eabd811a05a44915.tar.bz2 linux-e6861be452a53a5de3e1a048eabd811a05a44915.zip