31 files changed, 941 insertions, 400 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 4e4a448f6931..6e161f8ffe8d 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -639,6 +639,16 @@ int bch2_alloc_read(struct bch_fs *c)
 				continue;
 			}
 
+			if (k.k->p.offset < ca->mi.first_bucket) {
+				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket));
+				continue;
+			}
+
+			if (k.k->p.offset >= ca->mi.nbuckets) {
+				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+				continue;
+			}
+
 			struct bch_alloc_v4 a;
 			*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
 			0;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 84832c2d4df9..5004f6ba997c 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -678,7 +678,8 @@ struct bch_sb_field_ext {
 	x(disk_accounting_v2,		BCH_VERSION(1,  9))		\
 	x(disk_accounting_v3,		BCH_VERSION(1, 10))		\
 	x(disk_accounting_inum,		BCH_VERSION(1, 11))		\
-	x(rebalance_work_acct_fix,	BCH_VERSION(1, 12))
+	x(rebalance_work_acct_fix,	BCH_VERSION(1, 12))		\
+	x(inode_has_child_snapshots,	BCH_VERSION(1, 13))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 771154e3a291..94bbd8505582 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1224,17 +1224,20 @@ int bch2_gc_gens(struct bch_fs *c)
 	u64 b, start_time = local_clock();
 	int ret;
 
-	/*
-	 * Ideally we would be using state_lock and not gc_gens_lock here, but that
-	 * introduces a deadlock in the RO path - we currently take the state
-	 * lock at the start of going RO, thus the gc thread may get stuck:
-	 */
 	if (!mutex_trylock(&c->gc_gens_lock))
 		return 0;
 
 	trace_and_count(c, gc_gens_start, c);
 
-	down_read(&c->state_lock);
+	/*
+	 * We have to use trylock here. Otherwise, we would
+	 * introduce a deadlock in the RO path - we take the
+	 * state lock at the start of going RO.
+	 */
+	if (!down_read_trylock(&c->state_lock)) {
+		mutex_unlock(&c->gc_gens_lock);
+		return 0;
+	}
 
 	for_each_member_device(c, ca) {
 		struct bucket_gens *gens = bucket_gens(ca);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 1c1448b52207..cf933409d385 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1838,10 +1838,11 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
 	struct btree_trans *trans = bch2_trans_get(c);
 
 	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-	__btree_node_write_done(c, b);
-	six_unlock_read(&b->c.lock);
 
+	/* we don't need transaction context anymore after we got the lock. */
 	bch2_trans_put(trans);
+	__btree_node_write_done(c, b);
+	six_unlock_read(&b->c.lock);
 }
 
 static void btree_node_write_work(struct work_struct *work)
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index bfe9f0c1e1be..0883cf6e1a3e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2381,9 +2381,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 		else
 			iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
 
-		if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
-			     ? bkey_gt(iter_pos, end)
-			     : bkey_ge(iter_pos, end)))
+		if (unlikely(iter->flags & BTREE_ITER_all_snapshots	? bpos_gt(iter_pos, end) :
+			     iter->flags & BTREE_ITER_is_extents	? bkey_ge(iter_pos, end) :
+									  bkey_gt(iter_pos, end)))
 			goto end;
 
 		break;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 78e63ad7d380..31a58bf46fdb 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -857,6 +857,14 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
 	for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\
 					  SPOS_MAX, _flags, _k, _ret)
 
+#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id,	\
+					     _start, _flags, _k, _ret)	\
+	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
+				  (_start), (_flags));			\
+	     (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags),	\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_rewind(&(_iter)))
+
 #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret)	\
 	for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
 
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 1e694fedc5da..a7aedb134e9f 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -171,6 +171,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
 	if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
 		return;
 
+	if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
+		return;
+
 	rcu_read_lock();
 	struct found_btree_node n = {
 		.btree_id	= BTREE_NODE_ID(bn),
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 462b1a2fe1ad..a6ee0beee6b0 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -80,6 +80,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc
 					if (ptr2 == ptr)
 						break;
 
+					ca = bch2_dev_have_ref(c, ptr2->dev);
 					bucket = PTR_BUCKET_POS(ca, ptr2);
 					bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
 				}
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 9f3133e3e7e5..e309fb78529b 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -242,6 +242,14 @@ void bch2_accounting_swab(struct bkey_s k)
 		*p = swab64(*p);
 }
 
+static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r,
+					    struct disk_accounting_pos acc)
+{
+	unsafe_memcpy(r, &acc.replicas,
+		      replicas_entry_bytes(&acc.replicas),
+		      "variable length struct");
+}
+
 static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p)
 {
 	struct disk_accounting_pos acc_k;
@@ -249,9 +257,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc
 
 	switch (acc_k.type) {
 	case BCH_DISK_ACCOUNTING_replicas:
-		unsafe_memcpy(r, &acc_k.replicas,
-			      replicas_entry_bytes(&acc_k.replicas),
-			      "variable length struct");
+		__accounting_to_replicas(r, acc_k);
 		return true;
 	default:
 		return false;
@@ -608,6 +614,81 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
 	return ret;
 }
 
+static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
+					      struct disk_accounting_pos acc,
+					      u64 *v, unsigned nr)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0, invalid_dev = -1;
+
+	switch (acc.type) {
+	case BCH_DISK_ACCOUNTING_replicas: {
+		struct bch_replicas_padded r;
+		__accounting_to_replicas(&r.e, acc);
+
+		for (unsigned i = 0; i < r.e.nr_devs; i++)
+			if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
+			    !bch2_dev_exists(c, r.e.devs[i])) {
+				invalid_dev = r.e.devs[i];
+				goto invalid_device;
+			}
+
+		/*
+		 * All replicas entry checks except for invalid device are done
+		 * in bch2_accounting_validate
+		 */
+		BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf));
+
+		if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
+				trans, accounting_replicas_not_marked,
+				"accounting not marked in superblock replicas\n  %s",
+				(printbuf_reset(&buf),
+				 bch2_accounting_key_to_text(&buf, &acc),
+				 buf.buf))) {
+			/*
+			 * We're not RW yet and still single threaded, dropping
+			 * and retaking lock is ok:
+			 */
+			percpu_up_write(&c->mark_lock);
+			ret = bch2_mark_replicas(c, &r.e);
+			if (ret)
+				goto fsck_err;
+			percpu_down_write(&c->mark_lock);
+		}
+		break;
+	}
+
+	case BCH_DISK_ACCOUNTING_dev_data_type:
+		if (!bch2_dev_exists(c, acc.dev_data_type.dev)) {
+			invalid_dev = acc.dev_data_type.dev;
+			goto invalid_device;
+		}
+		break;
+	}
+
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+invalid_device:
+	if (fsck_err(trans, accounting_to_invalid_device,
+		     "accounting entry points to invalid device %i\n  %s",
+		     invalid_dev,
+		     (printbuf_reset(&buf),
+		      bch2_accounting_key_to_text(&buf, &acc),
+		      buf.buf))) {
+		for (unsigned i = 0; i < nr; i++)
+			v[i] = -v[i];
+
+		ret = commit_do(trans, NULL, NULL, 0,
+				bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?:
+			-BCH_ERR_remove_disk_accounting_entry;
+	} else {
+		ret = -BCH_ERR_remove_disk_accounting_entry;
+	}
+	goto fsck_err;
+}
+
 /*
  * At startup time, initialize the in memory accounting from the btree (and
  * journal)
@@ -666,44 +747,42 @@ int bch2_accounting_read(struct bch_fs *c)
 	}
 	keys->gap = keys->nr = dst - keys->data;
 
-	percpu_down_read(&c->mark_lock);
-	for (unsigned i = 0; i < acc->k.nr; i++) {
-		u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
-		bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
+	percpu_down_write(&c->mark_lock);
+	unsigned i = 0;
+	while (i < acc->k.nr) {
+		unsigned idx = inorder_to_eytzinger0(i, acc->k.nr);
 
-		if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters))
-			continue;
+		struct disk_accounting_pos acc_k;
+		bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos);
 
-		struct bch_replicas_padded r;
-		if (!accounting_to_replicas(&r.e, acc->k.data[i].pos))
-			continue;
+		u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+		bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false);
 
 		/*
-		 * If the replicas entry is invalid it'll get cleaned up by
-		 * check_allocations:
+		 * If the entry counters are zeroed, it should be treated as
+		 * nonexistent - it might point to an invalid device.
+		 *
+		 * Remove it, so that if it's re-added it gets re-marked in the
+		 * superblock:
 		 */
-		if (bch2_replicas_entry_validate(&r.e, c, &buf))
+		ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters)
+			? -BCH_ERR_remove_disk_accounting_entry
+			: bch2_disk_accounting_validate_late(trans, acc_k,
+							v, acc->k.data[idx].nr_counters);
+
+		if (ret == -BCH_ERR_remove_disk_accounting_entry) {
+			free_percpu(acc->k.data[idx].v[0]);
+			free_percpu(acc->k.data[idx].v[1]);
+			darray_remove_item(&acc->k, &acc->k.data[idx]);
+			eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+					accounting_pos_cmp, NULL);
+			ret = 0;
 			continue;
-
-		struct disk_accounting_pos k;
-		bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
-
-		if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
-				trans, accounting_replicas_not_marked,
-				"accounting not marked in superblock replicas\n  %s",
-				(printbuf_reset(&buf),
-				 bch2_accounting_key_to_text(&buf, &k),
-				 buf.buf))) {
-			/*
-			 * We're not RW yet and still single threaded, dropping
-			 * and retaking lock is ok:
-			 */
-			percpu_up_read(&c->mark_lock);
-			ret = bch2_mark_replicas(c, &r.e);
-			if (ret)
-				goto fsck_err;
-			percpu_down_read(&c->mark_lock);
 		}
+
+		if (ret)
+			goto fsck_err;
+		i++;
 	}
 
 	preempt_disable();
@@ -742,7 +821,7 @@ int bch2_accounting_read(struct bch_fs *c)
 	}
 	preempt_enable();
 fsck_err:
-	percpu_up_read(&c->mark_lock);
+	percpu_up_write(&c->mark_lock);
 err:
 	printbuf_exit(&buf);
 	bch2_trans_put(trans);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 1587c6e1866a..e410cfe37b1a 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -124,6 +124,11 @@ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
 			 "incorrect value size (%zu < %u)",
 			 bkey_val_u64s(k.k), stripe_val_u64s(s));
 
+	bkey_fsck_err_on(s->csum_granularity_bits >= 64,
+			 c, stripe_csum_granularity_bad,
+			 "invalid csum granularity (%u >= 64)",
+			 s->csum_granularity_bits);
+
 	ret = bch2_bkey_ptrs_validate(c, k, flags);
 fsck_err:
 	return ret;
@@ -145,7 +150,11 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
 		   nr_data,
 		   s.nr_redundant);
 	bch2_prt_csum_type(out, s.csum_type);
-	prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
+	prt_str(out, " gran ");
+	if (s.csum_granularity_bits < 64)
+		prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits);
+	else
+		prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits);
 
 	if (s.disk_label) {
 		prt_str(out, " label");
@@ -1197,47 +1206,62 @@ void bch2_do_stripe_deletes(struct bch_fs *c)
 /* stripe creation: */
 
 static int ec_stripe_key_update(struct btree_trans *trans,
-				struct bkey_i_stripe *new,
-				bool create)
+				struct bkey_i_stripe *old,
+				struct bkey_i_stripe *new)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
+	bool create = !old;
 
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
-			       new->k.p, BTREE_ITER_intent);
-	ret = bkey_err(k);
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
+					       new->k.p, BTREE_ITER_intent);
+	int ret = bkey_err(k);
 	if (ret)
 		goto err;
 
-	if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) {
-		bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s",
-				     create ? "creating" : "updating",
-				     bch2_bkey_types[k.k->type]);
+	if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe),
+				    c, "error %s stripe: got existing key type %s",
+				    create ? "creating" : "updating",
+				    bch2_bkey_types[k.k->type])) {
 		ret = -EINVAL;
 		goto err;
 	}
 
 	if (k.k->type == KEY_TYPE_stripe) {
-		const struct bch_stripe *old = bkey_s_c_to_stripe(k).v;
-		unsigned i;
+		const struct bch_stripe *v = bkey_s_c_to_stripe(k).v;
 
-		if (old->nr_blocks != new->v.nr_blocks) {
-			bch_err(c, "error updating stripe: nr_blocks does not match");
-			ret = -EINVAL;
-			goto err;
-		}
+		BUG_ON(old->v.nr_blocks != new->v.nr_blocks);
+		BUG_ON(old->v.nr_blocks != v->nr_blocks);
 
-		for (i = 0; i < new->v.nr_blocks; i++) {
-			unsigned v = stripe_blockcount_get(old, i);
+		for (unsigned i = 0; i < new->v.nr_blocks; i++) {
+			unsigned sectors = stripe_blockcount_get(v, i);
 
-			BUG_ON(v &&
-			       (old->ptrs[i].dev != new->v.ptrs[i].dev ||
-				old->ptrs[i].gen != new->v.ptrs[i].gen ||
-				old->ptrs[i].offset != new->v.ptrs[i].offset));
+			if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) {
+				struct printbuf buf = PRINTBUF;
 
-			stripe_blockcount_set(&new->v, i, v);
+				prt_printf(&buf, "stripe changed nonempty block %u", i);
+				prt_str(&buf, "\nold: ");
+				bch2_bkey_val_to_text(&buf, c, k);
+				prt_str(&buf, "\nnew: ");
+				bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i));
+				bch2_fs_inconsistent(c, "%s", buf.buf);
+				printbuf_exit(&buf);
+				ret = -EINVAL;
+				goto err;
+			}
+
+			/*
+			 * If the stripe ptr changed underneath us, it must have
+			 * been dev_remove_stripes() -> * invalidate_stripe_to_dev()
+			 */
+			if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) {
+				BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID);
+
+				if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]))
+					new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID;
+			}
+
+			stripe_blockcount_set(&new->v, i, sectors);
 		}
 	}
 
@@ -1499,8 +1523,10 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 			    BCH_TRANS_COMMIT_no_check_rw|
 			    BCH_TRANS_COMMIT_no_enospc,
 			    ec_stripe_key_update(trans,
-					bkey_i_to_stripe(&s->new_stripe.key),
-					!s->have_existing_stripe));
+					s->have_existing_stripe
+					? bkey_i_to_stripe(&s->existing_stripe.key)
+					: NULL,
+					bkey_i_to_stripe(&s->new_stripe.key)));
 	bch_err_msg(c, ret, "creating stripe key");
 	if (ret) {
 		goto err;
@@ -1876,7 +1902,15 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
 	bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
 
 	for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
-		__clear_bit(v->ptrs[i].dev, devs.d);
+		/*
+		 * Note: we don't yet repair invalid blocks (failed/removed
+		 * devices) when reusing stripes - we still need a codepath to
+		 * walk backpointers and update all extents that point to that
+		 * block when updating the stripe
+		 */
+		if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
+			__clear_bit(v->ptrs[i].dev, devs.d);
+
 		if (i < h->s->nr_data)
 			nr_have_data++;
 		else
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 60b7875adada..649263516ab1 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -268,7 +268,8 @@
 	x(BCH_ERR_nopromote,		nopromote_no_writes)			\
 	x(BCH_ERR_nopromote,		nopromote_enomem)			\
 	x(0,				invalid_snapshot_node)			\
-	x(0,				option_needs_open_fs)
+	x(0,				option_needs_open_fs)			\
+	x(0,				remove_disk_accounting_entry)
 
 enum bch_errcode {
 	BCH_ERR_START		= 2048,
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index ed5001dd662e..923a5f1849a8 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -695,6 +695,16 @@ void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
 int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c,
 			    enum bch_validate_flags);
 
+static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
+				      struct bch_extent_ptr ptr2)
+{
+	return (ptr1.cached	== ptr2.cached &&
+		ptr1.unwritten	== ptr2.unwritten &&
+		ptr1.offset	== ptr2.offset &&
+		ptr1.dev	== ptr2.dev &&
+		ptr1.dev	== ptr2.dev);
+}
+
 void bch2_ptr_swab(struct bkey_s);
 
 const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index ee1c0325f313..6d3a05ae5da8 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -369,6 +369,7 @@ static noinline void bch2_dio_write_flush(struct dio_write *dio)
 
 static __always_inline long bch2_dio_write_done(struct dio_write *dio)
 {
+	struct bch_fs *c = dio->op.c;
 	struct kiocb *req = dio->req;
 	struct bch_inode_info *inode = dio->inode;
 	bool sync = dio->sync;
@@ -387,7 +388,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio)
 	ret = dio->op.error ?: ((long) dio->written << 9);
 	bio_put(&dio->op.wbio.bio);
 
-	bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write);
+	bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
 
 	/* inode->i_dio_count is our ref on inode and thus bch_fs */
 	inode_dio_end(&inode->v);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 02969dff165d..d7bd28bdaffc 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -157,6 +157,20 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
 	return a.subvol == b.subvol && a.inum == b.inum;
 }
 
+static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
+{
+	const subvol_inum *inum = data;
+
+	return jhash(&inum->inum, sizeof(inum->inum), seed);
+}
+
+static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed)
+{
+	const struct bch_inode_info *inode = data;
+
+	return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed);
+}
+
 static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
 				 const void *obj)
 {
@@ -170,11 +184,91 @@ static const struct rhashtable_params bch2_vfs_inodes_params = {
 	.head_offset		= offsetof(struct bch_inode_info, hash),
 	.key_offset		= offsetof(struct bch_inode_info, ei_inum),
 	.key_len		= sizeof(subvol_inum),
+	.hashfn			= bch2_vfs_inode_hash_fn,
+	.obj_hashfn		= bch2_vfs_inode_obj_hash_fn,
 	.obj_cmpfn		= bch2_vfs_inode_cmp_fn,
 	.automatic_shrinking	= true,
 };
 
-struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
+int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
+{
+	struct bch_fs *c = trans->c;
+	struct rhashtable *ht = &c->vfs_inodes_table;
+	subvol_inum inum = (subvol_inum) { .inum = p.offset };
+	DARRAY(u32) subvols;
+	int ret = 0;
+
+	if (!test_bit(BCH_FS_started, &c->flags))
+		return false;
+
+	darray_init(&subvols);
+restart_from_top:
+
+	/*
+	 * Tweaked version of __rhashtable_lookup(); we need to get a list of
+	 * subvolumes in which the given inode number is open.
+	 *
+	 * For this to work, we don't include the subvolume ID in the key that
+	 * we hash - all inodes with the same inode number regardless of
+	 * subvolume will hash to the same slot.
+	 *
+	 * This will be less than ideal if the same file is ever open
+	 * simultaneously in many different snapshots:
+	 */
+	rcu_read_lock();
+	struct rhash_lock_head __rcu *const *bkt;
+	struct rhash_head *he;
+	unsigned int hash;
+	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+restart:
+	hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params);
+	bkt = rht_bucket(tbl, hash);
+	do {
+		struct bch_inode_info *inode;
+
+		rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
+			if (inode->ei_inum.inum == inum.inum) {
+				ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
+						      GFP_NOWAIT|__GFP_NOWARN);
+				if (ret) {
+					rcu_read_unlock();
+					ret = darray_make_room(&subvols, 1);
+					if (ret)
+						goto err;
+					subvols.nr = 0;
+					goto restart_from_top;
+				}
+			}
+		}
+		/* An object might have been moved to a different hash chain,
+		 * while we walk along it - better check and retry.
+		 */
+	} while (he != RHT_NULLS_MARKER(bkt));
+
+	/* Ensure we see any new tables. */
+	smp_rmb();
+
+	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	if (unlikely(tbl))
+		goto restart;
+	rcu_read_unlock();
+
+	darray_for_each(subvols, i) {
+		u32 snap;
+		ret = bch2_subvolume_get_snapshot(trans, *i, &snap);
+		if (ret)
+			goto err;
+
+		ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot);
+		if (ret)
+			break;
+	}
+err:
+	darray_exit(&subvols);
+	return ret;
+}
+
+static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
 {
 	return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
 }
@@ -184,7 +278,8 @@ static void __wait_on_freeing_inode(struct bch_fs *c,
 				    subvol_inum inum)
 {
 	wait_queue_head_t *wq;
-	DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
+	struct wait_bit_queue_entry wait;
+
 	wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW);
 	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 	spin_unlock(&inode->v.i_lock);
@@ -252,7 +347,8 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
 
 	set_bit(EI_INODE_HASHED, &inode->ei_flags);
 retry:
-	if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
+	if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table,
+					&inode->ei_inum,
 					&inode->hash,
 					bch2_vfs_inodes_params))) {
 		old = bch2_inode_hash_find(c, trans, inode->ei_inum);
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index da74ecc236e7..59f9f7ae728d 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -54,8 +54,6 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode)
 	return inode->ei_inum;
 }
 
-struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum);
-
 /*
  * Set if we've gotten a btree error for this inode, and thus the vfs inode and
  * btree inode may be inconsistent:
@@ -148,6 +146,8 @@ struct bch_inode_info *
 __bch2_create(struct mnt_idmap *, struct bch_inode_info *,
 	      struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
 
+int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p);
+
 int bch2_fs_quota_transfer(struct bch_fs *,
 			   struct bch_inode_info *,
 			   struct bch_qid,
@@ -198,10 +198,7 @@ int bch2_vfs_init(void);
 
 #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)	({ do {} while (0); })
 
-static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
-{
-	return NULL;
-}
+static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; }
 
 static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
 					       snapshot_id_list *s) {}
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index b8a6ceb0cc7a..a1087fd292e4 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -326,17 +326,54 @@ err:
 	return ret;
 }
 
+static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
+{
+	if (inode->bi_inum == BCACHEFS_ROOT_INO &&
+	    inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
+		return false;
+
+	return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
+}
+
+static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
+					SPOS(d_pos.inode, d_pos.offset, snapshot),
+					BTREE_ITER_intent|
+					BTREE_ITER_with_updates);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (bpos_eq(k.k->p, d_pos)) {
+		/*
+		 * delet_at() doesn't work because the update path doesn't
+		 * internally use BTREE_ITER_with_updates yet
+		 */
+		struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
+		ret = PTR_ERR_OR_ZERO(k);
+		if (ret)
+			goto err;
+
+		bkey_init(&k->k);
+		k->k.type = KEY_TYPE_whiteout;
+		k->k.p = iter.pos;