// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
*
* Code for managing the extent btree and dynamically updating the writeback
* dirty sector count.
*/
#include "bcachefs.h"
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_gc.h"
#include "btree_io.h"
#include "btree_iter.h"
#include "buckets.h"
#include "checksum.h"
#include "compress.h"
#include "debug.h"
#include "disk_groups.h"
#include "error.h"
#include "extents.h"
#include "inode.h"
#include "journal.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
#include "trace.h"
#include "util.h"
static unsigned bch2_crc_field_size_max[] = {
[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
[BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
};
static void bch2_extent_crc_pack(union bch_extent_crc *,
struct bch_extent_crc_unpacked,
enum bch_extent_entry_type);
struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
unsigned dev)
{
struct bch_dev_io_failures *i;
for (i = f->devs; i < f->devs + f->nr; i++)
if (i->dev == dev)
return i;
return NULL;
}
void bch2_mark_io_failure(struct bch_io_failures *failed,
struct extent_ptr_decoded *p)
{
struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
if (!f) {
BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
f = &failed->devs[failed->nr++];
f->dev = p->ptr.dev;
f->idx = p->idx;
f->nr_failed = 1;
f->nr_retries = 0;
} else if (p->idx != f->idx) {
f->idx = p->idx;
f->nr_failed = 1;
f->nr_retries = 0;
} else {
f->nr_failed++;
}
}
static inline u64 dev_latency(struct bch_fs *c, unsigned dev)
{
struct bch_dev *ca = bch2_dev_rcu(c, dev);
return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
}
/*
* returns true if p1 is better than p2:
*/
static inline bool ptr_better(struct bch_fs *c,
const struct extent_ptr_decoded p1,
const struct extent_ptr_decoded p2)
{
if (likely(!p1.idx && !p2.idx)) {
u64 l1 = dev_latency(c, p1.ptr.dev);
u64 l2 = dev_latency(c, p2.ptr.dev);
/* Pick at random, biased in favor of the faster device: */
return bch2_rand_range(l1 + l2) > l1;
}
if (bch2_force_reconstruct_read)
return p1.idx > p2.idx;
return p1.idx < p2.idx;
}
/*
* This picks a non-stale pointer, preferably from a device other than @avoid.
* Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
* other devices, it will still pick a pointer from avoid.
*/
int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
struct bch_io_failures *failed,
struct extent_ptr_decoded *pick)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
struct bch_dev_io_failures *f;
int ret = 0;
if (k.k->type == KEY_TYPE_error)
return -EIO;
rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
/*
* Unwritten extent: no need to actually read, treat it as a
* hole and return 0s:
*/
if (p.ptr.unwritten) {
ret = 0;
break;
}
/*
* If there are any dirty pointers it's an error if we can't
* read:
*/
if (!ret && !p.ptr.cached)
ret = -EIO;
struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
continue;
f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
if (f)
p.idx = f->nr_failed < f->nr_retries
? f->idx
: f->idx + 1;
if (!p.idx && !ca)
p.idx++;
if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
p.idx++;
if (!p.idx && !bch2_dev_is_readable(ca))
p.idx++;
if (p.idx >= (unsigned) p.has_ec + 1)
continue;
if (ret > 0 && !ptr_better(c, p, *pick))
continue;
*pick = p;
ret = 1;
}
rcu_read_unlock();
return ret;
}
/* KEY_TYPE_btree_ptr: */
int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bch_validate_flags flags,
struct printbuf *err)
{
int ret = 0;
bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err,
btree_ptr_val_too_big,
"value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
fsck_err:
return ret;
}
void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
bch2_bkey_ptrs_to_text(out, c, k);
}
int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bch_validate_flags flags,
struct printbuf *err)
{
struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
int ret = 0;
bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
c, err, btree_ptr_v2_val_too_big,
"value too big (%zu > %zu)",
bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
c, err, btree_ptr_v2_min_key_bad,
"min_key > key");
if (flags & BCH_VALIDATE_write)
bkey_fsck_err_on(!bp.v->sectors_written,
c, err, btree_ptr_v2_written_0,
"sectors_written == 0");
ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
fsck_err:
return ret;
}
void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
prt_printf(out, "seq %llx written %u min_key %s",
le64_to_cpu(bp.v->seq),
le16_to_cpu(bp.v->sectors_written),
BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
bch2_bpos_to_text(out, bp.v->min_key);
prt_printf(out, " ");
bch2_bkey_ptrs_to_text(out, c, k);
}
void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
unsigned big_endian, int write,
struct bkey_s k)
{
struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
if (version < bcachefs_metadata_version_inode_btree_change &&
btree_id_is_extents(btree_id) &&
!bkey_eq(bp.v->min_key, POS_MIN))
bp.v->min_key = write
? bpos_nosnap_predecessor(bp.v->min_key)
: bpos_nosnap_successor(bp.v->min_key);
}
/* KEY_TYPE_extent: */
bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
{
struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l);
struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
union bch_extent_entry *en_l;
const union bch_extent_entry *en_r;
struct extent_ptr_decoded lp, rp;
bool use_right_ptr;
en_l = l_ptrs.start;
en_r = r_ptrs.start;
while (en_l <
|