// SPDX-License-Identifier: GPL-2.0
/*
* Some low level IO code, and hacks for various block layer limitations
*
* Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
* Copyright 2012 Google, Inc.
*/
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
#include "async_objs.h"
#include "btree_update.h"
#include "buckets.h"
#include "checksum.h"
#include "clock.h"
#include "compress.h"
#include "data_update.h"
#include "disk_groups.h"
#include "ec.h"
#include "enumerated_ref.h"
#include "error.h"
#include "io_read.h"
#include "io_misc.h"
#include "io_write.h"
#include "reflink.h"
#include "subvolume.h"
#include "trace.h"
#include <linux/moduleparam.h>
#include <linux/random.h>
#include <linux/sched/mm.h>
#ifdef CONFIG_BCACHEFS_DEBUG
static unsigned bch2_read_corrupt_ratio;
module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
MODULE_PARM_DESC(read_corrupt_ratio, "");
#endif
static bool bch2_poison_extents_on_checksum_error;
module_param_named(poison_extents_on_checksum_error,
bch2_poison_extents_on_checksum_error, bool, 0644);
MODULE_PARM_DESC(poison_extents_on_checksum_error,
"Extents with checksum errors are marked as poisoned - unsafe without read fua support");
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
static bool bch2_target_congested(struct bch_fs *c, u16 target)
{
const struct bch_devs_mask *devs;
unsigned d, nr = 0, total = 0;
u64 now = local_clock(), last;
s64 congested;
struct bch_dev *ca;
if (!target)
return false;
guard(rcu)();
devs = bch2_target_to_mask(c, target) ?:
&c->rw_devs[BCH_DATA_user];
for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
ca = rcu_dereference(c->devs[d]);
if (!ca)
continue;
congested = atomic_read(&ca->congested);
last = READ_ONCE(ca->congested_last);
if (time_after64(now, last))
congested -= (now - last) >> 12;
total += max(congested, 0LL);
nr++;
}
return get_random_u32_below(nr * CONGESTED_MAX) < total;
}
#else
static bool bch2_target_congested(struct bch_fs *c, u16 target)
{
return false;
}
#endif
/* Cache promotion on read */
static const struct rhashtable_params bch_promote_params = {
.head_offset = offsetof(struct promote_op, hash),
.key_offset = offsetof(struct promote_op, pos),
.key_len = sizeof(struct bpos),
.automatic_shrinking = true,
};
static inline bool have_io_error(struct bch_io_failures *failed)
{
return failed && failed->nr;
}
static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
{
EBUG_ON(rbio->split);
return rbio->data_update
? container_of(rbio, struct data_update, rbio)
: NULL;
}
static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
{
struct data_update *u = rbio_data_update(orig);
if (!u)
return false;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
unsigned i = 0;
bkey_for_each_ptr(ptrs, ptr) {
if (ptr->dev == dev &&
u->data_opts.rewrite_ptrs & BIT(i))
return true;
i++;
}
return false;
}
static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
struct bpos pos,
struct bch_io_opts opts,
unsigned flags,
struct bch_io_failures *failed)
{
if (!have_io_error(failed)) {
BUG_ON(!opts.promote_target);
if (!(flags & BCH_READ_may_promote))
return bch_err_throw(c, nopromote_may_not);
if (bch2_bkey_has_target(c, k, opts.promote_target))
return bch_err_throw(c, nopromote_already_promoted);
if (bkey_extent_is_unwritten(k))
return bch_err_throw(c, nopromote_unwritten);
if (bch2_target_congested(c, opts.promote_target))
return bch_err_throw(c, nopromote_congested);
}
if (rhashtable_lookup_fast(&c->promote_table, &pos,
bch_promote_params))
return bch_err_throw(c, nopromote_in_flight);
return 0;
}
static noinline void promote_free(struct bch_read_bio *rbio)
{
struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
struct bch_fs *c = rbio->c;
int ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
bch_promote_params);
BUG_ON(ret);
async_object_list_del(c, promote, op->list_idx);
async_object_list_del(c, rbio, rbio->list_idx);
bch2_data_update_exit(&op->write);
enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote);
kfree_rcu(op, rcu);
}
static void promote_done(struct bch_write_op *wop)
{
struct promote_op *op = container_of(wop, struct promote_op, write.op);
struct bch_fs *c = op->write.rbio.c;
bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time);
promote_free(&op->write.rbio);
}
static void promote_start_work(struct work_struct *work)
{
struct promote_op *op = container_of(work, struct promote_op, work);
bch2_data_update_read_done(&op->write);
}
static noinline void promote_start(struct bch_read_bio *rbio)
{
struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
trace_and_count(op->write.op.c, io_read_promote, &rbio->bio);
INIT_WORK(&op->work, promote_start_work);
queue_work(rbio->c->write_ref_wq, &op->work);
}
static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
enum btree_id btree_id,
struct bkey_s_c k,
struct bpos pos,
struct extent_ptr_decoded *pick,
unsigned sectors,
struct bch_read_bio *orig,
struct bch_io_failures *failed)
{
struct bch_fs *c = trans->c;
int ret;
struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait };
if (!have_io_error(failed)) {
update_opts.target = orig->opts.promote_target;
update_opts.extra_replicas = 1;
update_opts.write_flags |= BCH_WRITE_cached;
update_opts.write_flags |= BCH_WRITE_only_specified_devs;
} else {
update_opts.target = orig->opts.foreground_target;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
unsigned ptr_bit = 1;
bkey_for_each_ptr(ptrs, ptr) {
if (bch2_dev_io_failures(failed, ptr->dev) &&
!ptr_being_rewritten(orig, ptr->dev))
update_opts.rewrite_ptrs |= ptr_bit;
ptr_bit <<= 1;
}
if (!update_opts.rewrite_ptrs)
return NULL;
}
if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote))
return ERR_PTR(-BCH_ERR_nopromote_no_writes);
struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
if (!op) {
ret = bch_err_throw(c, nopromote_enomem);
goto err_put;
}
op->start_time = local_clock();
op->pos = pos;
if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
bch_promote_params)) {
ret = bch_err_throw(c, nopromote_in_flight);
goto err;
}
ret = async_object_list_add(c, promote, op, &op->list_idx);
if (ret < 0)
goto err_remove_hash;
ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
writepoint_hashed((unsigned long) current),
&orig->opts,
update_opts,
btree_id, k);
op->write.type = BCH_DATA_UPDATE_promote;
/*
* possible errors: -BCH_ERR_nocow_lock_blocked,
* -BCH_ERR_ENOSPC_disk_reservation:
*/
if (ret)
goto err_remove_list;
rbio_init_fragment(&op->write.rbio.bio, orig);
op->write.rbio.bounce = true;
op->write.rbio.promote = true;
op->write.op.end_io = promote_done;
return &op->write.rbio;
err_remove_list:
async_object_list_del(c, promote, op->list_idx);
err_remove_hash:
BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
bch_promote_params));
err:
b
|