// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "bbpos.h"
#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_iter.h"
#include "btree_locking.h"
#include "debug.h"
#include "errcode.h"
#include "error.h"
#include "journal.h"
#include "trace.h"
#include <linux/prefetch.h>
#include <linux/sched/mm.h>
#include <linux/swap.h>
const char * const bch2_btree_node_flags[] = {
"typebit",
"typebit",
"typebit",
#define x(f) [BTREE_NODE_##f] = #f,
BTREE_FLAGS()
#undef x
NULL
};
void bch2_recalc_btree_reserve(struct bch_fs *c)
{
unsigned reserve = 16;
if (!c->btree_roots_known[0].b)
reserve += 8;
for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (r->b)
reserve += min_t(unsigned, 1, r->b->c.level) * 8;
}
c->btree_cache.nr_reserve = reserve;
}
static inline size_t btree_cache_can_free(struct btree_cache_list *list)
{
struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
size_t can_free = list->nr;
if (!list->idx)
can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
return can_free;
}
static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
{
BUG_ON(!list_empty(&b->list));
if (b->c.lock.readers)
list_add(&b->list, &bc->freed_pcpu);
else
list_add(&b->list, &bc->freed_nonpcpu);
}
static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b)
{
BUG_ON(!list_empty(&b->list));
BUG_ON(!b->data);
bc->nr_freeable++;
list_add(&b->list, &bc->freeable);
}
void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
{
struct btree_cache *bc = &c->btree_cache;
mutex_lock(&bc->lock);
__bch2_btree_node_to_freelist(bc, b);
mutex_unlock(&bc->lock);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
void __btree_node_data_free(struct btree *b)
{
BUG_ON(!list_empty(&b->list));
BUG_ON(btree_node_hashed(b));
/*
* This should really be done in slub/vmalloc, but we're using the
* kmalloc_large() path, so we're working around a slub bug by doing
* this here:
*/
if (b->data)
mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE);
if (b->aux_data)
mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE);
EBUG_ON(btree_node_write_in_flight(b));
clear_btree_node_just_written(b);
kvfree(b->data);
b->data = NULL;
#ifdef __KERNEL__
kvfree(b->aux_data);
#else
munmap(b->aux_data, btree_aux_data_bytes(b));
#endif
b->aux_data = NULL;
}
static void btree_node_data_free(struct btree_cache *bc, struct btree *b)
{
BUG_ON(list_empty(&b->list));
list_del_init(&b->list);
__btree_node_data_free(b);
--bc->nr_freeable;
btree_node_to_freedlist(bc, b);
}
static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
const void *obj)
{
const struct btree *b = obj;
const u64 *v = arg->key;
return b->hash_val == *v ? 0 : 1;
}
static const struct rhashtable_params bch_btree_cache_params = {
.head_offset = offsetof(struct btree, hash),
.key_offset = offsetof(struct btree, hash_val),
.key_len = sizeof(u64),
.obj_cmpfn = bch2_btree_cache_cmp_fn,
.automatic_shrinking = true,
};
static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
b->data = kvmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
return bch_err_throw(c, ENOMEM_btree_node_mem_alloc);
#ifdef __KERNEL__
b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
#else
b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
PROT_READ|PROT_WRITE|PROT_EXEC,
MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
if (b->aux_data == MAP_FAILED)
b->aux_data = NULL;
#endif
if (!b->aux_data) {
kvfree(b->data);
b->data = NULL;
return bch_err_throw(c, ENOMEM_btree_node_mem_alloc);
}
return 0;
}
static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
{
struct btree *b;
b = kzalloc(sizeof(struct btree), gfp);
if (!b)
return NULL;
bkey_btree_ptr_init(&b->key);
INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked);
b->byte_order = ilog2(c->opts.btree_node_size);
return b;
}
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
{
struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL);
if (!b)
return NULL;
if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
kfree(b);
return NULL;
}
bch2_btree_lock_init(&b->c, 0, GFP_KERNEL);
return b;
}
static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
{
struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
u64 mask = bc->pinned_nodes_mask[!!b->c.level];
return ((mask & BIT_ULL(b->c.btree_id)) &&
bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
}
void bch2_node_pin(struct bch_fs *c, struct btree *b)
{
struct btree_cache *bc = &c->btree_cache;
mutex_lock(&bc->lock);
if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
set_btree_node_pinned(b);
list_move(&b->list, &bc->live[1].list);
bc->live[0].nr--;
bc->live[1].nr++;
}
mutex_unlock(&bc->lock);
}
void bch2_btree_cache_unpin(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b, *n;
mutex_lock(&bc->lock);
c->btree_cache.pinned_nodes_mask[0] = 0;
c->btree_cache.pinned_nodes_mask[1] = 0;
list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
clear_btree_node_pinned(b);
list_move(&b->list, &bc->live[0].list);
bc->live[0].nr++;
bc->live[1].nr--;
}
mutex_unlock(&bc->lock);
}
/* Btree in memory cache - hash table */
void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
{
lockdep_assert_held(&bc->lock);
int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
BUG_ON(ret);
/* Cause future lookups for this node to fail: */
b->hash_val = 0;
if (b->c.btree_id < BTREE_ID_NR)
--bc->nr_by_btree[b->c.btree_id];
--bc->live[btree_node_pinned(b)].nr;
list_del_init(&b->list);
}
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
{
__bch2_btree_node_hash_remove(bc, b);
__bch2_btree_node_to_freelist(bc, b);
}
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
{
BUG_ON(!list_empty(&b->list));
BUG_ON(b->hash_val);
b->hash_val = btree_ptr_hash_val(&b->key);
int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
bch_btree_cache_params);
if (ret)
return ret;
if (b->c.btree_id < BTREE_ID_NR)
bc->nr_by_btree[b->c.btree_id]++;
bool p = __btree_node_pinned(bc, b);
mod_bit(BTREE_NODE_pinned, &b->flags, p);
list_add_tail(&b->list, &bc->live[p].list);
bc->live[p].nr++;
return 0;
}
int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
unsigned level, enum btree_id id)
{
b->c.level = level;
b->c.btree_id = id;
mutex_lock(&bc->lock);
int ret = __bch2_btree_node_hash_insert(bc, b);
mutex_unlock(&bc->lock);
return ret;
}
void bch2_btree_node_update_key_early(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_i *new)
{
struct bch_fs *c = trans->c;
struct btree *b;
struct bkey_buf tmp;
int ret;
bch2_bkey_buf_init(&tmp);
bch2_bkey_buf_reassemble(&tmp, c, old);
b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
if (!IS_ERR_OR_NULL(b)) {
mutex_lock(&c->btree_cache.lock);
__bch2_btree_node_hash_remove(&c->btree_cache, b);
bkey_copy(&b->key, new);
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
BUG_ON(ret);
mutex_unlock(&c->btree_cache.lock);
six_unlock_read(&b->c.lock);
}
bch2_bkey_buf_exit(&tmp, c);
}
__flatten
static inline struct btree *btree_cache_find(struct btree_cache *bc,
const struct bkey_i *k)
{
u64 v = btree_ptr_hash_val(k);
return rhashtable_lookup_fast(&bc->table,
|