/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_FORMAT_H
#define _BCACHEFS_FORMAT_H
/*
* bcachefs on disk data structures
*
* OVERVIEW:
*
* There are three main types of on disk data structures in bcachefs (this is
* reduced from 5 in bcache)
*
* - superblock
* - journal
* - btree
*
* The btree is the primary structure; most metadata exists as keys in the
* various btrees. There are only a small number of btrees, they're not
* sharded - we have one btree for extents, another for inodes, et cetera.
*
* SUPERBLOCK:
*
* The superblock contains the location of the journal, the list of devices in
* the filesystem, and in general any metadata we need in order to decide
* whether we can start a filesystem or prior to reading the journal/btree
* roots.
*
* The superblock is extensible, and most of the contents of the superblock are
* in variable length, type tagged fields; see struct bch_sb_field.
*
* Backup superblocks do not reside in a fixed location; also, superblocks do
* not have a fixed size. To locate backup superblocks we have struct
* bch_sb_layout; we store a copy of this inside every superblock, and also
* before the first superblock.
*
* JOURNAL:
*
* The journal primarily records btree updates in the order they occurred;
* journal replay consists of just iterating over all the keys in the open
* journal entries and re-inserting them into the btrees.
*
* The journal also contains entry types for the btree roots, and blacklisted
* journal sequence numbers (see journal_seq_blacklist.c).
*
* BTREE:
*
* bcachefs btrees are copy on write b+ trees, where nodes are big (typically
* 128k-256k) and log structured. We use struct btree_node for writing the first
* entry in a given node (offset 0), and struct btree_node_entry for all
* subsequent writes.
*
* After the header, btree node entries contain a list of keys in sorted order.
* Values are stored inline with the keys; since values are variable length (and
* keys effectively are variable length too, due to packing) we can't do random
* access without building up additional in memory tables in the btree node read
* path.
*
* BTREE KEYS (struct bkey):
*
* The various btrees share a common format for the key - so as to avoid
* switching in fastpath lookup/comparison code - but define their own
* structures for the key values.
*
* The size of a key/value pair is stored as a u8 in units of u64s, so the max
* size is just under 2k. The common part also contains a type tag for the
* value, and a format field indicating whether the key is packed or not (and
* also meant to allow adding new key fields in the future, if desired).
*
* bkeys, when stored within a btree node, may also be packed. In that case, the
* bkey_format in that node is used to unpack it. Packed bkeys mean that we can
* be generous with field sizes in the common part of the key format (64 bit
* inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
*/
#include <asm/types.h>
#include <asm/byteorder.h>
#include <linux/kernel.h>
#include <linux/uuid.h>
#ifdef __KERNEL__
typedef uuid_t __uuid_t;
#endif
#define LE_BITMASK(_bits, name, type, field, offset, end) \
static const unsigned name##_OFFSET = offset; \
static const unsigned name##_BITS = (end - offset); \
static const __u##_bits name##_MAX = (1ULL << (end - offset)) - 1; \
\
static inline __u64 name(const type *k) \
{ \
return (__le##_bits##_to_cpu(k->field) >> offset) & \
~(~0ULL << (end - offset)); \
} \
\
static inline void SET_##name(type *k, __u64 v) \
{ \
__u##_bits new = __le##_bits##_to_cpu(k->field); \
\
new &= ~(~(~0ULL << (end - offset)) << offset); \
new |= (v & ~(~0ULL << (end - offset))) << offset; \
k->field = __cpu_to_le##_bits(new); \
}
#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e)
#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e)
#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e)
struct bkey_format {
__u8 key_u64s;
__u8 nr_fields;
/* One unused slot for now: */
__u8 bits_per_field[6];
__le64 field_offset[6];
};
/* Btree keys - all units are in sectors */
struct bpos {
/*
* Word order matches machine byte order - btree code treats a bpos as a
* single large integer, for search/comparison purposes
*
* Note that wherever a bpos is embedded in another on disk data
* structure, it has to be byte swabbed when reading in metadata that
* wasn't written in native endian order:
*/
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__u32 snapshot;
__u64 offset;
__u64 inode;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
__u64 inode;
__u64 offset; /* Points to end of extent - sectors */
__u32 snapshot;
#else
#error edit for your odd byteorder.
#endif
} __attribute__((packed, aligned(4)));
#define KEY_INODE_MAX ((__u64)~0ULL)
#define KEY_OFFSET_MAX ((__u64)~0ULL)
#define KEY_SNAPSHOT_MAX ((__u32)~0U)
#define KEY_SIZE_MAX ((__u32)~0U)
static inline struct bpos POS(__u64 inode, __u64 offset)
{
struct bpos ret;
ret.inode = inode;
ret.offset = offset;
ret.snapshot = 0;
return ret;
}
#define POS_MIN POS(0, 0)
#define POS_MAX POS(KEY_INODE_MAX, KEY_OFFSET_MAX)
/* Empty placeholder struct, for container_of() */
struct bch_val {
__u64 __nothing[0];
};
struct bversion {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__u64 lo;
__u32 hi;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
__u32 hi;
__u64 lo;
#endif
} __attribute__((packed, aligned(4)));
struct bkey {
/* Size of combined key and value, in u64s */
__u8 u64s;
/* Format of key (0 for format local to btree node) */
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u8 format:7,
needs_whiteout:1;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u8 needs_whiteout:1,
format:7;
#else
#error edit for your odd byteorder.
#endif
/* Type of the value */
__u8 type;
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__u8 pad[1];
struct bversion version;
__u32 size; /* extent size, in sectors */
struct bpos p;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
struct bpos p;
__u32 size; /* extent size, in sectors */
struct bversion version;
__u8 pad[1];
#endif
} __attribute__((packed, aligned(8)));
struct bkey_packed {
__u64 _data[0];
/* Size of combined key and value, in u64s */
__u8 u64s;
/* Format of key (0 for format local to btree node) */
/*
* XXX: next incompat on disk format change, switch format and
* needs_whiteout - bkey_packed() will be cheaper if format is the high
* bits of the bitfield
*/
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u8 format:7,
needs_whiteout:1;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u8 needs_whiteout:1,
format:7;
#endif
/* Type of the value */
__u8 type;
__u8 key_start[0];
/*
* We copy bkeys with struct assignment in various places, and while
* that shouldn't be done with packed bkeys we can't disallow it in C,
* and it's legal to cast a bkey to a bkey_packed - so padding it out
* to the same size as struct bkey should hopefully be safest.
*/
__u8 pad[sizeof(struct bkey) - 3];
} __attribute__((packed, aligned(8)));
#define BKEY_U64s (sizeof(s
|