diff options
92 files changed, 4582 insertions, 834 deletions
diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst index 7940da9bc6c1..f49aeef62d0c 100644 --- a/Documentation/bpf/btf.rst +++ b/Documentation/bpf/btf.rst @@ -74,7 +74,7 @@ sequentially and type id is assigned to each recognized type starting from id #define BTF_KIND_ARRAY 3 /* Array */ #define BTF_KIND_STRUCT 4 /* Struct */ #define BTF_KIND_UNION 5 /* Union */ - #define BTF_KIND_ENUM 6 /* Enumeration */ + #define BTF_KIND_ENUM 6 /* Enumeration up to 32-bit values */ #define BTF_KIND_FWD 7 /* Forward */ #define BTF_KIND_TYPEDEF 8 /* Typedef */ #define BTF_KIND_VOLATILE 9 /* Volatile */ @@ -87,6 +87,7 @@ sequentially and type id is assigned to each recognized type starting from id #define BTF_KIND_FLOAT 16 /* Floating point */ #define BTF_KIND_DECL_TAG 17 /* Decl Tag */ #define BTF_KIND_TYPE_TAG 18 /* Type Tag */ + #define BTF_KIND_ENUM64 19 /* Enumeration up to 64-bit values */ Note that the type section encodes debug info, not just pure types. ``BTF_KIND_FUNC`` is not a type, and it represents a defined subprogram. @@ -101,10 +102,10 @@ Each type contains the following common data:: * bits 24-28: kind (e.g. int, ptr, array...etc) * bits 29-30: unused * bit 31: kind_flag, currently used by - * struct, union and fwd + * struct, union, fwd, enum and enum64. */ __u32 info; - /* "size" is used by INT, ENUM, STRUCT and UNION. + /* "size" is used by INT, ENUM, STRUCT, UNION and ENUM64. * "size" tells the size of the type it is describing. * * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, @@ -281,10 +282,10 @@ modes exist: ``struct btf_type`` encoding requirement: * ``name_off``: 0 or offset to a valid C identifier - * ``info.kind_flag``: 0 + * ``info.kind_flag``: 0 for unsigned, 1 for signed * ``info.kind``: BTF_KIND_ENUM * ``info.vlen``: number of enum values - * ``size``: 4 + * ``size``: 1/2/4/8 ``btf_type`` is followed by ``info.vlen`` number of ``struct btf_enum``.:: @@ -297,6 +298,10 @@ The ``btf_enum`` encoding: * ``name_off``: offset to a valid C identifier * ``val``: any value +If the original enum value is signed and the size is less than 4, +that value will be sign extended into 4 bytes. If the size is 8, +the value will be truncated into 4 bytes. + 2.2.7 BTF_KIND_FWD ~~~~~~~~~~~~~~~~~~ @@ -493,7 +498,7 @@ the attribute is applied to a ``struct``/``union`` member or a ``func`` argument, and ``btf_decl_tag.component_idx`` should be a valid index (starting from 0) pointing to a member or an argument. -2.2.17 BTF_KIND_TYPE_TAG +2.2.18 BTF_KIND_TYPE_TAG ~~~~~~~~~~~~~~~~~~~~~~~~ ``struct btf_type`` encoding requirement: @@ -516,6 +521,32 @@ type_tag, then zero or more const/volatile/restrict/typedef and finally the base type. The base type is one of int, ptr, array, struct, union, enum, func_proto and float types. +2.2.19 BTF_KIND_ENUM64 +~~~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: 0 or offset to a valid C identifier + * ``info.kind_flag``: 0 for unsigned, 1 for signed + * ``info.kind``: BTF_KIND_ENUM64 + * ``info.vlen``: number of enum values + * ``size``: 1/2/4/8 + +``btf_type`` is followed by ``info.vlen`` number of ``struct btf_enum64``.:: + + struct btf_enum64 { + __u32 name_off; + __u32 val_lo32; + __u32 val_hi32; + }; + +The ``btf_enum64`` encoding: + * ``name_off``: offset to a valid C identifier + * ``val_lo32``: lower 32-bit value for a 64-bit value + * ``val_hi32``: high 32-bit value for a 64-bit value + +If the original enum value is signed and the size is less than 8, +that value will be sign extended into 8 bytes. + 3. BTF Kernel API ================= diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst index 1de6a57c7e1e..9e27fbdb2206 100644 --- a/Documentation/bpf/instruction-set.rst +++ b/Documentation/bpf/instruction-set.rst @@ -127,7 +127,7 @@ BPF_XOR | BPF_K | BPF_ALU64 means:: Byte swap instructions ---------------------- -The byte swap instructions use an instruction class of ``BFP_ALU`` and a 4-bit +The byte swap instructions use an instruction class of ``BPF_ALU`` and a 4-bit code field of ``BPF_END``. The byte swap instructions operate on the destination register diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 9e457156ad4d..6a1c9fca5260 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -712,22 +712,6 @@ static inline void emit_alu_r(const u8 dst, const u8 src, const bool is64, } } -/* ALU operation (32 bit) - * dst = dst (op) src - */ -static inline void emit_a32_alu_r(const s8 dst, const s8 src, - struct jit_ctx *ctx, const bool is64, - const bool hi, const u8 op) { - const s8 *tmp = bpf2a32[TMP_REG_1]; - s8 rn, rd; - - rn = arm_bpf_get_reg32(src, tmp[1], ctx); - rd = arm_bpf_get_reg32(dst, tmp[0], ctx); - /* ALU operation */ - emit_alu_r(rd, rn, is64, hi, op, ctx); - arm_bpf_put_reg32(dst, rd, ctx); -} - /* ALU operation (64 bit) */ static inline void emit_a32_alu_r64(const bool is64, const s8 dst[], const s8 src[], struct jit_ctx *ctx, diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index 2a3715bf29fe..d926e0f7ef57 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -69,6 +69,7 @@ struct rv_jit_context { struct bpf_prog *prog; u16 *insns; /* RV insns */ int ninsns; + int body_len; int epilogue_offset; int *offset; /* BPF to RV */ int nexentries; diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index be743d700aa7..737baf8715da 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -44,7 +44,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) unsigned int prog_size = 0, extable_size = 0; bool tmp_blinded = false, extra_pass = false; struct bpf_prog *tmp, *orig_prog = prog; - int pass = 0, prev_ninsns = 0, i; + int pass = 0, prev_ninsns = 0, prologue_len, i; struct rv_jit_data *jit_data; struct rv_jit_context *ctx; @@ -95,6 +95,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) prog = orig_prog; goto out_offset; } + ctx->body_len = ctx->ninsns; bpf_jit_build_prologue(ctx); ctx->epilogue_offset = ctx->ninsns; bpf_jit_build_epilogue(ctx); @@ -161,6 +162,11 @@ skip_init_ctx: if (!prog->is_func || extra_pass) { bpf_jit_binary_lock_ro(jit_data->header); + prologue_len = ctx->epilogue_offset - ctx->body_len; + for (i = 0; i < prog->len; i++) + ctx->offset[i] = ninsns_rvoff(prologue_len + + ctx->offset[i]); + bpf_prog_fill_jited_linfo(prog, ctx->offset); out_offset: kfree(ctx->offset); kfree(jit_data); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2b914a56a2c5..0edd7d2c0064 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -5,6 +5,7 @@ #define _LINUX_BPF_H 1 #include <uapi/linux/bpf.h> +#include <uapi/linux/filter.h> #include <linux/workqueue.h> #include <linux/file.h> @@ -22,8 +23,10 @@ #include <linux/sched/mm.h> #include <linux/slab.h> #include <linux/percpu-refcount.h> +#include <linux/stddef.h> #include <linux/bpfptr.h> #include <linux/btf.h> +#include <linux/rcupdate_trace.h> struct bpf_verifier_env; struct bpf_verifier_log; @@ -398,6 +401,9 @@ enum bpf_type_flag { /* DYNPTR points to a ringbuf record. */ DYNPTR_TYPE_RINGBUF = BIT(9 + BPF_BASE_TYPE_BITS), + /* Size is known at compile time. */ + MEM_FIXED_SIZE = BIT(10 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; @@ -461,6 +467,8 @@ enum bpf_arg_type { * all bytes or clear them in error case. */ ARG_PTR_TO_UNINIT_MEM = MEM_UNINIT | ARG_PTR_TO_MEM, + /* Pointer to valid memory of size known at compile time. */ + ARG_PTR_TO_FIXED_SIZE_MEM = MEM_FIXED_SIZE | ARG_PTR_TO_MEM, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. @@ -526,6 +534,14 @@ struct bpf_func_proto { u32 *arg5_btf_id; }; u32 *arg_btf_id[5]; + struct { + size_t arg1_size; + size_t arg2_size; + size_t arg3_size; + size_t arg4_size; + size_t arg5_size; + }; + size_t arg_size[5]; }; int *ret_btf_id; /* return value btf_id */ bool (*allowed)(const struct bpf_prog *prog); @@ -1084,6 +1100,40 @@ struct bpf_prog_aux { }; }; +struct bpf_prog { + u16 pages; /* Number of allocated pages */ + u16 jited:1, /* Is our filter JIT'ed? */ + jit_requested:1,/* archs need to JIT the prog */ + gpl_compatible:1, /* Is filter GPL compatible? */ + cb_access:1, /* Is control block accessed? */ + dst_needed:1, /* Do we need dst entry? */ + blinding_requested:1, /* needs constant blinding */ + blinded:1, /* Was blinded */ + is_func:1, /* program is a bpf function */ + kprobe_override:1, /* Do we override a kprobe? */ + has_callchain_buf:1, /* callchain buffer allocated? */ + enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ + call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ + call_get_func_ip:1, /* Do we call get_func_ip() */ + tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */ + enum bpf_prog_type type; /* Type of BPF program */ + enum bpf_attach_type expected_attach_type; /* For some prog types */ + u32 len; /* Number of filter blocks */ + u32 jited_len; /* Size of jited insns in bytes */ + u8 tag[BPF_TAG_SIZE]; + struct bpf_prog_stats __percpu *stats; + int __percpu *active; + unsigned int (*bpf_func)(const void *ctx, + const struct bpf_insn *insn); + struct bpf_prog_aux *aux; /* Auxiliary fields */ + struct sock_fprog_kern *orig_prog; /* Original BPF program */ + /* Instructions for interpreter */ + union { + DECLARE_FLEX_ARRAY(struct sock_filter, insns); + DECLARE_FLEX_ARRAY(struct bpf_insn, insnsi); + }; +}; + struct bpf_array_aux { /* Programs with direct jumps into programs part of this array. */ struct list_head poke_progs; @@ -1336,6 +1386,8 @@ extern struct bpf_empty_prog_array bpf_empty_prog_array; struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array *progs); +/* Use when traversal over the bpf_prog_array uses tasks_trace rcu */ +void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs); int bpf_prog_array_length(struct bpf_prog_array *progs); bool bpf_prog_array_is_empty(struct bpf_prog_array *array); int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, @@ -1427,6 +1479,55 @@ bpf_prog_run_array(const struct bpf_prog_array *array, return ret; } +/* Notes on RCU design for bpf_prog_arrays containing sleepable programs: + * + * We use the tasks_trace rcu flavor read section to protect the bpf_prog_array + * overall. As a result, we must use the bpf_prog_array_free_sleepable + * in order to use the tasks_trace rcu grace period. + * + * When a non-sleepable program is inside the array, we take the rcu read + * section and disable preemption for that program alone, so it can access + * rcu-protected dynamically sized maps. + */ +static __always_inline u32 +bpf_prog_run_array_sleepable(const struct bpf_prog_array __rcu *array_rcu, + const void *ctx, bpf_prog_run_fn run_prog) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_trace_run_ctx run_ctx; + u32 ret = 1; + + might_fault(); + + rcu_read_lock_trace(); + migrate_disable(); + + array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held()); + if (unlikely(!array)) + goto out; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + item = &array->items[0]; + while ((prog = READ_ONCE(item->prog))) { + if (!prog->aux->sleepable) + rcu_read_lock() |
