diff options
| author | Jakub Kicinski <kuba@kernel.org> | 2022-12-12 11:27:41 -0800 |
|---|---|---|
| committer | Jakub Kicinski <kuba@kernel.org> | 2022-12-12 11:27:42 -0800 |
| commit | 26f708a28454df2062a63fd869e983c379f50ff0 (patch) | |
| tree | e9580092e7d69af3f9d5add0cd331bad2a6bf708 /kernel | |
| parent | b2b509fb5a1e6af1e630a755b32c4658099df70b (diff) | |
| parent | 99523094de48df65477cbbb9d8027f4bc4701794 (diff) | |
| download | linux-26f708a28454df2062a63fd869e983c379f50ff0.tar.gz linux-26f708a28454df2062a63fd869e983c379f50ff0.tar.bz2 linux-26f708a28454df2062a63fd869e983c379f50ff0.zip | |
Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Alexei Starovoitov says:
====================
pull-request: bpf-next 2022-12-11
We've added 74 non-merge commits during the last 11 day(s) which contain
a total of 88 files changed, 3362 insertions(+), 789 deletions(-).
The main changes are:
1) Decouple prune and jump points handling in the verifier, from Andrii.
2) Do not rely on ALLOW_ERROR_INJECTION for fmod_ret, from Benjamin.
Merged from hid tree.
3) Do not zero-extend kfunc return values. Necessary fix for 32-bit archs,
from Björn.
4) Don't use rcu_users to refcount in task kfuncs, from David.
5) Three reg_state->id fixes in the verifier, from Eduard.
6) Optimize bpf_mem_alloc by reusing elements from free_by_rcu, from Hou.
7) Refactor dynptr handling in the verifier, from Kumar.
8) Remove the "/sys" mount and umount dance in {open,close}_netns
in bpf selftests, from Martin.
9) Enable sleepable support for cgrp local storage, from Yonghong.
* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (74 commits)
selftests/bpf: test case for relaxed prunning of active_lock.id
selftests/bpf: Add pruning test case for bpf_spin_lock
bpf: use check_ids() for active_lock comparison
selftests/bpf: verify states_equal() maintains idmap across all frames
bpf: states_equal() must build idmap for all function frames
selftests/bpf: test cases for regsafe() bug skipping check_id()
bpf: regsafe() must not skip check_ids()
docs/bpf: Add documentation for BPF_MAP_TYPE_SK_STORAGE
selftests/bpf: Add test for dynptr reinit in user_ringbuf callback
bpf: Use memmove for bpf_dynptr_{read,write}
bpf: Move PTR_TO_STACK alignment check to process_dynptr_func
bpf: Rework check_func_arg_reg_off
bpf: Rework process_dynptr_func
bpf: Propagate errors from process_* checks in check_func_arg
bpf: Refactor ARG_PTR_TO_DYNPTR checks into process_dynptr_func
bpf: Skip rcu_barrier() if rcu_trace_implies_rcu_gp() is true
bpf: Reuse freed element in free_by_rcu during allocation
selftests/bpf: Bring test_offload.py back to life
bpf: Fix comment error in fixup_kfunc_call function
bpf: Do not zero-extend kfunc return values
...
====================
Link: https://lore.kernel.org/r/20221212024701.73809-1-alexei.starovoitov@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/bpf/bpf_cgrp_storage.c | 3 | ||||
| -rw-r--r-- | kernel/bpf/bpf_inode_storage.c | 4 | ||||
| -rw-r--r-- | kernel/bpf/bpf_lsm.c | 16 | ||||
| -rw-r--r-- | kernel/bpf/bpf_task_storage.c | 4 | ||||
| -rw-r--r-- | kernel/bpf/btf.c | 32 | ||||
| -rw-r--r-- | kernel/bpf/helpers.c | 118 | ||||
| -rw-r--r-- | kernel/bpf/memalloc.c | 31 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 696 |
8 files changed, 605 insertions, 299 deletions
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 309403800f82..6cdf6d9ed91d 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -211,7 +211,6 @@ BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgro return ret; } -BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, bpf_local_storage_map) const struct bpf_map_ops cgrp_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -222,7 +221,7 @@ const struct bpf_map_ops cgrp_storage_map_ops = { .map_update_elem = bpf_cgrp_storage_update_elem, .map_delete_elem = bpf_cgrp_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_id = &cgroup_storage_map_btf_ids[0], + .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = cgroup_storage_ptr, }; diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 6a1d4d22816a..05f4c66c9089 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -213,8 +213,6 @@ static void inode_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(map, &inode_cache, NULL); } -BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct, - bpf_local_storage_map) const struct bpf_map_ops inode_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -225,7 +223,7 @@ const struct bpf_map_ops inode_storage_map_ops = { .map_update_elem = bpf_fd_inode_storage_update_elem, .map_delete_elem = bpf_fd_inode_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_id = &inode_storage_map_btf_ids[0], + .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = inode_storage_ptr, }; diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index ae0267f150b5..9ea42a45da47 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -345,11 +345,27 @@ BTF_ID(func, bpf_lsm_task_to_inode) BTF_ID(func, bpf_lsm_userns_create) BTF_SET_END(sleepable_lsm_hooks) +BTF_SET_START(untrusted_lsm_hooks) +BTF_ID(func, bpf_lsm_bpf_map_free_security) +BTF_ID(func, bpf_lsm_bpf_prog_alloc_security) +BTF_ID(func, bpf_lsm_bpf_prog_free_security) +BTF_ID(func, bpf_lsm_file_alloc_security) +BTF_ID(func, bpf_lsm_file_free_security) +BTF_ID(func, bpf_lsm_sk_alloc_security) +BTF_ID(func, bpf_lsm_sk_free_security) +BTF_ID(func, bpf_lsm_task_free) +BTF_SET_END(untrusted_lsm_hooks) + bool bpf_lsm_is_sleepable_hook(u32 btf_id) { return btf_id_set_contains(&sleepable_lsm_hooks, btf_id); } +bool bpf_lsm_is_trusted(const struct bpf_prog *prog) +{ + return !btf_id_set_contains(&untrusted_lsm_hooks, prog->aux->attach_btf_id); +} + const struct bpf_prog_ops lsm_prog_ops = { }; diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 8e832db8151a..1e486055a523 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -324,7 +324,7 @@ static void task_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy); } -BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map) +BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map) const struct bpf_map_ops task_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -335,7 +335,7 @@ const struct bpf_map_ops task_storage_map_ops = { .map_update_elem = bpf_pid_task_storage_update_elem, .map_delete_elem = bpf_pid_task_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_id = &task_storage_map_btf_ids[0], + .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = task_storage_ptr, }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d11cbf8cece7..f7dd8af06413 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -19,6 +19,7 @@ #include <linux/bpf_verifier.h> #include <linux/btf.h> #include <linux/btf_ids.h> +#include <linux/bpf_lsm.h> #include <linux/skmsg.h> #include <linux/perf_event.h> #include <linux/bsearch.h> @@ -205,6 +206,7 @@ enum btf_kfunc_hook { BTF_KFUNC_HOOK_STRUCT_OPS, BTF_KFUNC_HOOK_TRACING, BTF_KFUNC_HOOK_SYSCALL, + BTF_KFUNC_HOOK_FMODRET, BTF_KFUNC_HOOK_MAX, }; @@ -5829,6 +5831,7 @@ static bool prog_args_trusted(const struct bpf_prog *prog) case BPF_PROG_TYPE_TRACING: return atype == BPF_TRACE_RAW_TP || atype == BPF_TRACE_ITER; case BPF_PROG_TYPE_LSM: + return bpf_lsm_is_trusted(prog); case BPF_PROG_TYPE_STRUCT_OPS: return true; default: @@ -7606,11 +7609,14 @@ u32 *btf_kfunc_id_set_contains(const struct btf *btf, return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id); } -/* This function must be invoked only from initcalls/module init functions */ -int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, - const struct btf_kfunc_id_set *kset) +u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id) +{ + return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id); +} + +static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, + const struct btf_kfunc_id_set *kset) { - enum btf_kfunc_hook hook; struct btf *btf; int ret; @@ -7629,13 +7635,29 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, if (IS_ERR(btf)) return PTR_ERR(btf); - hook = bpf_prog_type_to_kfunc_hook(prog_type); ret = btf_populate_kfunc_set(btf, hook, kset->set); btf_put(btf); return ret; } + +/* This function must be invoked only from initcalls/module init functions */ +int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, + const struct btf_kfunc_id_set *kset) +{ + enum btf_kfunc_hook hook; + + hook = bpf_prog_type_to_kfunc_hook(prog_type); + return __register_btf_kfunc_id_set(hook, kset); +} EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); +/* This function must be invoked only from initcalls/module init functions */ +int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset) +{ + return __register_btf_kfunc_id_set(BTF_KFUNC_HOOK_FMODRET, kset); +} +EXPORT_SYMBOL_GPL(register_btf_fmodret_id_set); + s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id) { struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a5a511430f2a..af30c6cbd65d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1404,7 +1404,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { #define DYNPTR_SIZE_MASK 0xFFFFFF #define DYNPTR_RDONLY_BIT BIT(31) -static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr) +static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_RDONLY_BIT; } @@ -1414,7 +1414,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ ptr->size |= type << DYNPTR_TYPE_SHIFT; } -u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr) +u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_SIZE_MASK; } @@ -1438,7 +1438,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) memset(ptr, 0, sizeof(*ptr)); } -static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len) +static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) { u32 size = bpf_dynptr_get_size(ptr); @@ -1483,7 +1483,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, }; -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, +BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, u32, offset, u64, flags) { int err; @@ -1495,7 +1495,11 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src if (err) return err; - memcpy(dst, src->data + src->offset + offset, len); + /* Source and destination may possibly overlap, hence use memmove to + * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr + * pointing to overlapping PTR_TO_MAP_VALUE regions. + */ + memmove(dst, src->data + src->offset + offset, len); return 0; } @@ -1506,12 +1510,12 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_DYNPTR, + .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; -BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len, u64, flags) { int err; @@ -1523,7 +1527,11 @@ BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, if (err) return err; - memcpy(dst->data + dst->offset + offset, src, len); + /* Source and destination may possibly overlap, hence use memmove to + * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr + * pointing to overlapping PTR_TO_MAP_VALUE regions. + */ + memmove(dst->data + dst->offset + offset, src, len); return 0; } @@ -1532,14 +1540,14 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_DYNPTR, + .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) +BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) { int err; @@ -1560,7 +1568,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = { .func = bpf_dynptr_data, .gpl_only = false, .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, - .arg1_type = ARG_PTR_TO_DYNPTR, + .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, }; @@ -1833,8 +1841,59 @@ struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) */ struct task_struct *bpf_task_acquire(struct task_struct *p) { - refcount_inc(&p->rcu_users); - return p; + return get_task_struct(p); +} + +/** + * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task + * acquired by this kfunc which is not stored in a map as a kptr, must be + * released by calling bpf_task_release(). + * @p: The task on which a reference is being acquired. + */ +struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p) +{ + /* For the time being this function returns NULL, as it's not currently + * possible to safely acquire a reference to a task with RCU protection + * using get_task_struct() and put_task_struct(). This is due to the + * slightly odd mechanics of p->rcu_users, and how task RCU protection + * works. + * + * A struct task_struct is refcounted by two different refcount_t + * fields: + * + * 1. p->usage: The "true" refcount field which tracks a task's + * lifetime. The task is freed as soon as this + * refcount drops to 0. + * + * 2. p->rcu_users: An "RCU users" refcount field which is statically + * initialized to 2, and is co-located in a union with + * a struct rcu_head field (p->rcu). p->rcu_users + * essentially encapsulates a single p->usage + * refcount, and when p->rcu_users goes to 0, an RCU + * callback is scheduled on the struct rcu_head which + * decrements the p->usage refcount. + * + * There are two important implications to this task refcounting logic + * described above. The first is that + * refcount_inc_not_zero(&p->rcu_users) cannot be used anywhere, as + * after the refcount goes to 0, the RCU callback being scheduled will + * cause the memory backing the refcount to again be nonzero due to the + * fields sharing a union. The other is that we can't rely on RCU to + * guarantee that a task is valid in a BPF program. This is because a + * task could have already transitioned to being in the TASK_DEAD + * state, had its rcu_users refcount go to 0, and its rcu callback + * invoked in which it drops its single p->usage reference. At this + * point the task will be freed as soon as the last p->usage reference + * goes to 0, without waiting for another RCU gp to elapse. The only + * way that a BPF program can guarantee that a task is valid is in this + * scenario is to hold a p->usage refcount itself. + * + * Until we're able to resolve this issue, either by pulling + * p->rcu_users and p->rcu out of the union, or by getting rid of + * p->usage and just using p->rcu_users for refcounting, we'll just + * return NULL here. + */ + return NULL; } /** @@ -1845,33 +1904,15 @@ struct task_struct *bpf_task_acquire(struct task_struct *p) */ struct task_struct *bpf_task_kptr_get(struct task_struct **pp) { - struct task_struct *p; - - rcu_read_lock(); - p = READ_ONCE(*pp); - - /* Another context could remove the task from the map and release it at - * any time, including after we've done the lookup above. This is safe - * because we're in an RCU read region, so the task is guaranteed to - * remain valid until at least the rcu_read_unlock() below. + /* We must return NULL here until we have clarity on how to properly + * leverage RCU for ensuring a task's lifetime. See the comment above + * in bpf_task_acquire_not_zero() for more details. */ - if (p && !refcount_inc_not_zero(&p->rcu_users)) - /* If the task had been removed from the map and freed as - * described above, refcount_inc_not_zero() will return false. - * The task will be freed at some point after the current RCU - * gp has ended, so just return NULL to the user. - */ - p = NULL; - rcu_read_unlock(); - - return p; + return NULL; } /** - * bpf_task_release - Release the reference acquired on a struct task_struct *. - * If this kfunc is invoked in an RCU read region, the task_struct is - * guaranteed to not be freed until the current grace period has ended, even if - * its refcount drops to 0. + * bpf_task_release - Release the reference acquired on a task. * @p: The task on which a reference is being released. */ void bpf_task_release(struct task_struct *p) @@ -1879,7 +1920,7 @@ void bpf_task_release(struct task_struct *p) if (!p) return; - put_task_struct_rcu_user(p); + put_task_struct(p); } #ifdef CONFIG_CGROUPS @@ -1927,7 +1968,7 @@ struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp) } /** - * bpf_cgroup_release - Release the reference acquired on a struct cgroup *. + * bpf_cgroup_release - Release the reference acquired on a cgroup. * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to * not be freed until the current grace period has ended, even if its refcount * drops to 0. @@ -2013,6 +2054,7 @@ BTF_ID_FLAGS(func, bpf_list_push_back) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) #ifdef CONFIG_CGROUPS diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 8f0d65f2474a..ebcc3dd0fa19 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -171,9 +171,24 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) memcg = get_memcg(c); old_memcg = set_active_memcg(memcg); for (i = 0; i < cnt; i++) { - obj = __alloc(c, node); - if (!obj) - break; + /* + * free_by_rcu is only manipulated by irq work refill_work(). + * IRQ works on the same CPU are called sequentially, so it is + * safe to use __llist_del_first() here. If alloc_bulk() is + * invoked by the initial prefill, there will be no running + * refill_work(), so __llist_del_first() is fine as well. + * + * In most cases, objects on free_by_rcu are from the same CPU. + * If some objects come from other CPUs, it doesn't incur any + * harm because NUMA_NO_NODE means the preference for current + * numa node and it is not a guarantee. + */ + obj = __llist_del_first(&c->free_by_rcu); + if (!obj) { + obj = __alloc(c, node); + if (!obj) + break; + } if (IS_ENABLED(CONFIG_PREEMPT_RT)) /* In RT irq_work runs in per-cpu kthread, so disable * interrupts to avoid preemption and interrupts and @@ -449,9 +464,17 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma) { /* waiting_for_gp lists was drained, but __free_rcu might * still execute. Wait for it now before we freeing percpu caches. + * + * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), + * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used + * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), + * so if call_rcu(head, __free_rcu) is skipped due to + * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by + * using rcu_trace_implies_rcu_gp() as well. */ rcu_barrier_tasks_trace(); - rcu_barrier(); + if (!rcu_trace_implies_rcu_gp()) + rcu_barrier(); free_mem_alloc_no_barrier(ma); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4e7f1d085e53..a5255a0dcbb6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -451,6 +451,11 @@ static bool reg_type_not_null(enum bpf_reg_type type) type == PTR_TO_SOCK_COMMON; } +static bool type_is_ptr_alloc_obj(u32 type) +{ + return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC; +} + static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) { struct btf_record *rec = NULL; @@ -458,7 +463,7 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) if (reg->type == PTR_TO_MAP_VALUE) { rec = reg->map_ptr->record; - } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) { + } else if (type_is_ptr_alloc_obj(reg->type)) { meta = btf_find_struct_meta(reg->btf, reg->btf_id); if (meta) rec = meta->record; @@ -587,7 +592,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env, [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", [PTR_TO_MAP_KEY] = "map_key", - [PTR_TO_DYNPTR] = "dynptr_ptr", + [CONST_PTR_TO_DYNPTR] = "dynptr_ptr", }; if (type & PTR_MAYBE_NULL) { @@ -720,6 +725,28 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type) return type == BPF_DYNPTR_TYPE_RINGBUF; } +static void __mark_dynptr_reg(struct bpf_reg_state *reg, + enum bpf_dynptr_type type, + bool first_slot); + +static void __mark_reg_not_init(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg); + +static void mark_dynptr_stack_regs(struct bpf_reg_state *sreg1, + struct bpf_reg_state *sreg2, + enum bpf_dynptr_type type) +{ + __mark_dynptr_reg(sreg1, type, true); + __mark_dynptr_reg(sreg2, type, false); +} + +static void mark_dynptr_cb_reg(struct bpf_reg_state *reg, + enum bpf_dynptr_type type) +{ + __mark_dynptr_reg(reg, type, true); +} + + static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, enum bpf_arg_type arg_type, int insn_idx) { @@ -741,9 +768,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (type == BPF_DYNPTR_TYPE_INVALID) return -EINVAL; - state->stack[spi].spilled_ptr.dynptr.first_slot = true; - state->stack[spi].spilled_ptr.dynptr.type = type; - state->stack[spi - 1].spilled_ptr.dynptr.type = type; + mark_dynptr_stack_regs(&state->stack[spi].spilled_ptr, + &state->stack[spi - 1].spilled_ptr, type); if (dynptr_type_refcounted(type)) { /* The id is used to track proper releasing */ @@ -751,8 +777,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (id < 0) return id; - state->stack[spi].spilled_ptr.id = id; - state->stack[spi - 1].spilled_ptr.id = id; + state->stack[spi].spilled_ptr.ref_obj_id = id; + state->stack[spi - 1].spilled_ptr.ref_obj_id = id; } return 0; @@ -774,25 +800,23 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re } /* Invalidate any slices associated with this dynptr */ - if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { - release_reference(env, state->stack[spi].spilled_ptr.id); - state->stack[spi].spilled_ptr.id = 0; - state->stack[spi - 1].spilled_ptr.id = 0; - } - - state->stack[spi].spilled_ptr.dynptr.first_slot = false; - state->stack[spi].spilled_ptr.dynptr.type = 0; - state->stack[spi - 1].spilled_ptr.dynptr.type = 0; + if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) + WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id)); + __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); + __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); return 0; } static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); - int i; + int spi, i; + + if (reg->type == CONST_PTR_TO_DYNPTR) + return false; + spi = get_spi(reg->off); if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) return true; @@ -805,13 +829,17 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_ return true; } -bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, - struct bpf_reg_state *reg) +static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); + int spi; int i; + /* This already represents first slot of initialized bpf_dynptr */ + if (reg->type == CONST_PTR_TO_DYNPTR) + return true; + + spi = get_spi(reg->off); if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || !state->stack[spi].spilled_ptr.dynptr.first_slot) return false; @@ -825,21 +853,24 @@ bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, return true; } -bool is_dynptr_type_expected(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - enum bpf_arg_type arg_type) +static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + enum bpf_arg_type arg_type) { struct bpf_func_state *state = func(env, reg); enum bpf_dynptr_type dynptr_type; - int spi = get_spi(reg->off); + int spi; /* ARG_PTR_TO_DYNPTR takes any type of dynptr */ if (arg_type == ARG_PTR_TO_DYNPTR) return true; dynptr_type = arg_to_dynptr_type(arg_type); - - return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type; + if (reg->type == CONST_PTR_TO_DYNPTR) { + return reg->dynptr.type == dynptr_type; + } else { + spi = get_spi(reg->off); + return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type; + } } /* The reg state of a pointer or a bounded scalar was saved when @@ -1351,9 +1382,6 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -static void __mark_reg_not_init(const struct bpf_verifier_env *env, - struct bpf_reg_state *reg); - /* This helper doesn't clear reg->id */ static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm) { @@ -1416,6 +1444,19 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, __mark_reg_known_zero(regs + regno); } +static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, + bool first_slot) +{ + /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for + * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply + * set it unconditionally as it is ignored for STACK_DYNPTR anyway. + */ + __mark_reg_known_zero(reg); + reg->type = CONST_PTR_TO_DYNPTR; + reg->dynptr.type = type; + reg->dynptr.first_slot = first_slot; +} + static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) { if (base_type(reg->type) == PTR_TO_MAP_VALUE) { @@ -2525,6 +2566,16 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return 0; } +static void mark_jmp_point(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].jmp_point = true; +} + +static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].jmp_point; +} + /* for any branch, call, exit record the history of jmps in the given state */ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur) @@ -2533,6 +2584,9 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_idx_pair *p; size_t alloc_size; + if (!is_jmp_point(env, env->insn_idx)) + return 0; + cnt++; alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); p = krealloc(cur->jmp_history, alloc_size, GFP_USER); @@ -4275,7 +4329,7 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg) return true; /* If a register is not referenced, it is trusted if it has the - * MEM_ALLOC, MEM_RCU or PTR_TRUSTED type modifiers, and no others. Some of the + * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the * other type modifiers may be safe, but we elect to take an opt-in * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are * not. @@ -4287,6 +4341,11 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg) !bpf_type_has_unsafe_modifiers(reg->type); } +static bool is_rcu_reg(const struct bpf_reg_state *reg) +{ + return reg->type & MEM_RCU; +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -4703,6 +4762,18 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, u32 btf_id; int ret; + if (!env->allow_ptr_leaks) { + verbose(env, + "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n", + tname); + return -EPERM; + } + if (!env->prog->gpl_compatible && btf_is_kernel(reg->btf)) { + verbose(env, + "Cannot access kernel 'struct %s' from non-GPL compatible program\n", + tname); + return -EINVAL; + } if (off < 0) { verbose(env, "R%d is ptr_%s invalid negative access: off=%d\n", @@ -4773,14 +4844,16 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (flag & MEM_RCU) { /* Mark value register as MEM_RCU only if it is protected by - * bpf_rcu_read_lock() and the ptr reg is trusted. MEM_RCU + * bpf_rcu_read_lock() and the ptr reg is rcu or trusted. MEM_RCU * itself can already indicate trustedness inside the rcu - * read lock region. Also mark it as PTR_TRUSTED. + * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since + * it could be null in some cases. */ - if (!env->cur_state->active_rcu_lock || !is_trusted_reg(reg)) + if (!env->cur_state->active_rcu_lock || + !(is_trusted_reg(reg) || is_rcu_reg(reg))) flag &= ~MEM_RCU; else - flag |= PTR_TRUSTED; + flag |= PTR_MAYBE_NULL; } else if (reg->type & MEM_RCU) { /* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively. @@ -4823,9 +4896,9 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id); tname = btf_name_by_offset(btf_vmlinux, t->name_off); - if (!env->allow_ptr_to_map_access) { + if (!env->allow_ptr_leaks) { verbose(env, - "%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n", + "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n", tname); return -EPERM; } @@ -5726,7 +5799,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, cur->active_lock.ptr = NULL; cur->active_lock.id = 0; - for (i = 0; i < fstate->acquired_refs; i++) { + for (i = fstate->acquired_refs - 1; i >= 0; i--) { int err; /* Complain on error because this reference state cannot @@ -5822,6 +5895,119 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return 0; } +/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK + * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR. + * + * In both cases we deal with the first 8 bytes, but need to mark the next 8 + * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of + * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object. + * + * Mutability of bpf_dynptr is at two levels, one is at the level of struct + * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct + * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can + * mutate the view of the dynptr and also possibly destroy it. In the latter + * case, it cannot mutate the bpf_dynptr itself but it can still mutate the + * memory that dynptr points to. + * + * The verifier will keep track both levels of mutation (bpf_dynptr's in + * reg->type and the memory's in reg->dynptr.type), but there is no support for + * readonly dynptr view yet, hence only the first case is tracked and checked. + * + * This is consistent with how C applies the const modifier to a struct object, + * where the pointer itself inside bpf_dynptr becomes const but not what it + * points to. + * + * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument + * type, and declare it as 'const struct bpf_dynptr *' in their prototype. + */ +int process_dynptr_func(struct bpf_verifier_env *env, int regno, + enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + + /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an + * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): + */ + if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) { + verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n"); + return -EFAULT; + } + /* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to + * check_func_arg_reg_off's logic. We only need to check offset + * alignment for PTR_TO_STACK. + */ + if (reg->type == PTR_TO_STACK && (reg->off % BPF_REG_SIZE)) { + verbose(env, "cannot pass in dynptr at an offset=%d\n", reg->off); + return -EINVAL; + } + /* MEM_UNINIT - Points to memory that is an appropriate candidate for + * constructing a mutable bpf_dynptr object. + * + * Currently, this is only possible with PTR_TO_STACK + * pointing to a region of at least 16 bytes which doesn't + * contain an existing bpf_dynptr. + * + * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be + * mutated or destroyed. However, the memory it points to + * may be mutated. + * + * None - Points to a initialized dynptr that can be mutated and + * destroyed, including mutation of the memory it points + * to. + */ + if (arg_type & MEM_UNINIT) { |
