diff options
author | Jakub Kicinski <kuba@kernel.org> | 2022-08-17 20:29:34 -0700 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2022-08-17 20:29:36 -0700 |
commit | 3f5f728a7296b5d5b87117d85d3020cc9640f6dd (patch) | |
tree | 0f2ef5085128150fe676767c3e34c66e204dc4cc | |
parent | fd78d07c7c35de260eb89f1be4a1e7487b8092ad (diff) | |
parent | df78da27260c915039b348b164bbc53fa372ba70 (diff) | |
download | linux-3f5f728a7296b5d5b87117d85d3020cc9640f6dd.tar.gz linux-3f5f728a7296b5d5b87117d85d3020cc9640f6dd.tar.bz2 linux-3f5f728a7296b5d5b87117d85d3020cc9640f6dd.zip |
Merge https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Andrii Nakryiko says:
====================
bpf-next 2022-08-17
We've added 45 non-merge commits during the last 14 day(s) which contain
a total of 61 files changed, 986 insertions(+), 372 deletions(-).
The main changes are:
1) New bpf_ktime_get_tai_ns() BPF helper to access CLOCK_TAI, from Kurt
Kanzenbach and Jesper Dangaard Brouer.
2) Few clean ups and improvements for libbpf 1.0, from Andrii Nakryiko.
3) Expose crash_kexec() as kfunc for BPF programs, from Artem Savkov.
4) Add ability to define sleepable-only kfuncs, from Benjamin Tissoires.
5) Teach libbpf's bpf_prog_load() and bpf_map_create() to gracefully handle
unsupported names on old kernels, from Hangbin Liu.
6) Allow opting out from auto-attaching BPF programs by libbpf's BPF skeleton,
from Hao Luo.
7) Relax libbpf's requirement for shared libs to be marked executable, from
Henqgi Chen.
8) Improve bpf_iter internals handling of error returns, from Hao Luo.
9) Few accommodations in libbpf to support GCC-BPF quirks, from James Hilliard.
10) Fix BPF verifier logic around tracking dynptr ref_obj_id, from Joanne Koong.
11) bpftool improvements to handle full BPF program names better, from Manu
Bretelle.
12) bpftool fixes around libcap use, from Quentin Monnet.
13) BPF map internals clean ups and improvements around memory allocations,
from Yafang Shao.
14) Allow to use cgroup_get_from_file() on cgroupv1, allowing BPF cgroup
iterator to work on cgroupv1, from Yosry Ahmed.
15) BPF verifier internal clean ups, from Dave Marchevsky and Joanne Koong.
16) Various fixes and clean ups for selftests/bpf and vmtest.sh, from Daniel
Xu, Artem Savkov, Joanne Koong, Andrii Nakryiko, Shibin Koikkara Reeny.
* https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (45 commits)
selftests/bpf: Few fixes for selftests/bpf built in release mode
libbpf: Clean up deprecated and legacy aliases
libbpf: Streamline bpf_attr and perf_event_attr initialization
libbpf: Fix potential NULL dereference when parsing ELF
selftests/bpf: Tests libbpf autoattach APIs
libbpf: Allows disabling auto attach
selftests/bpf: Fix attach point for non-x86 arches in test_progs/lsm
libbpf: Making bpf_prog_load() ignore name if kernel doesn't support
selftests/bpf: Update CI kconfig
selftests/bpf: Add connmark read test
selftests/bpf: Add existing connection bpf_*_ct_lookup() test
bpftool: Clear errno after libcap's checks
bpf: Clear up confusion in bpf_skb_adjust_room()'s documentation
bpftool: Fix a typo in a comment
libbpf: Add names for auxiliary maps
bpf: Use bpf_map_area_alloc consistently on bpf map creation
bpf: Make __GFP_NOWARN consistent in bpf map creation
bpf: Use bpf_map_area_free instread of kvfree
bpf: Remove unneeded memset in queue_stack_map creation
libbpf: preserve errno across pr_warn/pr_info/pr_debug
...
====================
Link: https://lore.kernel.org/r/20220817215656.1180215-1-andrii@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
61 files changed, 986 insertions, 372 deletions
diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index c0b7dae6dbf5..781731749e55 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -146,6 +146,21 @@ that operate (change some property, perform some operation) on an object that was obtained using an acquire kfunc. Such kfuncs need an unchanged pointer to ensure the integrity of the operation being performed on the expected object. +2.4.6 KF_SLEEPABLE flag +----------------------- + +The KF_SLEEPABLE flag is used for kfuncs that may sleep. Such kfuncs can only +be called by sleepable BPF programs (BPF_F_SLEEPABLE). + +2.4.7 KF_DESTRUCTIVE flag +-------------------------- + +The KF_DESTRUCTIVE flag is used to indicate functions calling which is +destructive to the system. For example such a call can result in system +rebooting or panicking. Due to this additional restrictions apply to these +calls. At the moment they only require CAP_SYS_BOOT capability, but more can be +added later. + 2.5 Registering the kfuncs -------------------------- diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 20c26aed7896..a627a02cf8ab 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2349,6 +2349,7 @@ extern const struct bpf_func_proto bpf_get_numa_node_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; extern const struct bpf_func_proto bpf_ktime_get_ns_proto; extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto; +extern const struct bpf_func_proto bpf_ktime_get_tai_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; diff --git a/include/linux/btf.h b/include/linux/btf.h index cdb376d53238..ad93c2d9cc1c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -49,6 +49,8 @@ * for this case. */ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ +#define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ +#define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ struct btf; struct btf_member; diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 3cd3a6e631aa..b2b9de70d9f4 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -86,10 +86,6 @@ extern spinlock_t nf_conntrack_expect_lock; /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ -#if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ - (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) || \ - IS_ENABLED(CONFIG_NF_CT_NETLINK)) - static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout) { if (timeout > INT_MAX) @@ -101,6 +97,4 @@ int __nf_ct_change_timeout(struct nf_conn *ct, u64 cta_timeout); void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off); int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status); -#endif - #endif /* _NF_CONNTRACK_CORE_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7bf9ba1329be..934a2a8beb87 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2573,10 +2573,12 @@ union bpf_attr { * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer - * (room space is added or removed below the layer 2 header). + * (room space is added or removed between the layer 2 and + * layer 3 headers). * * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer - * (room space is added or removed below the layer 3 header). + * (room space is added or removed between the layer 3 and + * layer 4 headers). * * The following flags are supported at this time: * @@ -3008,8 +3010,18 @@ union bpf_attr { * **BPF_F_USER_STACK** * Collect a user space stack instead of a kernel stack. * **BPF_F_USER_BUILD_ID** - * Collect buildid+offset instead of ips for user stack, - * only valid if **BPF_F_USER_STACK** is also specified. + * Collect (build_id, file_offset) instead of ips for user + * stack, only valid if **BPF_F_USER_STACK** is also + * specified. + * + * *file_offset* is an offset relative to the beginning + * of the executable or shared object file backing the vma + * which the *ip* falls in. It is *not* an offset relative + * to that object's base address. Accordingly, it must be + * adjusted by adding (sh_addr - sh_offset), where + * sh_{addr,offset} correspond to the executable section + * containing *file_offset* in the object, for comparisons + * to symbols' st_value to be valid. * * **bpf_get_stack**\ () can collect up to * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject @@ -5331,6 +5343,18 @@ union bpf_attr { * **-EACCES** if the SYN cookie is not valid. * * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * u64 bpf_ktime_get_tai_ns(void) + * Description + * A nonsettable system-wide clock derived from wall-clock time but + * ignoring leap seconds. This clock does not experience + * discontinuities and backwards jumps caused by NTP inserting leap + * seconds as CLOCK_REALTIME does. + * + * See: **clock_gettime**\ (**CLOCK_TAI**) + * Return + * Current *ktime*. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5541,6 +5565,7 @@ union bpf_attr { FN(tcp_raw_gen_syncookie_ipv6), \ FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ + FN(ktime_get_tai_ns), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 24b755eca0b3..97bb57493ed5 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -202,6 +202,11 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, } stop: offs = seq->count; + if (IS_ERR(p)) { + seq->op->stop(seq, NULL); + err = PTR_ERR(p); + goto done; + } /* bpf program called if !p */ seq->op->stop(seq, p); if (!p) { diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 8ce40fd869f6..4ee2e7286c23 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -582,7 +582,7 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, synchronize_rcu(); kvfree(smap->buckets); - kfree(smap); + bpf_map_area_free(smap); } int bpf_local_storage_map_alloc_check(union bpf_attr *attr) @@ -610,7 +610,7 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) unsigned int i; u32 nbuckets; - smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); + smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); if (!smap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); @@ -623,7 +623,7 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!smap->buckets) { - kfree(smap); + bpf_map_area_free(smap); return ERR_PTR(-ENOMEM); } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7e64447659f3..903719b89238 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5864,26 +5864,25 @@ again: } static int __get_type_size(struct btf *btf, u32 btf_id, - const struct btf_type **bad_type) + const struct btf_type **ret_type) { const struct btf_type *t; + *ret_type = btf_type_by_id(btf, 0); if (!btf_id) /* void */ return 0; t = btf_type_by_id(btf, btf_id); while (t && btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (!t) { - *bad_type = btf_type_by_id(btf, 0); + if (!t) return -EINVAL; - } + *ret_type = t; if (btf_type_is_ptr(t)) /* kernel size of pointer. Not BPF's size of pointer*/ return sizeof(void *); if (btf_type_is_int(t) || btf_is_any_enum(t)) return t->size; - *bad_type = t; return -EINVAL; } @@ -6175,6 +6174,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); bool rel = false, kptr_get = false, trusted_arg = false; + bool sleepable = false; struct bpf_verifier_log *log = &env->log; u32 i, nargs, ref_id, ref_obj_id = 0; bool is_kfunc = btf_is_kernel(btf); @@ -6212,6 +6212,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, rel = kfunc_flags & KF_RELEASE; kptr_get = kfunc_flags & KF_KPTR_GET; trusted_arg = kfunc_flags & KF_TRUSTED_ARGS; + sleepable = kfunc_flags & KF_SLEEPABLE; } /* check that BTF function arguments match actual types that the @@ -6419,6 +6420,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, func_name); return -EINVAL; } + + if (sleepable && !env->prog->aux->sleepable) { + bpf_log(log, "kernel function %s is sleepable but the program is not\n", + func_name); + return -EINVAL; + } + /* returns argument register number > 0 in case of reference release kfunc */ return rel ? ref_regno : 0; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c1e10d088dbb..639437f36928 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2623,6 +2623,7 @@ const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak; +const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index f4860ac756cd..b5ba34ddd4b6 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -97,7 +97,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); - cmap = kzalloc(sizeof(*cmap), GFP_USER | __GFP_ACCOUNT); + cmap = bpf_map_area_alloc(sizeof(*cmap), NUMA_NO_NODE); if (!cmap) return ERR_PTR(-ENOMEM); @@ -118,7 +118,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) return &cmap->map; free_cmap: - kfree(cmap); + bpf_map_area_free(cmap); return ERR_PTR(err); } @@ -623,7 +623,7 @@ static void cpu_map_free(struct bpf_map *map) __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } bpf_map_area_free(cmap->cpu_map); - kfree(cmap); + bpf_map_area_free(cmap); } /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a0e02b009487..f9a87dcc5535 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -163,13 +163,13 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); - dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_ACCOUNT); + dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE); if (!dtab) return ERR_PTR(-ENOMEM); err = dev_map_init_map(dtab, attr); if (err) { - kfree(dtab); + bpf_map_area_free(dtab); return ERR_PTR(err); } @@ -240,7 +240,7 @@ static void dev_map_free(struct bpf_map *map) bpf_map_area_free(dtab->netdev_map); } - kfree(dtab); + bpf_map_area_free(dtab); } static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 6c530a5e560a..b301a63afa2f 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -491,7 +491,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) struct bpf_htab *htab; int err, i; - htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); + htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); @@ -575,7 +575,7 @@ free_map_locked: bpf_map_area_free(htab->buckets); free_htab: lockdep_unregister_key(&htab->lockdep_key); - kfree(htab); + bpf_map_area_free(htab); return ERR_PTR(err); } @@ -1492,7 +1492,7 @@ static void htab_map_free(struct bpf_map *map) for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); lockdep_unregister_key(&htab->lockdep_key); - kfree(htab); + bpf_map_area_free(htab); } static void htab_map_seq_show_elem(struct bpf_map *map, void *key, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1f961f9982d2..3c1b9bbcf971 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -198,6 +198,18 @@ const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_ktime_get_tai_ns) +{ + /* NMI safe access to clock tai */ + return ktime_get_tai_fast_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = { + .func = bpf_ktime_get_tai_ns, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; @@ -1617,6 +1629,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ktime_get_tai_ns: + return &bpf_ktime_get_tai_ns_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: @@ -1711,3 +1725,21 @@ bpf_base_func_proto(enum bpf_func_id func_id) return NULL; } } + +BTF_SET8_START(tracing_btf_ids) +#ifdef CONFIG_KEXEC_CORE +BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) +#endif +BTF_SET8_END(tracing_btf_ids) + +static const struct btf_kfunc_id_set tracing_kfunc_set = { + .owner = THIS_MODULE, + .set = &tracing_btf_ids, +}; + +static int __init kfunc_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &tracing_kfunc_set); +} + +late_initcall(kfunc_init); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 49ef0ce040c7..098cf336fae6 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -313,8 +313,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) /* max_entries is not used and enforced to be 0 */ return ERR_PTR(-EINVAL); - map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), - __GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node); + map = bpf_map_area_alloc(sizeof(struct bpf_cgroup_storage_map), numa_node); if (!map) return ERR_PTR(-ENOMEM); @@ -346,7 +345,7 @@ static void cgroup_storage_map_free(struct bpf_map *_map) WARN_ON(!RB_EMPTY_ROOT(& |