diff options
Diffstat (limited to 'kernel/bpf')
| -rw-r--r-- | kernel/bpf/Makefile | 5 | ||||
| -rw-r--r-- | kernel/bpf/bpf_iter.c | 5 | ||||
| -rw-r--r-- | kernel/bpf/bpf_local_storage.c | 4 | ||||
| -rw-r--r-- | kernel/bpf/bpf_lsm.c | 23 | ||||
| -rw-r--r-- | kernel/bpf/bpf_task_storage.c | 8 | ||||
| -rw-r--r-- | kernel/bpf/cgroup.c | 157 | ||||
| -rw-r--r-- | kernel/bpf/cgroup_iter.c | 282 | ||||
| -rw-r--r-- | kernel/bpf/hashtab.c | 168 | ||||
| -rw-r--r-- | kernel/bpf/helpers.c | 48 | ||||
| -rw-r--r-- | kernel/bpf/memalloc.c | 634 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 15 | ||||
| -rw-r--r-- | kernel/bpf/trampoline.c | 8 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 94 |
13 files changed, 1258 insertions, 193 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 057ba8e01e70..341c94f208f4 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o -obj-$(CONFIG_BPF_SYSCALL) += btf.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o @@ -24,6 +24,9 @@ endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif +ifeq ($(CONFIG_CGROUPS),y) +obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o +endif obj-$(CONFIG_CGROUP_BPF) += cgroup.o ifeq ($(CONFIG_INET),y) obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 97bb57493ed5..5dc307bdeaeb 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -694,19 +694,24 @@ struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) { + struct bpf_run_ctx run_ctx, *old_run_ctx; int ret; if (prog->aux->sleepable) { rcu_read_lock_trace(); migrate_disable(); might_fault(); + old_run_ctx = bpf_set_run_ctx(&run_ctx); ret = bpf_prog_run(prog, ctx); + bpf_reset_run_ctx(old_run_ctx); migrate_enable(); rcu_read_unlock_trace(); } else { rcu_read_lock(); migrate_disable(); + old_run_ctx = bpf_set_run_ctx(&run_ctx); ret = bpf_prog_run(prog, ctx); + bpf_reset_run_ctx(old_run_ctx); migrate_enable(); rcu_read_unlock(); } diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 4ee2e7286c23..802fc15b0d73 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -555,11 +555,11 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem, map_node))) { if (busy_counter) { migrate_disable(); - __this_cpu_inc(*busy_counter); + this_cpu_inc(*busy_counter); } bpf_selem_unlink(selem, false); if (busy_counter) { - __this_cpu_dec(*busy_counter); + this_cpu_dec(*busy_counter); migrate_enable(); } cond_resched_rcu(); diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index fa71d58b7ded..4fd845bc5a12 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -41,17 +41,21 @@ BTF_SET_END(bpf_lsm_hooks) */ BTF_SET_START(bpf_lsm_current_hooks) /* operate on freshly allocated sk without any cgroup association */ +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_sk_alloc_security) BTF_ID(func, bpf_lsm_sk_free_security) +#endif BTF_SET_END(bpf_lsm_current_hooks) /* List of LSM hooks that trigger while the socket is properly locked. */ BTF_SET_START(bpf_lsm_locked_sockopt_hooks) +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_socket_sock_rcv_skb) BTF_ID(func, bpf_lsm_sock_graft) BTF_ID(func, bpf_lsm_inet_csk_clone) BTF_ID(func, bpf_lsm_inet_conn_established) +#endif BTF_SET_END(bpf_lsm_locked_sockopt_hooks) /* List of LSM hooks that trigger while the socket is _not_ locked, @@ -59,8 +63,10 @@ BTF_SET_END(bpf_lsm_locked_sockopt_hooks) * in the early init phase. */ BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks) +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_socket_post_create) BTF_ID(func, bpf_lsm_socket_socketpair) +#endif BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks) #ifdef CONFIG_CGROUP_BPF @@ -189,6 +195,14 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto = { static const struct bpf_func_proto * bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + if (prog->expected_attach_type == BPF_LSM_CGROUP) { + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + } + switch (func_id) { case BPF_FUNC_inode_storage_get: return &bpf_inode_storage_get_proto; @@ -212,15 +226,6 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL; case BPF_FUNC_get_attach_cookie: return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL; - case BPF_FUNC_get_local_storage: - return prog->expected_attach_type == BPF_LSM_CGROUP ? - &bpf_get_local_storage_proto : NULL; - case BPF_FUNC_set_retval: - return prog->expected_attach_type == BPF_LSM_CGROUP ? - &bpf_set_retval_proto : NULL; - case BPF_FUNC_get_retval: - return prog->expected_attach_type == BPF_LSM_CGROUP ? - &bpf_get_retval_proto : NULL; #ifdef CONFIG_NET case BPF_FUNC_setsockopt: if (prog->expected_attach_type != BPF_LSM_CGROUP) diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index e9014dc62682..6f290623347e 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -26,20 +26,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy); static void bpf_task_storage_lock(void) { migrate_disable(); - __this_cpu_inc(bpf_task_storage_busy); + this_cpu_inc(bpf_task_storage_busy); } static void bpf_task_storage_unlock(void) { - __this_cpu_dec(bpf_task_storage_busy); + this_cpu_dec(bpf_task_storage_busy); migrate_enable(); } static bool bpf_task_storage_trylock(void) { migrate_disable(); - if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) { - __this_cpu_dec(bpf_task_storage_busy); + if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) { + this_cpu_dec(bpf_task_storage_busy); migrate_enable(); return false; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4a400cd63731..00c7f864900e 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1529,6 +1529,37 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, return ret; } +BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) +{ + /* flags argument is not used now, + * but provides an ability to extend the API. + * verifier checks that its value is correct. + */ + enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); + struct bpf_cgroup_storage *storage; + struct bpf_cg_run_ctx *ctx; + void *ptr; + + /* get current cgroup storage from BPF run context */ + ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + storage = ctx->prog_item->cgroup_storage[stype]; + + if (stype == BPF_CGROUP_STORAGE_SHARED) + ptr = &READ_ONCE(storage->buf)->data[0]; + else + ptr = this_cpu_ptr(storage->percpu_buf); + + return (unsigned long)ptr; +} + +const struct bpf_func_proto bpf_get_local_storage_proto = { + .func = bpf_get_local_storage, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_0(bpf_get_retval) { struct bpf_cg_run_ctx *ctx = @@ -1560,32 +1591,26 @@ const struct bpf_func_proto bpf_set_retval_proto = { }; static const struct bpf_func_proto * -cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_retval: - return &bpf_get_retval_proto; - case BPF_FUNC_set_retval: - return &bpf_set_retval_proto; default: return bpf_base_func_proto(func_id); } } -static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - return cgroup_base_func_proto(func_id, prog); -} - static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -2098,11 +2123,17 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - case BPF_FUNC_strtol: - return &bpf_strtol_proto; - case BPF_FUNC_strtoul: - return &bpf_strtoul_proto; case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; case BPF_FUNC_sysctl_get_current_value: @@ -2113,8 +2144,10 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sysctl_set_new_value_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return cgroup_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } @@ -2235,6 +2268,16 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = { static const struct bpf_func_proto * cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { #ifdef CONFIG_NET case BPF_FUNC_get_netns_cookie: @@ -2256,8 +2299,10 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; #endif + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return cgroup_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } @@ -2422,3 +2467,69 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = { const struct bpf_prog_ops cg_sockopt_prog_ops = { }; + +/* Common helpers for cgroup hooks. */ +const struct bpf_func_proto * +cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; + case BPF_FUNC_get_retval: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_SOCK_OPS: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + return NULL; + default: + return &bpf_get_retval_proto; + } + case BPF_FUNC_set_retval: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_SOCK_OPS: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + return NULL; + default: + return &bpf_set_retval_proto; + } + default: + return NULL; + } +} + +/* Common helpers for cgroup hooks with valid process context. */ +const struct bpf_func_proto * +cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_current_comm: + return &bpf_get_current_comm_proto; + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; +#endif + default: + return NULL; + } +} diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c new file mode 100644 index 000000000000..0d200a993489 --- /dev/null +++ b/kernel/bpf/cgroup_iter.c @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Google */ +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <linux/cgroup.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> + +#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */ + +/* cgroup_iter provides four modes of traversal to the cgroup hierarchy. + * + * 1. Walk the descendants of a cgroup in pre-order. + * 2. Walk the descendants of a cgroup in post-order. + * 3. Walk the ancestors of a cgroup. + * 4. Show the given cgroup only. + * + * For walking descendants, cgroup_iter can walk in either pre-order or + * post-order. For walking ancestors, the iter walks up from a cgroup to + * the root. + * + * The iter program can terminate the walk early by returning 1. Walk + * continues if prog returns 0. + * + * The prog can check (seq->num == 0) to determine whether this is + * the first element. The prog may also be passed a NULL cgroup, + * which means the walk has completed and the prog has a chance to + * do post-processing, such as outputting an epilogue. + * + * Note: the iter_prog is called with cgroup_mutex held. + * + * Currently only one session is supported, which means, depending on the + * volume of data bpf program intends to send to user space, the number + * of cgroups that can be walked is limited. For example, given the current + * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each + * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can + * be walked is 512. This is a limitation of cgroup_iter. If the output data + * is larger than the kernel buffer size, after all data in the kernel buffer + * is consumed by user space, the subsequent read() syscall will signal + * EOPNOTSUPP. In order to work around, the user may have to update their + * program to reduce the volume of data sent to output. For example, skip + * some uninteresting cgroups. + */ + +struct bpf_iter__cgroup { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct cgroup *, cgroup); +}; + +struct cgroup_iter_priv { + struct cgroup_subsys_state *start_css; + bool visited_all; + bool terminate; + int order; +}; + +static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct cgroup_iter_priv *p = seq->private; + + mutex_lock(&cgroup_mutex); + + /* cgroup_iter doesn't support read across multiple sessions. */ + if (*pos > 0) { + if (p->visited_all) + return NULL; + + /* Haven't visited all, but because cgroup_mutex has dropped, + * return -EOPNOTSUPP to indicate incomplete iteration. + */ + return ERR_PTR(-EOPNOTSUPP); + } + + ++*pos; + p->terminate = false; + p->visited_all = false; + if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) + return css_next_descendant_pre(NULL, p->start_css); + else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) + return css_next_descendant_post(NULL, p->start_css); + else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */ + return p->start_css; +} + +static int __cgroup_iter_seq_show(struct seq_file *seq, + struct cgroup_subsys_state *css, int in_stop); + +static void cgroup_iter_seq_stop(struct seq_file *seq, void *v) +{ + struct cgroup_iter_priv *p = seq->private; + + mutex_unlock(&cgroup_mutex); + + /* pass NULL to the prog for post-processing */ + if (!v) { + __cgroup_iter_seq_show(seq, NULL, true); + p->visited_all = true; + } +} + +static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v; + struct cgroup_iter_priv *p = seq->private; + + ++*pos; + if (p->terminate) + return NULL; + + if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) + return css_next_descendant_pre(curr, p->start_css); + else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) + return css_next_descendant_post(curr, p->start_css); + else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) + return curr->parent; + else /* BPF_CGROUP_ITER_SELF_ONLY */ + return NULL; +} + +static int __cgroup_iter_seq_show(struct seq_file *seq, + struct cgroup_subsys_state *css, int in_stop) +{ + struct cgroup_iter_priv *p = seq->private; + struct bpf_iter__cgroup ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + /* cgroup is dead, skip this element */ + if (css && cgroup_is_dead(css->cgroup)) + return 0; + + ctx.meta = &meta; + ctx.cgroup = css ? css->cgroup : NULL; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + /* if prog returns > 0, terminate after this element. */ + if (ret != 0) + p->terminate = true; + + return 0; +} + +static int cgroup_iter_seq_show(struct seq_file *seq, void *v) +{ + return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v, + false); +} + +static const struct seq_operations cgroup_iter_seq_ops = { + .start = cgroup_iter_seq_start, + .next = cgroup_iter_seq_next, + .stop = cgroup_iter_seq_stop, + .show = cgroup_iter_seq_show, +}; + +BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup) + +static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux) +{ + struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv; + struct cgroup *cgrp = aux->cgroup.start; + + p->start_css = &cgrp->self; + p->terminate = false; + p->visited_all = false; + p->order = aux->cgroup.order; + return 0; +} + +static const struct bpf_iter_seq_info cgroup_iter_seq_info = { + .seq_ops = &cgroup_iter_seq_ops, + .init_seq_private = cgroup_iter_seq_init, + .seq_priv_size = sizeof(struct cgroup_iter_priv), +}; + +static int bpf_iter_attach_cgroup(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + int fd = linfo->cgroup.cgroup_fd; + u64 id = linfo->cgroup.cgroup_id; + int order = linfo->cgroup.order; + struct cgroup *cgrp; + + if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE && + order != BPF_CGROUP_ITER_DESCENDANTS_POST && + order != BPF_CGROUP_ITER_ANCESTORS_UP && + order != BPF_CGROUP_ITER_SELF_ONLY) + return -EINVAL; + + if (fd && id) + return -EINVAL; + + if (fd) + cgrp = cgroup_get_from_fd(fd); + else if (id) + cgrp = cgroup_get_from_id(id); + else /* walk the entire hierarchy by default. */ + cgrp = cgroup_get_from_path("/"); + + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + aux->cgroup.start = cgrp; + aux->cgroup.order = order; + return 0; +} + +static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux) +{ + cgroup_put(aux->cgroup.start); +} + +static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq) +{ + char *buf; + + buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!buf) { + seq_puts(seq, "cgroup_path:\t<unknown>\n"); + goto show_order; + } + + /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path + * will print nothing. + * + * Path is in the calling process's cgroup namespace. + */ + cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + seq_printf(seq, "cgroup_path:\t%s\n", buf); + kfree(buf); + +show_order: + if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE) + seq_puts(seq, "order: descendants_pre\n"); + else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST) + seq_puts(seq, "order: descendants_post\n"); + else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP) + seq_puts(seq, "order: ancestors_up\n"); + else /* BPF_CGROUP_ITER_SELF_ONLY */ + seq_puts(seq, "order: self_only\n"); +} + +static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info) +{ + info->iter.cgroup.order = aux->cgroup.order; + info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start); + return 0; +} + +DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta, + struct cgroup *cgroup) + +static struct bpf_iter_reg bpf_cgroup_reg_info = { + .target = "cgroup", + .feature = BPF_ITER_RESCHED, + .attach_target = bpf_iter_attach_cgroup, + .detach_target = bpf_iter_detach_cgroup, + .show_fdinfo = bpf_iter_cgroup_show_fdinfo, + .fill_link_info = bpf_iter_cgroup_fill_link_info, + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__cgroup, cgroup), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &cgroup_iter_seq_info, +}; + +static int __init bpf_cgroup_iter_init(void) +{ + bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0]; + return bpf_iter_reg_target(&bpf_cgroup_reg_info); +} + +late_initcall(bpf_cgroup_iter_init); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b301a63afa2f..0fe3f136cbbe 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -14,6 +14,7 @@ #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" +#include <linux/bpf_mem_alloc.h> #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ @@ -92,6 +93,8 @@ struct bucket { struct bpf_htab { struct bpf_map map; + struct bpf_mem_alloc ma; + struct bpf_mem_alloc pcpu_ma; struct bucket *buckets; void *elems; union { @@ -99,7 +102,12 @@ struct bpf_htab { struct bpf_lru lru; }; struct htab_elem *__percpu *extra_elems; - atomic_t count; /* number of elements in this hashtable */ + /* number of elements in non-preallocated hashtable are kept + * in either pcount or count + */ + struct percpu_counter pcount; + atomic_t count; + bool use_percpu_counter; u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ u32 hashrnd; @@ -114,14 +122,14 @@ struct htab_elem { struct { void *padding; union { - struct bpf_htab *htab; struct pcpu_freelist_node fnode; struct htab_elem *batch_flink; }; }; }; union { - struct rcu_head rcu; + /* pointer to per-cpu pointer */ + void *ptr_to_pptr; struct bpf_lru_node lru_node; }; u32 hash; @@ -162,17 +170,25 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab, unsigned long *pflags) { unsigned long flags; + bool use_raw_lock; hash = hash & HASHTAB_MAP_LOCK_MASK; - migrate_disable(); + use_raw_lock = htab_use_raw_lock(htab); + if (use_raw_lock) + preempt_disable(); + else + migrate_disable(); if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { __this_cpu_dec(*(htab->map_locked[hash])); - migrate_enable(); + if (use_raw_lock) + preempt_enable(); + else + migrate_enable(); return -EBUSY; } - if (htab_use_raw_lock(htab)) + if (use_raw_lock) raw_spin_lock_irqsave(&b->raw_lock, flags); else spin_lock_irqsave(&b->lock, flags); @@ -185,13 +201,18 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab, struct bucket *b, u32 hash, unsigned long flags) { + bool use_raw_lock = htab_use_raw_lock(htab); + hash = hash & HASHTAB_MAP_LOCK_MASK; - if (htab_use_raw_lock(htab)) + if (use_raw_lock) raw_spin_unlock_irqrestore(&b->raw_lock, flags); else spin_unlock_irqrestore(&b->lock, flags); __this_cpu_dec(*(htab->map_locked[hash])); - migrate_enable(); + if (use_raw_lock) + preempt_enable(); + else + migrate_enable(); } static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); @@ -428,8 +449,6 @@ static int htab_map_alloc_check(union bpf_attr *attr) bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); int numa_node = bpf_map_attr_numa_node(attr); - BUILD_BUG_ON(offsetof(struct htab_elem, htab) != - offsetof(struct htab_elem, hash_node.pprev)); BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); @@ -550,6 +569,29 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab_init_buckets(htab); +/* compute_batch_value() computes batch value as num_online_cpus() * 2 + * and __percpu_counter_compare() needs + * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus() + * for percpu_counter to be faster than atomic_t. In practice the average bpf + * hash map size is 10k, which means that a system with 64 cpus will fill + * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore + * define our own batch count as 32 then 10k hash map can be filled up to 80%: + * 10k - 8k > 32 _batch_ * 64 _cpus_ + * and __percpu_counter_compare() will still be fast. At that point hash map + * collisions will dominate its performance anyway. Assume that hash map filled + * to 50+% isn't going to be O(1) and use the following formula to choose + * between percpu_counter and atomic_t. + */ +#define PERCPU_COUNTER_BATCH 32 + if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH) + htab->use_percpu_counter = true; + + if (htab->use_percpu_counter) { + err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL); + if (err) + goto free_map_locked; + } + if (prealloc) { err = prealloc_init(htab); if (err) @@ -563,6 +605,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (err) goto free_prealloc; } + } else { + err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false); + if (err) + goto free_map_locked; + if (percpu) { + err = bpf_mem_alloc_init(&htab->pcpu_ma, + round_up(htab->map.value_size, 8), true); + if (err) + goto free_map_locked; + } } return &htab->map; @@ -573,6 +625,8 @@ free_map_locked: for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); + bpf_mem_alloc_destroy(&htab->pcpu_ma); + bpf_mem_alloc_destroy(&htab->ma); free_htab: lockdep_unregister_key(&htab->lockdep_key); bpf_map_area_free(htab); @@ -847,17 +901,9 @@ find_first_elem: static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) - free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); + bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); check_and_free_fields(htab, l); - kfree(l); -} - -static void htab_elem_free_rcu(struct rcu_head *head) -{ - struct htab_elem *l = container_of(head, struct htab_elem, rcu); - struct bpf_htab *htab = l->htab; - - htab_elem_free(htab, l); + bpf_mem_cache_free(&htab->ma, l); } static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) @@ -871,6 +917,31 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) } } +static bool is_map_full(struct bpf_htab *htab) +{ + if (htab->use_percpu_counter) + return __percpu_counter_compare(&htab->pcount, htab->map.max_entries, + PERCPU_COUNTER_BATCH) >= 0; + return atomic_read(&htab->count) >= htab->map.max_entries; +} + +static void inc_elem_count(struct bpf_htab *htab) +{ + if (htab->use_percpu_counter) + percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH); + else + atomic_inc(&htab->count); +} + +static void dec_elem_count(struct bpf_htab *htab) +{ + if (htab->use_percpu_counter) + percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH); + else + atomic_dec(&htab->count); +} + + static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { htab_put_fd_value(htab, l); @@ -879,9 +950,8 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) check_and_free_fields(htab, l); __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { - atomic_dec(&htab->count); - l->htab = htab; - call_rcu(&l->rcu, htab_elem_free_rcu); + dec_elem_count(htab); + htab_elem_free(htab, l); } } @@ -906,13 +976,12 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, void *value, bool onallcpus) { - /* When using prealloc and not setting the initial value on all cpus, - * zero-fill element values for other cpus (just as what happens when - * not using prealloc). Otherwise, bpf program has no way to ensure + /* When not setting the initial value on all cpus, zero-fill element + * values for other cpus. Otherwise, bpf program has no way to ensure * known initial values for cpus other than current one * (onallcpus=false always when coming from bpf prog). */ - if (htab_is_prealloc(htab) && !onallcpus) { + if (!onallcpus) { u32 size = round_up(htab->map.value_size, 8); int current_cpu = raw_smp_processor_id(); int cpu; @@ -963,19 +1032,16 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = container_of(l, struct htab_elem, fnode); } } else { - if (atomic_inc_return(&htab->count) > htab->map.max_entries) - if (!old_elem) { + if (is_map_full(htab)) + if (!old_elem) /* when map is full and update() is replacing * old element, it's ok to allocate, since * old element will be freed immediately. * Otherwise return an error */ - l_new = ERR_PTR(-E2BIG); - goto dec_count; - } - l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, - GFP_NOWAIT | __GFP_NOWARN, - htab->map.numa_node); + return ERR_PTR(-E2BIG); + inc_elem_count(htab); + l_new = bpf_mem_cache_alloc(&htab->ma); if (!l_new) { l_new = ERR_PTR(-ENOMEM); goto dec_count; @@ -986,18 +1052,18 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, memcpy(l_new->key, key, key_size); if (percpu) { - size = round_up(size, 8); if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); } else { /* alloc_percpu zero-fills */ - pptr = bpf_map_alloc_percpu(&htab->map, size, 8, - GFP_NOWAIT | __GFP_NOWARN); + pptr = bpf_mem_cache_alloc(&htab->pcpu_ma); if (!pptr) { - kfree(l_new); + bpf_mem_cache_free(&htab->ma, l_new); l_new = ERR_PTR(-ENOMEM); goto dec_count; } + l_new->ptr_to_pptr = pptr; + pptr = *(void **)pptr; } pcpu_init_value(htab, pptr, value, onallcpus); @@ -1016,7 +1082,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new->hash = hash; return l_new; dec_count: - atomic_dec(&htab->count); + dec_elem_count(htab); return l_new; } @@ -1416,6 +1482,10 @@ static void delete_all_elements(struct bpf_htab *htab) { int i; |
