// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
*/
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
#include <linux/bpf_perf_event.h>
#include <linux/filter.h>
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/kprobes.h>
#include <linux/syscalls.h>
#include <linux/error-injection.h>
#include <asm/tlb.h>
#include "trace_probe.h"
#include "trace.h"
#define bpf_event_rcu_dereference(p) \
rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex))
#ifdef CONFIG_MODULES
struct bpf_trace_module {
struct module *module;
struct list_head list;
};
static LIST_HEAD(bpf_trace_modules);
static DEFINE_MUTEX(bpf_module_mutex);
static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
{
struct bpf_raw_event_map *btp, *ret = NULL;
struct bpf_trace_module *btm;
unsigned int i;
mutex_lock(&bpf_module_mutex);
list_for_each_entry(btm, &bpf_trace_modules, list) {
for (i = 0; i < btm->module->num_bpf_raw_events; ++i) {
btp = &btm->module->bpf_raw_events[i];
if (!strcmp(btp->tp->name, name)) {
if (try_module_get(btm->module))
ret = btp;
goto out;
}
}
}
out:
mutex_unlock(&bpf_module_mutex);
return ret;
}
#else
static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
{
return NULL;
}
#endif /* CONFIG_MODULES */
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
/**
* trace_call_bpf - invoke BPF program
* @call: tracepoint event
* @ctx: opaque context pointer
*
* kprobe handlers execute BPF programs via this helper.
* Can be used from static tracepoints in the future.
*
* Return: BPF programs always return an integer which is interpreted by
* kprobe handler as:
* 0 - return from kprobe (event is filtered out)
* 1 - store kprobe event into ring buffer
* Other values are reserved and currently alias to 1
*/
unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
unsigned int ret;
if (in_nmi()) /* not supported yet */
return 1;
preempt_disable();
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
/*
* since some bpf program is already running on this cpu,
* don't call into another bpf program (same or different)
* and don't send kprobe event into ring-buffer,
* so return zero here
*/
ret = 0;
goto out;
}
/*
* Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
* to all call sites, we did a bpf_prog_array_valid() there to check
* whether call->prog_array is empty or not, which is
* a heurisitc to speed up execution.
*
* If bpf_prog_array_valid() fetched prog_array was
* non-NULL, we go into trace_call_bpf() and do the actual
* proper rcu_dereference() under RCU lock.
* If it turns out that prog_array is NULL then, we bail out.
* For the opposite, if the bpf_prog_array_valid() fetched pointer
* was NULL, you'll skip the prog_array with the risk of missing
* out of events when it was updated in between this and the
* rcu_dereference() which is accepted risk.
*/
ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
out:
__this_cpu_dec(bpf_prog_active);
preempt_enable();
return ret;
}
EXPORT_SYMBOL_GPL(trace_call_bpf);
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
{
regs_set_return_value(regs, rc);
override_function_with_return(regs);
return 0;
}
static const struct bpf_func_proto bpf_override_return_proto = {
.func = bpf_override_return,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
};
#endif
BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
{
int ret;
ret = probe_kernel_read(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
return ret;
}
static const struct bpf_func_proto bpf_probe_read_proto = {
.func = bpf_probe_read,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};
BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
u32, size)
{
/*
* Ensure we're in user context which is safe for the helper to
* run. This helper has no business in a kthread.
*
* access_ok() should prevent writing to non-user memory, but in
* some situations (nommu, temporary switch, etc) access_ok() does
* not provide enough validation, hence the check on KERNEL_DS.
*
* nmi_uaccess_okay() ensures the probe is not run in an interim
* state, when the task or mm are switched. This is specifically
* required to prevent the use of temporary mm.
*/
if (unlikely(in_interrupt() ||
current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
if (unlikely(uaccess_kernel()))
return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
if (!access_ok(unsafe_ptr, size))
return -EPERM;
return probe_kernel_write(unsafe_ptr, src, size);
}
static const struct bpf_func_proto bpf_probe_write_user_proto = {
.func = bpf_probe_write_user,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
.arg2_type = ARG_PTR_TO_MEM,
.arg3_type = ARG_CONST_SIZE,
};
static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
{
pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
current->comm, task_pid_nr(current));
return &bpf_probe_write_user_proto;
}
/*
* Only limited trace_printk() conversion specifiers allowed:
* %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s
*/
BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
u64, arg2, u64, arg3)
{
bool str_seen = false;
int mod[3] = {};
int fmt_cnt = 0;
u64 unsafe_addr;
char buf[64];
int i;
/*
* bpf_check()->check_func_arg()->check_stack_boundary()
* guarantees that fmt points to bpf program stack,
* fmt_size bytes of it were initialized and fmt_size > 0
*/
if (fmt[--fmt_size] != 0)
return -EINVAL;
/* check format string for allowed specifiers */
for (i = 0; i < fmt_size; i++) {
if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
return -EINVAL;
if (fmt[i] != '%')
continue;
if (fmt_cnt >= 3)
return -EINVAL;
/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
i++;
if (fmt[i] == 'l') {
mod[fmt_cnt]++;
i++;
} else if (fmt[i] == 'p' || fmt[i] == 's') {
mod[fmt_cnt]++;
/* disallow any further format extensions */
if (fmt[i + 1] != 0 &&
!isspace(fmt[i + 1]) &&
!ispunct(fmt[i + 1]))
return -EINVAL;
fmt_cnt++;
if (fmt[i] == 's') {
if (str_seen)
/* allow only one '%s' per fmt string */
return -EINVAL;
str_seen = true;
switch (fmt_cnt) {
case 1:
unsafe_addr = arg1;
arg1 = (long) buf;
break;
case 2:
unsafe_addr = arg2;
arg2 = (long) buf;
break;
case 3:
unsafe_addr = arg3;
arg3 = (long) buf;
break;
}
buf[0] = 0;
strncpy_from_unsafe(buf,
(void *) (long) unsafe_addr,
sizeof(buf));
}
continue;
}
if (fmt[i] == 'l') {
mod[fmt_cnt]++;
i++;
}
if (fmt[i] != 'i' && fmt[i] != 'd' &&
fmt[i] != 'u' && fmt[i] != 'x')
return -EINVAL;
fmt_cnt++;
}
/* Horrid workaround for getting va_list handling working with different
* argument type combinations generically for 32 and 64 bit archs.
*/
#define __BPF_TP_EMIT() __BPF_ARG3_TP()
#define __BPF_TP(...) \
__trace_printk(0 /* Fake ip */, \
fmt, ##__VA_ARGS__)
#define __BPF_ARG1_TP(...) \
((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \
? __BPF_TP(arg1, ##__VA_ARGS__) \
: ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \
? __BPF_TP((long)arg1, ##__VA_ARGS__) \
: __BPF_TP((u32)arg
|