diff options
| author | Namhyung Kim <namhyung@kernel.org> | 2022-05-18 15:47:21 -0700 |
|---|---|---|
| committer | Arnaldo Carvalho de Melo <acme@redhat.com> | 2022-05-26 12:36:57 -0300 |
| commit | edc41a1099c2d08ccfd4ed7d59688501e3749015 (patch) | |
| tree | 30b5f519a0987aa8ad07440587bf933b643278a2 /tools/perf/util/bpf_skel | |
| parent | 303ead45c4459df9e38d4f2bd38ef6238c5898de (diff) | |
| download | linux-edc41a1099c2d08ccfd4ed7d59688501e3749015.tar.gz linux-edc41a1099c2d08ccfd4ed7d59688501e3749015.tar.bz2 linux-edc41a1099c2d08ccfd4ed7d59688501e3749015.zip | |
perf record: Enable off-cpu analysis with BPF
Add --off-cpu option to enable the off-cpu profiling with BPF. It'd
use a bpf_output event and rename it to "offcpu-time". Samples will
be synthesized at the end of the record session using data from a BPF
map which contains the aggregated off-cpu time at context switches.
So it needs root privilege to get the off-cpu profiling.
Each sample will have a separate user stacktrace so it will skip
kernel threads. The sample ip will be set from the stacktrace and
other sample data will be updated accordingly. Currently it only
handles some basic sample types.
The sample timestamp is set to a dummy value just not to bother with
other events during the sorting. So it has a very big initial value
and increase it on processing each samples.
Good thing is that it can be used together with regular profiling like
cpu cycles. If you don't want to that, you can use a dummy event to
enable off-cpu profiling only.
Example output:
$ sudo perf record --off-cpu perf bench sched messaging -l 1000
$ sudo perf report --stdio --call-graph=no
# Total Lost Samples: 0
#
# Samples: 41K of event 'cycles'
# Event count (approx.): 42137343851
...
# Samples: 1K of event 'offcpu-time'
# Event count (approx.): 587990831640
#
# Children Self Command Shared Object Symbol
# ........ ........ ............... .................. .........................
#
81.66% 0.00% sched-messaging libc-2.33.so [.] __libc_start_main
81.66% 0.00% sched-messaging perf [.] cmd_bench
81.66% 0.00% sched-messaging perf [.] main
81.66% 0.00% sched-messaging perf [.] run_builtin
81.43% 0.00% sched-messaging perf [.] bench_sched_messaging
40.86% 40.86% sched-messaging libpthread-2.33.so [.] __read
37.66% 37.66% sched-messaging libpthread-2.33.so [.] __write
2.91% 2.91% sched-messaging libc-2.33.so [.] __poll
...
As you can see it spent most of off-cpu time in read and write in
bench_sched_messaging(). The --call-graph=no was added just to make
the output concise here.
It uses perf hooks facility to control BPF program during the record
session rather than adding new BPF/off-cpu specific calls.
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Hao Luo <haoluo@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20220518224725.742882-3-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Diffstat (limited to 'tools/perf/util/bpf_skel')
| -rw-r--r-- | tools/perf/util/bpf_skel/off_cpu.bpf.c | 139 |
1 files changed, 139 insertions, 0 deletions
diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c new file mode 100644 index 000000000000..5173ed882fdf --- /dev/null +++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2022 Google +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +/* task->flags for off-cpu analysis */ +#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ + +/* task->state for off-cpu analysis */ +#define TASK_INTERRUPTIBLE 0x0001 +#define TASK_UNINTERRUPTIBLE 0x0002 + +#define MAX_STACKS 32 +#define MAX_ENTRIES 102400 + +struct tstamp_data { + __u32 stack_id; + __u32 state; + __u64 timestamp; +}; + +struct offcpu_key { + __u32 pid; + __u32 tgid; + __u32 stack_id; + __u32 state; +}; + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(__u32)); + __uint(value_size, MAX_STACKS * sizeof(__u64)); + __uint(max_entries, MAX_ENTRIES); +} stacks SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct tstamp_data); +} tstamp SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(struct offcpu_key)); + __uint(value_size, sizeof(__u64)); + __uint(max_entries, MAX_ENTRIES); +} off_cpu SEC(".maps"); + +/* old kernel task_struct definition */ +struct task_struct___old { + long state; +} __attribute__((preserve_access_index)); + +int enabled = 0; + +/* + * Old kernel used to call it task_struct->state and now it's '__state'. + * Use BPF CO-RE "ignored suffix rule" to deal with it like below: + * + * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes + */ +static inline int get_task_state(struct task_struct *t) +{ + if (bpf_core_field_exists(t->__state)) + return BPF_CORE_READ(t, __state); + + /* recast pointer to capture task_struct___old type for compiler */ + struct task_struct___old *t_old = (void *)t; + + /* now use old "state" name of the field */ + return BPF_CORE_READ(t_old, state); +} + +SEC("tp_btf/sched_switch") +int on_switch(u64 *ctx) +{ + __u64 ts; + int state; + __u32 stack_id; + struct task_struct *prev, *next; + struct tstamp_data *pelem; + + if (!enabled) + return 0; + + prev = (struct task_struct *)ctx[1]; + next = (struct task_struct *)ctx[2]; + state = get_task_state(prev); + + ts = bpf_ktime_get_ns(); + + if (prev->flags & PF_KTHREAD) + goto next; + if (state != TASK_INTERRUPTIBLE && + state != TASK_UNINTERRUPTIBLE) + goto next; + + stack_id = bpf_get_stackid(ctx, &stacks, + BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK); + + pelem = bpf_task_storage_get(&tstamp, prev, NULL, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!pelem) + goto next; + + pelem->timestamp = ts; + pelem->state = state; + pelem->stack_id = stack_id; + +next: + pelem = bpf_task_storage_get(&tstamp, next, NULL, 0); + + if (pelem && pelem->timestamp) { + struct offcpu_key key = { + .pid = next->pid, + .tgid = next->tgid, + .stack_id = pelem->stack_id, + .state = pelem->state, + }; + __u64 delta = ts - pelem->timestamp; + __u64 *total; + + total = bpf_map_lookup_elem(&off_cpu, &key); + if (total) + *total += delta; + else + bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); + + /* prevent to reuse the timestamp later */ + pelem->timestamp = 0; + } + + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; |
