diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-14 09:22:11 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-14 09:22:11 -0700 |
| commit | 96f86ff08332d88defd35c330fc6dae219b9e264 (patch) | |
| tree | 1122e9ac32022cc7c5a365c2ab6ddf116fbd3b15 /tools/perf/util/arm-spe.c | |
| parent | d785610f052d7456497cdec2a2406f6d4b16569f (diff) | |
| parent | 7391db6459388d47d657aad633cb55fc04a8d4fb (diff) | |
| download | linux-96f86ff08332d88defd35c330fc6dae219b9e264.tar.gz linux-96f86ff08332d88defd35c330fc6dae219b9e264.tar.bz2 linux-96f86ff08332d88defd35c330fc6dae219b9e264.zip | |
Merge tag 'perf-tools-fixes-for-v6.0-2022-08-13' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux
Pull more perf tool updates from Arnaldo Carvalho de Melo:
- 'perf c2c' now supports ARM64, adjust its output to cope with
differences with what is in x86_64. Now go find false sharing on
ARM64 (at least Neoverse) as well!
- Refactor the JSON processing, making the output more compact and thus
reducing the size of the resulting perf binary
- Improvements for 'perf offcpu' profiling, including tracking child
processes
- Update Intel JSON metrics and events files for broadwellde,
broadwellx, cascadelakex, haswellx, icelakex, ivytown, jaketown,
knightslanding, sapphirerapids, skylakex and snowridgex
- Add 'perf stat' JSON output and a 'perf test' entry for it
- Ignore memfd and anonymous mmap events if jitdump present
- Refactor 'perf test' shell tests allowing subdirs
- Fix an error handling path in 'parse_perf_probe_command()'
- Fixes for the guest Intel PT tracing patchkit in the 1st batch of
this merge window
- Print debuginfod queries if -v option is used, to explain delays in
processing when debuginfo servers are enabled to fetch DSOs with
richer symbol tables
- Improve error message for 'perf record -p not_existing_pid'
- Fix openssl and libbpf feature detection
- Add PMU pai_crypto event description for IBM z16 on 'perf list'
- Fix typos and duplicated words on comments in various places
* tag 'perf-tools-fixes-for-v6.0-2022-08-13' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux: (81 commits)
perf test: Refactor shell tests allowing subdirs
perf vendor events: Update events for snowridgex
perf vendor events: Update events and metrics for skylakex
perf vendor events: Update metrics for sapphirerapids
perf vendor events: Update events for knightslanding
perf vendor events: Update metrics for jaketown
perf vendor events: Update metrics for ivytown
perf vendor events: Update events and metrics for icelakex
perf vendor events: Update events and metrics for haswellx
perf vendor events: Update events and metrics for cascadelakex
perf vendor events: Update events and metrics for broadwellx
perf vendor events: Update metrics for broadwellde
perf jevents: Fold strings optimization
perf jevents: Compress the pmu_events_table
perf metrics: Copy entire pmu_event in find metric
perf pmu-events: Hide the pmu_events
perf pmu-events: Don't assume pmu_event is an array
perf pmu-events: Move test events/metrics to JSON
perf test: Use full metric resolution
perf pmu-events: Hide pmu_events_map
...
Diffstat (limited to 'tools/perf/util/arm-spe.c')
| -rw-r--r-- | tools/perf/util/arm-spe.c | 130 |
1 files changed, 114 insertions, 16 deletions
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c index d040406f3314..22dcfe07e886 100644 --- a/tools/perf/util/arm-spe.c +++ b/tools/perf/util/arm-spe.c @@ -34,6 +34,7 @@ #include "arm-spe-decoder/arm-spe-decoder.h" #include "arm-spe-decoder/arm-spe-pkt-decoder.h" +#include "../../arch/arm64/include/asm/cputype.h" #define MAX_TIMESTAMP (~0ULL) struct arm_spe { @@ -45,6 +46,7 @@ struct arm_spe { struct perf_session *session; struct machine *machine; u32 pmu_type; + u64 midr; struct perf_tsc_conversion tc; @@ -387,35 +389,128 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, return arm_spe_deliver_synth_event(spe, speq, event, &sample); } -static u64 arm_spe__synth_data_source(const struct arm_spe_record *record) +static const struct midr_range neoverse_spe[] = { + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), + {}, +}; + +static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record, + union perf_mem_data_src *data_src) { - union perf_mem_data_src data_src = { 0 }; + /* + * Even though four levels of cache hierarchy are possible, no known + * production Neoverse systems currently include more than three levels + * so for the time being we assume three exist. If a production system + * is built with four the this function would have to be changed to + * detect the number of levels for reporting. + */ - if (record->op == ARM_SPE_LD) - data_src.mem_op = PERF_MEM_OP_LOAD; - else if (record->op == ARM_SPE_ST) - data_src.mem_op = PERF_MEM_OP_STORE; - else - return 0; + /* + * We have no data on the hit level or data source for stores in the + * Neoverse SPE records. + */ + if (record->op & ARM_SPE_ST) { + data_src->mem_lvl = PERF_MEM_LVL_NA; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA; + data_src->mem_snoop = PERF_MEM_SNOOP_NA; + return; + } + + switch (record->source) { + case ARM_SPE_NV_L1D: + data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; + data_src->mem_snoop = PERF_MEM_SNOOP_NONE; + break; + case ARM_SPE_NV_L2: + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; + data_src->mem_snoop = PERF_MEM_SNOOP_NONE; + break; + case ARM_SPE_NV_PEER_CORE: + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; + data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; + break; + /* + * We don't know if this is L1, L2 but we do know it was a cache-2-cache + * transfer, so set SNOOPX_PEER + */ + case ARM_SPE_NV_LOCAL_CLUSTER: + case ARM_SPE_NV_PEER_CLUSTER: + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; + data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; + break; + /* + * System cache is assumed to be L3 + */ + case ARM_SPE_NV_SYS_CACHE: + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; + data_src->mem_snoop = PERF_MEM_SNOOP_HIT; + break; + /* + * We don't know what level it hit in, except it came from the other + * socket + */ + case ARM_SPE_NV_REMOTE: + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; + break; + case ARM_SPE_NV_DRAM: + data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; + data_src->mem_snoop = PERF_MEM_SNOOP_NONE; + break; + default: + break; + } +} +static void arm_spe__synth_data_source_generic(const struct arm_spe_record *record, + union perf_mem_data_src *data_src) +{ if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) { - data_src.mem_lvl = PERF_MEM_LVL_L3; + data_src->mem_lvl = PERF_MEM_LVL_L3; if (record->type & ARM_SPE_LLC_MISS) - data_src.mem_lvl |= PERF_MEM_LVL_MISS; + data_src->mem_lvl |= PERF_MEM_LVL_MISS; else - data_src.mem_lvl |= PERF_MEM_LVL_HIT; + data_src->mem_lvl |= PERF_MEM_LVL_HIT; } else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) { - data_src.mem_lvl = PERF_MEM_LVL_L1; + data_src->mem_lvl = PERF_MEM_LVL_L1; if (record->type & ARM_SPE_L1D_MISS) - data_src.mem_lvl |= PERF_MEM_LVL_MISS; + data_src->mem_lvl |= PERF_MEM_LVL_MISS; else - data_src.mem_lvl |= PERF_MEM_LVL_HIT; + data_src->mem_lvl |= PERF_MEM_LVL_HIT; } if (record->type & ARM_SPE_REMOTE_ACCESS) - data_src.mem_lvl |= PERF_MEM_LVL_REM_CCE1; + data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1; +} + +static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr) +{ + union perf_mem_data_src data_src = { 0 }; + bool is_neoverse = is_midr_in_range(midr, neoverse_spe); + + if (record->op == ARM_SPE_LD) + data_src.mem_op = PERF_MEM_OP_LOAD; + else if (record->op == ARM_SPE_ST) + data_src.mem_op = PERF_MEM_OP_STORE; + else + return 0; + + if (is_neoverse) + arm_spe__synth_data_source_neoverse(record, &data_src); + else + arm_spe__synth_data_source_generic(record, &data_src); if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) { data_src.mem_dtlb = PERF_MEM_TLB_WK; @@ -436,7 +531,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq) u64 data_src; int err; - data_src = arm_spe__synth_data_source(record); + data_src = arm_spe__synth_data_source(record, spe->midr); if (spe->sample_flc) { if (record->type & ARM_SPE_L1D_MISS) { @@ -1178,6 +1273,8 @@ int arm_spe_process_auxtrace_info(union perf_event *event, struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info; size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX; struct perf_record_time_conv *tc = &session->time_conv; + const char *cpuid = perf_env__cpuid(session->evlist->env); + u64 midr = strtol(cpuid, NULL, 16); struct arm_spe *spe; int err; @@ -1197,6 +1294,7 @@ int arm_spe_process_auxtrace_info(union perf_event *event, spe->machine = &session->machines.host; /* No kvm support */ spe->auxtrace_type = auxtrace_info->type; spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE]; + spe->midr = midr; spe->timeless_decoding = arm_spe__is_timeless_decoding(spe); |
