summaryrefslogtreecommitdiff
path: root/tools/perf/util/arm-spe.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-08-14 09:22:11 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-08-14 09:22:11 -0700
commit96f86ff08332d88defd35c330fc6dae219b9e264 (patch)
tree1122e9ac32022cc7c5a365c2ab6ddf116fbd3b15 /tools/perf/util/arm-spe.c
parentd785610f052d7456497cdec2a2406f6d4b16569f (diff)
parent7391db6459388d47d657aad633cb55fc04a8d4fb (diff)
downloadlinux-96f86ff08332d88defd35c330fc6dae219b9e264.tar.gz
linux-96f86ff08332d88defd35c330fc6dae219b9e264.tar.bz2
linux-96f86ff08332d88defd35c330fc6dae219b9e264.zip
Merge tag 'perf-tools-fixes-for-v6.0-2022-08-13' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux
Pull more perf tool updates from Arnaldo Carvalho de Melo: - 'perf c2c' now supports ARM64, adjust its output to cope with differences with what is in x86_64. Now go find false sharing on ARM64 (at least Neoverse) as well! - Refactor the JSON processing, making the output more compact and thus reducing the size of the resulting perf binary - Improvements for 'perf offcpu' profiling, including tracking child processes - Update Intel JSON metrics and events files for broadwellde, broadwellx, cascadelakex, haswellx, icelakex, ivytown, jaketown, knightslanding, sapphirerapids, skylakex and snowridgex - Add 'perf stat' JSON output and a 'perf test' entry for it - Ignore memfd and anonymous mmap events if jitdump present - Refactor 'perf test' shell tests allowing subdirs - Fix an error handling path in 'parse_perf_probe_command()' - Fixes for the guest Intel PT tracing patchkit in the 1st batch of this merge window - Print debuginfod queries if -v option is used, to explain delays in processing when debuginfo servers are enabled to fetch DSOs with richer symbol tables - Improve error message for 'perf record -p not_existing_pid' - Fix openssl and libbpf feature detection - Add PMU pai_crypto event description for IBM z16 on 'perf list' - Fix typos and duplicated words on comments in various places * tag 'perf-tools-fixes-for-v6.0-2022-08-13' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux: (81 commits) perf test: Refactor shell tests allowing subdirs perf vendor events: Update events for snowridgex perf vendor events: Update events and metrics for skylakex perf vendor events: Update metrics for sapphirerapids perf vendor events: Update events for knightslanding perf vendor events: Update metrics for jaketown perf vendor events: Update metrics for ivytown perf vendor events: Update events and metrics for icelakex perf vendor events: Update events and metrics for haswellx perf vendor events: Update events and metrics for cascadelakex perf vendor events: Update events and metrics for broadwellx perf vendor events: Update metrics for broadwellde perf jevents: Fold strings optimization perf jevents: Compress the pmu_events_table perf metrics: Copy entire pmu_event in find metric perf pmu-events: Hide the pmu_events perf pmu-events: Don't assume pmu_event is an array perf pmu-events: Move test events/metrics to JSON perf test: Use full metric resolution perf pmu-events: Hide pmu_events_map ...
Diffstat (limited to 'tools/perf/util/arm-spe.c')
-rw-r--r--tools/perf/util/arm-spe.c130
1 files changed, 114 insertions, 16 deletions
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index d040406f3314..22dcfe07e886 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -34,6 +34,7 @@
#include "arm-spe-decoder/arm-spe-decoder.h"
#include "arm-spe-decoder/arm-spe-pkt-decoder.h"
+#include "../../arch/arm64/include/asm/cputype.h"
#define MAX_TIMESTAMP (~0ULL)
struct arm_spe {
@@ -45,6 +46,7 @@ struct arm_spe {
struct perf_session *session;
struct machine *machine;
u32 pmu_type;
+ u64 midr;
struct perf_tsc_conversion tc;
@@ -387,35 +389,128 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
return arm_spe_deliver_synth_event(spe, speq, event, &sample);
}
-static u64 arm_spe__synth_data_source(const struct arm_spe_record *record)
+static const struct midr_range neoverse_spe[] = {
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
+ {},
+};
+
+static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record,
+ union perf_mem_data_src *data_src)
{
- union perf_mem_data_src data_src = { 0 };
+ /*
+ * Even though four levels of cache hierarchy are possible, no known
+ * production Neoverse systems currently include more than three levels
+ * so for the time being we assume three exist. If a production system
+ * is built with four the this function would have to be changed to
+ * detect the number of levels for reporting.
+ */
- if (record->op == ARM_SPE_LD)
- data_src.mem_op = PERF_MEM_OP_LOAD;
- else if (record->op == ARM_SPE_ST)
- data_src.mem_op = PERF_MEM_OP_STORE;
- else
- return 0;
+ /*
+ * We have no data on the hit level or data source for stores in the
+ * Neoverse SPE records.
+ */
+ if (record->op & ARM_SPE_ST) {
+ data_src->mem_lvl = PERF_MEM_LVL_NA;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
+ data_src->mem_snoop = PERF_MEM_SNOOP_NA;
+ return;
+ }
+
+ switch (record->source) {
+ case ARM_SPE_NV_L1D:
+ data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
+ data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
+ break;
+ case ARM_SPE_NV_L2:
+ data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
+ data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
+ break;
+ case ARM_SPE_NV_PEER_CORE:
+ data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
+ data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
+ break;
+ /*
+ * We don't know if this is L1, L2 but we do know it was a cache-2-cache
+ * transfer, so set SNOOPX_PEER
+ */
+ case ARM_SPE_NV_LOCAL_CLUSTER:
+ case ARM_SPE_NV_PEER_CLUSTER:
+ data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
+ data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
+ break;
+ /*
+ * System cache is assumed to be L3
+ */
+ case ARM_SPE_NV_SYS_CACHE:
+ data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
+ data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
+ break;
+ /*
+ * We don't know what level it hit in, except it came from the other
+ * socket
+ */
+ case ARM_SPE_NV_REMOTE:
+ data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
+ data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
+ data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
+ break;
+ case ARM_SPE_NV_DRAM:
+ data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
+ data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
+ break;
+ default:
+ break;
+ }
+}
+static void arm_spe__synth_data_source_generic(const struct arm_spe_record *record,
+ union perf_mem_data_src *data_src)
+{
if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
- data_src.mem_lvl = PERF_MEM_LVL_L3;
+ data_src->mem_lvl = PERF_MEM_LVL_L3;
if (record->type & ARM_SPE_LLC_MISS)
- data_src.mem_lvl |= PERF_MEM_LVL_MISS;
+ data_src->mem_lvl |= PERF_MEM_LVL_MISS;
else
- data_src.mem_lvl |= PERF_MEM_LVL_HIT;
+ data_src->mem_lvl |= PERF_MEM_LVL_HIT;
} else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) {
- data_src.mem_lvl = PERF_MEM_LVL_L1;
+ data_src->mem_lvl = PERF_MEM_LVL_L1;
if (record->type & ARM_SPE_L1D_MISS)
- data_src.mem_lvl |= PERF_MEM_LVL_MISS;
+ data_src->mem_lvl |= PERF_MEM_LVL_MISS;
else
- data_src.mem_lvl |= PERF_MEM_LVL_HIT;
+ data_src->mem_lvl |= PERF_MEM_LVL_HIT;
}
if (record->type & ARM_SPE_REMOTE_ACCESS)
- data_src.mem_lvl |= PERF_MEM_LVL_REM_CCE1;
+ data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1;
+}
+
+static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr)
+{
+ union perf_mem_data_src data_src = { 0 };
+ bool is_neoverse = is_midr_in_range(midr, neoverse_spe);
+
+ if (record->op == ARM_SPE_LD)
+ data_src.mem_op = PERF_MEM_OP_LOAD;
+ else if (record->op == ARM_SPE_ST)
+ data_src.mem_op = PERF_MEM_OP_STORE;
+ else
+ return 0;
+
+ if (is_neoverse)
+ arm_spe__synth_data_source_neoverse(record, &data_src);
+ else
+ arm_spe__synth_data_source_generic(record, &data_src);
if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
data_src.mem_dtlb = PERF_MEM_TLB_WK;
@@ -436,7 +531,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
u64 data_src;
int err;
- data_src = arm_spe__synth_data_source(record);
+ data_src = arm_spe__synth_data_source(record, spe->midr);
if (spe->sample_flc) {
if (record->type & ARM_SPE_L1D_MISS) {
@@ -1178,6 +1273,8 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX;
struct perf_record_time_conv *tc = &session->time_conv;
+ const char *cpuid = perf_env__cpuid(session->evlist->env);
+ u64 midr = strtol(cpuid, NULL, 16);
struct arm_spe *spe;
int err;
@@ -1197,6 +1294,7 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
spe->machine = &session->machines.host; /* No kvm support */
spe->auxtrace_type = auxtrace_info->type;
spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
+ spe->midr = midr;
spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);