diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-19 12:33:28 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-19 12:33:28 -0700 |
| commit | a90f1cd105c6c5c246f07ca371d873d35b78c7d9 (patch) | |
| tree | 8af5dbc44c60883d2c860c45042700ea63555105 /tools | |
| parent | a76056285f5d64740b461d70b062225ba80f0ac2 (diff) | |
| parent | 256d218ec6aea99855dc5c54af550fcff96fc732 (diff) | |
| download | linux-a90f1cd105c6c5c246f07ca371d873d35b78c7d9.tar.gz linux-a90f1cd105c6c5c246f07ca371d873d35b78c7d9.tar.bz2 linux-a90f1cd105c6c5c246f07ca371d873d35b78c7d9.zip | |
Merge tag 'turbostat-for-Linux-6.10-merge-window' of git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux
Pull turbostat updates from Len Brown:
- Survive sparse die id's seen in Linux-6.9
- Handle clustered-uncore topology in new/upcoming hardware
- For non-root use, add ability to see software C-state counters
- Enable reading core and package hardware cstate via perf, and prefer
perf over the MSR driver access for these counters
* tag 'turbostat-for-Linux-6.10-merge-window' of git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux:
tools/power turbostat: version 2024.05.10
tools/power turbostat: Ignore pkg_cstate_limit when it is not available
tools/power turbostat: Fix order of strings in pkg_cstate_limit_strings
tools/power turbostat: Read Package-cstates via perf
tools/power turbostat: Read Core-cstates via perf
tools/power turbostat: Avoid possible memory corruption due to sparse topology IDs
tools/power turbostat: Add columns for clustered uncore frequency
tools/power turbostat: Enable non-privileged users to read sysfs counters
tools/power turbostat: Replace _Static_assert with BUILD_BUG_ON
tools/power turbostat: Add ARL-H support
tools/power turbostat: Enhance ARL/LNL support
tools/power turbostat: Survive sparse die_id
tools/power turbostat: Remember global max_die_id
tools/power turbostat: Harden probe_intel_uncore_frequency()
tools/power turbostat: Add "snapshot:" Makefile target
Diffstat (limited to 'tools')
| -rw-r--r-- | tools/power/x86/turbostat/Makefile | 27 | ||||
| -rw-r--r-- | tools/power/x86/turbostat/turbostat.8 | 4 | ||||
| -rw-r--r-- | tools/power/x86/turbostat/turbostat.c | 1169 |
3 files changed, 899 insertions, 301 deletions
diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile index 92e139b9c792..2d6dce2c8f77 100644 --- a/tools/power/x86/turbostat/Makefile +++ b/tools/power/x86/turbostat/Makefile @@ -3,6 +3,8 @@ CC = $(CROSS_COMPILE)gcc BUILD_OUTPUT := $(CURDIR) PREFIX ?= /usr DESTDIR ?= +DAY := $(shell date +%Y.%m.%d) +SNAPSHOT = turbostat-$(DAY) ifeq ("$(origin O)", "command line") BUILD_OUTPUT := $(O) @@ -22,9 +24,30 @@ override CFLAGS += -D_FORTIFY_SOURCE=2 .PHONY : clean clean : @rm -f $(BUILD_OUTPUT)/turbostat + @rm -f $(SNAPSHOT).tar.gz install : turbostat - install -d $(DESTDIR)$(PREFIX)/bin + install -d $(DESTDIR)$(PREFIX)/bin install $(BUILD_OUTPUT)/turbostat $(DESTDIR)$(PREFIX)/bin/turbostat - install -d $(DESTDIR)$(PREFIX)/share/man/man8 + install -d $(DESTDIR)$(PREFIX)/share/man/man8 install -m 644 turbostat.8 $(DESTDIR)$(PREFIX)/share/man/man8 + +snapshot: turbostat + @rm -rf $(SNAPSHOT) + @mkdir $(SNAPSHOT) + @cp turbostat Makefile turbostat.c turbostat.8 ../../../../arch/x86/include/asm/intel-family.h $(SNAPSHOT) + + @sed -e 's/^#include <linux\/bits.h>/#include "bits.h"/' ../../../../arch/x86/include/asm/msr-index.h > $(SNAPSHOT)/msr-index.h + @echo '#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))' >> $(SNAPSHOT)/msr-index.h + @echo "#define BIT(x) (1 << (x))" > $(SNAPSHOT)/bits.h + @echo "#define BIT_ULL(nr) (1ULL << (nr))" >> $(SNAPSHOT)/bits.h + @echo "#define GENMASK(h, l) (((~0UL) << (l)) & (~0UL >> (sizeof(long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h + @echo "#define GENMASK_ULL(h, l) (((~0ULL) << (l)) & (~0ULL >> (sizeof(long long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h + + @echo PWD=. > $(SNAPSHOT)/Makefile + @echo "CFLAGS += -DMSRHEADER='\"msr-index.h\"'" >> $(SNAPSHOT)/Makefile + @echo "CFLAGS += -DINTEL_FAMILY_HEADER='\"intel-family.h\"'" >> $(SNAPSHOT)/Makefile + @sed -e's/.*MSRHEADER.*//' -e's/.*INTEL_FAMILY_HEADER.*//' Makefile >> $(SNAPSHOT)/Makefile + + @rm -f $(SNAPSHOT).tar.gz + tar cvzf $(SNAPSHOT).tar.gz $(SNAPSHOT) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 0d3672e5d9ed..8d37acd39201 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -155,7 +155,9 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBRAM_%\fP percent of the interval that RAPL throttling was active on DRAM. .PP -\fBUncMHz\fP uncore MHz, instantaneous sample. +\fBUncMHz\fP per-package uncore MHz, instantaneous sample. +.PP +\fBUMHz1.0\fP per-package uncore MHz for domain=1 and fabric_cluster=0, instantaneous sample. System summary is the average of all packages. .SH TOO MUCH INFORMATION EXAMPLE By default, turbostat dumps all possible information -- a system configuration header, followed by columns for all counters. This is ideal for remote debugging, use the "--out" option to save everything to a text file, and get that file to the expert helping you debug. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 98256468e248..8cdf41906e98 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -38,6 +38,7 @@ #include <stdbool.h> #include <assert.h> #include <linux/kernel.h> +#include <linux/build_bug.h> #define UNUSED(x) (void)(x) @@ -58,15 +59,22 @@ #define MAX_NOFILE 0x8000 enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; -enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; -enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; +enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M }; +enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE }; enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR }; +enum cstate_source { CSTATE_SOURCE_NONE, CSTATE_SOURCE_PERF, CSTATE_SOURCE_MSR }; + +struct sysfs_path { + char path[PATH_BYTES]; + int id; + struct sysfs_path *next; +}; struct msr_counter { unsigned int msr_num; char name[NAME_BYTES]; - char path[PATH_BYTES]; + struct sysfs_path *sp; unsigned int width; enum counter_type type; enum counter_format format; @@ -78,64 +86,64 @@ struct msr_counter { }; struct msr_counter bic[] = { - { 0x0, "usec", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Time_Of_Day_Seconds", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Package", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Node", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Avg_MHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Busy%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Bzy_MHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "TSC_MHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "IRQ", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SMI", "", 32, 0, FORMAT_DELTA, NULL, 0 }, - { 0x0, "sysfs", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c1", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c3", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c7", "", 0, 0, 0, NULL, 0 }, - { 0x0, "ThreadC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CoreTmp", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CoreCnt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PkgTmp", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFX%rc6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFXMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc2", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc3", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc7", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc8", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc9", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pk%pc10", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%LPI", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SYS%LPI", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PkgWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CorWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFXWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PkgCnt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "RAMWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PKG_%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "RAM_%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Cor_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFX_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "RAM_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Mod%c6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Totl%C0", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Any%C0", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFX%C0", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPUGFX%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Core", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU", "", 0, 0, 0, NULL, 0 }, - { 0x0, "APIC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "X2APIC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Die", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFXAMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "IPC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CoreThr", "", 0, 0, 0, NULL, 0 }, - { 0x0, "UncMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SAM%mc6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SAMMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SAMAMHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "usec", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Time_Of_Day_Seconds", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Package", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Node", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Avg_MHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Busy%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Bzy_MHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "TSC_MHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "IRQ", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SMI", NULL, 32, 0, FORMAT_DELTA, NULL, 0 }, + { 0x0, "sysfs", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c1", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c3", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c7", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "ThreadC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CoreTmp", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CoreCnt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PkgTmp", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFX%rc6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFXMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc2", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc3", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc7", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc8", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc9", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pk%pc10", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%LPI", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SYS%LPI", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PkgWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CorWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFXWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PkgCnt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "RAMWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PKG_%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "RAM_%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Cor_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFX_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "RAM_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Mod%c6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Totl%C0", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Any%C0", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFX%C0", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPUGFX%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Core", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "APIC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "X2APIC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Die", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFXAMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "IPC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CoreThr", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "UncMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SAM%mc6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SAMMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SAMAMHz", NULL, 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -216,6 +224,28 @@ unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT) #define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT) +/* + * MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit: + * If you change the values, note they are used both in comparisons + * (>= PCL__7) and to index pkg_cstate_limit_strings[]. + */ +#define PCLUKN 0 /* Unknown */ +#define PCLRSV 1 /* Reserved */ +#define PCL__0 2 /* PC0 */ +#define PCL__1 3 /* PC1 */ +#define PCL__2 4 /* PC2 */ +#define PCL__3 5 /* PC3 */ +#define PCL__4 6 /* PC4 */ +#define PCL__6 7 /* PC6 */ +#define PCL_6N 8 /* PC6 No Retention */ +#define PCL_6R 9 /* PC6 Retention */ +#define PCL__7 10 /* PC7 */ +#define PCL_7S 11 /* PC7 Shrink */ +#define PCL__8 12 /* PC8 */ +#define PCL__9 13 /* PC9 */ +#define PCL_10 14 /* PC10 */ +#define PCLUNL 15 /* Unlimited */ + struct amperf_group_fd; char *proc_stat = "/proc/stat"; @@ -299,6 +329,9 @@ struct gfx_sysfs_info { static struct gfx_sysfs_info gfx_info[GFX_MAX]; int get_msr(int cpu, off_t offset, unsigned long long *msr); +int add_counter(unsigned int msr_num, char *path, char *name, + unsigned int width, enum counter_scope scope, + enum counter_type type, enum counter_format format, int flags, int package_num); /* Model specific support Start */ @@ -663,6 +696,23 @@ static const struct platform_features adl_features = { .enable_tsc_tweak = 1, }; +static const struct platform_features arl_features = { + .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, + .has_config_tdp = 1, + .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC10, + .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, + .has_msr_core_c1_res = 1, + .has_ext_cst_msrs = 1, + .trl_msrs = TRL_BASE, + .tcc_offset_bits = 6, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .enable_tsc_tweak = 1, +}; + static const struct platform_features skx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, @@ -905,8 +955,10 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_RAPTORLAKE_S, &adl_features }, { INTEL_FAM6_METEORLAKE, &cnl_features }, { INTEL_FAM6_METEORLAKE_L, &cnl_features }, - { INTEL_FAM6_ARROWLAKE, &cnl_features }, - { INTEL_FAM6_LUNARLAKE_M, &cnl_features }, + { INTEL_FAM6_ARROWLAKE_H, &arl_features }, + { INTEL_FAM6_ARROWLAKE_U, &arl_features }, + { INTEL_FAM6_ARROWLAKE, &arl_features }, + { INTEL_FAM6_LUNARLAKE_M, &arl_features }, { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features }, { INTEL_FAM6_ATOM_AIRMONT, &amt_features }, @@ -979,8 +1031,9 @@ char *progname; #define CPU_SUBSET_MAXCPUS 1024 /* need to use before probe... */ cpu_set_t *cpu_present_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset; size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size; -#define MAX_ADDED_COUNTERS 8 #define MAX_ADDED_THREAD_COUNTERS 24 +#define MAX_ADDED_CORE_COUNTERS 8 +#define MAX_ADDED_PACKAGE_COUNTERS 16 #define BITMASK_SIZE 32 /* Indexes used to map data read from perf and MSRs into global variables */ @@ -1022,6 +1075,7 @@ struct rapl_counter_info_t { /* struct rapl_counter_info_t for each RAPL domain */ struct rapl_counter_info_t *rapl_counter_info_perdomain; +unsigned int rapl_counter_info_perdomain_size; #define RAPL_COUNTER_FLAG_USE_MSR_SUM (1u << 1) @@ -1152,6 +1206,161 @@ struct rapl_counter { double scale; }; +/* Indexes used to map data read from perf and MSRs into global variables */ +enum ccstate_rci_index { + CCSTATE_RCI_INDEX_C1_RESIDENCY = 0, + CCSTATE_RCI_INDEX_C3_RESIDENCY = 1, + CCSTATE_RCI_INDEX_C6_RESIDENCY = 2, + CCSTATE_RCI_INDEX_C7_RESIDENCY = 3, + PCSTATE_RCI_INDEX_C2_RESIDENCY = 4, + PCSTATE_RCI_INDEX_C3_RESIDENCY = 5, + PCSTATE_RCI_INDEX_C6_RESIDENCY = 6, + PCSTATE_RCI_INDEX_C7_RESIDENCY = 7, + PCSTATE_RCI_INDEX_C8_RESIDENCY = 8, + PCSTATE_RCI_INDEX_C9_RESIDENCY = 9, + PCSTATE_RCI_INDEX_C10_RESIDENCY = 10, + NUM_CSTATE_COUNTERS, +}; + +struct cstate_counter_info_t { + unsigned long long data[NUM_CSTATE_COUNTERS]; + enum cstate_source source[NUM_CSTATE_COUNTERS]; + unsigned long long msr[NUM_CSTATE_COUNTERS]; + int fd_perf_core; + int fd_perf_pkg; +}; + +struct cstate_counter_info_t *ccstate_counter_info; +unsigned int ccstate_counter_info_size; + +#define CSTATE_COUNTER_FLAG_COLLECT_PER_CORE (1u << 0) +#define CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD ((1u << 1) | CSTATE_COUNTER_FLAG_COLLECT_PER_CORE) +#define CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY (1u << 2) + +struct cstate_counter_arch_info { + int feature_mask; /* Mask for testing if the counter is supported on host */ + const char *perf_subsys; + const char *perf_name; + unsigned long long msr; + unsigned int rci_index; /* Maps data from perf counters to global variables */ + unsigned long long bic; + unsigned long long flags; + int pkg_cstate_limit; +}; + +static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { + { + .feature_mask = CC1, + .perf_subsys = "cstate_core", + .perf_name = "c1-residency", + .msr = MSR_CORE_C1_RES, + .rci_index = CCSTATE_RCI_INDEX_C1_RESIDENCY, + .bic = BIC_CPU_c1, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD, + .pkg_cstate_limit = 0, + }, + { + .feature_mask = CC3, + .perf_subsys = "cstate_core", + .perf_name = "c3-residency", + .msr = MSR_CORE_C3_RESIDENCY, + .rci_index = CCSTATE_RCI_INDEX_C3_RESIDENCY, + .bic = BIC_CPU_c3, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .pkg_cstate_limit = 0, + }, + { + .feature_mask = CC6, + .perf_subsys = "cstate_core", + .perf_name = "c6-residency", + .msr = MSR_CORE_C6_RESIDENCY, + .rci_index = CCSTATE_RCI_INDEX_C6_RESIDENCY, + .bic = BIC_CPU_c6, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .pkg_cstate_limit = 0, + }, + { + .feature_mask = CC7, + .perf_subsys = "cstate_core", + .perf_name = "c7-residency", + .msr = MSR_CORE_C7_RESIDENCY, + .rci_index = CCSTATE_RCI_INDEX_C7_RESIDENCY, + .bic = BIC_CPU_c7, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .pkg_cstate_limit = 0, + }, + { + .feature_mask = PC2, + .perf_subsys = "cstate_pkg", + .perf_name = "c2-residency", + .msr = MSR_PKG_C2_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C2_RESIDENCY, + .bic = BIC_Pkgpc2, + .flags = 0, + .pkg_cstate_limit = PCL__2, + }, + { + .feature_mask = PC3, + .perf_subsys = "cstate_pkg", + .perf_name = "c3-residency", + .msr = MSR_PKG_C3_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C3_RESIDENCY, + .bic = BIC_Pkgpc3, + .flags = 0, + .pkg_cstate_limit = PCL__3, + }, + { + .feature_mask = PC6, + .perf_subsys = "cstate_pkg", + .perf_name = "c6-residency", + .msr = MSR_PKG_C6_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C6_RESIDENCY, + .bic = BIC_Pkgpc6, + .flags = 0, + .pkg_cstate_limit = PCL__6, + }, + { + .feature_mask = PC7, + .perf_subsys = "cstate_pkg", + .perf_name = "c7-residency", + .msr = MSR_PKG_C7_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C7_RESIDENCY, + .bic = BIC_Pkgpc7, + .flags = 0, + .pkg_cstate_limit = PCL__7, + }, + { + .feature_mask = PC8, + .perf_subsys = "cstate_pkg", + .perf_name = "c8-residency", + .msr = MSR_PKG_C8_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C8_RESIDENCY, + .bic = BIC_Pkgpc8, + .flags = 0, + .pkg_cstate_limit = PCL__8, + }, + { + .feature_mask = PC9, + .perf_subsys = "cstate_pkg", + .perf_name = "c9-residency", + .msr = MSR_PKG_C9_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C9_RESIDENCY, + .bic = BIC_Pkgpc9, + .flags = 0, + .pkg_cstate_limit = PCL__9, + }, + { + .feature_mask = PC10, + .perf_subsys = "cstate_pkg", + .perf_name = "c10-residency", + .msr = MSR_PKG_C10_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C10_RESIDENCY, + .bic = BIC_Pkgpc10, + .flags = 0, + .pkg_cstate_limit = PCL_10, + }, +}; + struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -1181,7 +1390,7 @@ struct core_data { struct rapl_counter core_energy; /* MSR_CORE_ENERGY_STAT */ unsigned int core_id; unsigned long long core_throt_cnt; - unsigned long long counter[MAX_ADDED_COUNTERS]; + unsigned long long counter[MAX_ADDED_CORE_COUNTERS]; } *core_even, *core_odd; struct pkg_data { @@ -1214,7 +1423,7 @@ struct pkg_data { struct rapl_counter rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ unsigned int pkg_temp_c; unsigned int uncore_mhz; - unsigned long long counter[MAX_ADDED_COUNTERS]; + unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS]; } *package_even, *package_odd; #define ODD_COUNTERS thread_odd, core_odd, package_odd @@ -1357,36 +1566,42 @@ struct sys_counters { struct msr_counter *pp; } sys; -void free_sys_counters(void) +static size_t free_msr_counters_(struct msr_counter **pp) { - struct msr_counter *p = sys.tp, *pnext = NULL; + struct msr_counter *p = NULL; + size_t num_freed = 0; - while (p) { - pnext = p->next; - free(p); - p = pnext; - } + while (*pp) { + p = *pp; - p = sys.cp, pnext = NULL; - while (p) { - pnext = p->next; - free(p); - p = pnext; - } + if (p->msr_num != 0) { + *pp = p->next; + + free(p); + ++num_freed; - p = sys.pp, pnext = NULL; - while (p) { - pnext = p->next; - free(p); - p = pnext; + continue; + } + + pp = &p->next; } - sys.added_thread_counters = 0; - sys.added_core_counters = 0; - sys.added_package_counters = 0; - sys.tp = NULL; - sys.cp = NULL; - sys.pp = NULL; + return num_freed; +} + +/* + * Free all added counters accessed via msr. + */ +static void free_sys_msr_counters(void) +{ + /* Thread counters */ + sys.added_thread_counters -= free_msr_counters_(&sys.tp); + + /* Core counters */ + sys.added_core_counters -= free_msr_counters_(&sys.cp); + + /* Package counters */ + sys.added_package_counters -= free_msr_counters_(&sys.pp); } struct system_summary { @@ -1415,6 +1630,9 @@ struct topo_params { int allowed_cpus; int allowed_cores; int max_cpu_num; + int max_core_id; + int max_package_id; + int max_die_id; int max_node_num; int nodes_per_pkg; int cores_per_node; @@ -1529,23 +1747,12 @@ int get_msr_fd(int cpu) static void bic_disable_msr_access(void) { - const unsigned long bic_msrs = - BIC_SMI | - BIC_CPU_c1 | - BIC_CPU_c3 | - BIC_CPU_c6 | - BIC_CPU_c7 | - BIC_Mod_c6 | - BIC_CoreTmp | - BIC_Totl_c0 | - BIC_Any_c0 | - BIC_GFX_c0 | - BIC_CPUGFX | - BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_PkgTmp; + const unsigned long bic_msrs = BIC_SMI | BIC_Mod_c6 | BIC_CoreTmp | + BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_PkgTmp; bic_enabled &= ~bic_msrs; - free_sys_counters(); + free_sys_msr_counters(); } static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) @@ -1928,13 +2135,15 @@ void print_header(char *delim) if (mp->format == FORMAT_RAW) { if (mp->width == 64) outp += sprintf(outp, "%s%18.18s", delim, mp->name); - else + else if (mp->width == 32) outp += sprintf(outp, "%s%10.10s", delim, mp->name); + else + outp += sprintf(outp, "%s%7.7s", delim, mp->name); } else { if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns) outp += sprintf(outp, "%s%8s", delim, mp->name); else - outp += sprintf(outp, "%s%s", delim, mp->name); + outp += sprintf(outp, "%s%7.7s", delim, mp->name); } } @@ -1966,7 +2175,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { outp += sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - t->counter[i], mp->path); + t->counter[i], mp->sp->path); } } @@ -1987,7 +2196,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { outp += sprintf(outp, "cADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - c->counter[i], mp->path); + c->counter[i], mp->sp->path); } outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us); } @@ -2023,7 +2232,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { outp += sprintf(outp, "pADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - p->counter[i], mp->path); + p->counter[i], mp->sp->path); } } @@ -2388,7 +2597,8 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->counter[i]); } else if (mp->format == FORMAT_PERCENT) { outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->counter[i] / tsc); - } + } else if (mp->type == COUNTER_K2M) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->counter[i] / 1000); } done: @@ -2498,6 +2708,8 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) old->counter[i] = new->counter[i]; + else if (mp->format == FORMAT_AVERAGE) + old->counter[i] = new->counter[i]; else old->counter[i] = new->counter[i] - old->counter[i]; } @@ -2970,7 +3182,7 @@ unsigned long long snapshot_sysfs_counter(char *path) return counter; } -int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) +int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp, char *counter_path) { if (mp->msr_num != 0) { assert(!no_msr); @@ -2980,25 +3192,40 @@ int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) char path[128 + PATH_BYTES]; if (mp->flags & SYSFS_PERCPU) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->path); + sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->sp->path); *counterp = snapshot_sysfs_counter(path); } else { - *counterp = snapshot_sysfs_counter(mp->path); + *counterp = snapshot_sysfs_counter(counter_path); } } return 0; } -unsigned long long get_uncore_mhz(int package, int die) +unsigned long long get_legacy_uncore_mhz(int package) { char path[128]; + int die; + static int warn_once; + + /* + * for this package, use the first die_id that exists + */ + for (die = 0; die <= topo.max_die_id; ++die) { - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", package, - die); + sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", + package, die); - return (snapshot_sysfs_counter(path) / 1000); + if (access(path, R_OK) == 0) + return (snapshot_sysfs_counter(path) / 1000); + } + if (!warn_once) { + warnx("BUG: %s: No %s", __func__, path); + warn_once = 1; + } + + return 0; } int get_epb(int cpu) @@ -3361,6 +3588,17 @@ size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci) return ret; } +static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t *cci) +{ + size_t ret = 0; + + for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i) + if (cci->source[i] == CSTATE_SOURCE_PERF) + ++ret; + + return ret; +} + void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci, unsigned int idx) { rc->raw_value = rci->data[idx]; @@ -3368,15 +3606,18 @@ void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci rc->scale = rci->scale[idx]; } -int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data *p) +int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct pkg_data *p) { unsigned long long perf_data[NUM_RAPL_COUNTERS + 1]; - struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain]; + struct rapl_counter_info_t *rci; if (debug) fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain); assert(rapl_counter_info_perdomain); + assert(domain < rapl_counter_info_perdomain_size); + + rci = &rapl_counter_info_perdomain[domain]; /* * If we have any perf counters to read, read them all now, in bulk @@ -3432,7 +3673,7 @@ int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data } } - _Static_assert(NUM_RAPL_COUNTERS == 7); + BUILD_BUG_ON(NUM_RAPL_COUNTERS != 7); write_rapl_counter(&p->energy_pkg, rci, RAPL_RCI_INDEX_ENERGY_PKG); write_rapl_counter(&p->energy_cores, rci, RAPL_RCI_INDEX_ENERGY_CORES); write_rapl_counter(&p->energy_dram, rci, RAPL_RCI_INDEX_DRAM); @@ -3444,6 +3685,154 @@ int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data return 0; } +char *find_sysfs_path_by_id(struct sysfs_path *sp, int id) +{ + while (sp) { + if (sp->id == id) + return (sp->path); + sp = sp->next; + } + if (debug) + warnx("%s: id%d not found", __func__, id); + return NULL; +} + +int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + /* + * Overcommit memory a little bit here, + * but skip calculating exact sizes for the buffers. + */ + unsigned long long perf_data[NUM_CSTATE_COUNTERS]; + unsigned long long perf_data_core[NUM_CSTATE_COUNTERS + 1]; + unsigned long long perf_data_pkg[NUM_CSTATE_COUNTERS + 1]; + + struct cstate_counter_info_t *cci; + + if (debug) + fprintf(stderr, "%s: cpu%d\n", __func__, cpu); + + assert(ccstate_counter_info); + assert(cpu <= ccstate_counter_info_size); + + memset(perf_data, 0, sizeof(perf_data)); + memset(perf_data_core, 0, sizeof(perf_data_core)); + memset(perf_data_pkg, 0, sizeof(perf_data_pkg)); + + cci = &ccstate_counter_info[cpu]; + + /* + * If we have any perf counters to read, read them all now, in bulk + */ + const size_t num_perf_counters = cstate_counter_info_count_perf(cci); + ssize_t expected_read_size = num_perf_counters * sizeof(unsigned long long); + ssize_t actual_read_size_core = 0, actual_read_size_pkg = 0; + + if (cci->fd_perf_core != -1) { + /* Each descriptor read begins with number of counters read. */ + expected_read_size += sizeof(unsigned long long); + + actual_read_size_core = read(cci->fd_perf_core, &perf_data_core[0], sizeof(perf_data_core)); + + if (actual_read_size_core <= 0) + err(-1, "%s: read perf %s: %ld", __func__, "core", actual_read_size_core); + } + + if (cci->fd_perf_pkg != -1) { + /* Each descriptor read begins with number of counters read. */ + expected_read_size += sizeof(unsigned long long); + + actual_read_size_pkg = read(cci->fd_perf_pkg, &perf_data_pkg[0], sizeof(perf_data_pkg)); + + if (actual_read_size_pkg <= 0) + err(-1, "%s: read perf %s: %ld", __func__, "pkg", actual_read_size_pkg); + } + + const ssize_t actual_read_size_total = actual_read_size_core + actual_read_size_pkg; + + if (actual_read_size_total != expected_read_size) + err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, actual_read_size_total); + + /* + * Copy ccstate and pcstate data into unified buffer. + * + * Skip first element from core and pkg buffers. + * Kernel puts there how many counters were read. + */ + const size_t num_core_counters = perf_data_core[0]; + const size_t num_pkg_counters = perf_data_pkg[0]; + + assert(num_perf_counters == num_core_counters + num_pkg_counters); + + /* Copy ccstate perf data */ + memcpy(&perf_data[0], &perf_data_core[1], num_core_counters * sizeof(unsigned long long)); + + /* Copy pcstate perf data */ + memcpy(&perf_data[num_core_counters], &perf_data_pkg[1], num_pkg_counters * sizeof(unsigned long long)); + + for (unsigned int i = 0, pi = 0; i < NUM_CSTATE_COUNTERS; ++i) { + switch (cci->source[i]) { + case CSTATE_SOURCE_NONE: + break; + + case CSTATE_SOURCE_PERF: + assert(pi < ARRAY_SIZE(perf_data)); + assert(cci->fd_perf_core != -1 || cci->fd_perf_pkg != -1); + + if (debug) { + fprintf(stderr, "cstate via %s %u: %llu\n", "perf", i, perf_data[pi]); + } + + cci->data[i] = perf_data[pi]; + + ++pi; + break; + + case CSTATE_SOURCE_MSR: + assert(!no_msr); + if (get_msr(cpu, cci->msr[i], &cci->data[i])) + return -13 - i; + + if (debug) { + fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", "msr", cci->msr[i], i, cci->data[i]); + } + + break; + } + } + + /* + * Helper to write the data only if the source of + * the counter for the current cpu is not none. + * + * Otherwise we would overwrite core data with 0 (default value), + * when invoked for the thread sibling. + */ +#define PERF_COUNTER_WRITE_DATA(out_counter, index) do { \ + if (cci->source[index] != CSTATE_SOURCE_NONE) \ + out_counter = cci->data[index]; \ +} while (0) + + BUILD_BUG_ON(NUM_CSTATE_COUNTERS != 11); + + PERF_COUNTER_WRITE_DATA(t->c1, CCSTATE_RCI_INDEX_C1_RESIDENCY); + PERF_COUNTER_WRITE_DATA(c->c3, CCSTATE_RCI_INDEX_C3_RESIDENCY); + PERF_COUNTER_WRITE_DATA(c->c6, CCSTATE_RCI_INDEX_C6_RESIDENCY); + PERF_COUNTER_WRITE_DATA(c->c7, CCSTATE_RCI_INDEX_C7_RESIDENCY); + + PERF_COUNTER_WRITE_DATA(p->pc2, PCSTATE_RCI_INDEX_C2_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc3, PCSTATE_RCI_INDEX_C3_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc6, PCSTATE_RCI_INDEX_C6_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc7, PCSTATE_RCI_INDEX_C7_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc8, PCSTATE_RCI_INDEX_C8_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc9, PCSTATE_RCI_INDEX_C9_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc10, PCSTATE_RCI_INDEX_C10_RESIDENCY); + +#undef PERF_COUNTER_WRITE_DATA + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -3499,13 +3888,11 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -5; t->smi_count = msr & 0xFFFFFFFF; } - if (DO_BIC(BIC_CPU_c1) && platform->has_msr_core_c1_res) { - if (get_msr(cpu, MSR_CORE_C1_RES, &t->c1)) - return -6; - } + + get_cstate_counters(cpu, t, c, p); for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { - if (get_mp(cpu, mp, &t->counter[i])) + if (get_mp(cpu, mp, &t->counter[i], mp->sp->path)) return -10; } @@ -3519,31 +3906,14 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return status; } - if (DO_BIC(BIC_CPU_c3) || soft_c1_residency_display(BIC_CPU_c3)) { - if (get_msr(cpu, MSR_CORE_C3_RESIDENCY, &c->c3)) - return -6; - } - - if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !platform->has_msr_knl_core_c6_residency) { |
