summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-11-01 09:34:55 -1000
committerLinus Torvalds <torvalds@linux-foundation.org>2023-11-01 09:34:55 -1000
commit56ec8e4cd8cbff3c96c53cd8303bba924613b5ce (patch)
tree72d38b1c2a6d8be1a25eba18b7fd97d12bc7ab81 /drivers
parent7d461b291e65938f15f56fe58da2303b07578a76 (diff)
parent14dcf78a6c042dd9421b11485b394c6273568bca (diff)
downloadlinux-56ec8e4cd8cbff3c96c53cd8303bba924613b5ce.tar.gz
linux-56ec8e4cd8cbff3c96c53cd8303bba924613b5ce.tar.bz2
linux-56ec8e4cd8cbff3c96c53cd8303bba924613b5ce.zip
Merge tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux
Pull arm64 updates from Catalin Marinas: "No major architecture features this time around, just some new HWCAP definitions, support for the Ampere SoC PMUs and a few fixes/cleanups. The bulk of the changes is reworking of the CPU capability checking code (cpus_have_cap() etc). - Major refactoring of the CPU capability detection logic resulting in the removal of the cpus_have_const_cap() function and migrating the code to "alternative" branches where possible - Backtrace/kgdb: use IPIs and pseudo-NMI - Perf and PMU: - Add support for Ampere SoC PMUs - Multi-DTC improvements for larger CMN configurations with multiple Debug & Trace Controllers - Rework the Arm CoreSight PMU driver to allow separate registration of vendor backend modules - Fixes: add missing MODULE_DEVICE_TABLE to the amlogic perf driver; use device_get_match_data() in the xgene driver; fix NULL pointer dereference in the hisi driver caused by calling cpuhp_state_remove_instance(); use-after-free in the hisi driver - HWCAP updates: - FEAT_SVE_B16B16 (BFloat16) - FEAT_LRCPC3 (release consistency model) - FEAT_LSE128 (128-bit atomic instructions) - SVE: remove a couple of pseudo registers from the cpufeature code. There is logic in place already to detect mismatched SVE features - Miscellaneous: - Reduce the default swiotlb size (currently 64MB) if no ZONE_DMA bouncing is needed. The buffer is still required for small kmalloc() buffers - Fix module PLT counting with !RANDOMIZE_BASE - Restrict CPU_BIG_ENDIAN to LLVM IAS 15.x or newer move synchronisation code out of the set_ptes() loop - More compact cpufeature displaying enabled cores - Kselftest updates for the new CPU features" * tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux: (83 commits) arm64: Restrict CPU_BIG_ENDIAN to GNU as or LLVM IAS 15.x or newer arm64: module: Fix PLT counting when CONFIG_RANDOMIZE_BASE=n arm64, irqchip/gic-v3, ACPI: Move MADT GICC enabled check into a helper perf: hisi: Fix use-after-free when register pmu fails drivers/perf: hisi_pcie: Initialize event->cpu only on success drivers/perf: hisi_pcie: Check the type first in pmu::event_init() arm64: cpufeature: Change DBM to display enabled cores arm64: cpufeature: Display the set of cores with a feature perf/arm-cmn: Enable per-DTC counter allocation perf/arm-cmn: Rework DTC counters (again) perf/arm-cmn: Fix DTC domain detection drivers: perf: arm_pmuv3: Drop some unused arguments from armv8_pmu_init() drivers: perf: arm_pmuv3: Read PMMIR_EL1 unconditionally drivers/perf: hisi: use cpuhp_state_remove_instance_nocalls() for hisi_hns3_pmu uninit process clocksource/drivers/arm_arch_timer: limit XGene-1 workaround arm64: Remove system_uses_lse_atomics() arm64: Mark the 'addr' argument to set_ptes() and __set_pte_at() as unused drivers/perf: xgene: Use device_get_match_data() perf/amlogic: add missing MODULE_DEVICE_TABLE arm64/mm: Hoist synchronization out of set_ptes() loop ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/acpi/processor_core.c2
-rw-r--r--drivers/clocksource/arm_arch_timer.c36
-rw-r--r--drivers/irqchip/irq-gic-v3.c82
-rw-r--r--drivers/perf/amlogic/meson_g12_ddr_pmu.c1
-rw-r--r--drivers/perf/arm-cmn.c154
-rw-r--r--drivers/perf/arm_cspmu/Kconfig19
-rw-r--r--drivers/perf/arm_cspmu/Makefile8
-rw-r--r--drivers/perf/arm_cspmu/ampere_cspmu.c272
-rw-r--r--drivers/perf/arm_cspmu/arm_cspmu.c201
-rw-r--r--drivers/perf/arm_cspmu/arm_cspmu.h32
-rw-r--r--drivers/perf/arm_cspmu/nvidia_cspmu.c34
-rw-r--r--drivers/perf/arm_cspmu/nvidia_cspmu.h17
-rw-r--r--drivers/perf/arm_pmuv3.c46
-rw-r--r--drivers/perf/hisilicon/hisi_pcie_pmu.c9
-rw-r--r--drivers/perf/hisilicon/hisi_uncore_pa_pmu.c4
-rw-r--r--drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c4
-rw-r--r--drivers/perf/hisilicon/hns3_pmu.c8
-rw-r--r--drivers/perf/xgene_pmu.c37
18 files changed, 718 insertions, 248 deletions
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 7dd6dbaa98c3..b203cfe28550 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -90,7 +90,7 @@ static int map_gicc_mpidr(struct acpi_subtable_header *entry,
struct acpi_madt_generic_interrupt *gicc =
container_of(entry, struct acpi_madt_generic_interrupt, header);
- if (!(gicc->flags & ACPI_MADT_ENABLED))
+ if (!acpi_gicc_is_usable(gicc))
return -ENODEV;
/* device_declaration means Device object in DSDT, in the
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 7dd2c615bce2..e054de92de91 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -836,8 +836,9 @@ static u64 __arch_timer_check_delta(void)
* Note that TVAL is signed, thus has only 31 of its
* 32 bits to express magnitude.
*/
- MIDR_ALL_VERSIONS(MIDR_CPU_MODEL(ARM_CPU_IMP_APM,
- APM_CPU_PART_POTENZA)),
+ MIDR_REV_RANGE(MIDR_CPU_MODEL(ARM_CPU_IMP_APM,
+ APM_CPU_PART_XGENE),
+ APM_CPU_VAR_POTENZA, 0x0, 0xf),
{},
};
@@ -917,7 +918,7 @@ static void arch_timer_evtstrm_enable(unsigned int divider)
#ifdef CONFIG_ARM64
/* ECV is likely to require a large divider. Use the EVNTIS flag. */
- if (cpus_have_const_cap(ARM64_HAS_ECV) && divider > 15) {
+ if (cpus_have_final_cap(ARM64_HAS_ECV) && divider > 15) {
cntkctl |= ARCH_TIMER_EVT_INTERVAL_SCALE;
divider -= 8;
}
@@ -955,6 +956,30 @@ static void arch_timer_configure_evtstream(void)
arch_timer_evtstrm_enable(max(0, lsb));
}
+static int arch_timer_evtstrm_starting_cpu(unsigned int cpu)
+{
+ arch_timer_configure_evtstream();
+ return 0;
+}
+
+static int arch_timer_evtstrm_dying_cpu(unsigned int cpu)
+{
+ cpumask_clear_cpu(smp_processor_id(), &evtstrm_available);
+ return 0;
+}
+
+static int __init arch_timer_evtstrm_register(void)
+{
+ if (!arch_timer_evt || !evtstrm_enable)
+ return 0;
+
+ return cpuhp_setup_state(CPUHP_AP_ARM_ARCH_TIMER_EVTSTRM_STARTING,
+ "clockevents/arm/arch_timer_evtstrm:starting",
+ arch_timer_evtstrm_starting_cpu,
+ arch_timer_evtstrm_dying_cpu);
+}
+core_initcall(arch_timer_evtstrm_register);
+
static void arch_counter_set_user_access(void)
{
u32 cntkctl = arch_timer_get_cntkctl();
@@ -1016,8 +1041,6 @@ static int arch_timer_starting_cpu(unsigned int cpu)
}
arch_counter_set_user_access();
- if (evtstrm_enable)
- arch_timer_configure_evtstream();
return 0;
}
@@ -1164,8 +1187,6 @@ static int arch_timer_dying_cpu(unsigned int cpu)
{
struct clock_event_device *clk = this_cpu_ptr(arch_timer_evt);
- cpumask_clear_cpu(smp_processor_id(), &evtstrm_available);
-
arch_timer_stop(clk);
return 0;
}
@@ -1279,6 +1300,7 @@ out_unreg_notify:
out_free:
free_percpu(arch_timer_evt);
+ arch_timer_evt = NULL;
out:
return err;
}
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index f59ac9586b7b..68d11ccee441 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -79,6 +79,13 @@ static DEFINE_STATIC_KEY_TRUE(supports_deactivate_key);
#define GIC_ESPI_NR GICD_TYPER_ESPIS(gic_data.rdists.gicd_typer)
/*
+ * There are 16 SGIs, though we only actually use 8 in Linux. The other 8 SGIs
+ * are potentially stolen by the secure side. Some code, especially code dealing
+ * with hwirq IDs, is simplified by accounting for all 16.
+ */
+#define SGI_NR 16
+
+/*
* The behaviours of RPR and PMR registers differ depending on the value of
* SCR_EL3.FIQ, and the behaviour of non-secure priority registers of the
* distributor and redistributors depends on whether security is enabled in the
@@ -99,7 +106,7 @@ static DEFINE_STATIC_KEY_TRUE(supports_deactivate_key);
* - Figure 4-7 Secure read of the priority field for a Non-secure Group 1
* interrupt.
*/
-static DEFINE_STATIC_KEY_FALSE(supports_pseudo_nmis);
+DEFINE_STATIC_KEY_FALSE(supports_pseudo_nmis);
DEFINE_STATIC_KEY_FALSE(gic_nonsecure_priorities);
EXPORT_SYMBOL(gic_nonsecure_priorities);
@@ -125,8 +132,8 @@ EXPORT_SYMBOL(gic_nonsecure_priorities);
__priority; \
})
-/* ppi_nmi_refs[n] == number of cpus having ppi[n + 16] set as NMI */
-static refcount_t *ppi_nmi_refs;
+/* rdist_nmi_refs[n] == number of cpus having the rdist interrupt n set as NMI */
+static refcount_t *rdist_nmi_refs;
static struct gic_kvm_info gic_v3_kvm_info __initdata;
static DEFINE_PER_CPU(bool, has_rss);
@@ -270,17 +277,6 @@ static void gic_redist_wait_for_rwp(void)
gic_do_wait_for_rwp(gic_data_rdist_rd_base(), GICR_CTLR_RWP);
}
-#ifdef CONFIG_ARM64
-
-static u64 __maybe_unused gic_read_iar(void)
-{
- if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_23154))
- return gic_read_iar_cavium_thunderx();
- else
- return gic_read_iar_common();
-}
-#endif
-
static void gic_enable_redist(bool enable)
{
void __iomem *rbase;
@@ -519,9 +515,22 @@ static u32 __gic_get_ppi_index(irq_hw_number_t hwirq)
}
}
-static u32 gic_get_ppi_index(struct irq_data *d)
+static u32 __gic_get_rdist_index(irq_hw_number_t hwirq)
+{
+ switch (__get_intid_range(hwirq)) {
+ case SGI_RANGE:
+ case PPI_RANGE:
+ return hwirq;
+ case EPPI_RANGE:
+ return hwirq - EPPI_BASE_INTID + 32;
+ default:
+ unreachable();
+ }
+}
+
+static u32 gic_get_rdist_index(struct irq_data *d)
{
- return __gic_get_ppi_index(d->hwirq);
+ return __gic_get_rdist_index(d->hwirq);
}
static int gic_irq_nmi_setup(struct irq_data *d)
@@ -545,11 +554,14 @@ static int gic_irq_nmi_setup(struct irq_data *d)
/* desc lock should already be held */
if (gic_irq_in_rdist(d)) {
- u32 idx = gic_get_ppi_index(d);
+ u32 idx = gic_get_rdist_index(d);
- /* Setting up PPI as NMI, only switch handler for first NMI */
- if (!refcount_inc_not_zero(&ppi_nmi_refs[idx])) {
- refcount_set(&ppi_nmi_refs[idx], 1);
+ /*
+ * Setting up a percpu interrupt as NMI, only switch handler
+ * for first NMI
+ */
+ if (!refcount_inc_not_zero(&rdist_nmi_refs[idx])) {
+ refcount_set(&rdist_nmi_refs[idx], 1);
desc->handle_irq = handle_percpu_devid_fasteoi_nmi;
}
} else {
@@ -582,10 +594,10 @@ static void gic_irq_nmi_teardown(struct irq_data *d)
/* desc lock should already be held */
if (gic_irq_in_rdist(d)) {
- u32 idx = gic_get_ppi_index(d);
+ u32 idx = gic_get_rdist_index(d);
/* Tearing down NMI, only switch handler for last NMI */
- if (refcount_dec_and_test(&ppi_nmi_refs[idx]))
+ if (refcount_dec_and_test(&rdist_nmi_refs[idx]))
desc->handle_irq = handle_percpu_devid_irq;
} else {
desc->handle_irq = handle_fasteoi_irq;
@@ -1279,10 +1291,10 @@ static void gic_cpu_init(void)
rbase = gic_data_rdist_sgi_base();
/* Configure SGIs/PPIs as non-secure Group-1 */
- for (i = 0; i < gic_data.ppi_nr + 16; i += 32)
+ for (i = 0; i < gic_data.ppi_nr + SGI_NR; i += 32)
writel_relaxed(~0, rbase + GICR_IGROUPR0 + i / 8);
- gic_cpu_config(rbase, gic_data.ppi_nr + 16, gic_redist_wait_for_rwp);
+ gic_cpu_config(rbase, gic_data.ppi_nr + SGI_NR, gic_redist_wait_for_rwp);
/* initialise system registers */
gic_cpu_sys_reg_init();
@@ -1952,12 +1964,13 @@ static void gic_enable_nmi_support(void)
return;
}
- ppi_nmi_refs = kcalloc(gic_data.ppi_nr, sizeof(*ppi_nmi_refs), GFP_KERNEL);
- if (!ppi_nmi_refs)
+ rdist_nmi_refs = kcalloc(gic_data.ppi_nr + SGI_NR,
+ sizeof(*rdist_nmi_refs), GFP_KERNEL);
+ if (!rdist_nmi_refs)
return;
- for (i = 0; i < gic_data.ppi_nr; i++)
- refcount_set(&ppi_nmi_refs[i], 0);
+ for (i = 0; i < gic_data.ppi_nr + SGI_NR; i++)
+ refcount_set(&rdist_nmi_refs[i], 0);
pr_info("Pseudo-NMIs enabled using %s ICC_PMR_EL1 synchronisation\n",
gic_has_relaxed_pmr_sync() ? "relaxed" : "forced");
@@ -2074,6 +2087,7 @@ static int __init gic_init_bases(phys_addr_t dist_phys_base,
gic_dist_init();
gic_cpu_init();
+ gic_enable_nmi_support();
gic_smp_init();
gic_cpu_pm_init();
@@ -2086,8 +2100,6 @@ static int __init gic_init_bases(phys_addr_t dist_phys_base,
gicv2m_init(handle, gic_data.domain);
}
- gic_enable_nmi_support();
-
return 0;
out_free:
@@ -2380,8 +2392,7 @@ gic_acpi_parse_madt_gicc(union acpi_subtable_headers *header,
u32 size = reg == GIC_PIDR2_ARCH_GICv4 ? SZ_64K * 4 : SZ_64K * 2;
void __iomem *redist_base;
- /* GICC entry which has !ACPI_MADT_ENABLED is not unusable so skip */
- if (!(gicc->flags & ACPI_MADT_ENABLED))
+ if (!acpi_gicc_is_usable(gicc))
return 0;
redist_base = ioremap(gicc->gicr_base_address, size);
@@ -2431,7 +2442,7 @@ static int __init gic_acpi_match_gicc(union acpi_subtable_headers *header,
* If GICC is enabled and has valid gicr base address, then it means
* GICR base is presented via GICC
*/
- if ((gicc->flags & ACPI_MADT_ENABLED) && gicc->gicr_base_address) {
+ if (acpi_gicc_is_usable(gicc) && gicc->gicr_base_address) {
acpi_data.enabled_rdists++;
return 0;
}
@@ -2440,7 +2451,7 @@ static int __init gic_acpi_match_gicc(union acpi_subtable_headers *header,
* It's perfectly valid firmware can pass disabled GICC entry, driver
* should not treat as errors, skip the entry instead of probe fail.
*/
- if (!(gicc->flags & ACPI_MADT_ENABLED))
+ if (!acpi_gicc_is_usable(gicc))
return 0;
return -ENODEV;
@@ -2499,8 +2510,7 @@ static int __init gic_acpi_parse_virt_madt_gicc(union acpi_subtable_headers *hea
int maint_irq_mode;
static int first_madt = true;
- /* Skip unusable CPUs */
- if (!(gicc->flags & ACPI_MADT_ENABLED))
+ if (!acpi_gicc_is_usable(gicc))
return 0;
maint_irq_mode = (gicc->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
diff --git a/drivers/perf/amlogic/meson_g12_ddr_pmu.c b/drivers/perf/amlogic/meson_g12_ddr_pmu.c
index 8b643888d503..15d52ab3276a 100644
--- a/drivers/perf/amlogic/meson_g12_ddr_pmu.c
+++ b/drivers/perf/amlogic/meson_g12_ddr_pmu.c
@@ -377,6 +377,7 @@ static const struct of_device_id meson_ddr_pmu_dt_match[] = {
},
{}
};
+MODULE_DEVICE_TABLE(of, meson_ddr_pmu_dt_match);
static struct platform_driver g12_ddr_pmu_driver = {
.probe = g12_ddr_pmu_probe,
diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 6b50bc551984..014010d03588 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -112,7 +112,9 @@
#define CMN_DTM_PMEVCNTSR 0x240
-#define CMN_DTM_UNIT_INFO 0x0910
+#define CMN650_DTM_UNIT_INFO 0x0910
+#define CMN_DTM_UNIT_INFO 0x0960
+#define CMN_DTM_UNIT_INFO_DTC_DOMAIN GENMASK_ULL(1, 0)
#define CMN_DTM_NUM_COUNTERS 4
/* Want more local counters? Why not replicate the whole DTM! Ugh... */
@@ -279,16 +281,13 @@ struct arm_cmn_node {
u16 id, logid;
enum cmn_node_type type;
- int dtm;
- union {
- /* DN/HN-F/CXHA */
- struct {
- u8 val : 4;
- u8 count : 4;
- } occupid[SEL_MAX];
- /* XP */
- u8 dtc;
- };
+ u8 dtm;
+ s8 dtc;
+ /* DN/HN-F/CXHA */
+ struct {
+ u8 val : 4;
+ u8 count : 4;
+ } occupid[SEL_MAX];
union {
u8 event[4];
__le32 event_sel;
@@ -538,12 +537,12 @@ static int arm_cmn_map_show(struct seq_file *s, void *data)
seq_puts(s, "\n |");
for (x = 0; x < cmn->mesh_x; x++) {
- u8 dtc = cmn->xps[xp_base + x].dtc;
+ s8 dtc = cmn->xps[xp_base + x].dtc;
- if (dtc & (dtc - 1))
+ if (dtc < 0)
seq_puts(s, " DTC ?? |");
else
- seq_printf(s, " DTC %ld |", __ffs(dtc));
+ seq_printf(s, " DTC %d |", dtc);
}
seq_puts(s, "\n |");
for (x = 0; x < cmn->mesh_x; x++)
@@ -587,8 +586,7 @@ static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id) {}
struct arm_cmn_hw_event {
struct arm_cmn_node *dn;
u64 dtm_idx[4];
- unsigned int dtc_idx;
- u8 dtcs_used;
+ s8 dtc_idx[CMN_MAX_DTCS];
u8 num_dns;
u8 dtm_offset;
bool wide_sel;
@@ -598,6 +596,10 @@ struct arm_cmn_hw_event {
#define for_each_hw_dn(hw, dn, i) \
for (i = 0, dn = hw->dn; i < hw->num_dns; i++, dn++)
+/* @i is the DTC number, @idx is the counter index on that DTC */
+#define for_each_hw_dtc_idx(hw, i, idx) \
+ for (int i = 0, idx; i < CMN_MAX_DTCS; i++) if ((idx = hw->dtc_idx[i]) >= 0)
+
static struct arm_cmn_hw_event *to_cmn_hw(struct perf_event *event)
{
BUILD_BUG_ON(sizeof(struct arm_cmn_hw_event) > offsetof(struct hw_perf_event, target));
@@ -1427,12 +1429,11 @@ static void arm_cmn_init_counter(struct perf_event *event)
{
struct arm_cmn *cmn = to_cmn(event->pmu);
struct arm_cmn_hw_event *hw = to_cmn_hw(event);
- unsigned int i, pmevcnt = CMN_DT_PMEVCNT(hw->dtc_idx);
u64 count;
- for (i = 0; hw->dtcs_used & (1U << i); i++) {
- writel_relaxed(CMN_COUNTER_INIT, cmn->dtc[i].base + pmevcnt);
- cmn->dtc[i].counters[hw->dtc_idx] = event;
+ for_each_hw_dtc_idx(hw, i, idx) {
+ writel_relaxed(CMN_COUNTER_INIT, cmn->dtc[i].base + CMN_DT_PMEVCNT(idx));
+ cmn->dtc[i].counters[idx] = event;
}
count = arm_cmn_read_dtm(cmn, hw, false);
@@ -1445,11 +1446,9 @@ static void arm_cmn_event_read(struct perf_event *event)
struct arm_cmn_hw_event *hw = to_cmn_hw(event);
u64 delta, new, prev;
unsigned long flags;
- unsigned int i;
- if (hw->dtc_idx == CMN_DT_NUM_COUNTERS) {
- i = __ffs(hw->dtcs_used);
- delta = arm_cmn_read_cc(cmn->dtc + i);
+ if (CMN_EVENT_TYPE(event) == CMN_TYPE_DTC) {
+ delta = arm_cmn_read_cc(cmn->dtc + hw->dtc_idx[0]);
local64_add(delta, &event->count);
return;
}
@@ -1459,8 +1458,8 @@ static void arm_cmn_event_read(struct perf_event *event)
delta = new - prev;
local_irq_save(flags);
- for (i = 0; hw->dtcs_used & (1U << i); i++) {
- new = arm_cmn_read_counter(cmn->dtc + i, hw->dtc_idx);
+ for_each_hw_dtc_idx(hw, i, idx) {
+ new = arm_cmn_read_counter(cmn->dtc + i, idx);
delta += new << 16;
}
local_irq_restore(flags);
@@ -1516,7 +1515,7 @@ static void arm_cmn_event_start(struct perf_event *event, int flags)
int i;
if (type == CMN_TYPE_DTC) {
- i = __ffs(hw->dtcs_used);
+ i = hw->dtc_idx[0];
writeq_relaxed(CMN_CC_INIT, cmn->dtc[i].base + CMN_DT_PMCCNTR);
cmn->dtc[i].cc_active = true;
} else if (type == CMN_TYPE_WP) {
@@ -1547,7 +1546,7 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags)
int i;
if (type == CMN_TYPE_DTC) {
- i = __ffs(hw->dtcs_used);
+ i = hw->dtc_idx[0];
cmn->dtc[i].cc_active = false;
} else if (type == CMN_TYPE_WP) {
int wp_idx = arm_cmn_wp_idx(event);
@@ -1571,7 +1570,7 @@ struct arm_cmn_val {
u8 dtm_count[CMN_MAX_DTMS];
u8 occupid[CMN_MAX_DTMS][SEL_MAX];
u8 wp[CMN_MAX_DTMS][4];
- int dtc_count;
+ int dtc_count[CMN_MAX_DTCS];
bool cycles;
};
@@ -1592,7 +1591,8 @@ static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
return;
}
- val->dtc_count++;
+ for_each_hw_dtc_idx(hw, dtc, idx)
+ val->dtc_count[dtc]++;
for_each_hw_dn(hw, dn, i) {
int wp_idx, dtm = dn->dtm, sel = hw->filter_sel;
@@ -1639,8 +1639,9 @@ static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event)
goto done;
}
- if (val->dtc_count == CMN_DT_NUM_COUNTERS)
- goto done;
+ for (i = 0; i < CMN_MAX_DTCS; i++)
+ if (val->dtc_count[i] == CMN_DT_NUM_COUNTERS)
+ goto done;
for_each_hw_dn(hw, dn, i) {
int wp_idx, wp_cmb, dtm = dn->dtm, sel = hw->filter_sel;
@@ -1733,12 +1734,19 @@ static int arm_cmn_event_init(struct perf_event *event)
hw->dn = arm_cmn_node(cmn, type);
if (!hw->dn)
return -EINVAL;
+
+ memset(hw->dtc_idx, -1, sizeof(hw->dtc_idx));
for (dn = hw->dn; dn->type == type; dn++) {
if (bynodeid && dn->id != nodeid) {
hw->dn++;
continue;
}
hw->num_dns++;
+ if (dn->dtc < 0)
+ memset(hw->dtc_idx, 0, cmn->num_dtcs);
+ else
+ hw->dtc_idx[dn->dtc] = 0;
+
if (bynodeid)
break;
}
@@ -1750,12 +1758,6 @@ static int arm_cmn_event_init(struct perf_event *event)
nodeid, nid.x, nid.y, nid.port, nid.dev, type);
return -EINVAL;
}
- /*
- * Keep assuming non-cycles events count in all DTC domains; turns out
- * it's hard to make a worthwhile optimisation around this, short of
- * going all-in with domain-local counter allocation as well.
- */
- hw->dtcs_used = (1U << cmn->num_dtcs) - 1;
return arm_cmn_validate_group(cmn, event);
}
@@ -1781,46 +1783,48 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
}
memset(hw->dtm_idx, 0, sizeof(hw->dtm_idx));
- for (i = 0; hw->dtcs_used & (1U << i); i++)
- cmn->dtc[i].counters[hw->dtc_idx] = NULL;
+ for_each_hw_dtc_idx(hw, j, idx)
+ cmn->dtc[j].counters[idx] = NULL;
}
static int arm_cmn_event_add(struct perf_event *event, int flags)
{
struct arm_cmn *cmn = to_cmn(event->pmu);
struct arm_cmn_hw_event *hw = to_cmn_hw(event);
- struct arm_cmn_dtc *dtc = &cmn->dtc[0];
struct arm_cmn_node *dn;
enum cmn_node_type type = CMN_EVENT_TYPE(event);
- unsigned int i, dtc_idx, input_sel;
+ unsigned int input_sel, i = 0;
if (type == CMN_TYPE_DTC) {
- i = 0;
while (cmn->dtc[i].cycles)
if (++i == cmn->num_dtcs)
return -ENOSPC;
cmn->dtc[i].cycles = event;
- hw->dtc_idx = CMN_DT_NUM_COUNTERS;
- hw->dtcs_used = 1U << i;
+ hw->dtc_idx[0] = i;
if (flags & PERF_EF_START)
arm_cmn_event_start(event, 0);
return 0;
}
- /* Grab a free global counter first... */
- dtc_idx = 0;
- while (dtc->counters[dtc_idx])
- if (++dtc_idx == CMN_DT_NUM_COUNTERS)
- return -ENOSPC;
-
- hw->dtc_idx = dtc_idx;
+ /* Grab the global counters first... */
+ for_each_hw_dtc_idx(hw, j, idx) {
+ if (cmn->part == PART_CMN600 && j > 0) {
+ idx = hw->dtc_idx[0];
+ } else {
+ idx = 0;
+ while (cmn->dtc[j].counters[idx])
+ if (++idx == CMN_DT_NUM_COUNTERS)
+ goto free_dtms;
+ }
+ hw->dtc_idx[j] = idx;
+ }
- /* ...then the local counters to feed it. */
+ /* ...then the local counters to feed them */
for_each_hw_dn(hw, dn, i) {
struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset;
- unsigned int dtm_idx, shift;
+ unsigned int dtm_idx, shift, d = max_t(int, dn->dtc, 0);
u64 reg;
dtm_idx = 0;
@@ -1839,11 +1843,11 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
tmp = dtm->wp_event[wp_idx ^ 1];
if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) !=
- CMN_EVENT_WP_COMBINE(dtc->counters[tmp]))
+ CMN_EVENT_WP_COMBINE(cmn->dtc[d].counters[tmp]))
goto free_dtms;
input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx;
- dtm->wp_event[wp_idx] = dtc_idx;
+ dtm->wp_event[wp_idx] = hw->dtc_idx[d];
writel_relaxed(cfg, dtm->base + CMN_DTM_WPn_CONFIG(wp_idx));
} else {
struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
@@ -1863,7 +1867,7 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
dtm->input_sel[dtm_idx] = input_sel;
shift = CMN__PMEVCNTn_GLOBAL_NUM_SHIFT(dtm_idx);
dtm->pmu_config_low &= ~(CMN__PMEVCNT0_GLOBAL_NUM << shift);
- dtm->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, dtc_idx) << shift;
+ dtm->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, hw->dtc_idx[d]) << shift;
dtm->pmu_config_low |= CMN__PMEVCNT_PAIRED(dtm_idx);
reg = (u64)le32_to_cpu(dtm->pmu_config_high) << 32 | dtm->pmu_config_low;
writeq_relaxed(reg, dtm->base + CMN_DTM_PMU_CONFIG);
@@ -1891,7 +1895,7 @@ static void arm_cmn_event_del(struct perf_event *event, int flags)
arm_cmn_event_stop(event, PERF_EF_UPDATE);
if (type == CMN_TYPE_DTC)
- cmn->dtc[__ffs(hw->dtcs_used)].cycles = NULL;
+ cmn->dtc[hw->dtc_idx[0]].cycles = NULL;
else
arm_cmn_event_clear(cmn, event, hw->num_dns);
}
@@ -2072,7 +2076,6 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
{
struct arm_cmn_node *dn, *xp;
int dtc_idx = 0;
- u8 dtcs_present = (1 << cmn->num_dtcs) - 1;
cmn->dtc = devm_kcalloc(cmn->dev, cmn->num_dtcs, sizeof(cmn->dtc[0]), GFP_KERNEL);
if (!cmn->dtc)
@@ -2082,23 +2085,26 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
cmn->xps = arm_cmn_node(cmn, CMN_TYPE_XP);
+ if (cmn->part == PART_CMN600 && cmn->num_dtcs > 1) {
+ /* We do at least know that a DTC's XP must be in that DTC's domain */
+ dn = arm_cmn_node(cmn, CMN_TYPE_DTC);
+ for (int i = 0; i < cmn->num_dtcs; i++)
+ arm_cmn_node_to_xp(cmn, dn + i)->dtc = i;
+ }
+
for (dn = cmn->dns; dn->type; dn++) {
- if (dn->type == CMN_TYPE_XP) {
- dn->dtc &= dtcs_present;
+ if (dn->type == CMN_TYPE_XP)
continue;
- }
xp = arm_cmn_node_to_xp(cmn, dn);
+ dn->dtc = xp->dtc;
dn->dtm = xp->dtm;
if (cmn->multi_dtm)
dn->dtm += arm_cmn_nid(cmn, dn->id).port / 2;
if (dn->type == CMN_TYPE_DTC) {
- int err;
- /* We do at least know that a DTC's XP must be in that DTC's domain */
- if (xp->dtc == 0xf)
- xp->dtc = 1 << dtc_idx;
- err = arm_cmn_init_dtc(cmn, dn, dtc_idx++);
+ int err = arm_cmn_init_dtc(cmn, dn, dtc_idx++);
+
if (err)
return err;
}
@@ -2117,6 +2123,16 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
return 0;
}
+static unsigned int arm_cmn_dtc_domain(struct arm_cmn *cmn, void __iomem *xp_region)
+{
+ int offset = CMN_DTM_UNIT_INFO;
+
+ if (cmn->part == PART_CMN650 || cmn->part == PART_CI700)
+ offset = CMN650_DTM_UNIT_INFO;
+
+ return FIELD_GET(CMN_DTM_UNIT_INFO_DTC_DOMAIN, readl_relaxed(xp_region + offset));
+}
+
static void arm_cmn_init_node_info(struct arm_cmn *cmn, u32 offset, struct arm_cmn_node *node)
{
int level;
@@ -2246,9 +2262,9 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
cmn->mesh_x = xp->logid;
if (cmn->part == PART_CMN600)
- xp->dtc = 0xf;
+ xp->dtc = -1;
else
- xp->dtc = 1 << readl_relaxed(xp_region + CMN_DTM_UNIT_INFO);
+ xp->dtc = arm_cmn_dtc_domain(cmn, xp_region);
xp->dtm = dtm - cmn->dtms;
arm_cmn_init_dtm(dtm++, xp, 0);
diff --git a/drivers/perf/arm_cspmu/Kconfig b/drivers/perf/arm_cspmu/Kconfig
index 25d25ded0983..6f4e28fc84a2 100644
--- a/drivers/perf/arm_cspmu/Kconfig
+++ b/drivers/perf/arm_cspmu/Kconfig
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
#
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
config ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU
tristate "ARM Coresight Architecture PMU"
@@ -10,3 +10,20 @@ config ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU
based on ARM CoreSight PMU architecture. Note that this PMU
architecture does not have relationship with the ARM CoreSight
Self-Hosted Tracing.
+
+config NVIDIA_CORESIGHT_PMU_ARCH_SYSTEM_PMU
+ tristate "NVIDIA Coresight Architecture PMU"
+ depends on ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU
+ help
+ Provides NVIDIA specific attributes for performance monitoring unit
+ (PMU) devices based on ARM CoreSight PMU architecture.
+
+config AMPERE_CORESIGHT_PMU_ARCH_SYSTEM_PMU
+ tristate "Ampere Coresight Architecture PMU"
+ depends on ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU
+ help
+ Provides Ampere specific attributes for performance monitoring unit
+ (PMU) devices based on ARM CoreSight PMU architecture.
+
+ In the first phase, the driver enables support on MCU PMU used in
+ AmpereOne SoC family.
diff --git a/drivers/perf/arm_cspmu/Makefile b/drivers/perf/arm_cspmu/Makefile
index fedb17df982d..220a734efd54 100644
--- a/drivers/perf/arm_cspmu/Makefile
+++ b/drivers/perf/arm_cspmu/Makefile
@@ -1,6 +1,10 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu_module.o
-arm_cspmu_module-y := arm_cspmu.o nvidia_cspmu.o
+
+arm_cspmu_module-y := arm_cspmu.o
+
+obj-$(CONFIG_NVIDIA_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += nvidia_cspmu.o
+obj-$(CONFIG_AMPERE_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += ampere_cspmu.o
diff --git a/drivers/perf/arm_cspmu/ampere_cspmu.c b/drivers/perf/arm_cspmu/ampere_cspmu.c
new file mode 100644
index 000000000000..f146a455e838
--- /dev/null
+++ b/drivers/perf/arm_cspmu/ampere_cspmu.c
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ampere SoC PMU (Performance Monitor Unit)
+ *
+ * Copyright (c) 2023, Ampere Computing LLC
+ */
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+
+#include "arm_cspmu.h"
+
+#define PMAUXR0 0xD80
+#define PMAUXR1 0xD84
+#define PMAUXR2 0xD88
+#define PMAUXR3 0xD8C
+
+#define to_ampere_cspmu_ctx(cspmu) ((struct ampere_cspmu_ctx *)(cspmu->impl.ctx))
+
+struct ampere_cspmu_ctx {
+ const char *name;
+ struct attribute **event_attr;
+ struct attribute **format_attr;
+};
+
+static DEFINE_IDA(mcu_pmu_ida);
+
+#define SOC_PMU_EVENT_ATTR_EXTRACTOR(_name, _config, _start, _end) \
+ static inline u32 get_##_name(const struct perf_event *event) \
+ { \
+ return FIELD_GET(GENMASK_ULL(_end, _start), \
+ event->attr._config); \
+ } \
+
+SOC_PMU_EVENT_ATTR_EXTRACTOR(event, config, 0, 8);
+SOC_PMU_EVENT_ATTR_EXTRACTOR(threshold, config1, 0, 7);
+SOC_PMU_EVENT_ATTR_EXTRACTOR(rank, config1, 8, 23);
+SOC_PMU_EVENT_ATTR_EXTRACTOR(bank, config1, 24, 55);
+
+static struct attribute *ampereone_mcu_pmu_event_attrs[] = {
+ ARM_CSPMU_EVENT_ATTR(cycle_count, 0x00),
+ ARM_CSPMU_EVENT_ATTR(act_sent, 0x01),
+ ARM_CSPMU_EVENT_ATTR(pre_sent, 0x02),
+ ARM_CSPMU_EVENT_ATTR(rd_sent, 0x03),
+ ARM_CSPMU_EVENT_ATTR(rda_sent, 0x04),
+ ARM_CSPMU_EVENT_ATTR(wr_sent, 0x05),
+ ARM_CSPMU_EVENT_ATTR(wra_sent, 0x06),
+ ARM_CSPMU_EVENT_ATTR(pd_entry_vld, 0x07),
+ ARM_CSPMU_EVENT_ATTR(sref_entry_vld, 0x08),
+ ARM_CSPMU_EVENT_ATTR(prea_sent, 0x09),
+ ARM_CSPMU_EVENT_ATTR(pre_sb_sent, 0x0a),
+ ARM_CSPMU_EVENT_ATTR(ref_sent, 0x0b),
+ ARM_CSPMU_EVENT_ATTR(rfm_sent, 0x0c),
+ ARM_CSPMU_EVENT_ATTR(ref_sb_sent, 0x0d),
+ ARM_CSPMU_EVENT_ATTR(rfm_sb_sent, 0x0e),
+ ARM_CSPMU_EVENT_ATTR(rd_rda_sent, 0x0f),
+ ARM_CSPMU_EVENT_ATTR(wr_wra_sent, 0x10),
+ ARM_CSPMU_EVENT_ATTR(raw_hazard, 0x11),
+ ARM_CSPMU_EVENT_ATTR(war_hazard, 0x12),
+ ARM_CSPMU_EVENT_ATTR(waw_hazard, 0x13),
+ ARM_CSPMU_EVENT_ATTR(rar_hazard, 0x14),
+ ARM_CSPMU_EVENT_ATTR(raw_war_waw_hazard, 0x15),
+ ARM_CSPMU_EVENT_ATTR(hprd_lprd_wr_req_vld, 0x16),
+ ARM_CSPMU_EVENT_ATTR(lprd_req_vld, 0x17),
+ ARM_CSPMU_EVENT_ATTR(hprd_req_vld, 0x18),
+ ARM_CSPMU_EVENT_ATTR(hprd_lprd_req_vld, 0x19),
+ ARM_CSPMU_EVENT_ATTR(prefetch_tgt, 0x1a),
+ ARM_CSPMU_EVENT_ATTR(wr_req_vld, 0x1b),
+ ARM_CSPMU_EVENT_ATTR(partial_wr_req_vld, 0x1c),
+ ARM_CSPMU_EVENT_ATTR(rd_retry, 0x1d),
+ ARM_CSPMU_EVENT_ATTR(wr_retry, 0x1e),
+ ARM_CSPMU_EVENT_ATTR(retry_gnt, 0x1f),
+ ARM_CSPMU_EVENT_ATTR(rank_change, 0x20),
+ ARM_CSPMU_EVENT_ATTR(dir_cha