summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/acpi/cppc_acpi.c99
-rw-r--r--drivers/acpi/sleep.c26
-rw-r--r--drivers/base/arch_topology.c42
-rw-r--r--drivers/base/core.c3
-rw-r--r--drivers/base/power/runtime.c98
-rw-r--r--drivers/cpufreq/Kconfig.x8617
-rw-r--r--drivers/cpufreq/Makefile5
-rw-r--r--drivers/cpufreq/amd-pstate-trace.c2
-rw-r--r--drivers/cpufreq/amd-pstate-trace.h77
-rw-r--r--drivers/cpufreq/amd-pstate.c645
-rw-r--r--drivers/cpufreq/cpufreq.c9
-rw-r--r--drivers/cpufreq/cpufreq_conservative.c5
-rw-r--r--drivers/cpufreq/cpufreq_ondemand.c5
-rw-r--r--drivers/cpufreq/intel_pstate.c121
-rw-r--r--drivers/cpufreq/mediatek-cpufreq-hw.c33
-rw-r--r--drivers/cpufreq/qcom-cpufreq-hw.c39
-rw-r--r--drivers/cpuidle/governors/menu.c2
-rw-r--r--drivers/cpuidle/sysfs.c8
-rw-r--r--drivers/devfreq/Kconfig9
-rw-r--r--drivers/devfreq/Makefile1
-rw-r--r--drivers/devfreq/devfreq.c4
-rw-r--r--drivers/devfreq/sun8i-a33-mbus.c511
-rw-r--r--drivers/mmc/host/jz4740_mmc.c8
-rw-r--r--drivers/mmc/host/mxcmmc.c6
-rw-r--r--drivers/net/ethernet/realtek/r8169_main.c4
-rw-r--r--drivers/powercap/dtpm.c6
-rw-r--r--drivers/powercap/idle_inject.c2
-rw-r--r--drivers/powercap/intel_rapl_common.c61
-rw-r--r--drivers/thermal/cpufreq_cooling.c6
29 files changed, 1707 insertions, 147 deletions
diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index e9ac5ea1dfc9..a9d2de403c0c 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -118,6 +118,8 @@ static DEFINE_PER_CPU(struct cpc_desc *, cpc_desc_ptr);
*/
#define NUM_RETRIES 500ULL
+#define OVER_16BTS_MASK ~0xFFFFULL
+
#define define_one_cppc_ro(_name) \
static struct kobj_attribute _name = \
__ATTR(_name, 0444, show_##_name, NULL)
@@ -412,7 +414,7 @@ bool acpi_cpc_valid(void)
struct cpc_desc *cpc_ptr;
int cpu;
- for_each_possible_cpu(cpu) {
+ for_each_present_cpu(cpu) {
cpc_ptr = per_cpu(cpc_desc_ptr, cpu);
if (!cpc_ptr)
return false;
@@ -730,9 +732,26 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr)
goto out_free;
cpc_ptr->cpc_regs[i-2].sys_mem_vaddr = addr;
}
+ } else if (gas_t->space_id == ACPI_ADR_SPACE_SYSTEM_IO) {
+ if (gas_t->access_width < 1 || gas_t->access_width > 3) {
+ /*
+ * 1 = 8-bit, 2 = 16-bit, and 3 = 32-bit.
+ * SystemIO doesn't implement 64-bit
+ * registers.
+ */
+ pr_debug("Invalid access width %d for SystemIO register\n",
+ gas_t->access_width);
+ goto out_free;
+ }
+ if (gas_t->address & OVER_16BTS_MASK) {
+ /* SystemIO registers use 16-bit integer addresses */
+ pr_debug("Invalid IO port %llu for SystemIO register\n",
+ gas_t->address);
+ goto out_free;
+ }
} else {
if (gas_t->space_id != ACPI_ADR_SPACE_FIXED_HARDWARE || !cpc_ffh_supported()) {
- /* Support only PCC ,SYS MEM and FFH type regs */
+ /* Support only PCC, SystemMemory, SystemIO, and FFH type regs. */
pr_debug("Unsupported register type: %d\n", gas_t->space_id);
goto out_free;
}
@@ -907,7 +926,21 @@ static int cpc_read(int cpu, struct cpc_register_resource *reg_res, u64 *val)
}
*val = 0;
- if (reg->space_id == ACPI_ADR_SPACE_PLATFORM_COMM && pcc_ss_id >= 0)
+
+ if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_IO) {
+ u32 width = 8 << (reg->access_width - 1);
+ acpi_status status;
+
+ status = acpi_os_read_port((acpi_io_address)reg->address,
+ (u32 *)val, width);
+ if (ACPI_FAILURE(status)) {
+ pr_debug("Error: Failed to read SystemIO port %llx\n",
+ reg->address);
+ return -EFAULT;
+ }
+
+ return 0;
+ } else if (reg->space_id == ACPI_ADR_SPACE_PLATFORM_COMM && pcc_ss_id >= 0)
vaddr = GET_PCC_VADDR(reg->address, pcc_ss_id);
else if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY)
vaddr = reg_res->sys_mem_vaddr;
@@ -946,7 +979,20 @@ static int cpc_write(int cpu, struct cpc_register_resource *reg_res, u64 val)
int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu);
struct cpc_reg *reg = &reg_res->cpc_entry.reg;
- if (reg->space_id == ACPI_ADR_SPACE_PLATFORM_COMM && pcc_ss_id >= 0)
+ if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_IO) {
+ u32 width = 8 << (reg->access_width - 1);
+ acpi_status status;
+
+ status = acpi_os_write_port((acpi_io_address)reg->address,
+ (u32)val, width);
+ if (ACPI_FAILURE(status)) {
+ pr_debug("Error: Failed to write SystemIO port %llx\n",
+ reg->address);
+ return -EFAULT;
+ }
+
+ return 0;
+ } else if (reg->space_id == ACPI_ADR_SPACE_PLATFORM_COMM && pcc_ss_id >= 0)
vaddr = GET_PCC_VADDR(reg->address, pcc_ss_id);
else if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY)
vaddr = reg_res->sys_mem_vaddr;
@@ -1214,6 +1260,51 @@ out_err:
EXPORT_SYMBOL_GPL(cppc_get_perf_ctrs);
/**
+ * cppc_set_enable - Set to enable CPPC on the processor by writing the
+ * Continuous Performance Control package EnableRegister field.
+ * @cpu: CPU for which to enable CPPC register.
+ * @enable: 0 - disable, 1 - enable CPPC feature on the processor.
+ *
+ * Return: 0 for success, -ERRNO or -EIO otherwise.
+ */
+int cppc_set_enable(int cpu, bool enable)
+{
+ int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu);
+ struct cpc_register_resource *enable_reg;
+ struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu);
+ struct cppc_pcc_data *pcc_ss_data = NULL;
+ int ret = -EINVAL;
+
+ if (!cpc_desc) {
+ pr_debug("No CPC descriptor for CPU:%d\n", cpu);
+ return -EINVAL;
+ }
+
+ enable_reg = &cpc_desc->cpc_regs[ENABLE];
+
+ if (CPC_IN_PCC(enable_reg)) {
+
+ if (pcc_ss_id < 0)
+ return -EIO;
+
+ ret = cpc_write(cpu, enable_reg, enable);
+ if (ret)
+ return ret;
+
+ pcc_ss_data = pcc_data[pcc_ss_id];
+
+ down_write(&pcc_ss_data->pcc_lock);
+ /* after writing CPC, transfer the ownership of PCC to platfrom */
+ ret = send_pcc_cmd(pcc_ss_id, CMD_WRITE);
+ up_write(&pcc_ss_data->pcc_lock);
+ return ret;
+ }
+
+ return cpc_write(cpu, enable_reg, enable);
+}
+EXPORT_SYMBOL_GPL(cppc_set_enable);
+
+/**
* cppc_set_perf - Set a CPU's performance controls.
* @cpu: CPU for which to set performance controls.
* @perf_ctrls: ptr to cppc_perf_ctrls. See cppc_acpi.h
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 4b8454f26ca1..a60ff5dfed3a 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -874,11 +874,11 @@ static inline void acpi_sleep_syscore_init(void) {}
#ifdef CONFIG_HIBERNATION
static unsigned long s4_hardware_signature;
static struct acpi_table_facs *facs;
-static bool nosigcheck;
+static int sigcheck = -1; /* Default behaviour is just to warn */
-void __init acpi_no_s4_hw_signature(void)
+void __init acpi_check_s4_hw_signature(int check)
{
- nosigcheck = true;
+ sigcheck = check;
}
static int acpi_hibernation_begin(pm_message_t stage)
@@ -1004,12 +1004,28 @@ static void acpi_sleep_hibernate_setup(void)
hibernation_set_ops(old_suspend_ordering ?
&acpi_hibernation_ops_old : &acpi_hibernation_ops);
sleep_states[ACPI_STATE_S4] = 1;
- if (nosigcheck)
+ if (!sigcheck)
return;
acpi_get_table(ACPI_SIG_FACS, 1, (struct acpi_table_header **)&facs);
- if (facs)
+ if (facs) {
+ /*
+ * s4_hardware_signature is the local variable which is just
+ * used to warn about mismatch after we're attempting to
+ * resume (in violation of the ACPI specification.)
+ */
s4_hardware_signature = facs->hardware_signature;
+
+ if (sigcheck > 0) {
+ /*
+ * If we're actually obeying the ACPI specification
+ * then the signature is written out as part of the
+ * swsusp header, in order to allow the boot kernel
+ * to gracefully decline to resume.
+ */
+ swsusp_hardware_signature = facs->hardware_signature;
+ }
+ }
}
#else /* !CONFIG_HIBERNATION */
static inline void acpi_sleep_hibernate_setup(void) {}
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index ff16a36a908b..976154140f0b 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -22,6 +22,7 @@
static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data);
static struct cpumask scale_freq_counters_mask;
static bool scale_freq_invariant;
+static DEFINE_PER_CPU(u32, freq_factor) = 1;
static bool supports_scale_freq_counters(const struct cpumask *cpus)
{
@@ -155,15 +156,49 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
DEFINE_PER_CPU(unsigned long, thermal_pressure);
-void topology_set_thermal_pressure(const struct cpumask *cpus,
- unsigned long th_pressure)
+/**
+ * topology_update_thermal_pressure() - Update thermal pressure for CPUs
+ * @cpus : The related CPUs for which capacity has been reduced
+ * @capped_freq : The maximum allowed frequency that CPUs can run at
+ *
+ * Update the value of thermal pressure for all @cpus in the mask. The
+ * cpumask should include all (online+offline) affected CPUs, to avoid
+ * operating on stale data when hot-plug is used for some CPUs. The
+ * @capped_freq reflects the currently allowed max CPUs frequency due to
+ * thermal capping. It might be also a boost frequency value, which is bigger
+ * than the internal 'freq_factor' max frequency. In such case the pressure
+ * value should simply be removed, since this is an indication that there is
+ * no thermal throttling. The @capped_freq must be provided in kHz.
+ */
+void topology_update_thermal_pressure(const struct cpumask *cpus,
+ unsigned long capped_freq)
{
+ unsigned long max_capacity, capacity, th_pressure;
+ u32 max_freq;
int cpu;
+ cpu = cpumask_first(cpus);
+ max_capacity = arch_scale_cpu_capacity(cpu);
+ max_freq = per_cpu(freq_factor, cpu);
+
+ /* Convert to MHz scale which is used in 'freq_factor' */
+ capped_freq /= 1000;
+
+ /*
+ * Handle properly the boost frequencies, which should simply clean
+ * the thermal pressure value.
+ */
+ if (max_freq <= capped_freq)
+ capacity = max_capacity;
+ else
+ capacity = mult_frac(max_capacity, capped_freq, max_freq);
+
+ th_pressure = max_capacity - capacity;
+
for_each_cpu(cpu, cpus)
WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
}
-EXPORT_SYMBOL_GPL(topology_set_thermal_pressure);
+EXPORT_SYMBOL_GPL(topology_update_thermal_pressure);
static ssize_t cpu_capacity_show(struct device *dev,
struct device_attribute *attr,
@@ -217,7 +252,6 @@ static void update_topology_flags_workfn(struct work_struct *work)
update_topology = 0;
}
-static DEFINE_PER_CPU(u32, freq_factor) = 1;
static u32 *raw_capacity;
static int free_raw_capacity(void)
diff --git a/drivers/base/core.c b/drivers/base/core.c
index fd034d742447..b191bd17de89 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -485,8 +485,7 @@ static void device_link_release_fn(struct work_struct *work)
/* Ensure that all references to the link object have been dropped. */
device_link_synchronize_removal();
- while (refcount_dec_not_one(&link->rpm_active))
- pm_runtime_put(link->supplier);
+ pm_runtime_release_supplier(link, true);
put_device(link->consumer);
put_device(link->supplier);
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index d504cd4ab3cb..2f3cce17219b 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -305,19 +305,40 @@ static int rpm_get_suppliers(struct device *dev)
return 0;
}
+/**
+ * pm_runtime_release_supplier - Drop references to device link's supplier.
+ * @link: Target device link.
+ * @check_idle: Whether or not to check if the supplier device is idle.
+ *
+ * Drop all runtime PM references associated with @link to its supplier device
+ * and if @check_idle is set, check if that device is idle (and so it can be
+ * suspended).
+ */
+void pm_runtime_release_supplier(struct device_link *link, bool check_idle)
+{
+ struct device *supplier = link->supplier;
+
+ /*
+ * The additional power.usage_count check is a safety net in case
+ * the rpm_active refcount becomes saturated, in which case
+ * refcount_dec_not_one() would return true forever, but it is not
+ * strictly necessary.
+ */
+ while (refcount_dec_not_one(&link->rpm_active) &&
+ atomic_read(&supplier->power.usage_count) > 0)
+ pm_runtime_put_noidle(supplier);
+
+ if (check_idle)
+ pm_request_idle(supplier);
+}
+
static void __rpm_put_suppliers(struct device *dev, bool try_to_suspend)
{
struct device_link *link;
list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
- device_links_read_lock_held()) {
-
- while (refcount_dec_not_one(&link->rpm_active))
- pm_runtime_put_noidle(link->supplier);
-
- if (try_to_suspend)
- pm_request_idle(link->supplier);
- }
+ device_links_read_lock_held())
+ pm_runtime_release_supplier(link, try_to_suspend);
}
static void rpm_put_suppliers(struct device *dev)
@@ -742,13 +763,15 @@ static int rpm_resume(struct device *dev, int rpmflags)
trace_rpm_resume_rcuidle(dev, rpmflags);
repeat:
- if (dev->power.runtime_error)
+ if (dev->power.runtime_error) {
retval = -EINVAL;
- else if (dev->power.disable_depth == 1 && dev->power.is_suspended
- && dev->power.runtime_status == RPM_ACTIVE)
- retval = 1;
- else if (dev->power.disable_depth > 0)
- retval = -EACCES;
+ } else if (dev->power.disable_depth > 0) {
+ if (dev->power.runtime_status == RPM_ACTIVE &&
+ dev->power.last_status == RPM_ACTIVE)
+ retval = 1;
+ else
+ retval = -EACCES;
+ }
if (retval)
goto out;
@@ -1410,8 +1433,10 @@ void __pm_runtime_disable(struct device *dev, bool check_resume)
/* Update time accounting before disabling PM-runtime. */
update_pm_runtime_accounting(dev);
- if (!dev->power.disable_depth++)
+ if (!dev->power.disable_depth++) {
__pm_runtime_barrier(dev);
+ dev->power.last_status = dev->power.runtime_status;
+ }
out:
spin_unlock_irq(&dev->power.lock);
@@ -1428,23 +1453,23 @@ void pm_runtime_enable(struct device *dev)
spin_lock_irqsave(&dev->power.lock, flags);
- if (dev->power.disable_depth > 0) {
- dev->power.disable_depth--;
-
- /* About to enable runtime pm, set accounting_timestamp to now */
- if (!dev->power.disable_depth)
- dev->power.accounting_timestamp = ktime_get_mono_fast_ns();
- } else {
+ if (!dev->power.disable_depth) {
dev_warn(dev, "Unbalanced %s!\n", __func__);
+ goto out;
}
- WARN(!dev->power.disable_depth &&
- dev->power.runtime_status == RPM_SUSPENDED &&
- !dev->power.ignore_children &&
- atomic_read(&dev->power.child_count) > 0,
- "Enabling runtime PM for inactive device (%s) with active children\n",
- dev_name(dev));
+ if (--dev->power.disable_depth > 0)
+ goto out;
+
+ dev->power.last_status = RPM_INVALID;
+ dev->power.accounting_timestamp = ktime_get_mono_fast_ns();
+
+ if (dev->power.runtime_status == RPM_SUSPENDED &&
+ !dev->power.ignore_children &&
+ atomic_read(&dev->power.child_count) > 0)
+ dev_warn(dev, "Enabling runtime PM for inactive device with active children\n");
+out:
spin_unlock_irqrestore(&dev->power.lock, flags);
}
EXPORT_SYMBOL_GPL(pm_runtime_enable);
@@ -1640,6 +1665,7 @@ EXPORT_SYMBOL_GPL(__pm_runtime_use_autosuspend);
void pm_runtime_init(struct device *dev)
{
dev->power.runtime_status = RPM_SUSPENDED;
+ dev->power.last_status = RPM_INVALID;
dev->power.idle_notification = false;
dev->power.disable_depth = 1;
@@ -1722,8 +1748,6 @@ void pm_runtime_get_suppliers(struct device *dev)
void pm_runtime_put_suppliers(struct device *dev)
{
struct device_link *link;
- unsigned long flags;
- bool put;
int idx;
idx = device_links_read_lock();
@@ -1731,11 +1755,17 @@ void pm_runtime_put_suppliers(struct device *dev)
list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
device_links_read_lock_held())
if (link->supplier_preactivated) {
+ bool put;
+
link->supplier_preactivated = false;
- spin_lock_irqsave(&dev->power.lock, flags);
+
+ spin_lock_irq(&dev->power.lock);
+
put = pm_runtime_status_suspended(dev) &&
refcount_dec_not_one(&link->rpm_active);
- spin_unlock_irqrestore(&dev->power.lock, flags);
+
+ spin_unlock_irq(&dev->power.lock);
+
if (put)
pm_runtime_put(link->supplier);
}
@@ -1772,9 +1802,7 @@ void pm_runtime_drop_link(struct device_link *link)
return;
pm_runtime_drop_link_count(link->consumer);
-
- while (refcount_dec_not_one(&link->rpm_active))
- pm_runtime_put(link->supplier);
+ pm_runtime_release_supplier(link, true);
}
static bool pm_runtime_need_not_resume(struct device *dev)
diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
index 92701a18bdd9..55516043b656 100644
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -34,6 +34,23 @@ config X86_PCC_CPUFREQ
If in doubt, say N.
+config X86_AMD_PSTATE
+ tristate "AMD Processor P-State driver"
+ depends on X86 && ACPI
+ select ACPI_PROCESSOR
+ select ACPI_CPPC_LIB if X86_64
+ select CPU_FREQ_GOV_SCHEDUTIL if SMP
+ help
+ This driver adds a CPUFreq driver which utilizes a fine grain
+ processor performance frequency control range instead of legacy
+ performance levels. _CPC needs to be present in the ACPI tables
+ of the system.
+
+ For details, take a look at:
+ <file:Documentation/admin-guide/pm/amd-pstate.rst>.
+
+ If in doubt, say N.
+
config X86_ACPI_CPUFREQ
tristate "ACPI Processor P-States driver"
depends on ACPI_PROCESSOR
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 48ee5859030c..285de70af877 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -17,6 +17,10 @@ obj-$(CONFIG_CPU_FREQ_GOV_ATTR_SET) += cpufreq_governor_attr_set.o
obj-$(CONFIG_CPUFREQ_DT) += cpufreq-dt.o
obj-$(CONFIG_CPUFREQ_DT_PLATDEV) += cpufreq-dt-platdev.o
+# Traces
+CFLAGS_amd-pstate-trace.o := -I$(src)
+amd_pstate-y := amd-pstate.o amd-pstate-trace.o
+
##################################################################################
# x86 drivers.
# Link order matters. K8 is preferred to ACPI because of firmware bugs in early
@@ -25,6 +29,7 @@ obj-$(CONFIG_CPUFREQ_DT_PLATDEV) += cpufreq-dt-platdev.o
# speedstep-* is preferred over p4-clockmod.
obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
+obj-$(CONFIG_X86_AMD_PSTATE) += amd_pstate.o
obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
diff --git a/drivers/cpufreq/amd-pstate-trace.c b/drivers/cpufreq/amd-pstate-trace.c
new file mode 100644
index 000000000000..891b696dcd69
--- /dev/null
+++ b/drivers/cpufreq/amd-pstate-trace.c
@@ -0,0 +1,2 @@
+#define CREATE_TRACE_POINTS
+#include "amd-pstate-trace.h"
diff --git a/drivers/cpufreq/amd-pstate-trace.h b/drivers/cpufreq/amd-pstate-trace.h
new file mode 100644
index 000000000000..647505957d4f
--- /dev/null
+++ b/drivers/cpufreq/amd-pstate-trace.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * amd-pstate-trace.h - AMD Processor P-state Frequency Driver Tracer
+ *
+ * Copyright (C) 2021 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ * Author: Huang Rui <ray.huang@amd.com>
+ */
+
+#if !defined(_AMD_PSTATE_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _AMD_PSTATE_TRACE_H
+
+#include <linux/cpufreq.h>
+#include <linux/tracepoint.h>
+#include <linux/trace_events.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM amd_cpu
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE amd-pstate-trace
+
+#define TPS(x) tracepoint_string(x)
+
+TRACE_EVENT(amd_pstate_perf,
+
+ TP_PROTO(unsigned long min_perf,
+ unsigned long target_perf,
+ unsigned long capacity,
+ unsigned int cpu_id,
+ bool changed,
+ bool fast_switch
+ ),
+
+ TP_ARGS(min_perf,
+ target_perf,
+ capacity,
+ cpu_id,
+ changed,
+ fast_switch
+ ),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, min_perf)
+ __field(unsigned long, target_perf)
+ __field(unsigned long, capacity)
+ __field(unsigned int, cpu_id)
+ __field(bool, changed)
+ __field(bool, fast_switch)
+ ),
+
+ TP_fast_assign(
+ __entry->min_perf = min_perf;
+ __entry->target_perf = target_perf;
+ __entry->capacity = capacity;
+ __entry->cpu_id = cpu_id;
+ __entry->changed = changed;
+ __entry->fast_switch = fast_switch;
+ ),
+
+ TP_printk("amd_min_perf=%lu amd_des_perf=%lu amd_max_perf=%lu cpu_id=%u changed=%s fast_switch=%s",
+ (unsigned long)__entry->min_perf,
+ (unsigned long)__entry->target_perf,
+ (unsigned long)__entry->capacity,
+ (unsigned int)__entry->cpu_id,
+ (__entry->changed) ? "true" : "false",
+ (__entry->fast_switch) ? "true" : "false"
+ )
+);
+
+#endif /* _AMD_PSTATE_TRACE_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+
+#include <trace/define_trace.h>
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
new file mode 100644
index 000000000000..9ce75ed11f8e
--- /dev/null
+++ b/drivers/cpufreq/amd-pstate.c
@@ -0,0 +1,645 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * amd-pstate.c - AMD Processor P-state Frequency Driver
+ *
+ * Copyright (C) 2021 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ * Author: Huang Rui <ray.huang@amd.com>
+ *
+ * AMD P-State introduces a new CPU performance scaling design for AMD
+ * processors using the ACPI Collaborative Performance and Power Control (CPPC)
+ * feature which works with the AMD SMU firmware providing a finer grained
+ * frequency control range. It is to replace the legacy ACPI P-States control,
+ * allows a flexible, low-latency interface for the Linux kernel to directly
+ * communicate the performance hints to hardware.
+ *
+ * AMD P-State is supported on recent AMD Zen base CPU series include some of
+ * Zen2 and Zen3 processors. _CPC needs to be present in the ACPI tables of AMD
+ * P-State supported system. And there are two types of hardware implementations
+ * for AMD P-State: 1) Full MSR Solution and 2) Shared Memory Solution.
+ * X86_FEATURE_CPPC CPU feature flag is used to distinguish the different types.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+#include <linux/cpufreq.h>
+#include <linux/compiler.h>
+#include <linux/dmi.h>
+#include <linux/slab.h>
+#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/uaccess.h>
+#include <linux/static_call.h>
+
+#include <acpi/processor.h>
+#include <acpi/cppc_acpi.h>
+
+#include <asm/msr.h>
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
+#include "amd-pstate-trace.h"
+
+#define AMD_PSTATE_TRANSITION_LATENCY 0x20000
+#define AMD_PSTATE_TRANSITION_DELAY 500
+
+/*
+ * TODO: We need more time to fine tune processors with shared memory solution
+ * with community together.
+ *
+ * There are some performance drops on the CPU benchmarks which reports from
+ * Suse. We are co-working with them to fine tune the shared memory solution. So
+ * we disable it by default to go acpi-cpufreq on these processors and add a
+ * module parameter to be able to enable it manually for debugging.
+ */
+static bool shared_mem = false;
+module_param(shared_mem, bool, 0444);
+MODULE_PARM_DESC(shared_mem,
+ "enable amd-pstate on processors with shared memory solution (false = disabled (default), true = enabled)");
+
+static struct cpufreq_driver amd_pstate_driver;
+
+/**
+ * struct amd_cpudata - private CPU data for AMD P-State
+ * @cpu: CPU number
+ * @req: constraint request to apply
+ * @cppc_req_cached: cached performance request hints
+ * @highest_perf: the maximum performance an individual processor may reach,
+ * assuming ideal conditions
+ * @nominal_perf: the maximum sustained performance level of the processor,
+ * assuming ideal operating conditions
+ * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power
+ * savings are achieved
+ * @lowest_perf: the absolute lowest performance level of the processor
+ * @max_freq: the frequency that mapped to highest_perf
+ * @min_freq: the frequency that mapped to lowest_perf
+ * @nominal_freq: the frequency that mapped to nominal_perf
+ * @lowest_nonlinear_freq: the frequency that mapped to lowest_nonlinear_perf
+ * @boost_supported: check whether the Processor or SBIOS supports boost mode
+ *
+ * The amd_cpudata is key private data for each CPU thread in AMD P-State, and
+ * represents all the attributes and goals that AMD P-State requests at runtime.
+ */
+struct amd_cpudata {
+ int cpu;
+
+ struct freq_qos_request req[2];
+ u64 cppc_req_cached;
+
+ u32 highest_perf;
+ u32 nominal_perf;
+ u32 lowest_nonlinear_perf;
+ u32 lowest_perf;
+
+ u32 max_freq;
+ u32 min_freq;
+ u32 nominal_freq;
+ u32 lowest_nonlinear_freq;
+
+ bool boost_supported;
+};
+
+static inline int pstate_enable(bool enable)
+{
+ return wrmsrl_safe(MSR_AMD_CPPC_ENABLE, enable);
+}
+
+static int cppc_enable(bool enable)
+{
+ int cpu, ret = 0;
+
+ for_each_present_cpu(cpu) {
+ ret = cppc_set_enable(cpu, enable);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+
+DEFINE_STATIC_CALL(amd_pstate_enable, pstate_enable);
+
+static inline int amd_pstate_enable(bool enable)
+{
+ return static_call(amd_pstate_enable)(enable);
+}
+
+static int pstate_init_perf(struct amd_cpudata *cpudata)
+{
+ u64 cap1;
+
+ int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1,
+ &cap1);
+ if (ret)
+ return ret;
+
+ /*
+ * TODO: Introduce AMD specific power feature.
+ *
+ * CPPC entry doesn't indicate the highest performance in some ASICs.
+ */
+ WRITE_ONCE(cpudata->highest_perf, amd_get_highest_perf());
+
+ WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1));
+ WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1));
+ WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1));
+
+ return 0;
+}
+
+static int cppc_init_perf(struct amd_cpudata *cpudata)
+{
+ struct cppc_perf_caps cppc_perf;
+
+ int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf);
+ if (ret)
+ return ret;
+
+ WRITE_ONCE(cpudata->highest_perf, amd_get_highest_perf());
+
+ WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf);
+ WRITE_ONCE(cpudata->lowest_nonlinear_perf,
+ cppc_perf.lowest_nonlinear_perf);
+ WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf);
+
+ return 0;
+}
+
+DEFINE_STATIC_CALL(amd_pstate_init_perf, pstate_init_perf);
+
+static inline int amd_pstate_init_perf(struct amd_cpudata *cpudata)
+{
+ return static_call(amd_pstate_init_perf)(cpudata);
+}
+
+static void pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
+ u32 des_perf, u32 max_perf, bool fast_switch)
+{
+ if (fast_switch)
+ wrmsrl(MSR_AMD_CPPC_REQ, READ_ONCE(cpudata->cppc_req_cached));
+ else
+ wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
+ READ_ONCE(cpudata->cppc_req_cached));
+}
+
+static void cppc_update_perf(struct amd_cpudata *cpudata,
+ u32 min_perf, u32 des_perf,
+ u32 max_perf, bool fast_switch)
+{
+ struct cppc_perf_ctrls perf_ctrls;
+
+ perf_ctrls.max_perf = max_perf;
+ perf_ctrls.min_perf = min_perf;
+ perf_ctrls.desired_perf = des_perf;
+
+ cppc_set_perf(cpudata->cpu, &perf_ctrls);
+}
+
+DEFINE_STATIC_CALL(amd_pstate_update_perf, pstate_update_perf);
+
+static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata,
+ u32 min_perf, u32 des_perf,
+ u32 max_perf, bool fast_switch)
+{
+ static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf,
+ max_perf, fast_switch);
+}
+
+static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
+ u32 des_perf, u32 max_perf, bool fast_switch)
+{
+ u64 prev = READ_ONCE(cpudata->cppc_req_cached);
+ u64 value = prev;
+
+ value &= ~AMD_CPPC_MIN_PERF(~0L);
+ value |= AMD_CPPC_MIN_PERF(min_perf);
+
+ value &= ~AMD_CPPC_DES_PERF(~0L);
+ value |= AMD_CPPC_DES_PERF(des_perf);
+
+ value &= ~AMD_CPPC_MAX_PERF(~0L);
+ value |= AMD_CPPC_MAX_PERF(max_perf);
+
+ trace_amd_pstate_perf(min_perf, des_perf, max_perf,
+ cpudata->cpu, (value != prev), fast_switch);
+
+ if (value == prev)
+ return;
+
+ WRITE_ONCE(cpudata->cppc_req_cached, value);
+
+ amd_pstate_update_perf(cpudata, min_perf, des_perf,
+ max_perf, fast_switch);
+}
+
+static int amd_pstate_verify(struct cpufreq_policy_data *policy)
+{
+ cpufreq_verify_within_cpu_limits(policy);