summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-02-21 12:35:04 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2021-02-21 12:35:04 -0800
commit657bd90c93146a929c69cd43addf2804eb70c926 (patch)
treee643825c87070f83df58d37d4daf0417eb17e8c2
parent7b15c27e2f7b6d114770c2922b2c49d2e8f3867c (diff)
parentc5e6fc08feb2b88dc5dac2f3c817e1c2a4cafda4 (diff)
downloadlinux-657bd90c93146a929c69cd43addf2804eb70c926.tar.gz
linux-657bd90c93146a929c69cd43addf2804eb70c926.tar.bz2
linux-657bd90c93146a929c69cd43addf2804eb70c926.zip
Merge tag 'sched-core-2021-02-17' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "Core scheduler updates: - Add CONFIG_PREEMPT_DYNAMIC: this in its current form adds the preempt=none/voluntary/full boot options (default: full), to allow distros to build a PREEMPT kernel but fall back to close to PREEMPT_VOLUNTARY (or PREEMPT_NONE) runtime scheduling behavior via a boot time selection. There's also the /debug/sched_debug switch to do this runtime. This feature is implemented via runtime patching (a new variant of static calls). The scope of the runtime patching can be best reviewed by looking at the sched_dynamic_update() function in kernel/sched/core.c. ( Note that the dynamic none/voluntary mode isn't 100% identical, for example preempt-RCU is available in all cases, plus the preempt count is maintained in all models, which has runtime overhead even with the code patching. ) The PREEMPT_VOLUNTARY/PREEMPT_NONE models, used by the vast majority of distributions, are supposed to be unaffected. - Fix ignored rescheduling after rcu_eqs_enter(). This is a bug that was found via rcutorture triggering a hang. The bug is that rcu_idle_enter() may wake up a NOCB kthread, but this happens after the last generic need_resched() check. Some cpuidle drivers fix it by chance but many others don't. In true 2020 fashion the original bug fix has grown into a 5-patch scheduler/RCU fix series plus another 16 RCU patches to address the underlying issue of missed preemption events. These are the initial fixes that should fix current incarnations of the bug. - Clean up rbtree usage in the scheduler, by providing & using the following consistent set of rbtree APIs: partial-order; less() based: - rb_add(): add a new entry to the rbtree - rb_add_cached(): like rb_add(), but for a rb_root_cached total-order; cmp() based: - rb_find(): find an entry in an rbtree - rb_find_add(): find an entry, and add if not found - rb_find_first(): find the first (leftmost) matching entry - rb_next_match(): continue from rb_find_first() - rb_for_each(): iterate a sub-tree using the previous two - Improve the SMP/NUMA load-balancer: scan for an idle sibling in a single pass. This is a 4-commit series where each commit improves one aspect of the idle sibling scan logic. - Improve the cpufreq cooling driver by getting the effective CPU utilization metrics from the scheduler - Improve the fair scheduler's active load-balancing logic by reducing the number of active LB attempts & lengthen the load-balancing interval. This improves stress-ng mmapfork performance. - Fix CFS's estimated utilization (util_est) calculation bug that can result in too high utilization values Misc updates & fixes: - Fix the HRTICK reprogramming & optimization feature - Fix SCHED_SOFTIRQ raising race & warning in the CPU offlining code - Reduce dl_add_task_root_domain() overhead - Fix uprobes refcount bug - Process pending softirqs in flush_smp_call_function_from_idle() - Clean up task priority related defines, remove *USER_*PRIO and USER_PRIO() - Simplify the sched_init_numa() deduplication sort - Documentation updates - Fix EAS bug in update_misfit_status(), which degraded the quality of energy-balancing - Smaller cleanups" * tag 'sched-core-2021-02-17' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (51 commits) sched,x86: Allow !PREEMPT_DYNAMIC entry/kvm: Explicitly flush pending rcuog wakeup before last rescheduling point entry: Explicitly flush pending rcuog wakeup before last rescheduling point rcu/nocb: Trigger self-IPI on late deferred wake up before user resume rcu/nocb: Perform deferred wake up before last idle's need_resched() check rcu: Pull deferred rcuog wake up to rcu_eqs_enter() callers sched/features: Distinguish between NORMAL and DEADLINE hrtick sched/features: Fix hrtick reprogramming sched/deadline: Reduce rq lock contention in dl_add_task_root_domain() uprobes: (Re)add missing get_uprobe() in __find_uprobe() smp: Process pending softirqs in flush_smp_call_function_from_idle() sched: Harden PREEMPT_DYNAMIC static_call: Allow module use without exposing static_call_key sched: Add /debug/sched_preempt preempt/dynamic: Support dynamic preempt with preempt= boot option preempt/dynamic: Provide irqentry_exit_cond_resched() static call preempt/dynamic: Provide preempt_schedule[_notrace]() static calls preempt/dynamic: Provide cond_resched() and might_resched() static calls preempt: Introduce CONFIG_PREEMPT_DYNAMIC static_call: Provide DEFINE_STATIC_CALL_RET0() ...
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt7
-rw-r--r--Documentation/scheduler/schedutil.txt169
-rw-r--r--arch/Kconfig9
-rw-r--r--arch/powerpc/platforms/cell/spufs/sched.c2
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/preempt.h48
-rw-r--r--arch/x86/include/asm/static_call.h7
-rw-r--r--arch/x86/kernel/static_call.c17
-rw-r--r--arch/x86/kvm/x86.c1
-rw-r--r--drivers/thermal/cpufreq_cooling.c69
-rw-r--r--include/asm-generic/vmlinux.lds.h5
-rw-r--r--include/linux/cgroup.h4
-rw-r--r--include/linux/entry-common.h4
-rw-r--r--include/linux/entry-kvm.h14
-rw-r--r--include/linux/kernel.h23
-rw-r--r--include/linux/rbtree.h206
-rw-r--r--include/linux/rcupdate.h2
-rw-r--r--include/linux/sched.h34
-rw-r--r--include/linux/sched/prio.h18
-rw-r--r--include/linux/static_call.h77
-rw-r--r--include/linux/static_call_types.h50
-rw-r--r--include/linux/topology.h1
-rw-r--r--init/Kconfig2
-rw-r--r--kernel/Kconfig.preempt19
-rw-r--r--kernel/entry/common.c17
-rw-r--r--kernel/events/core.c195
-rw-r--r--kernel/events/uprobes.c80
-rw-r--r--kernel/locking/rtmutex.c54
-rw-r--r--kernel/rcu/tree.c53
-rw-r--r--kernel/rcu/tree.h2
-rw-r--r--kernel/rcu/tree_plugin.h31
-rw-r--r--kernel/sched/core.c357
-rw-r--r--kernel/sched/cpufreq_schedutil.c108
-rw-r--r--kernel/sched/deadline.c94
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/fair.c322
-rw-r--r--kernel/sched/features.h2
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h51
-rw-r--r--kernel/sched/topology.c99
-rw-r--r--kernel/smp.c4
-rw-r--r--kernel/static_call.c60
-rw-r--r--lib/timerqueue.c28
-rw-r--r--tools/include/linux/rbtree.h192
-rw-r--r--tools/include/linux/static_call_types.h50
-rw-r--r--tools/objtool/check.c17
-rw-r--r--tools/objtool/elf.c73
48 files changed, 1898 insertions, 785 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 36d6ce7cc886..b93aaa374266 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3903,6 +3903,13 @@
Format: {"off"}
Disable Hardware Transactional Memory
+ preempt= [KNL]
+ Select preemption mode if you have CONFIG_PREEMPT_DYNAMIC
+ none - Limited to cond_resched() calls
+ voluntary - Limited to cond_resched() and might_sleep() calls
+ full - Any section that isn't explicitly preempt disabled
+ can be preempted anytime.
+
print-fatal-signals=
[KNL] debug: print fatal signals
diff --git a/Documentation/scheduler/schedutil.txt b/Documentation/scheduler/schedutil.txt
new file mode 100644
index 000000000000..78f6b91e2291
--- /dev/null
+++ b/Documentation/scheduler/schedutil.txt
@@ -0,0 +1,169 @@
+
+
+NOTE; all this assumes a linear relation between frequency and work capacity,
+we know this is flawed, but it is the best workable approximation.
+
+
+PELT (Per Entity Load Tracking)
+-------------------------------
+
+With PELT we track some metrics across the various scheduler entities, from
+individual tasks to task-group slices to CPU runqueues. As the basis for this
+we use an Exponentially Weighted Moving Average (EWMA), each period (1024us)
+is decayed such that y^32 = 0.5. That is, the most recent 32ms contribute
+half, while the rest of history contribute the other half.
+
+Specifically:
+
+ ewma_sum(u) := u_0 + u_1*y + u_2*y^2 + ...
+
+ ewma(u) = ewma_sum(u) / ewma_sum(1)
+
+Since this is essentially a progression of an infinite geometric series, the
+results are composable, that is ewma(A) + ewma(B) = ewma(A+B). This property
+is key, since it gives the ability to recompose the averages when tasks move
+around.
+
+Note that blocked tasks still contribute to the aggregates (task-group slices
+and CPU runqueues), which reflects their expected contribution when they
+resume running.
+
+Using this we track 2 key metrics: 'running' and 'runnable'. 'Running'
+reflects the time an entity spends on the CPU, while 'runnable' reflects the
+time an entity spends on the runqueue. When there is only a single task these
+two metrics are the same, but once there is contention for the CPU 'running'
+will decrease to reflect the fraction of time each task spends on the CPU
+while 'runnable' will increase to reflect the amount of contention.
+
+For more detail see: kernel/sched/pelt.c
+
+
+Frequency- / CPU Invariance
+---------------------------
+
+Because consuming the CPU for 50% at 1GHz is not the same as consuming the CPU
+for 50% at 2GHz, nor is running 50% on a LITTLE CPU the same as running 50% on
+a big CPU, we allow architectures to scale the time delta with two ratios, one
+Dynamic Voltage and Frequency Scaling (DVFS) ratio and one microarch ratio.
+
+For simple DVFS architectures (where software is in full control) we trivially
+compute the ratio as:
+
+ f_cur
+ r_dvfs := -----
+ f_max
+
+For more dynamic systems where the hardware is in control of DVFS we use
+hardware counters (Intel APERF/MPERF, ARMv8.4-AMU) to provide us this ratio.
+For Intel specifically, we use:
+
+ APERF
+ f_cur := ----- * P0
+ MPERF
+
+ 4C-turbo; if available and turbo enabled
+ f_max := { 1C-turbo; if turbo enabled
+ P0; otherwise
+
+ f_cur
+ r_dvfs := min( 1, ----- )
+ f_max
+
+We pick 4C turbo over 1C turbo to make it slightly more sustainable.
+
+r_cpu is determined as the ratio of highest performance level of the current
+CPU vs the highest performance level of any other CPU in the system.
+
+ r_tot = r_dvfs * r_cpu
+
+The result is that the above 'running' and 'runnable' metrics become invariant
+of DVFS and CPU type. IOW. we can transfer and compare them between CPUs.
+
+For more detail see:
+
+ - kernel/sched/pelt.h:update_rq_clock_pelt()
+ - arch/x86/kernel/smpboot.c:"APERF/MPERF frequency ratio computation."
+ - Documentation/scheduler/sched-capacity.rst:"1. CPU Capacity + 2. Task utilization"
+
+
+UTIL_EST / UTIL_EST_FASTUP
+--------------------------
+
+Because periodic tasks have their averages decayed while they sleep, even
+though when running their expected utilization will be the same, they suffer a
+(DVFS) ramp-up after they are running again.
+
+To alleviate this (a default enabled option) UTIL_EST drives an Infinite
+Impulse Response (IIR) EWMA with the 'running' value on dequeue -- when it is
+highest. A further default enabled option UTIL_EST_FASTUP modifies the IIR
+filter to instantly increase and only decay on decrease.
+
+A further runqueue wide sum (of runnable tasks) is maintained of:
+
+ util_est := \Sum_t max( t_running, t_util_est_ewma )
+
+For more detail see: kernel/sched/fair.c:util_est_dequeue()
+
+
+UCLAMP
+------
+
+It is possible to set effective u_min and u_max clamps on each CFS or RT task;
+the runqueue keeps an max aggregate of these clamps for all running tasks.
+
+For more detail see: include/uapi/linux/sched/types.h
+
+
+Schedutil / DVFS
+----------------
+
+Every time the scheduler load tracking is updated (task wakeup, task
+migration, time progression) we call out to schedutil to update the hardware
+DVFS state.
+
+The basis is the CPU runqueue's 'running' metric, which per the above it is
+the frequency invariant utilization estimate of the CPU. From this we compute
+a desired frequency like:
+
+ max( running, util_est ); if UTIL_EST
+ u_cfs := { running; otherwise
+
+ clamp( u_cfs + u_rt , u_min, u_max ); if UCLAMP_TASK
+ u_clamp := { u_cfs + u_rt; otherwise
+
+ u := u_clamp + u_irq + u_dl; [approx. see source for more detail]
+
+ f_des := min( f_max, 1.25 u * f_max )
+
+XXX IO-wait; when the update is due to a task wakeup from IO-completion we
+boost 'u' above.
+
+This frequency is then used to select a P-state/OPP or directly munged into a
+CPPC style request to the hardware.
+
+XXX: deadline tasks (Sporadic Task Model) allows us to calculate a hard f_min
+required to satisfy the workload.
+
+Because these callbacks are directly from the scheduler, the DVFS hardware
+interaction should be 'fast' and non-blocking. Schedutil supports
+rate-limiting DVFS requests for when hardware interaction is slow and
+expensive, this reduces effectiveness.
+
+For more information see: kernel/sched/cpufreq_schedutil.c
+
+
+NOTES
+-----
+
+ - On low-load scenarios, where DVFS is most relevant, the 'running' numbers
+ will closely reflect utilization.
+
+ - In saturated scenarios task movement will cause some transient dips,
+ suppose we have a CPU saturated with 4 tasks, then when we migrate a task
+ to an idle CPU, the old CPU will have a 'running' value of 0.75 while the
+ new CPU will gain 0.25. This is inevitable and time progression will
+ correct this. XXX do we still guarantee f_max due to no idle-time?
+
+ - Much of the above is about avoiding DVFS dips, and independent DVFS domains
+ having to re-learn / ramp-up when load shifts.
+
diff --git a/arch/Kconfig b/arch/Kconfig
index 87608c2fa027..4790a5f23d9f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1058,6 +1058,15 @@ config HAVE_STATIC_CALL_INLINE
bool
depends on HAVE_STATIC_CALL
+config HAVE_PREEMPT_DYNAMIC
+ bool
+ depends on HAVE_STATIC_CALL
+ depends on GENERIC_ENTRY
+ help
+ Select this if the architecture support boot time preempt setting
+ on top of static calls. It is strongly advised to support inline
+ static call to avoid any overhead.
+
config ARCH_WANT_LD_ORPHAN_WARN
bool
help
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 9d06fffb1526..369206489895 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -72,7 +72,7 @@ static struct timer_list spuloadavg_timer;
#define DEF_SPU_TIMESLICE (100 * HZ / (1000 * SPUSCHED_TICK))
#define SCALE_PRIO(x, prio) \
- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE)
+ max(x * (MAX_PRIO - prio) / (NICE_WIDTH / 2), MIN_SPU_TIMESLICE)
/*
* scale user-nice values [ -20 ... 0 ... 19 ] to time slice values:
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7b934a591df2..595193bc2d31 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -224,6 +224,7 @@ config X86
select HAVE_STACK_VALIDATION if X86_64
select HAVE_STATIC_CALL
select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION
+ select HAVE_PREEMPT_DYNAMIC
select HAVE_RSEQ
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 69485ca13665..f8cb8af4de5c 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -5,6 +5,7 @@
#include <asm/rmwcc.h>
#include <asm/percpu.h>
#include <linux/thread_info.h>
+#include <linux/static_call_types.h>
DECLARE_PER_CPU(int, __preempt_count);
@@ -103,16 +104,45 @@ static __always_inline bool should_resched(int preempt_offset)
}
#ifdef CONFIG_PREEMPTION
- extern asmlinkage void preempt_schedule_thunk(void);
-# define __preempt_schedule() \
- asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT)
- extern asmlinkage void preempt_schedule(void);
- extern asmlinkage void preempt_schedule_notrace_thunk(void);
-# define __preempt_schedule_notrace() \
- asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT)
+extern asmlinkage void preempt_schedule(void);
+extern asmlinkage void preempt_schedule_thunk(void);
- extern asmlinkage void preempt_schedule_notrace(void);
-#endif
+#define __preempt_schedule_func preempt_schedule_thunk
+
+extern asmlinkage void preempt_schedule_notrace(void);
+extern asmlinkage void preempt_schedule_notrace_thunk(void);
+
+#define __preempt_schedule_notrace_func preempt_schedule_notrace_thunk
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+DECLARE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
+
+#define __preempt_schedule() \
+do { \
+ __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule); \
+ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \
+} while (0)
+
+DECLARE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+
+#define __preempt_schedule_notrace() \
+do { \
+ __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule_notrace); \
+ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule_notrace) : ASM_CALL_CONSTRAINT); \
+} while (0)
+
+#else /* PREEMPT_DYNAMIC */
+
+#define __preempt_schedule() \
+ asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT);
+
+#define __preempt_schedule_notrace() \
+ asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT);
+
+#endif /* PREEMPT_DYNAMIC */
+
+#endif /* PREEMPTION */
#endif /* __ASM_PREEMPT_H */
diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h
index c37f11999d0c..cbb67b6030f9 100644
--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -37,4 +37,11 @@
#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop")
+
+#define ARCH_ADD_TRAMP_KEY(name) \
+ asm(".pushsection .static_call_tramp_key, \"a\" \n" \
+ ".long " STATIC_CALL_TRAMP_STR(name) " - . \n" \
+ ".long " STATIC_CALL_KEY_STR(name) " - . \n" \
+ ".popsection \n")
+
#endif /* _ASM_STATIC_CALL_H */
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index ca9a380d9c0b..9442c4136c38 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -11,14 +11,26 @@ enum insn_type {
RET = 3, /* tramp / site cond-tail-call */
};
+/*
+ * data16 data16 xorq %rax, %rax - a single 5 byte instruction that clears %rax
+ * The REX.W cancels the effect of any data16.
+ */
+static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 };
+
static void __ref __static_call_transform(void *insn, enum insn_type type, void *func)
{
+ const void *emulate = NULL;
int size = CALL_INSN_SIZE;
const void *code;
switch (type) {
case CALL:
code = text_gen_insn(CALL_INSN_OPCODE, insn, func);
+ if (func == &__static_call_return0) {
+ emulate = code;
+ code = &xor5rax;
+ }
+
break;
case NOP:
@@ -41,7 +53,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void
if (unlikely(system_state == SYSTEM_BOOTING))
return text_poke_early(insn, code, size);
- text_poke_bp(insn, code, size, NULL);
+ text_poke_bp(insn, code, size, emulate);
}
static void __static_call_validate(void *insn, bool tail)
@@ -54,7 +66,8 @@ static void __static_call_validate(void *insn, bool tail)
return;
} else {
if (opcode == CALL_INSN_OPCODE ||
- !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5))
+ !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5) ||
+ !memcmp(insn, xor5rax, 5))
return;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1b404e4d7dd8..b967c1c774a1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1782,6 +1782,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
{
+ xfer_to_guest_mode_prepare();
return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
xfer_to_guest_mode_work_pending();
}
diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index 612f063c1cfc..f5af2571f9b7 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -76,7 +76,9 @@ struct cpufreq_cooling_device {
struct em_perf_domain *em;
struct cpufreq_policy *policy;
struct list_head node;
+#ifndef CONFIG_SMP
struct time_in_idle *idle_time;
+#endif
struct freq_qos_request qos_req;
};
@@ -132,14 +134,25 @@ static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,
}
/**
- * get_load() - get load for a cpu since last updated
- * @cpufreq_cdev: &struct cpufreq_cooling_device for this cpu
- * @cpu: cpu number
- * @cpu_idx: index of the cpu in time_in_idle*
+ * get_load() - get load for a cpu
+ * @cpufreq_cdev: struct cpufreq_cooling_device for the cpu
+ * @cpu: cpu number
+ * @cpu_idx: index of the cpu in time_in_idle array
*
* Return: The average load of cpu @cpu in percentage since this
* function was last called.
*/
+#ifdef CONFIG_SMP
+static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
+ int cpu_idx)
+{
+ unsigned long max = arch_scale_cpu_capacity(cpu);
+ unsigned long util;
+
+ util = sched_cpu_util(cpu, max);
+ return (util * 100) / max;
+}
+#else /* !CONFIG_SMP */
static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
int cpu_idx)
{
@@ -161,6 +174,7 @@ static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
return load;
}
+#endif /* CONFIG_SMP */
/**
* get_dynamic_power() - calculate the dynamic power
@@ -346,6 +360,36 @@ static inline bool em_is_sane(struct cpufreq_cooling_device *cpufreq_cdev,
}
#endif /* CONFIG_THERMAL_GOV_POWER_ALLOCATOR */
+#ifdef CONFIG_SMP
+static inline int allocate_idle_time(struct cpufreq_cooling_device *cpufreq_cdev)
+{
+ return 0;
+}
+
+static inline void free_idle_time(struct cpufreq_cooling_device *cpufreq_cdev)
+{
+}
+#else
+static int allocate_idle_time(struct cpufreq_cooling_device *cpufreq_cdev)
+{
+ unsigned int num_cpus = cpumask_weight(cpufreq_cdev->policy->related_cpus);
+
+ cpufreq_cdev->idle_time = kcalloc(num_cpus,
+ sizeof(*cpufreq_cdev->idle_time),
+ GFP_KERNEL);
+ if (!cpufreq_cdev->idle_time)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void free_idle_time(struct cpufreq_cooling_device *cpufreq_cdev)
+{
+ kfree(cpufreq_cdev->idle_time);
+ cpufreq_cdev->idle_time = NULL;
+}
+#endif /* CONFIG_SMP */
+
static unsigned int get_state_freq(struct cpufreq_cooling_device *cpufreq_cdev,
unsigned long state)
{
@@ -485,7 +529,7 @@ __cpufreq_cooling_register(struct device_node *np,
struct thermal_cooling_device *cdev;
struct cpufreq_cooling_device *cpufreq_cdev;
char dev_name[THERMAL_NAME_LENGTH];
- unsigned int i, num_cpus;
+ unsigned int i;
struct device *dev;
int ret;
struct thermal_cooling_device_ops *cooling_ops;
@@ -496,7 +540,6 @@ __cpufreq_cooling_register(struct device_node *np,
return ERR_PTR(-ENODEV);
}
-
if (IS_ERR_OR_NULL(policy)) {
pr_err("%s: cpufreq policy isn't valid: %p\n", __func__, policy);
return ERR_PTR(-EINVAL);
@@ -514,12 +557,10 @@ __cpufreq_cooling_register(struct device_node *np,
return ERR_PTR(-ENOMEM);
cpufreq_cdev->policy = policy;
- num_cpus = cpumask_weight(policy->related_cpus);
- cpufreq_cdev->idle_time = kcalloc(num_cpus,
- sizeof(*cpufreq_cdev->idle_time),
- GFP_KERNEL);
- if (!cpufreq_cdev->idle_time) {
- cdev = ERR_PTR(-ENOMEM);
+
+ ret = allocate_idle_time(cpufreq_cdev);
+ if (ret) {
+ cdev = ERR_PTR(ret);
goto free_cdev;
}
@@ -579,7 +620,7 @@ remove_qos_req:
remove_ida:
ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
free_idle_time:
- kfree(cpufreq_cdev->idle_time);
+ free_idle_time(cpufreq_cdev);
free_cdev:
kfree(cpufreq_cdev);
return cdev;
@@ -672,7 +713,7 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
thermal_cooling_device_unregister(cdev);
freq_qos_remove_request(&cpufreq_cdev->qos_req);
ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
- kfree(cpufreq_cdev->idle_time);
+ free_idle_time(cpufreq_cdev);
kfree(cpufreq_cdev);
}
EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister);
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 52dbd58f6810..a54e08d77789 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -403,7 +403,10 @@
. = ALIGN(8);