diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-23 18:17:09 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-23 18:17:09 -0700 |
| commit | a13dc4d40938787cfc9e520ed426df4ebf48b1aa (patch) | |
| tree | 10f047632aecf59f0c286427f2e45bd5b4c7c726 | |
| parent | 1de564b8c1a6f9f8bf3a106daa0be9f2cba7d045 (diff) | |
| parent | d936411dc9caeb3edb992e39c33d4d1d81ca8c08 (diff) | |
| download | linux-a13dc4d40938787cfc9e520ed426df4ebf48b1aa.tar.gz linux-a13dc4d40938787cfc9e520ed426df4ebf48b1aa.tar.bz2 linux-a13dc4d40938787cfc9e520ed426df4ebf48b1aa.zip | |
Merge tag 'x86_cleanups_for_v5.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 cleanups from Borislav Petkov:
- Serious sanitization and cleanup of the whole APERF/MPERF and
frequency invariance code along with removing the need for
unnecessary IPIs
- Finally remove a.out support
- The usual trivial cleanups and fixes all over x86
* tag 'x86_cleanups_for_v5.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (21 commits)
x86: Remove empty files
x86/speculation: Add missing srbds=off to the mitigations= help text
x86/prctl: Remove pointless task argument
x86/aperfperf: Make it correct on 32bit and UP kernels
x86/aperfmperf: Integrate the fallback code from show_cpuinfo()
x86/aperfmperf: Replace arch_freq_get_on_cpu()
x86/aperfmperf: Replace aperfmperf_get_khz()
x86/aperfmperf: Store aperf/mperf data for cpu frequency reads
x86/aperfmperf: Make parts of the frequency invariance code unconditional
x86/aperfmperf: Restructure arch_scale_freq_tick()
x86/aperfmperf: Put frequency invariance aperf/mperf data into a struct
x86/aperfmperf: Untangle Intel and AMD frequency invariance init
x86/aperfmperf: Separate AP/BP frequency invariance init
x86/smp: Move APERF/MPERF code where it belongs
x86/aperfmperf: Dont wake idle CPUs in arch_freq_get_on_cpu()
x86/process: Fix kernel-doc warning due to a changed function name
x86: Remove a.out support
x86/mm: Replace nodes_weight() with nodes_empty() where appropriate
x86: Replace cpumask_weight() with cpumask_empty() where appropriate
x86/pkeys: Remove __arch_set_user_pkey_access() declaration
...
28 files changed, 441 insertions, 865 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1169a4acfeb1..c63384ede958 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3147,6 +3147,7 @@ mds=off [X86] tsx_async_abort=off [X86] kvm.nx_huge_pages=off [X86] + srbds=off [X86,INTEL] no_entry_flush [PPC] no_uaccess_flush [PPC] diff --git a/MAINTAINERS b/MAINTAINERS index f468864fd268..496f5e281776 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7385,7 +7385,6 @@ L: linux-mm@kvack.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/execve F: arch/alpha/kernel/binfmt_loader.c -F: arch/x86/ia32/ia32_aout.c F: fs/*binfmt_*.c F: fs/exec.c F: include/linux/binfmts.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 84a3a52ed3a0..7fa8bdd0ed7e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2842,13 +2842,6 @@ config IA32_EMULATION 64-bit kernel. You should likely turn this on, unless you're 100% sure that you don't have any 32-bit programs left. -config IA32_AOUT - tristate "IA32 a.out support" - depends on IA32_EMULATION - depends on BROKEN - help - Support old a.out binaries in the 32bit emulation. - config X86_X32_ABI bool "x32 ABI for 64-bit mode" depends on X86_64 diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index 8e4d0391ff6c..e481056698de 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -5,7 +5,5 @@ obj-$(CONFIG_IA32_EMULATION) := ia32_signal.o -obj-$(CONFIG_IA32_AOUT) += ia32_aout.o - audit-class-$(CONFIG_AUDIT) := audit.o obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y) diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c deleted file mode 100644 index 9bd15241fadb..000000000000 --- a/arch/x86/ia32/ia32_aout.c +++ /dev/null @@ -1,325 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * a.out loader for x86-64 - * - * Copyright (C) 1991, 1992, 1996 Linus Torvalds - * Hacked together by Andi Kleen - */ - -#include <linux/module.h> - -#include <linux/time.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/a.out.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/file.h> -#include <linux/stat.h> -#include <linux/fcntl.h> -#include <linux/ptrace.h> -#include <linux/user.h> -#include <linux/binfmts.h> -#include <linux/personality.h> -#include <linux/init.h> -#include <linux/jiffies.h> -#include <linux/perf_event.h> -#include <linux/sched/task_stack.h> - -#include <linux/uaccess.h> -#include <asm/cacheflush.h> -#include <asm/user32.h> -#include <asm/ia32.h> - -#undef WARN_OLD - -static int load_aout_binary(struct linux_binprm *); -static int load_aout_library(struct file *); - -static struct linux_binfmt aout_format = { - .module = THIS_MODULE, - .load_binary = load_aout_binary, - .load_shlib = load_aout_library, -}; - -static int set_brk(unsigned long start, unsigned long end) -{ - start = PAGE_ALIGN(start); - end = PAGE_ALIGN(end); - if (end <= start) - return 0; - return vm_brk(start, end - start); -} - - -/* - * create_aout_tables() parses the env- and arg-strings in new user - * memory and creates the pointer tables from them, and puts their - * addresses on the "stack", returning the new stack pointer value. - */ -static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm) -{ - u32 __user *argv, *envp, *sp; - int argc = bprm->argc, envc = bprm->envc; - - sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p); - sp -= envc+1; - envp = sp; - sp -= argc+1; - argv = sp; - put_user((unsigned long) envp, --sp); - put_user((unsigned long) argv, --sp); - put_user(argc, --sp); - current->mm->arg_start = (unsigned long) p; - while (argc-- > 0) { - char c; - - put_user((u32)(unsigned long)p, argv++); - do { - get_user(c, p++); - } while (c); - } - put_user(0, argv); - current->mm->arg_end = current->mm->env_start = (unsigned long) p; - while (envc-- > 0) { - char c; - - put_user((u32)(unsigned long)p, envp++); - do { - get_user(c, p++); - } while (c); - } - put_user(0, envp); - current->mm->env_end = (unsigned long) p; - return sp; -} - -/* - * These are the functions used to load a.out style executables and shared - * libraries. There is no binary dependent code anywhere else. - */ -static int load_aout_binary(struct linux_binprm *bprm) -{ - unsigned long error, fd_offset, rlim; - struct pt_regs *regs = current_pt_regs(); - struct exec ex; - int retval; - - ex = *((struct exec *) bprm->buf); /* exec-header */ - if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && - N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || - N_TRSIZE(ex) || N_DRSIZE(ex) || - i_size_read(file_inode(bprm->file)) < - ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { - return -ENOEXEC; - } - - fd_offset = N_TXTOFF(ex); - - /* Check initial limits. This avoids letting people circumvent - * size limits imposed on them by creating programs with large - * arrays in the data or bss. - */ - rlim = rlimit(RLIMIT_DATA); - if (rlim >= RLIM_INFINITY) - rlim = ~0; - if (ex.a_data + ex.a_bss > rlim) - return -ENOMEM; - - /* Flush all traces of the currently running executable */ - retval = begin_new_exec(bprm); - if (retval) - return retval; - - /* OK, This is the point of no return */ - set_personality(PER_LINUX); - set_personality_ia32(false); - - setup_new_exec(bprm); - - regs->cs = __USER32_CS; - regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = - regs->r13 = regs->r14 = regs->r15 = 0; - - current->mm->end_code = ex.a_text + - (current->mm->start_code = N_TXTADDR(ex)); - current->mm->end_data = ex.a_data + - (current->mm->start_data = N_DATADDR(ex)); - current->mm->brk = ex.a_bss + - (current->mm->start_brk = N_BSSADDR(ex)); - - retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); - if (retval < 0) - return retval; - - if (N_MAGIC(ex) == OMAGIC) { - unsigned long text_addr, map_size; - - text_addr = N_TXTADDR(ex); - map_size = ex.a_text+ex.a_data; - - error = vm_brk(text_addr & PAGE_MASK, map_size); - - if (error) - return error; - - error = read_code(bprm->file, text_addr, 32, - ex.a_text + ex.a_data); - if ((signed long)error < 0) - return error; - } else { -#ifdef WARN_OLD - static unsigned long error_time, error_time2; - if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && - (N_MAGIC(ex) != NMAGIC) && - time_after(jiffies, error_time2 + 5*HZ)) { - printk(KERN_NOTICE "executable not page aligned\n"); - error_time2 = jiffies; - } - - if ((fd_offset & ~PAGE_MASK) != 0 && - time_after(jiffies, error_time + 5*HZ)) { - printk(KERN_WARNING - "fd_offset is not page aligned. Please convert " - "program: %pD\n", - bprm->file); - error_time = jiffies; - } -#endif - - if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) { - error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); - if (error) - return error; - - read_code(bprm->file, N_TXTADDR(ex), fd_offset, - ex.a_text+ex.a_data); - goto beyond_if; - } - - error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, - PROT_READ | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_32BIT, - fd_offset); - - if (error != N_TXTADDR(ex)) - return error; - - error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_32BIT, - fd_offset + ex.a_text); - if (error != N_DATADDR(ex)) - return error; - } - -beyond_if: - error = set_brk(current->mm->start_brk, current->mm->brk); - if (error) - return error; - - set_binfmt(&aout_format); - - current->mm->start_stack = - (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); - /* start thread */ - loadsegment(fs, 0); - loadsegment(ds, __USER32_DS); - loadsegment(es, __USER32_DS); - load_gs_index(0); - (regs)->ip = ex.a_entry; - (regs)->sp = current->mm->start_stack; - (regs)->flags = 0x200; - (regs)->cs = __USER32_CS; - (regs)->ss = __USER32_DS; - regs->r8 = regs->r9 = regs->r10 = regs->r11 = - regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; - return 0; -} - -static int load_aout_library(struct file *file) -{ - unsigned long bss, start_addr, len, error; - int retval; - struct exec ex; - loff_t pos = 0; - - retval = -ENOEXEC; - error = kernel_read(file, &ex, sizeof(ex), &pos); - if (error != sizeof(ex)) - goto out; - - /* We come in here for the regular a.out style of shared libraries */ - if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) || - N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) || - i_size_read(file_inode(file)) < - ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { - goto out; - } - - if (N_FLAGS(ex)) - goto out; - - /* For QMAGIC, the starting address is 0x20 into the page. We mask - this off to get the starting address for the page */ - - start_addr = ex.a_entry & 0xfffff000; - - if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) { -#ifdef WARN_OLD - static unsigned long error_time; - if (time_after(jiffies, error_time + 5*HZ)) { - printk(KERN_WARNING - "N_TXTOFF is not page aligned. Please convert " - "library: %pD\n", - file); - error_time = jiffies; - } -#endif - retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - if (retval) - goto out; - - read_code(file, start_addr, N_TXTOFF(ex), - ex.a_text + ex.a_data); - retval = 0; - goto out; - } - /* Now use mmap to map the library into memory. */ - error = vm_mmap(file, start_addr, ex.a_text + ex.a_data, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_32BIT, - N_TXTOFF(ex)); - retval = error; - if (error != start_addr) - goto out; - - len = PAGE_ALIGN(ex.a_text + ex.a_data); - bss = ex.a_text + ex.a_data + ex.a_bss; - if (bss > len) { - retval = vm_brk(start_addr + len, bss - len); - if (retval) - goto out; - } - retval = 0; -out: - return retval; -} - -static int __init init_aout_binfmt(void) -{ - register_binfmt(&aout_format); - return 0; -} - -static void __exit exit_aout_binfmt(void) -{ - unregister_binfmt(&aout_format); -} - -module_init(init_aout_binfmt); -module_exit(exit_aout_binfmt); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 86e5e4e26fcb..e89772dc17f1 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -36,6 +36,8 @@ extern int _debug_hotplug_cpu(int cpu, int action); #endif #endif +extern void ap_init_aperfmperf(void); + int mwait_usable(const struct cpuinfo_x86 *); unsigned int x86_family(unsigned int sig); diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index c83b3020350a..6b0f31fb53f7 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -162,7 +162,6 @@ static inline bool fpstate_is_confidential(struct fpu_guest *gfpu) } /* prctl */ -struct task_struct; -extern long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2); +extern long fpu_xstate_prctl(int option, unsigned long arg2); #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h deleted file mode 100644 index e69de29bb2d1..000000000000 --- a/arch/x86/include/asm/fpu/internal.h +++ /dev/null diff --git a/arch/x86/include/asm/mmx.h b/arch/x86/include/asm/mmx.h deleted file mode 100644 index e69de29bb2d1..000000000000 --- a/arch/x86/include/asm/mmx.h +++ /dev/null diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 1d5f14aff5f6..2e6c04d8a45b 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -41,9 +41,6 @@ static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, return __arch_override_mprotect_pkey(vma, prot, pkey); } -extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val); - #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3) #define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map) @@ -118,11 +115,6 @@ int mm_pkey_free(struct mm_struct *mm, int pkey) return 0; } -extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val); -extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val); - static inline int vma_pkey(struct vm_area_struct *vma) { unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 | diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 647d71535ce3..12ef86b19910 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -42,7 +42,6 @@ void x86_configure_nx(void); extern int reboot_force; -long do_arch_prctl_common(struct task_struct *task, int option, - unsigned long arg2); +long do_arch_prctl_common(int option, unsigned long arg2); #endif /* _ASM_X86_PROTO_H */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 9619385bf749..458c891a8273 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -212,30 +212,19 @@ static inline long arch_scale_freq_capacity(int cpu) } #define arch_scale_freq_capacity arch_scale_freq_capacity -extern void arch_scale_freq_tick(void); -#define arch_scale_freq_tick arch_scale_freq_tick - extern void arch_set_max_freq_ratio(bool turbo_disabled); -void init_freq_invariance(bool secondary, bool cppc_ready); +extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled); #else -static inline void arch_set_max_freq_ratio(bool turbo_disabled) -{ -} -static inline void init_freq_invariance(bool secondary, bool cppc_ready) -{ -} +static inline void arch_set_max_freq_ratio(bool turbo_disabled) { } +static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { } #endif +extern void arch_scale_freq_tick(void); +#define arch_scale_freq_tick arch_scale_freq_tick + #ifdef CONFIG_ACPI_CPPC_LIB void init_freq_invariance_cppc(void); #define arch_init_invariance_cppc init_freq_invariance_cppc - -bool amd_set_max_freq_ratio(u64 *ratio); -#else -static inline bool amd_set_max_freq_ratio(u64 *ratio) -{ - return false; -} #endif #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index df1644d9b3b6..8b8cbf22461a 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -50,20 +50,17 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) return err; } -bool amd_set_max_freq_ratio(u64 *ratio) +static void amd_set_max_freq_ratio(void) { struct cppc_perf_caps perf_caps; u64 highest_perf, nominal_perf; u64 perf_ratio; int rc; - if (!ratio) - return false; - rc = cppc_get_perf_caps(0, &perf_caps); if (rc) { pr_debug("Could not retrieve perf counters (%d)\n", rc); - return false; + return; } highest_perf = amd_get_highest_perf(); @@ -71,7 +68,7 @@ bool amd_set_max_freq_ratio(u64 *ratio) if (!highest_perf || !nominal_perf) { pr_debug("Could not retrieve highest or nominal performance\n"); - return false; + return; } perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); @@ -79,25 +76,27 @@ bool amd_set_max_freq_ratio(u64 *ratio) perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; if (!perf_ratio) { pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); - return false; + return; } - *ratio = perf_ratio; - arch_set_max_freq_ratio(false); - - return true; + freq_invariance_set_perf_ratio(perf_ratio, false); } static DEFINE_MUTEX(freq_invariance_lock); void init_freq_invariance_cppc(void) { - static bool secondary; + static bool init_done; - mutex_lock(&freq_invariance_lock); + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + return; - init_freq_invariance(secondary, true); - secondary = true; + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return; + mutex_lock(&freq_invariance_lock); + if (!init_done) + amd_set_max_freq_ratio(); + init_done = true; mutex_unlock(&freq_invariance_lock); } diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 9ca008f9e9b1..1f60a2b27936 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -6,146 +6,446 @@ * Copyright (C) 2017 Intel Corp. * Author: Len Brown <len.brown@intel.com> */ - +#include <linux/cpufreq.h> #include <linux/delay.h> #include <linux/ktime.h> #include <linux/math64.h> #include <linux/percpu.h> -#include <linux/cpufreq.h> -#include <linux/smp.h> -#include <linux/sched/isolation.h> #include <linux/rcupdate.h> +#include <linux/sched/isolation.h> +#include <linux/sched/topology.h> +#include <linux/smp.h> +#include <linux/syscore_ops.h> + +#include <asm/cpu.h> +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> #include "cpu.h" -struct aperfmperf_sample { - unsigned int khz; - atomic_t scfpending; - ktime_t time; - u64 aperf; - u64 mperf; +struct aperfmperf { + seqcount_t seq; + unsigned long last_update; + u64 acnt; + u64 mcnt; + u64 aperf; + u64 mperf; }; -static DEFINE_PER_CPU(struct aperfmperf_sample, samples); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { + .seq = SEQCNT_ZERO(cpu_samples.seq) +}; -#define APERFMPERF_CACHE_THRESHOLD_MS 10 -#define APERFMPERF_REFRESH_DELAY_MS 10 -#define APERFMPERF_STALE_THRESHOLD_MS 1000 +static void init_counter_refs(void) +{ + u64 aperf, mperf; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + this_cpu_write(cpu_samples.aperf, aperf); + this_cpu_write(cpu_samples.mperf, mperf); +} + +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) /* - * aperfmperf_snapshot_khz() - * On the current CPU, snapshot APERF, MPERF, and jiffies - * unless we already did it within 10ms - * calculate kHz, save snapshot + * APERF/MPERF frequency ratio computation. + * + * The scheduler wants to do frequency invariant accounting and needs a <1 + * ratio to account for the 'current' frequency, corresponding to + * freq_curr / freq_max. + * + * Since the frequency freq_curr on x86 is controlled by micro-controller and + * our P-state setting is little more than a request/hint, we need to observe + * the effective frequency 'BusyMHz', i.e. the average frequency over a time + * interval after discarding idle time. This is given by: + * + * BusyMHz = delta_APERF / delta_MPERF * freq_base + * + * where freq_base is the max non-turbo P-state. + * + * The freq_max term has to be set to a somewhat arbitrary value, because we + * can't know which turbo states will be available at a given point in time: + * it all depends on the thermal headroom of the entire package. We set it to + * the turbo level with 4 cores active. + * + * Benchmarks show that's a good compromise between the 1C turbo ratio + * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, + * which would ignore the entire turbo range (a conspicuous part, making + * freq_curr/freq_max always maxed out). + * + * An exception to the heuristic above is the Atom uarch, where we choose the + * highest turbo level for freq_max since Atom's are generally oriented towards + * power efficiency. + * + * Setting freq_max to anything less than the 1C turbo ratio makes the ratio + * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. */ -static void aperfmperf_snapshot_khz(void *dummy) + +DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); + +static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; +static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; + +void arch_set_max_freq_ratio(bool turbo_disabled) { - u64 aperf, aperf_delta; - u64 mperf, mperf_delta; - struct aperfmperf_sample *s = this_cpu_ptr(&samples); - unsigned long flags; + arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : + arch_turbo_freq_ratio; +} +EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); - local_irq_save(flags); - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); - local_irq_restore(flags); +static bool __init turbo_disabled(void) +{ + u64 misc_en; + int err; + + err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); + if (err) + return false; + + return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); +} + +static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + int err; + + err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ + *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ + + return true; +} + +#define X86_MATCH(model) \ + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) + +static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { + X86_MATCH(XEON_PHI_KNL), + X86_MATCH(XEON_PHI_KNM), + {} +}; + +static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { + X86_MATCH(SKYLAKE_X), + {} +}; + +static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { + X86_MATCH(ATOM_GOLDMONT), + X86_MATCH(ATOM_GOLDMONT_D), + X86_MATCH(ATOM_GOLDMONT_PLUS), + {} +}; + +static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, + int num_delta_fratio) +{ + int fratio, delta_fratio, found; + int err, i; + u64 msr; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + fratio = (msr >> 8) & 0xFF; + i = 16; + found = 0; + do { + if (found >= num_delta_fratio) { + *turbo_freq = fratio; + return true; + } + + delta_fratio = (msr >> (i + 5)) & 0x7; + + if (delta_fratio) { + found += 1; + fratio -= delta_fratio; + } + + i += 8; + } while (i < 64); + + return true; +} + +static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) +{ + u64 ratios, counts; + u32 group_size; + int err, i; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); + if (err) + return false; + + for (i = 0; i < 64; i += 8) { + group_size = (counts >> i) & 0xFF; + if (group_size >= size) { + *turbo_freq = (ratios >> i) & 0xFF; + return true; + } + } + + return false; +} - aperf_delta = aperf - s->aperf; - mperf_delta = mperf - s->mperf; +static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + u64 msr; + int err; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ + + /* The CPU may have less than 4 cores */ + if (!*turbo_freq) + *turbo_freq = msr & 0xFF; /* 1C turbo */ + + return true; +} + +static bool __init intel_set_max_freq_ratio(void) +{ + u64 base_freq, turbo_freq; + u64 turbo_ratio; + if (slv_set_max_freq_ratio(&base_freq, &turbo_fr |
