From 315ad8780a129e82e2c5c65ee6e970d91a577acb Mon Sep 17 00:00:00 2001 From: Rong Xu Date: Sat, 2 Nov 2024 10:51:08 -0700 Subject: kbuild: Add AutoFDO support for Clang build Add the build support for using Clang's AutoFDO. Building the kernel with AutoFDO does not reduce the optimization level from the compiler. AutoFDO uses hardware sampling to gather information about the frequency of execution of different code paths within a binary. This information is then used to guide the compiler's optimization decisions, resulting in a more efficient binary. Experiments showed that the kernel can improve up to 10% in latency. The support requires a Clang compiler after LLVM 17. This submission is limited to x86 platforms that support PMU features like LBR on Intel machines and AMD Zen3 BRS. Support for SPE on ARM 1, and BRBE on ARM 1 is part of planned future work. Here is an example workflow for AutoFDO kernel: 1) Build the kernel on the host machine with LLVM enabled, for example, $ make menuconfig LLVM=1 Turn on AutoFDO build config: CONFIG_AUTOFDO_CLANG=y With a configuration that has LLVM enabled, use the following command: scripts/config -e AUTOFDO_CLANG After getting the config, build with $ make LLVM=1 2) Install the kernel on the test machine. 3) Run the load tests. The '-c' option in perf specifies the sample event period. We suggest using a suitable prime number, like 500009, for this purpose. For Intel platforms: $ perf record -e BR_INST_RETIRED.NEAR_TAKEN:k -a -N -b -c \ -o -- For AMD platforms: The supported system are: Zen3 with BRS, or Zen4 with amd_lbr_v2 For Zen3: $ cat proc/cpuinfo | grep " brs" For Zen4: $ cat proc/cpuinfo | grep amd_lbr_v2 $ perf record --pfm-events RETIRED_TAKEN_BRANCH_INSTRUCTIONS:k -a \ -N -b -c -o -- 4) (Optional) Download the raw perf file to the host machine. 5) To generate an AutoFDO profile, two offline tools are available: create_llvm_prof and llvm_profgen. The create_llvm_prof tool is part of the AutoFDO project and can be found on GitHub (https://github.com/google/autofdo), version v0.30.1 or later. The llvm_profgen tool is included in the LLVM compiler itself. It's important to note that the version of llvm_profgen doesn't need to match the version of Clang. It needs to be the LLVM 19 release or later, or from the LLVM trunk. $ llvm-profgen --kernel --binary= --perfdata= \ -o or $ create_llvm_prof --binary= --profile= \ --format=extbinary --out= Note that multiple AutoFDO profile files can be merged into one via: $ llvm-profdata merge -o ... 6) Rebuild the kernel using the AutoFDO profile file with the same config as step 1, (Note CONFIG_AUTOFDO_CLANG needs to be enabled): $ make LLVM=1 CLANG_AUTOFDO_PROFILE= Co-developed-by: Han Shen Signed-off-by: Han Shen Signed-off-by: Rong Xu Suggested-by: Sriraman Tallam Suggested-by: Krzysztof Pszeniczny Suggested-by: Nick Desaulniers Suggested-by: Stephane Eranian Tested-by: Yonghong Song Tested-by: Yabin Cui Tested-by: Nathan Chancellor Reviewed-by: Kees Cook Tested-by: Peter Jung Signed-off-by: Masahiro Yamada --- arch/Kconfig | 20 ++++++++++++++++++++ arch/x86/Kconfig | 1 + 2 files changed, 21 insertions(+) (limited to 'arch') diff --git a/arch/Kconfig b/arch/Kconfig index bd9f095d69fa..8dca3b5e6ef5 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -811,6 +811,26 @@ config LTO_CLANG_THIN If unsure, say Y. endchoice +config ARCH_SUPPORTS_AUTOFDO_CLANG + bool + +config AUTOFDO_CLANG + bool "Enable Clang's AutoFDO build (EXPERIMENTAL)" + depends on ARCH_SUPPORTS_AUTOFDO_CLANG + depends on CC_IS_CLANG && CLANG_VERSION >= 170000 + help + This option enables Clang’s AutoFDO build. When + an AutoFDO profile is specified in variable + CLANG_AUTOFDO_PROFILE during the build process, + Clang uses the profile to optimize the kernel. + + If no profile is specified, AutoFDO options are + still passed to Clang to facilitate the collection + of perf data for creating an AutoFDO profile in + subsequent builds. + + If unsure, say N. + config ARCH_SUPPORTS_CFI_CLANG bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 16354dfa6d96..9dc87661fb37 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -126,6 +126,7 @@ config X86 select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG_THIN select ARCH_SUPPORTS_RT + select ARCH_SUPPORTS_AUTOFDO_CLANG select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64 select ARCH_USE_MEMTEST -- cgit v1.2.3 From 52892ed6b03a14b961c1df783ed05763758abc73 Mon Sep 17 00:00:00 2001 From: Rong Xu Date: Tue, 26 Nov 2024 09:34:09 -0800 Subject: MIPS: Place __kernel_entry at the beginning of text section Mark __kernel_entry as ".head.text" and place HEAD_TEXT before TEXT_TEXT in the linker script. This ensures that __kernel_entry will be placed at the beginning of text section. Drop mips from scripts/head-object-list.txt. Signed-off-by: Rong Xu Reported-by: Chris Packham Closes: https://lore.kernel.org/lkml/c6719149-8531-4174-824e-a3caf4bc6d0e@alliedtelesis.co.nz/T/ Tested-by: Chris Packham Signed-off-by: Masahiro Yamada --- arch/mips/kernel/head.S | 1 + arch/mips/kernel/vmlinux.lds.S | 1 + 2 files changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/mips/kernel/head.S b/arch/mips/kernel/head.S index b825ed4476c7..e3ff6179c99f 100644 --- a/arch/mips/kernel/head.S +++ b/arch/mips/kernel/head.S @@ -59,6 +59,7 @@ #endif .endm + __HEAD #ifndef CONFIG_NO_EXCEPT_FILL /* * Reserved space for exception handlers. diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 9ff55cb80a64..2b708fac8d2c 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -61,6 +61,7 @@ SECTIONS /* read-only */ _text = .; /* Text and read-only data */ .text : { + HEAD_TEXT TEXT_TEXT SCHED_TEXT LOCK_TEXT -- cgit v1.2.3 From 0043ecea2399ffc8bfd99ed9dbbe766e7c79293c Mon Sep 17 00:00:00 2001 From: Rong Xu Date: Sat, 2 Nov 2024 10:51:10 -0700 Subject: vmlinux.lds.h: Adjust symbol ordering in text output section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the -ffunction-sections compiler option is enabled, each function is placed in a separate section named .text.function_name rather than putting all functions in a single .text section. However, using -function-sections can cause problems with the linker script. The comments included in include/asm-generic/vmlinux.lds.h note these issues.: “TEXT_MAIN here will match .text.fixup and .text.unlikely if dead code elimination is enabled, so these sections should be converted to use ".." first.” It is unclear whether there is a straightforward method for converting a suffix to "..". This patch modifies the order of subsections within the text output section. Specifically, it changes current order: .text.hot, .text, .text_unlikely, .text.unknown, .text.asan to the new order: .text.asan, .text.unknown, .text_unlikely, .text.hot, .text Here is the rationale behind the new layout: The majority of the code resides in three sections: .text.hot, .text, and .text.unlikely, with .text.unknown containing a negligible amount. .text.asan is only generated in ASAN builds. The primary goal is to group code segments based on their execution frequency (hotness). First, we want to place .text.hot adjacent to .text. Since we cannot put .text.hot after .text (Due to constraints with -ffunction-sections, placing .text.hot after .text is problematic), we need to put .text.hot before .text. Then it comes to .text.unlikely, we cannot put it after .text (same -ffunction-sections issue) . Therefore, we position .text.unlikely before .text.hot. .text.unknown and .tex.asan follow the same logic. This revised ordering effectively reverses the original arrangement (for .text.unlikely, .text.unknown, and .tex.asan), maintaining a similar level of affinity between sections. It also places .text.hot section at the beginning of a page to better utilize the TLB entry. Note that the limitation arises because the linker script employs glob patterns instead of regular expressions for string matching. While there is a method to maintain the current order using complex patterns, this significantly complicates the pattern and increases the likelihood of errors. This patch also changes vmlinux.lds.S for the sparc64 architecture to accommodate specific symbol placement requirements. Co-developed-by: Han Shen Signed-off-by: Han Shen Signed-off-by: Rong Xu Suggested-by: Sriraman Tallam Suggested-by: Krzysztof Pszeniczny Tested-by: Yonghong Song Tested-by: Yabin Cui Tested-by: Nathan Chancellor Reviewed-by: Kees Cook Signed-off-by: Masahiro Yamada --- arch/sparc/kernel/vmlinux.lds.S | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index d317a843f7ea..f1b86eb30340 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -48,6 +48,11 @@ SECTIONS { _text = .; HEAD_TEXT + ALIGN_FUNCTION(); +#ifdef CONFIG_SPARC64 + /* Match text section symbols in head_64.S first */ + *head_64.o(.text) +#endif TEXT_TEXT SCHED_TEXT LOCK_TEXT -- cgit v1.2.3 From d5dc95836147f2e25b134c0ca3a0bc1a5867ea29 Mon Sep 17 00:00:00 2001 From: Rong Xu Date: Sat, 2 Nov 2024 10:51:14 -0700 Subject: kbuild: Add Propeller configuration for kernel build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the build support for using Clang's Propeller optimizer. Like AutoFDO, Propeller uses hardware sampling to gather information about the frequency of execution of different code paths within a binary. This information is then used to guide the compiler's optimization decisions, resulting in a more efficient binary. The support requires a Clang compiler LLVM 19 or later, and the create_llvm_prof tool (https://github.com/google/autofdo/releases/tag/v0.30.1). This commit is limited to x86 platforms that support PMU features like LBR on Intel machines and AMD Zen3 BRS. Here is an example workflow for building an AutoFDO+Propeller optimized kernel: 1) Build the kernel on the host machine, with AutoFDO and Propeller build config CONFIG_AUTOFDO_CLANG=y CONFIG_PROPELLER_CLANG=y then $ make LLVM=1 CLANG_AUTOFDO_PROFILE=” is the profile collected when doing a non-Propeller AutoFDO build. This step builds a kernel that has the same optimization level as AutoFDO, plus a metadata section that records basic block information. This kernel image runs as fast as an AutoFDO optimized kernel. 2) Install the kernel on test/production machines. 3) Run the load tests. The '-c' option in perf specifies the sample event period. We suggest using a suitable prime number, like 500009, for this purpose. For Intel platforms: $ perf record -e BR_INST_RETIRED.NEAR_TAKEN:k -a -N -b -c \ -o -- For AMD platforms: The supported system are: Zen3 with BRS, or Zen4 with amd_lbr_v2 # To see if Zen3 support LBR: $ cat proc/cpuinfo | grep " brs" # To see if Zen4 support LBR: $ cat proc/cpuinfo | grep amd_lbr_v2 # If the result is yes, then collect the profile using: $ perf record --pfm-events RETIRED_TAKEN_BRANCH_INSTRUCTIONS:k -a \ -N -b -c -o -- 4) (Optional) Download the raw perf file to the host machine. 5) Generate Propeller profile: $ create_llvm_prof --binary= --profile= \ --format=propeller --propeller_output_module_name \ --out=_cc_profile.txt \ --propeller_symorder=_ld_profile.txt “create_llvm_prof” is the profile conversion tool, and a prebuilt binary for linux can be found on https://github.com/google/autofdo/releases/tag/v0.30.1 (can also build from source). "" can be something like "/home/user/dir/any_string". This command generates a pair of Propeller profiles: "_cc_profile.txt" and "_ld_profile.txt". 6) Rebuild the kernel using the AutoFDO and Propeller profile files. CONFIG_AUTOFDO_CLANG=y CONFIG_PROPELLER_CLANG=y and $ make LLVM=1 CLANG_AUTOFDO_PROFILE= \ CLANG_PROPELLER_PROFILE_PREFIX= Co-developed-by: Han Shen Signed-off-by: Han Shen Signed-off-by: Rong Xu Suggested-by: Sriraman Tallam Suggested-by: Krzysztof Pszeniczny Suggested-by: Nick Desaulniers Suggested-by: Stephane Eranian Tested-by: Yonghong Song Tested-by: Nathan Chancellor Reviewed-by: Kees Cook Signed-off-by: Masahiro Yamada --- arch/Kconfig | 19 +++++++++++++++++++ arch/x86/Kconfig | 1 + arch/x86/kernel/vmlinux.lds.S | 4 ++++ 3 files changed, 24 insertions(+) (limited to 'arch') diff --git a/arch/Kconfig b/arch/Kconfig index 8dca3b5e6ef5..00551f340dbe 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -831,6 +831,25 @@ config AUTOFDO_CLANG If unsure, say N. +config ARCH_SUPPORTS_PROPELLER_CLANG + bool + +config PROPELLER_CLANG + bool "Enable Clang's Propeller build" + depends on ARCH_SUPPORTS_PROPELLER_CLANG + depends on CC_IS_CLANG && CLANG_VERSION >= 190000 + help + This option enables Clang’s Propeller build. When the Propeller + profiles is specified in variable CLANG_PROPELLER_PROFILE_PREFIX + during the build process, Clang uses the profiles to optimize + the kernel. + + If no profile is specified, Propeller options are still passed + to Clang to facilitate the collection of perf data for creating + the Propeller profiles in subsequent builds. + + If unsure, say N. + config ARCH_SUPPORTS_CFI_CLANG bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9dc87661fb37..89b8fc452a7c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -127,6 +127,7 @@ config X86 select ARCH_SUPPORTS_LTO_CLANG_THIN select ARCH_SUPPORTS_RT select ARCH_SUPPORTS_AUTOFDO_CLANG + select ARCH_SUPPORTS_PROPELLER_CLANG if X86_64 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64 select ARCH_USE_MEMTEST diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index b8c5741d2fb4..cf22081601ed 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -443,6 +443,10 @@ SECTIONS STABS_DEBUG DWARF_DEBUG +#ifdef CONFIG_PROPELLER_CLANG + .llvm_bb_addr_map : { *(.llvm_bb_addr_map) } +#endif + ELF_DETAILS DISCARDS -- cgit v1.2.3 From 214c0eea43b2ea66bcd6467ea57e47ce8874191b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 10 Nov 2024 10:34:30 +0900 Subject: kbuild: add $(objtree)/ prefix to some in-kernel build artifacts $(objtree) refers to the top of the output directory of kernel builds. This commit adds the explicit $(objtree)/ prefix to build artifacts needed for building external modules. This change has no immediate impact, as the top-level Makefile currently defines: objtree := . This commit prepares for supporting the building of external modules in a different directory. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- arch/arm/Makefile | 4 ++-- arch/arm64/Makefile | 2 +- arch/powerpc/Makefile | 4 ++-- arch/riscv/Makefile | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm/Makefile b/arch/arm/Makefile index aafebf145738..00ca7886b18e 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -264,13 +264,13 @@ stack_protector_prepare: prepare0 -mstack-protector-guard=tls \ -mstack-protector-guard-offset=$(shell \ awk '{if ($$2 == "TSK_STACK_CANARY") print $$3;}'\ - include/generated/asm-offsets.h)) + $(objtree)/include/generated/asm-offsets.h)) else stack_protector_prepare: prepare0 $(eval SSP_PLUGIN_CFLAGS := \ -fplugin-arg-arm_ssp_per_task_plugin-offset=$(shell \ awk '{if ($$2 == "TSK_STACK_CANARY") print $$3;}'\ - include/generated/asm-offsets.h)) + $(objtree)/include/generated/asm-offsets.h)) $(eval KBUILD_CFLAGS += $(SSP_PLUGIN_CFLAGS)) $(eval GCC_PLUGINS_CFLAGS += $(SSP_PLUGIN_CFLAGS)) endif diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 9efd3f37c2fd..358c68565bfd 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -71,7 +71,7 @@ stack_protector_prepare: prepare0 -mstack-protector-guard-reg=sp_el0 \ -mstack-protector-guard-offset=$(shell \ awk '{if ($$2 == "TSK_STACK_CANARY") print $$3;}' \ - include/generated/asm-offsets.h)) + $(objtree)/include/generated/asm-offsets.h)) endif ifeq ($(CONFIG_ARM64_BTI_KERNEL),y) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index bbfe4a1f06ef..321b596d2550 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -402,9 +402,9 @@ prepare: stack_protector_prepare PHONY += stack_protector_prepare stack_protector_prepare: prepare0 ifdef CONFIG_PPC64 - $(eval KBUILD_CFLAGS += -mstack-protector-guard-offset=$(shell awk '{if ($$2 == "PACA_CANARY") print $$3;}' include/generated/asm-offsets.h)) + $(eval KBUILD_CFLAGS += -mstack-protector-guard-offset=$(shell awk '{if ($$2 == "PACA_CANARY") print $$3;}' $(objtree)/include/generated/asm-offsets.h)) else - $(eval KBUILD_CFLAGS += -mstack-protector-guard-offset=$(shell awk '{if ($$2 == "TASK_CANARY") print $$3;}' include/generated/asm-offsets.h)) + $(eval KBUILD_CFLAGS += -mstack-protector-guard-offset=$(shell awk '{if ($$2 == "TASK_CANARY") print $$3;}' $(objtree)/include/generated/asm-offsets.h)) endif endif diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index d469db9f46f4..a08cfeb6cbf9 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -129,7 +129,7 @@ stack_protector_prepare: prepare0 -mstack-protector-guard-reg=tp \ -mstack-protector-guard-offset=$(shell \ awk '{if ($$2 == "TSK_STACK_CANARY") print $$3;}' \ - include/generated/asm-offsets.h)) + $(objtree)/include/generated/asm-offsets.h)) endif # arch specific predefines for sparse -- cgit v1.2.3