diff options
68 files changed, 1657 insertions, 1016 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 50b9837e985b..b37c1c30c16f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2248,6 +2248,15 @@ The memory region may be marked as e820 type 12 (0xc) and is NVDIMM or ADR memory. + memmap=<size>%<offset>-<oldtype>+<newtype> + [KNL,ACPI] Convert memory within the specified region + from <oldtype> to <newtype>. If "-<oldtype>" is left + out, the whole region will be marked as <newtype>, + even if previously unavailable. If "+<newtype>" is left + out, matching memory will be removed. Types are + specified as e820 types, e.g., 1 = RAM, 2 = reserved, + 3 = ACPI, 12 = PRAM. + memory_corruption_check=0/1 [X86] Some BIOSes seem to corrupt the first 64k of memory when doing things like suspend/resume. diff --git a/Documentation/x86/x86_64/5level-paging.txt b/Documentation/x86/x86_64/5level-paging.txt index 087251a0d99c..2432a5ef86d9 100644 --- a/Documentation/x86/x86_64/5level-paging.txt +++ b/Documentation/x86/x86_64/5level-paging.txt @@ -20,12 +20,9 @@ Documentation/x86/x86_64/mm.txt CONFIG_X86_5LEVEL=y enables the feature. -So far, a kernel compiled with the option enabled will be able to boot -only on machines that supports the feature -- see for 'la57' flag in -/proc/cpuinfo. - -The plan is to implement boot-time switching between 4- and 5-level paging -in the future. +Kernel with CONFIG_X86_5LEVEL=y still able to boot on 4-level hardware. +In this case additional page table level -- p4d -- will be folded at +runtime. == User-space and large virtual address space == diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cb5b5907dbd6..518b41b097dc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1461,6 +1461,8 @@ config X86_PAE config X86_5LEVEL bool "Enable 5-level page tables support" + select DYNAMIC_MEMORY_LAYOUT + select SPARSEMEM_VMEMMAP depends on X86_64 ---help--- 5-level paging enables access to larger address space: @@ -1469,8 +1471,8 @@ config X86_5LEVEL It will be supported by future Intel CPUs. - Note: a kernel with this option enabled can only be booted - on machines that support the feature. + A kernel with the option enabled can be booted on machines that + support 4- or 5-level paging. See Documentation/x86/x86_64/5level-paging.txt for more information. @@ -1595,10 +1597,6 @@ config ARCH_HAVE_MEMORY_PRESENT def_bool y depends on X86_32 && DISCONTIGMEM -config NEED_NODE_MEMMAP_SIZE - def_bool y - depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) - config ARCH_FLATMEM_ENABLE def_bool y depends on X86_32 && !NUMA @@ -2174,10 +2172,17 @@ config PHYSICAL_ALIGN Don't change this unless you know what you are doing. +config DYNAMIC_MEMORY_LAYOUT + bool + ---help--- + This option makes base addresses of vmalloc and vmemmap as well as + __PAGE_OFFSET movable during boot. + config RANDOMIZE_MEMORY bool "Randomize the kernel memory sections" depends on X86_64 depends on RANDOMIZE_BASE + select DYNAMIC_MEMORY_LAYOUT default RANDOMIZE_BASE ---help--- Randomizes the base virtual address of kernel memory sections diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f484ae0ece93..fa42f895fdde 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -78,7 +78,7 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 - vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o + vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr_64.o vmlinux-objs-y += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o endif diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index fc313e29fe2c..fca012baba19 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -33,6 +33,7 @@ #include <asm/processor-flags.h> #include <asm/asm-offsets.h> #include <asm/bootparam.h> +#include "pgtable.h" /* * Locally defined symbols should be marked hidden: @@ -304,55 +305,77 @@ ENTRY(startup_64) /* Set up the stack */ leaq boot_stack_end(%rbx), %rsp -#ifdef CONFIG_X86_5LEVEL /* - * Check if we need to enable 5-level paging. - * RSI holds real mode data and need to be preserved across - * a function call. + * At this point we are in long mode with 4-level paging enabled, + * but we might want to enable 5-level paging or vice versa. + * + * The problem is that we cannot do it directly. Setting or clearing + * CR4.LA57 in long mode would trigger #GP. So we need to switch off + * long mode and paging first. + * + * We also need a trampoline in lower memory to switch over from + * 4- to 5-level paging for cases when the bootloader puts the kernel + * above 4G, but didn't enable 5-level paging for us. + * + * The same trampoline can be used to switch from 5- to 4-level paging + * mode, like when starting 4-level paging kernel via kexec() when + * original kernel worked in 5-level paging mode. + * + * For the trampoline, we need the top page table to reside in lower + * memory as we don't have a way to load 64-bit values into CR3 in + * 32-bit mode. + * + * We go though the trampoline even if we don't have to: if we're + * already in a desired paging mode. This way the trampoline code gets + * tested on every boot. */ - pushq %rsi - call l5_paging_required - popq %rsi - /* If l5_paging_required() returned zero, we're done here. */ - cmpq $0, %rax - je lvl5 + /* Make sure we have GDT with 32-bit code segment */ + leaq gdt(%rip), %rax + movq %rax, gdt64+2(%rip) + lgdt gdt64(%rip) /* - * At this point we are in long mode with 4-level paging enabled, - * but we want to enable 5-level paging. + * paging_prepare() sets up the trampoline and checks if we need to + * enable 5-level paging. * - * The problem is that we cannot do it directly. Setting LA57 in - * long mode would trigger #GP. So we need to switch off long mode - * first. + * Address of the trampoline is returned in RAX. + * Non zero RDX on return means we need to enable 5-level paging. * - * NOTE: This is not going to work if bootloader put us above 4G - * limit. - * - * The first step is go into compatibility mode. + * RSI holds real mode data and needs to be preserved across + * this function call. */ + pushq %rsi + call paging_prepare + popq %rsi - /* Clear additional page table */ - leaq lvl5_pgtable(%rbx), %rdi - xorq %rax, %rax - movq $(PAGE_SIZE/8), %rcx - rep stosq + /* Save the trampoline address in RCX */ + movq %rax, %rcx /* - * Setup current CR3 as the first and only entry in a new top level - * page table. + * Load the address of trampoline_return() into RDI. + * It will be used by the trampoline to return to the main code. */ - movq %cr3, %rdi - leaq 0x7 (%rdi), %rax - movq %rax, lvl5_pgtable(%rbx) + leaq trampoline_return(%rip), %rdi /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ pushq $__KERNEL32_CS - leaq compatible_mode(%rip), %rax + leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax pushq %rax lretq -lvl5: -#endif +trampoline_return: + /* Restore the stack, the 32-bit trampoline uses its own stack */ + leaq boot_stack_end(%rbx), %rsp + + /* + * cleanup_trampoline() would restore trampoline memory. + * + * RSI holds real mode data and needs to be preserved across + * this function call. + */ + pushq %rsi + call cleanup_trampoline + popq %rsi /* Zero EFLAGS */ pushq $0 @@ -490,46 +513,82 @@ relocated: jmp *%rax .code32 -#ifdef CONFIG_X86_5LEVEL -compatible_mode: - /* Setup data and stack segments */ +/* + * This is the 32-bit trampoline that will be copied over to low memory. + * + * RDI contains the return address (might be above 4G). + * ECX contains the base address of the trampoline memory. + * Non zero RDX on return means we need to enable 5-level paging. + */ +ENTRY(trampoline_32bit_src) + /* Set up data and stack segments */ movl $__KERNEL_DS, %eax movl %eax, %ds movl %eax, %ss + /* Set up new stack */ + leal TRAMPOLINE_32BIT_STACK_END(%ecx), %esp + /* Disable paging */ movl %cr0, %eax btrl $X86_CR0_PG_BIT, %eax movl %eax, %cr0 - /* Point CR3 to 5-level paging */ - leal lvl5_pgtable(%ebx), %eax - movl %eax, %cr3 + /* Check what paging mode we want to be in after the trampoline */ + cmpl $0, %edx + jz 1f - /* Enable PAE and LA57 mode */ + /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */ + movl %cr4, %eax + testl $X86_CR4_LA57, %eax + jnz 3f + jmp 2f +1: + /* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */ movl %cr4, %eax - orl $(X86_CR4_PAE | X86_CR4_LA57), %eax + testl $X86_CR4_LA57, %eax + jz 3f +2: + /* Point CR3 to the trampoline's new top level page table */ + leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax + movl %eax, %cr3 +3: + /* Enable PAE and LA57 (if required) paging modes */ + movl $X86_CR4_PAE, %eax + cmpl $0, %edx + jz 1f + orl $X86_CR4_LA57, %eax +1: movl %eax, %cr4 - /* Calculate address we are running at */ - call 1f -1: popl %edi - subl $1b, %edi + /* Calculate address of paging_enabled() once we are executing in the trampoline */ + leal paging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax - /* Prepare stack for far return to Long Mode */ + /* Prepare the stack for far return to Long Mode */ pushl $__KERNEL_CS - leal lvl5(%edi), %eax - push %eax + pushl %eax - /* Enable paging back */ + /* Enable paging again */ movl $(X86_CR0_PG | X86_CR0_PE), %eax movl %eax, %cr0 lret -#endif + .code64 +paging_enabled: + /* Return from the trampoline */ + jmp *%rdi + + /* + * The trampoline code has a size limit. + * Make sure we fail to compile if the trampoline code grows + * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes. + */ + .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE + + .code32 no_longmode: - /* This isn't an x86-64 CPU so hang */ + /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ 1: hlt jmp 1b @@ -537,6 +596,11 @@ no_longmode: #include "../../kernel/verify_cpu.S" .data +gdt64: + .word gdt_end - gdt + .long 0 + .word 0 + .quad 0 gdt: .word gdt_end - gdt .long gdt @@ -585,7 +649,3 @@ boot_stack_end: .balign 4096 pgtable: .fill BOOT_PGT_SIZE, 1, 0 -#ifdef CONFIG_X86_5LEVEL -lvl5_pgtable: - .fill PAGE_SIZE, 1, 0 -#endif diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 8199a6187251..66e42a098d70 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -46,6 +46,12 @@ #define STATIC #include <linux/decompress/mm.h> +#ifdef CONFIG_X86_5LEVEL +unsigned int pgtable_l5_enabled __ro_after_init; +unsigned int pgdir_shift __ro_after_init = 39; +unsigned int ptrs_per_p4d __ro_after_init = 1; +#endif + extern unsigned long get_cmd_line_ptr(void); /* Simplified build-specific string for starting entropy. */ @@ -723,6 +729,14 @@ void choose_random_location(unsigned long input, return; } +#ifdef CONFIG_X86_5LEVEL + if (__read_cr4() & X86_CR4_LA57) { + pgtable_l5_enabled = 1; + pgdir_shift = 48; + ptrs_per_p4d = 512; + } +#endif + boot_params->hdr.loadflags |= KASLR_FLAG; /* Prepare to add new identity pagetables on demand. */ diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/kaslr_64.c index b5e5e02f8cde..522d11431433 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/kaslr_64.c @@ -16,13 +16,6 @@ #define __pa(x) ((unsigned long)(x)) #define __va(x) ((void *)((unsigned long)(x))) -/* - * The pgtable.h and mm/ident_map.c includes make use of the SME related - * information which is not used in the compressed image support. Un-define - * the SME support to avoid any compile and link errors. - */ -#undef CONFIG_AMD_MEM_ENCRYPT - /* No PAGE_TABLE_ISOLATION support needed either: */ #undef CONFIG_PAGE_TABLE_ISOLATION @@ -85,13 +78,14 @@ static struct x86_mapping_info mapping_info; /* Locates and clears a region for a new top level page table. */ void initialize_identity_maps(void) { - unsigned long sev_me_mask = get_sev_encryption_mask(); + /* If running as an SEV guest, the encryption mask is required. */ + set_sev_encryption_mask(); /* Init mapping_info with run-time function/buffer pointers. */ mapping_info.alloc_pgt_page = alloc_pgt_page; mapping_info.context = &pgt_data; - mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sev_me_mask; - mapping_info.kernpg_flag = _KERNPG_TABLE | sev_me_mask; + mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; + mapping_info.kernpg_flag = _KERNPG_TABLE; /* * It should be impossible for this not to already be true, diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index 54f5f6625a73..eaa843a52907 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -88,9 +88,7 @@ ENTRY(get_sev_encryption_bit) ENDPROC(get_sev_encryption_bit) .code64 -ENTRY(get_sev_encryption_mask) - xor %rax, %rax - +ENTRY(set_sev_encryption_mask) #ifdef CONFIG_AMD_MEM_ENCRYPT push %rbp push %rdx @@ -101,9 +99,7 @@ ENTRY(get_sev_encryption_mask) testl %eax, %eax jz .Lno_sev_mask - xor %rdx, %rdx - bts %rax, %rdx /* Create the encryption mask */ - mov %rdx, %rax /* ... and return it */ + bts %rax, sme_me_mask(%rip) /* Create the encryption mask */ .Lno_sev_mask: movq %rbp, %rsp /* Restore original stack pointer */ @@ -112,9 +108,16 @@ ENTRY(get_sev_encryption_mask) pop %rbp #endif + xor %rax, %rax ret -ENDPROC(get_sev_encryption_mask) +ENDPROC(set_sev_encryption_mask) .data enc_bit: .int 0xffffffff + +#ifdef CONFIG_AMD_MEM_ENCRYPT + .balign 8 +GLOBAL(sme_me_mask) + .quad 0 +#endif diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 252fee320816..8dd1d5ccae58 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -14,6 +14,7 @@ #include "misc.h" #include "error.h" +#include "pgtable.h" #include "../string.h" #include "../voffset.h" @@ -169,16 +170,6 @@ void __puthex(unsigned long value) } } -static bool l5_supported(void) -{ - /* Check if leaf 7 is supported. */ - if (native_cpuid_eax(0) < 7) - return 0; - - /* Check if la57 is supported. */ - return native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)); -} - #if CONFIG_X86_NEED_RELOCS static void handle_relocations(void *output, unsigned long output_len, unsigned long virt_addr) @@ -376,12 +367,6 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, console_init(); debug_putstr("early console in extract_kernel\n"); - if (IS_ENABLED(CONFIG_X86_5LEVEL) && !l5_supported()) { - error("This linux kernel as configured requires 5-level paging\n" - "This CPU does not support the required 'cr4.la57' feature\n" - "Unable to boot - please use a kernel appropriate for your CPU\n"); - } - free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; @@ -392,6 +377,11 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, debug_putaddr(output_len); debug_putaddr(kernel_total_size); +#ifdef CONFIG_X86_64 + /* Report address of 32-bit trampoline */ + debug_putaddr(trampoline_32bit); +#endif + /* * The memory hole needed for the kernel is the larger of either * the entire decompressed kernel plus relocation table, or the diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 9d323dc6b159..9e11be4cae19 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -12,6 +12,11 @@ #undef CONFIG_PARAVIRT_SPINLOCKS #undef CONFIG_KASAN +#ifdef CONFIG_X86_5LEVEL +/* cpu_feature_enabled() cannot be used that early */ +#define pgtable_l5_enabled __pgtable_l5_enabled +#endif + #include <linux/linkage.h> #include <linux/screen_info.h> #include <linux/elf.h> @@ -109,6 +114,6 @@ static inline void console_init(void) { } #endif -unsigned long get_sev_encryption_mask(void); +void set_sev_encryption_mask(void); #endif diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h new file mode 100644 index 000000000000..91f75638f6e6 --- /dev/null +++ b/arch/x86/boot/compressed/pgtable.h @@ -0,0 +1,20 @@ +#ifndef BOOT_COMPRESSED_PAGETABLE_H +#define BOOT_COMPRESSED_PAGETABLE_H + +#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE) + +#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0 + +#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE +#define TRAMPOLINE_32BIT_CODE_SIZE 0x60 + +#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE + +#ifndef __ASSEMBLER__ + +extern unsigned long *trampoline_32bit; + +extern void trampoline_32bit_src(void *return_ptr); + +#endif /* __ASSEMBLER__ */ +#endif /* BO |
