diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-04-02 15:45:30 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-04-02 15:45:30 -0700 |
| commit | d22fff81418edc92be534cad8d59da914049bf69 (patch) | |
| tree | 96b22b20bbc789a76e744bcfc11a7f0854b62ece | |
| parent | 986b37c0ae4f0a3f93d8974d03a9cbc1502dd377 (diff) | |
| parent | eaeb8e76cd5751e805f6e4a3fcec91d283e3b0c2 (diff) | |
| download | linux-d22fff81418edc92be534cad8d59da914049bf69.tar.gz linux-d22fff81418edc92be534cad8d59da914049bf69.tar.bz2 linux-d22fff81418edc92be534cad8d59da914049bf69.zip | |
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar:
- Extend the memmap= boot parameter syntax to allow the redeclaration
and dropping of existing ranges, and to support all e820 range types
(Jan H. Schönherr)
- Improve the W+X boot time security checks to remove false positive
warnings on Xen (Jan Beulich)
- Support booting as Xen PVH guest (Juergen Gross)
- Improved 5-level paging (LA57) support, in particular it's possible
now to have a single kernel image for both 4-level and 5-level
hardware (Kirill A. Shutemov)
- AMD hardware RAM encryption support (SME/SEV) fixes (Tom Lendacky)
- Preparatory commits for hardware-encrypted RAM support on Intel CPUs.
(Kirill A. Shutemov)
- Improved Intel-MID support (Andy Shevchenko)
- Show EFI page tables in page_tables debug files (Andy Lutomirski)
- ... plus misc fixes and smaller cleanups
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (56 commits)
x86/cpu/tme: Fix spelling: "configuation" -> "configuration"
x86/boot: Fix SEV boot failure from change to __PHYSICAL_MASK_SHIFT
x86/mm: Update comment in detect_tme() regarding x86_phys_bits
x86/mm/32: Remove unused node_memmap_size_bytes() & CONFIG_NEED_NODE_MEMMAP_SIZE logic
x86/mm: Remove pointless checks in vmalloc_fault
x86/platform/intel-mid: Add special handling for ACPI HW reduced platforms
ACPI, x86/boot: Introduce the ->reduced_hw_early_init() ACPI callback
ACPI, x86/boot: Split out acpi_generic_reduce_hw_init() and export
x86/pconfig: Provide defines and helper to run MKTME_KEY_PROG leaf
x86/pconfig: Detect PCONFIG targets
x86/tme: Detect if TME and MKTME is activated by BIOS
x86/boot/compressed/64: Handle 5-level paging boot if kernel is above 4G
x86/boot/compressed/64: Use page table in trampoline memory
x86/boot/compressed/64: Use stack from trampoline memory
x86/boot/compressed/64: Make sure we have a 32-bit code segment
x86/mm: Do not use paravirtualized calls in native_set_p4d()
kdump, vmcoreinfo: Export pgtable_l5_enabled value
x86/boot/compressed/64: Prepare new top-level page table for trampoline
x86/boot/compressed/64: Set up trampoline memory
x86/boot/compressed/64: Save and restore trampoline memory
...
68 files changed, 1657 insertions, 1016 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 50b9837e985b..b37c1c30c16f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2248,6 +2248,15 @@ The memory region may be marked as e820 type 12 (0xc) and is NVDIMM or ADR memory. + memmap=<size>%<offset>-<oldtype>+<newtype> + [KNL,ACPI] Convert memory within the specified region + from <oldtype> to <newtype>. If "-<oldtype>" is left + out, the whole region will be marked as <newtype>, + even if previously unavailable. If "+<newtype>" is left + out, matching memory will be removed. Types are + specified as e820 types, e.g., 1 = RAM, 2 = reserved, + 3 = ACPI, 12 = PRAM. + memory_corruption_check=0/1 [X86] Some BIOSes seem to corrupt the first 64k of memory when doing things like suspend/resume. diff --git a/Documentation/x86/x86_64/5level-paging.txt b/Documentation/x86/x86_64/5level-paging.txt index 087251a0d99c..2432a5ef86d9 100644 --- a/Documentation/x86/x86_64/5level-paging.txt +++ b/Documentation/x86/x86_64/5level-paging.txt @@ -20,12 +20,9 @@ Documentation/x86/x86_64/mm.txt CONFIG_X86_5LEVEL=y enables the feature. -So far, a kernel compiled with the option enabled will be able to boot -only on machines that supports the feature -- see for 'la57' flag in -/proc/cpuinfo. - -The plan is to implement boot-time switching between 4- and 5-level paging -in the future. +Kernel with CONFIG_X86_5LEVEL=y still able to boot on 4-level hardware. +In this case additional page table level -- p4d -- will be folded at +runtime. == User-space and large virtual address space == diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cb5b5907dbd6..518b41b097dc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1461,6 +1461,8 @@ config X86_PAE config X86_5LEVEL bool "Enable 5-level page tables support" + select DYNAMIC_MEMORY_LAYOUT + select SPARSEMEM_VMEMMAP depends on X86_64 ---help--- 5-level paging enables access to larger address space: @@ -1469,8 +1471,8 @@ config X86_5LEVEL It will be supported by future Intel CPUs. - Note: a kernel with this option enabled can only be booted - on machines that support the feature. + A kernel with the option enabled can be booted on machines that + support 4- or 5-level paging. See Documentation/x86/x86_64/5level-paging.txt for more information. @@ -1595,10 +1597,6 @@ config ARCH_HAVE_MEMORY_PRESENT def_bool y depends on X86_32 && DISCONTIGMEM -config NEED_NODE_MEMMAP_SIZE - def_bool y - depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) - config ARCH_FLATMEM_ENABLE def_bool y depends on X86_32 && !NUMA @@ -2174,10 +2172,17 @@ config PHYSICAL_ALIGN Don't change this unless you know what you are doing. +config DYNAMIC_MEMORY_LAYOUT + bool + ---help--- + This option makes base addresses of vmalloc and vmemmap as well as + __PAGE_OFFSET movable during boot. + config RANDOMIZE_MEMORY bool "Randomize the kernel memory sections" depends on X86_64 depends on RANDOMIZE_BASE + select DYNAMIC_MEMORY_LAYOUT default RANDOMIZE_BASE ---help--- Randomizes the base virtual address of kernel memory sections diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f484ae0ece93..fa42f895fdde 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -78,7 +78,7 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 - vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o + vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr_64.o vmlinux-objs-y += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o endif diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index fc313e29fe2c..fca012baba19 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -33,6 +33,7 @@ #include <asm/processor-flags.h> #include <asm/asm-offsets.h> #include <asm/bootparam.h> +#include "pgtable.h" /* * Locally defined symbols should be marked hidden: @@ -304,55 +305,77 @@ ENTRY(startup_64) /* Set up the stack */ leaq boot_stack_end(%rbx), %rsp -#ifdef CONFIG_X86_5LEVEL /* - * Check if we need to enable 5-level paging. - * RSI holds real mode data and need to be preserved across - * a function call. + * At this point we are in long mode with 4-level paging enabled, + * but we might want to enable 5-level paging or vice versa. + * + * The problem is that we cannot do it directly. Setting or clearing + * CR4.LA57 in long mode would trigger #GP. So we need to switch off + * long mode and paging first. + * + * We also need a trampoline in lower memory to switch over from + * 4- to 5-level paging for cases when the bootloader puts the kernel + * above 4G, but didn't enable 5-level paging for us. + * + * The same trampoline can be used to switch from 5- to 4-level paging + * mode, like when starting 4-level paging kernel via kexec() when + * original kernel worked in 5-level paging mode. + * + * For the trampoline, we need the top page table to reside in lower + * memory as we don't have a way to load 64-bit values into CR3 in + * 32-bit mode. + * + * We go though the trampoline even if we don't have to: if we're + * already in a desired paging mode. This way the trampoline code gets + * tested on every boot. */ - pushq %rsi - call l5_paging_required - popq %rsi - /* If l5_paging_required() returned zero, we're done here. */ - cmpq $0, %rax - je lvl5 + /* Make sure we have GDT with 32-bit code segment */ + leaq gdt(%rip), %rax + movq %rax, gdt64+2(%rip) + lgdt gdt64(%rip) /* - * At this point we are in long mode with 4-level paging enabled, - * but we want to enable 5-level paging. + * paging_prepare() sets up the trampoline and checks if we need to + * enable 5-level paging. * - * The problem is that we cannot do it directly. Setting LA57 in - * long mode would trigger #GP. So we need to switch off long mode - * first. + * Address of the trampoline is returned in RAX. + * Non zero RDX on return means we need to enable 5-level paging. * - * NOTE: This is not going to work if bootloader put us above 4G - * limit. - * - * The first step is go into compatibility mode. + * RSI holds real mode data and needs to be preserved across + * this function call. */ + pushq %rsi + call paging_prepare + popq %rsi - /* Clear additional page table */ - leaq lvl5_pgtable(%rbx), %rdi - xorq %rax, %rax - movq $(PAGE_SIZE/8), %rcx - rep stosq + /* Save the trampoline address in RCX */ + movq %rax, %rcx /* - * Setup current CR3 as the first and only entry in a new top level - * page table. + * Load the address of trampoline_return() into RDI. + * It will be used by the trampoline to return to the main code. */ - movq %cr3, %rdi - leaq 0x7 (%rdi), %rax - movq %rax, lvl5_pgtable(%rbx) + leaq trampoline_return(%rip), %rdi /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ pushq $__KERNEL32_CS - leaq compatible_mode(%rip), %rax + leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax pushq %rax lretq -lvl5: -#endif +trampoline_return: + /* Restore the stack, the 32-bit trampoline uses its own stack */ + leaq boot_stack_end(%rbx), %rsp + + /* + * cleanup_trampoline() would restore trampoline memory. + * + * RSI holds real mode data and needs to be preserved across + * this function call. + */ + pushq %rsi + call cleanup_trampoline + popq %rsi /* Zero EFLAGS */ pushq $0 @@ -490,46 +513,82 @@ relocated: jmp *%rax .code32 -#ifdef CONFIG_X86_5LEVEL -compatible_mode: - /* Setup data and stack segments */ +/* + * This is the 32-bit trampoline that will be copied over to low memory. + * + * RDI contains the return address (might be above 4G). + * ECX contains the base address of the trampoline memory. + * Non zero RDX on return means we need to enable 5-level paging. + */ +ENTRY(trampoline_32bit_src) + /* Set up data and stack segments */ movl $__KERNEL_DS, %eax movl %eax, %ds movl %eax, %ss + /* Set up new stack */ + leal TRAMPOLINE_32BIT_STACK_END(%ecx), %esp + /* Disable paging */ movl %cr0, %eax btrl $X86_CR0_PG_BIT, %eax movl %eax, %cr0 - /* Point CR3 to 5-level paging */ - leal lvl5_pgtable(%ebx), %eax - movl %eax, %cr3 + /* Check what paging mode we want to be in after the trampoline */ + cmpl $0, %edx + jz 1f - /* Enable PAE and LA57 mode */ + /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */ + movl %cr4, %eax + testl $X86_CR4_LA57, %eax + jnz 3f + jmp 2f +1: + /* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */ movl %cr4, %eax - orl $(X86_CR4_PAE | X86_CR4_LA57), %eax + testl $X86_CR4_LA57, %eax + jz 3f +2: + /* Point CR3 to the trampoline's new top level page table */ + leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax + movl %eax, %cr3 +3: + /* Enable PAE and LA57 (if required) paging modes */ + movl $X86_CR4_PAE, %eax + cmpl $0, %edx + jz 1f + orl $X86_CR4_LA57, %eax +1: movl %eax, %cr4 - /* Calculate address we are running at */ - call 1f -1: popl %edi - subl $1b, %edi + /* Calculate address of paging_enabled() once we are executing in the trampoline */ + leal paging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax - /* Prepare stack for far return to Long Mode */ + /* Prepare the stack for far return to Long Mode */ pushl $__KERNEL_CS - leal lvl5(%edi), %eax - push %eax + pushl %eax - /* Enable paging back */ + /* Enable paging again */ movl $(X86_CR0_PG | X86_CR0_PE), %eax movl %eax, %cr0 lret -#endif + .code64 +paging_enabled: + /* Return from the trampoline */ + jmp *%rdi + + /* + * The trampoline code has a size limit. + * Make sure we fail to compile if the trampoline code grows + * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes. + */ + .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE + + .code32 no_longmode: - /* This isn't an x86-64 CPU so hang */ + /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ 1: hlt jmp 1b @@ -537,6 +596,11 @@ no_longmode: #include "../../kernel/verify_cpu.S" .data +gdt64: + .word gdt_end - gdt + .long 0 + .word 0 + .quad 0 gdt: .word gdt_end - gdt .long gdt @@ -585,7 +649,3 @@ boot_stack_end: .balign 4096 pgtable: .fill BOOT_PGT_SIZE, 1, 0 -#ifdef CONFIG_X86_5LEVEL -lvl5_pgtable: - .fill PAGE_SIZE, 1, 0 -#endif diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 8199a6187251..66e42a098d70 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -46,6 +46,12 @@ #define STATIC #include <linux/decompress/mm.h> +#ifdef CONFIG_X86_5LEVEL +unsigned int pgtable_l5_enabled __ro_after_init; +unsigned int pgdir_shift __ro_after_init = 39; +unsigned int ptrs_per_p4d __ro_after_init = 1; +#endif + extern unsigned long get_cmd_line_ptr(void); /* Simplified build-specific string for starting entropy. */ @@ -723,6 +729,14 @@ void choose_random_location(unsigned long input, return; } +#ifdef CONFIG_X86_5LEVEL + if (__read_cr4() & X86_CR4_LA57) { + pgtable_l5_enabled = 1; + pgdir_shift = 48; + ptrs_per_p4d = 512; + } +#endif + boot_params->hdr.loadflags |= KASLR_FLAG; /* Prepare to add new identity pagetables on demand. */ diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/kaslr_64.c index b5e5e02f8cde..522d11431433 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/kaslr_64.c @@ -16,13 +16,6 @@ #define __pa(x) ((unsigned long)(x)) #define __va(x) ((void *)((unsigned long)(x))) -/* - * The pgtable.h and mm/ident_map.c includes make use of the SME related - * information which is not used in the compressed image support. Un-define - * the SME support to avoid any compile and link errors. - */ -#undef CONFIG_AMD_MEM_ENCRYPT - /* No PAGE_TABLE_ISOLATION support needed either: */ #undef CONFIG_PAGE_TABLE_ISOLATION @@ -85,13 +78,14 @@ static struct x86_mapping_info mapping_info; /* Locates and clears a region for a new top level page table. */ void initialize_identity_maps(void) { - unsigned long sev_me_mask = get_sev_encryption_mask(); + /* If running as an SEV guest, the encryption mask is required. */ + set_sev_encryption_mask(); /* Init mapping_info with run-time function/buffer pointers. */ mapping_info.alloc_pgt_page = alloc_pgt_page; mapping_info.context = &pgt_data; - mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sev_me_mask; - mapping_info.kernpg_flag = _KERNPG_TABLE | sev_me_mask; + mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; + mapping_info.kernpg_flag = _KERNPG_TABLE; /* * It should be impossible for this not to already be true, diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index 54f5f6625a73..eaa843a52907 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -88,9 +88,7 @@ ENTRY(get_sev_encryption_bit) ENDPROC(get_sev_encryption_bit) .code64 -ENTRY(get_sev_encryption_mask) - xor %rax, %rax - +ENTRY(set_sev_encryption_mask) #ifdef CONFIG_AMD_MEM_ENCRYPT push %rbp push %rdx @@ -101,9 +99,7 @@ ENTRY(get_sev_encryption_mask) testl %eax, %eax jz .Lno_sev_mask - xor %rdx, %rdx - bts %rax, %rdx /* Create the encryption mask */ - mov %rdx, %rax /* ... and return it */ + bts %rax, sme_me_mask(%rip) /* Create the encryption mask */ .Lno_sev_mask: movq %rbp, %rsp /* Restore original stack pointer */ @@ -112,9 +108,16 @@ ENTRY(get_sev_encryption_mask) pop %rbp #endif + xor %rax, %rax ret -ENDPROC(get_sev_encryption_mask) +ENDPROC(set_sev_encryption_mask) .data enc_bit: .int 0xffffffff + +#ifdef CONFIG_AMD_MEM_ENCRYPT + .balign 8 +GLOBAL(sme_me_mask) + .quad 0 +#endif diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 252fee320816..8dd1d5ccae58 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -14,6 +14,7 @@ #include "misc.h" #include "error.h" +#include "pgtable.h" #include "../string.h" #include "../voffset.h" @@ -169,16 +170,6 @@ void __puthex(unsigned long value) } } -static bool l5_supported(void) -{ - /* Check if leaf 7 is supported. */ - if (native_cpuid_eax(0) < 7) - return 0; - - /* Check if la57 is supported. */ - return native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)); -} - #if CONFIG_X86_NEED_RELOCS static |
