summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-07-15 19:36:01 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-07-15 19:36:01 -0700
commit98896d8795d72acf166f83b06c2706effa019d92 (patch)
tree11b0e41c0cee3d35a1f2b6fe16ef94272a564b9d
parent181a984b7d8d98e5997bcd8e2ebe6ade1b36978e (diff)
parent16df35946120fca2346c415fae429c821391eef8 (diff)
downloadlinux-98896d8795d72acf166f83b06c2706effa019d92.tar.gz
linux-98896d8795d72acf166f83b06c2706effa019d92.tar.bz2
linux-98896d8795d72acf166f83b06c2706effa019d92.zip
Merge tag 'x86_cc_for_v6.11_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 confidential computing updates from Borislav Petkov: "Unrelated x86/cc changes queued here to avoid ugly cross-merges and conflicts: - Carve out CPU hotplug function declarations into a separate header with the goal to be able to use the lockdep assertions in a more flexible manner - As a result, refactor cacheinfo code after carving out a function to return the cache ID associated with a given cache level - Cleanups Add support to be able to kexec TDX guests: - Expand ACPI MADT CPU offlining support - Add machinery to prepare CoCo guests memory before kexec-ing into a new kernel - Cleanup, readjust and massage related code" * tag 'x86_cc_for_v6.11_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits) ACPI: tables: Print MULTIPROC_WAKEUP when MADT is parsed x86/acpi: Add support for CPU offlining for ACPI MADT wakeup method x86/mm: Introduce kernel_ident_mapping_free() x86/smp: Add smp_ops.stop_this_cpu() callback x86/acpi: Do not attempt to bring up secondary CPUs in the kexec case x86/acpi: Rename fields in the acpi_madt_multiproc_wakeup structure x86/mm: Do not zap page table entries mapping unaccepted memory table during kdump x86/mm: Make e820__end_ram_pfn() cover E820_TYPE_ACPI ranges x86/tdx: Convert shared memory back to private on kexec x86/mm: Add callbacks to prepare encrypted memory for kexec x86/tdx: Account shared memory x86/mm: Return correct level from lookup_address() if pte is none x86/mm: Make x86_platform.guest.enc_status_change_*() return an error x86/kexec: Keep CR4.MCE set during kexec for TDX guest x86/relocate_kernel: Use named labels for less confusion cpu/hotplug, x86/acpi: Disable CPU offlining for ACPI MADT wakeup cpu/hotplug: Add support for declaring CPU offlining not supported x86/apic: Mark acpi_mp_wake_* variables as __ro_after_init x86/acpi: Extract ACPI MADT wakeup code into a separate file x86/kexec: Remove spurious unconditional JMP from from identity_mapped() ...
-rw-r--r--arch/x86/Kconfig7
-rw-r--r--arch/x86/coco/core.c1
-rw-r--r--arch/x86/coco/tdx/tdx.c121
-rw-r--r--arch/x86/hyperv/ivm.c22
-rw-r--r--arch/x86/include/asm/acpi.h7
-rw-r--r--arch/x86/include/asm/init.h3
-rw-r--r--arch/x86/include/asm/pgtable.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/include/asm/set_memory.h3
-rw-r--r--arch/x86/include/asm/smp.h1
-rw-r--r--arch/x86/include/asm/x86_init.h14
-rw-r--r--arch/x86/kernel/acpi/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c86
-rw-r--r--arch/x86/kernel/acpi/madt_playdead.S28
-rw-r--r--arch/x86/kernel/acpi/madt_wakeup.c292
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c17
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c14
-rw-r--r--arch/x86/kernel/crash.c12
-rw-r--r--arch/x86/kernel/e820.c9
-rw-r--r--arch/x86/kernel/process.c7
-rw-r--r--arch/x86/kernel/reboot.c18
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S27
-rw-r--r--arch/x86/kernel/x86_init.c8
-rw-r--r--arch/x86/mm/ident_map.c73
-rw-r--r--arch/x86/mm/init_64.c16
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c8
-rw-r--r--arch/x86/mm/pat/set_memory.c75
-rw-r--r--drivers/acpi/tables.c14
-rw-r--r--include/acpi/actbl2.h19
-rw-r--r--include/linux/cacheinfo.h25
-rw-r--r--include/linux/cc_platform.h10
-rw-r--r--include/linux/cpu.h33
-rw-r--r--include/linux/cpuhplock.h49
-rw-r--r--kernel/cpu.c12
34 files changed, 812 insertions, 226 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d7122a1883e..125914536825 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1118,6 +1118,13 @@ config X86_LOCAL_APIC
depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI
select IRQ_DOMAIN_HIERARCHY
+config ACPI_MADT_WAKEUP
+ def_bool y
+ depends on X86_64
+ depends on ACPI
+ depends on SMP
+ depends on X86_LOCAL_APIC
+
config X86_IO_APIC
def_bool y
depends on X86_LOCAL_APIC || X86_UP_IOAPIC
diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
index b31ef2424d19..0f81f70aca82 100644
--- a/arch/x86/coco/core.c
+++ b/arch/x86/coco/core.c
@@ -29,7 +29,6 @@ static bool noinstr intel_cc_platform_has(enum cc_attr attr)
{
switch (attr) {
case CC_ATTR_GUEST_UNROLL_STRING_IO:
- case CC_ATTR_HOTPLUG_DISABLED:
case CC_ATTR_GUEST_MEM_ENCRYPT:
case CC_ATTR_MEM_ENCRYPT:
return true;
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index c1cb90369915..078e2bac2553 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -7,6 +7,7 @@
#include <linux/cpufeature.h>
#include <linux/export.h>
#include <linux/io.h>
+#include <linux/kexec.h>
#include <asm/coco.h>
#include <asm/tdx.h>
#include <asm/vmx.h>
@@ -14,6 +15,7 @@
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/pgtable.h>
+#include <asm/set_memory.h>
/* MMIO direction */
#define EPT_READ 0
@@ -38,6 +40,8 @@
#define TDREPORT_SUBTYPE_0 0
+static atomic_long_t nr_shared;
+
/* Called from __tdx_hypercall() for unrecoverable failure */
noinstr void __noreturn __tdx_hypercall_failed(void)
{
@@ -798,28 +802,124 @@ static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
return true;
}
-static bool tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,
- bool enc)
+static int tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,
+ bool enc)
{
/*
* Only handle shared->private conversion here.
* See the comment in tdx_early_init().
*/
- if (enc)
- return tdx_enc_status_changed(vaddr, numpages, enc);
- return true;
+ if (enc && !tdx_enc_status_changed(vaddr, numpages, enc))
+ return -EIO;
+
+ return 0;
}
-static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
+static int tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
bool enc)
{
/*
* Only handle private->shared conversion here.
* See the comment in tdx_early_init().
*/
- if (!enc)
- return tdx_enc_status_changed(vaddr, numpages, enc);
- return true;
+ if (!enc && !tdx_enc_status_changed(vaddr, numpages, enc))
+ return -EIO;
+
+ if (enc)
+ atomic_long_sub(numpages, &nr_shared);
+ else
+ atomic_long_add(numpages, &nr_shared);
+
+ return 0;
+}
+
+/* Stop new private<->shared conversions */
+static void tdx_kexec_begin(void)
+{
+ if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+ return;
+
+ /*
+ * Crash kernel reaches here with interrupts disabled: can't wait for
+ * conversions to finish.
+ *
+ * If race happened, just report and proceed.
+ */
+ if (!set_memory_enc_stop_conversion())
+ pr_warn("Failed to stop shared<->private conversions\n");
+}
+
+/* Walk direct mapping and convert all shared memory back to private */
+static void tdx_kexec_finish(void)
+{
+ unsigned long addr, end;
+ long found = 0, shared;
+
+ if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+ return;
+
+ lockdep_assert_irqs_disabled();
+
+ addr = PAGE_OFFSET;
+ end = PAGE_OFFSET + get_max_mapped();
+
+ while (addr < end) {
+ unsigned long size;
+ unsigned int level;
+ pte_t *pte;
+
+ pte = lookup_address(addr, &level);
+ size = page_level_size(level);
+
+ if (pte && pte_decrypted(*pte)) {
+ int pages = size / PAGE_SIZE;
+
+ /*
+ * Touching memory with shared bit set triggers implicit
+ * conversion to shared.
+ *
+ * Make sure nobody touches the shared range from
+ * now on.
+ */
+ set_pte(pte, __pte(0));
+
+ /*
+ * Memory encryption state persists across kexec.
+ * If tdx_enc_status_changed() fails in the first
+ * kernel, it leaves memory in an unknown state.
+ *
+ * If that memory remains shared, accessing it in the
+ * *next* kernel through a private mapping will result
+ * in an unrecoverable guest shutdown.
+ *
+ * The kdump kernel boot is not impacted as it uses
+ * a pre-reserved memory range that is always private.
+ * However, gathering crash information could lead to
+ * a crash if it accesses unconverted memory through
+ * a private mapping which is possible when accessing
+ * that memory through /proc/vmcore, for example.
+ *
+ * In all cases, print error info in order to leave
+ * enough bread crumbs for debugging.
+ */
+ if (!tdx_enc_status_changed(addr, pages, true)) {
+ pr_err("Failed to unshare range %#lx-%#lx\n",
+ addr, addr + size);
+ }
+
+ found += pages;
+ }
+
+ addr += size;
+ }
+
+ __flush_tlb_all();
+
+ shared = atomic_long_read(&nr_shared);
+ if (shared != found) {
+ pr_err("shared page accounting is off\n");
+ pr_err("nr_shared = %ld, nr_found = %ld\n", shared, found);
+ }
}
void __init tdx_early_init(void)
@@ -881,6 +981,9 @@ void __init tdx_early_init(void)
x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;
x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;
+ x86_platform.guest.enc_kexec_begin = tdx_kexec_begin;
+ x86_platform.guest.enc_kexec_finish = tdx_kexec_finish;
+
/*
* TDX intercepts the RDMSR to read the X2APIC ID in the parallel
* bringup low level code. That raises #VE which cannot be handled
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 768d73de0d09..b4a851d27c7c 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -523,9 +523,9 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
* transition is complete, hv_vtom_set_host_visibility() marks the pages
* as "present" again.
*/
-static bool hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc)
+static int hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc)
{
- return !set_memory_np(kbuffer, pagecount);
+ return set_memory_np(kbuffer, pagecount);
}
/*
@@ -536,20 +536,19 @@ static bool hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc
* with host. This function works as wrap of hv_mark_gpa_visibility()
* with memory base and size.
*/
-static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bool enc)
+static int hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bool enc)
{
enum hv_mem_host_visibility visibility = enc ?
VMBUS_PAGE_NOT_VISIBLE : VMBUS_PAGE_VISIBLE_READ_WRITE;
u64 *pfn_array;
phys_addr_t paddr;
+ int i, pfn, err;
void *vaddr;
int ret = 0;
- bool result = true;
- int i, pfn;
pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
if (!pfn_array) {
- result = false;
+ ret = -ENOMEM;
goto err_set_memory_p;
}
@@ -568,10 +567,8 @@ static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bo
if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) {
ret = hv_mark_gpa_visibility(pfn, pfn_array,
visibility);
- if (ret) {
- result = false;
+ if (ret)
goto err_free_pfn_array;
- }
pfn = 0;
}
}
@@ -586,10 +583,11 @@ err_set_memory_p:
* order to avoid leaving the memory range in a "broken" state. Setting
* the PRESENT bits shouldn't fail, but return an error if it does.
*/
- if (set_memory_p(kbuffer, pagecount))
- result = false;
+ err = set_memory_p(kbuffer, pagecount);
+ if (err && !ret)
+ ret = err;
- return result;
+ return ret;
}
static bool hv_vtom_tlb_flush_required(bool private)
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 5af926c050f0..21bc53f5ed0c 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -78,6 +78,13 @@ static inline bool acpi_skip_set_wakeup_address(void)
#define acpi_skip_set_wakeup_address acpi_skip_set_wakeup_address
+union acpi_subtable_headers;
+
+int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
+ const unsigned long end);
+
+void asm_acpi_mp_play_dead(u64 reset_vector, u64 pgd_pa);
+
/*
* Check if the CPU can handle C2 and deeper
*/
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index cc9ccf61b6bd..14d72727d7ee 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -6,6 +6,7 @@
struct x86_mapping_info {
void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
+ void (*free_pgt_page)(void *, void *); /* free buf for page table */
void *context; /* context for alloc_pgt_page */
unsigned long page_flag; /* page flag for PMD or PUD entry */
unsigned long offset; /* ident mapping offset */
@@ -16,4 +17,6 @@ struct x86_mapping_info {
int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
unsigned long pstart, unsigned long pend);
+void kernel_ident_mapping_free(struct x86_mapping_info *info, pgd_t *pgd);
+
#endif /* _ASM_X86_INIT_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 65b8e5bb902c..e39311a89bf4 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -140,6 +140,11 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
}
+static inline bool pte_decrypted(pte_t pte)
+{
+ return cc_mkdec(pte_val(pte)) == pte_val(pte);
+}
+
#define pmd_dirty pmd_dirty
static inline bool pmd_dirty(pmd_t pmd)
{
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b78644962626..2f321137736c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -549,6 +549,7 @@ enum pg_level {
PG_LEVEL_2M,
PG_LEVEL_1G,
PG_LEVEL_512G,
+ PG_LEVEL_256T,
PG_LEVEL_NUM
};
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 9aee31862b4a..4b2abce2e3e7 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -49,8 +49,11 @@ int set_memory_wb(unsigned long addr, int numpages);
int set_memory_np(unsigned long addr, int numpages);
int set_memory_p(unsigned long addr, int numpages);
int set_memory_4k(unsigned long addr, int numpages);
+
+bool set_memory_enc_stop_conversion(void);
int set_memory_encrypted(unsigned long addr, int numpages);
int set_memory_decrypted(unsigned long addr, int numpages);
+
int set_memory_np_noalias(unsigned long addr, int numpages);
int set_memory_nonglobal(unsigned long addr, int numpages);
int set_memory_global(unsigned long addr, int numpages);
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index a35936b512fe..ca073f40698f 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -35,6 +35,7 @@ struct smp_ops {
int (*cpu_disable)(void);
void (*cpu_die)(unsigned int cpu);
void (*play_dead)(void);
+ void (*stop_this_cpu)(void);
void (*send_call_func_ipi)(const struct cpumask *mask);
void (*send_call_func_single_ipi)(int cpu);
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 6149eabe200f..213cf5379a5a 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -149,12 +149,22 @@ struct x86_init_acpi {
* @enc_status_change_finish Notify HV after the encryption status of a range is changed
* @enc_tlb_flush_required Returns true if a TLB flush is needed before changing page encryption status
* @enc_cache_flush_required Returns true if a cache flush is needed before changing page encryption status
+ * @enc_kexec_begin Begin the two-step process of converting shared memory back
+ * to private. It stops the new conversions from being started
+ * and waits in-flight conversions to finish, if possible.
+ * @enc_kexec_finish Finish the two-step process of converting shared memory to
+ * private. All memory is private after the call when
+ * the function returns.
+ * It is called on only one CPU while the others are shut down
+ * and with interrupts disabled.
*/
struct x86_guest {
- bool (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc);
- bool (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
+ int (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc);
+ int (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
bool (*enc_tlb_flush_required)(bool enc);
bool (*enc_cache_flush_required)(void);
+ void (*enc_kexec_begin)(void);
+ void (*enc_kexec_finish)(void);
};
/**
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index fc17b3f136fe..842a5f449404 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o
obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o
+obj-$(CONFIG_ACPI_MADT_WAKEUP) += madt_wakeup.o madt_playdead.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4bf82dbd2a6b..9f4618dcd704 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -67,13 +67,6 @@ static bool has_lapic_cpus __initdata;
static bool acpi_support_online_capable;
#endif
-#ifdef CONFIG_X86_64
-/* Physical address of the Multiprocessor Wakeup Structure mailbox */
-static u64 acpi_mp_wake_mailbox_paddr;
-/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
-static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
-#endif
-
#ifdef CONFIG_X86_IO_APIC
/*
* Locks related to IOAPIC hotplug
@@ -341,60 +334,6 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e
return 0;
}
-
-#ifdef CONFIG_X86_64
-static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
-{
- /*
- * Remap mailbox memory only for the first call to acpi_wakeup_cpu().
- *
- * Wakeup of secondary CPUs is fully serialized in the core code.
- * No need to protect acpi_mp_wake_mailbox from concurrent accesses.
- */
- if (!acpi_mp_wake_mailbox) {
- acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
- sizeof(*acpi_mp_wake_mailbox),
- MEMREMAP_WB);
- }
-
- /*
- * Mailbox memory is shared between the firmware and OS. Firmware will
- * listen on mailbox command address, and once it receives the wakeup
- * command, the CPU associated with the given apicid will be booted.
- *
- * The value of 'apic_id' and 'wakeup_vector' must be visible to the
- * firmware before the wakeup command is visible. smp_store_release()
- * ensures ordering and visibility.
- */
- acpi_mp_wake_mailbox->apic_id = apicid;
- acpi_mp_wake_mailbox->wakeup_vector = start_ip;
- smp_store_release(&acpi_mp_wake_mailbox->command,
- ACPI_MP_WAKE_COMMAND_WAKEUP);
-
- /*
- * Wait for the CPU to wake up.
- *
- * The CPU being woken up is essentially in a spin loop waiting to be
- * woken up. It should not take long for it wake up and acknowledge by
- * zeroing out ->command.
- *
- * ACPI specification doesn't provide any guidance on how long kernel
- * has to wait for a wake up acknowledgement. It also doesn't provide
- * a way to cancel a wake up request if it takes too long.
- *
- * In TDX environment, the VMM has control over how long it takes to
- * wake up secondary. It can postpone scheduling secondary vCPU
- * indefinitely. Giving up on wake up request and reporting error opens
- * possible attack vector for VMM: it can wake up a secondary CPU when
- * kernel doesn't expect it. Wait until positive result of the wake up
- * request.
- */
- while (READ_ONCE(acpi_mp_wake_mailbox->command))
- cpu_relax();
-
- return 0;
-}
-#endif /* CONFIG_X86_64 */
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
@@ -1124,29 +1063,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
}
return 0;
}
-
-#ifdef CONFIG_X86_64
-static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
- const unsigned long end)
-{
- struct acpi_madt_multiproc_wakeup *mp_wake;
-
- if (!IS_ENABLED(CONFIG_SMP))
- return -ENODEV;
-
- mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
- if (BAD_MADT_ENTRY(mp_wake, end))
- return -EINVAL;
-
- acpi_table_print_madt_entry(&header->common);
-
- acpi_mp_wake_mailbox_paddr = mp_wake->base_address;
-
- apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
-
- return 0;
-}
-#endif /* CONFIG_X86_64 */
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
@@ -1343,7 +1259,7 @@ static void __init acpi_process_madt(void)
smp_found_config = 1;
}
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_ACPI_MADT_WAKEUP
/*
* Parse MADT MP Wake entry.
*/
diff --git a/arch/x86/kernel/acpi/madt_playdead.S b/arch/x86/kernel/acpi/madt_playdead.S
new file mode 100644
index 000000000000..4e498d28cdc8
--- /dev/null
+++ b/arch/x86/kernel/acpi/madt_playdead.S
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/nospec-branch.h>
+#include <asm/page_types.h>
+#include <asm/processor-flags.h>
+
+ .text
+ .align PAGE_SIZE
+
+/*
+ * asm_acpi_mp_play_dead() - Hand over control of the CPU to the BIOS
+ *
+ * rdi: Address of the ACPI MADT MPWK ResetVector
+ * rsi: PGD of the identity mapping
+ */
+SYM_FUNC_START(asm_acpi_mp_play_dead)
+ /* Turn off global entries. Following CR3 write will flush them. */
+ movq %cr4, %rdx
+ andq $~(X86_CR4_PGE), %rdx
+ movq %rdx, %cr4
+
+ /* Switch to identity mapping */
+ movq %rsi, %cr3
+
+ /* Jump to reset vector */
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rdi
+SYM_FUNC_END(asm_acpi_mp_play_dead)
diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c
new file mode 100644
index 000000000000..6cfe762be28b
--- /dev/null
+++ b/arch/x86/kernel/acpi/madt_wakeup.c
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/acpi.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/kexec.h>
+#include <linux/memblock.h>
+#include <linux/pgtable.h>
+#include <linux/sched/hotplug.h>
+#include <asm/apic.h>
+#include <asm/barrier.h>
+#include <asm/init.h>
+#include <asm/intel_pt.h>
+#include <asm/nmi.h>
+#include <asm/processor.h>
+#include <asm/reboot.h>
+
+/* Physical address of the Multiprocessor Wakeup Structure mailbox */
+static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
+
+/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
+static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox __ro_after_init;
+
+static u64 acpi_mp_pgd __ro_after_init;
+static u64 acpi_mp_reset_vector_paddr __ro_after_init;
+
+static void acpi_mp_stop_this_cpu(void)
+{
+ asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
+}
+
+static void acpi_mp_play_dead(void)
+{
+ play_dead_common();
+ asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
+}
+
+static void acpi_mp_cpu_die(unsigned int cpu)
+{
+ u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ unsigned long timeout;
+
+ /*
+ * Use TEST mailbox command to prove that BIOS got control over
+ * the CPU before declaring it dead.
+ *
+ * BIOS has to clear 'command' field of the mailbox.
+ */
+ acpi_mp_wake_mailbox->apic_id = apicid;
+ smp_store_release(&acpi_mp_wake_mailbox->command,
+ ACPI_MP_WAKE_COMMAND_TEST);
+
+ /* Don't wait longer than a second. */
+ timeout = USEC_PER_SEC;
+ while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout)
+ udelay(1);
+
+ if (!timeout)
+ pr_err("Failed to hand over CPU %d to BIOS\n", cpu);
+}
+
+/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
+static void __init *alloc_pgt_page(void *dummy)
+{
+ return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+}
+
+static void __init free_pgt_page(void *pgt, void *dummy)
+{
+ return memblock_free(pgt, PAGE_SIZE);
+}
+
+/*
+ * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
+ * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
+ * to the identity mapping and the function has be present at the same spot in
+ * the virtual address space before and after switching page tables.
+ */
+static int __init init_transition_pgtable(pgd_t *pgd)
+{
+ pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
+ unsigned long vaddr, paddr;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ vaddr = (unsigned long)asm_acpi_mp_play_dead;
+ pgd += pgd_index(vaddr);
+ if (!pgd_present(*pgd)) {
+ p4d = (p4d_t *)alloc_pgt_page(NULL);
+ if (!p4d)
+ return -ENOMEM;
+ set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ }
+ p4d = p4d_offset(pgd, vaddr);
+ if (!p4d_present(*p4d)) {
+ pud = (pud_t *)alloc_pgt_page(NULL);
+ if (!pud)
+ return -ENOMEM;
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+ }
+ pud = pud_offset(p4d, vaddr);
+ if (!pud_present(*pud)) {
+ pmd = (pmd_t *)alloc_pgt_page(NULL);
+ if (!pmd)
+ return -ENOMEM;
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ }
+ pmd = pmd_offset(pud, vaddr);
+ if (!pmd_present(*pmd)) {
+ pte = (pte_t *)alloc_pgt_page(NULL);
+ if (!pte)
+ return -ENOMEM;
+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
+ }
+ pte = pte_offset_kernel(pmd, vaddr);
+
+ paddr = __pa(vaddr);
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
+
+ return 0;
+}
+
+static int __init acpi_mp_setup_reset(u64 reset_vector)
+{
+ struct x86_mapping_info info = {
+ .alloc_pgt_page = alloc_pgt_page,
+ .free_pgt_page = free_pgt_page,
+ .page_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .kernpg_flag = _KERNPG_TABLE_NOENC,
+ };
+ pgd_t *pgd;
+
+ pgd = alloc_pgt_page(NULL);
+ if (!pgd)
+ return -ENOMEM;
+
+ for (int i = 0; i < nr_pfn_mapped; i++) {
+ unsigned long mstart, mend;
+
+ mstart = pfn_mapped[i].start << PAGE_SHIFT;
+ mend = pfn_mapped[i].end << PAGE_SHIFT;
+ if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+ }
+
+ if (kernel_ident_mapping_init(&info, pgd,
+ PAGE_ALIGN_DOWN(reset_vector),
+ PAGE_ALIGN(reset_vector + 1))) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+
+ if (init_transition_pgtable(pgd)) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+
+ smp_ops.play_dead = acpi_mp_play_dead;
+ smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu;
+ smp_ops.cpu_die = acpi_mp_cpu_die;
+
+ acpi_mp_reset_vector_paddr = reset_vector;
+ acpi_mp_pgd = __pa(pgd);
+
+ return 0;
+}
+
+static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
+{
+ if (!acpi_mp_wake_mailbox_paddr) {
+ pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n");
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * Remap mailbox memory only for the first call to acpi_wakeup_cpu().
+ *
+ * Wakeup of secondary CPUs is fully serialized in the core code.
+ * No need to protect acpi_mp_wake_mailbox from concurrent accesses.
+ */
+ if (!acpi_mp_wake_mailbox) {
+ acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
+ sizeof(*acpi_mp_wake_mailbox),
+ MEMREMAP_WB);
+ }
+
+ /*