summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/entry/common.c2
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/asm/linkage.h7
-rw-r--r--arch/x86/include/asm/mmu_context.h3
-rw-r--r--arch/x86/include/asm/paravirt_types.h15
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h37
-rw-r--r--arch/x86/kernel/alternative.c9
-rw-r--r--arch/x86/kernel/callthunks.c1
-rw-r--r--arch/x86/kernel/cpu/amd.c28
-rw-r--r--arch/x86/kernel/cpu/bugs.c17
-rw-r--r--arch/x86/kernel/paravirt.c67
-rw-r--r--arch/x86/kernel/process.c7
-rw-r--r--arch/x86/kernel/setup.c8
-rw-r--r--arch/x86/kernel/shstk.c33
-rw-r--r--arch/x86/kvm/mmu/mmu.c21
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h15
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c152
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h5
-rw-r--r--arch/x86/kvm/svm/sev.c34
-rw-r--r--arch/x86/kvm/svm/svm.c43
-rw-r--r--arch/x86/kvm/svm/svm.h1
-rw-r--r--arch/x86/kvm/x86.c5
-rw-r--r--arch/x86/lib/memcpy_64.S2
-rw-r--r--arch/x86/lib/memmove_64.S2
-rw-r--r--arch/x86/lib/memset_64.S2
-rw-r--r--arch/x86/xen/efi.c2
-rw-r--r--arch/x86/xen/enlighten.c2
-rw-r--r--arch/x86/xen/enlighten_hvm.c2
-rw-r--r--arch/x86/xen/enlighten_pv.c40
-rw-r--r--arch/x86/xen/mmu_pv.c55
-rw-r--r--arch/x86/xen/multicalls.h4
32 files changed, 315 insertions, 311 deletions
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 6c2826417b33..93c60c0c9d4a 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -294,7 +294,7 @@ static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
inc_irq_stat(irq_hv_callback_count);
- xen_hvm_evtchn_do_upcall();
+ xen_evtchn_do_upcall();
set_irq_regs(old_regs);
}
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1a4def36d5bb..17715cb8731d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1419,7 +1419,6 @@ struct kvm_arch {
* the thread holds the MMU lock in write mode.
*/
spinlock_t tdp_mmu_pages_lock;
- struct workqueue_struct *tdp_mmu_zap_wq;
#endif /* CONFIG_X86_64 */
/*
@@ -1835,7 +1834,7 @@ void kvm_mmu_vendor_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
-int kvm_mmu_init_vm(struct kvm *kvm);
+void kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 5ff49fd67732..571fe4d2d232 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -105,6 +105,13 @@
CFI_POST_PADDING \
SYM_FUNC_END(__cfi_##name)
+/* UML needs to be able to override memcpy() and friends for KASAN. */
+#ifdef CONFIG_UML
+# define SYM_FUNC_ALIAS_MEMFUNC SYM_FUNC_ALIAS_WEAK
+#else
+# define SYM_FUNC_ALIAS_MEMFUNC SYM_FUNC_ALIAS
+#endif
+
/* SYM_TYPED_FUNC_START -- use for indirectly called globals, w/ CFI type */
#define SYM_TYPED_FUNC_START(name) \
SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 416901d406f8..8dac45a2c7fc 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -186,8 +186,7 @@ do { \
#else
#define deactivate_mm(tsk, mm) \
do { \
- if (!tsk->vfork_done) \
- shstk_free(tsk); \
+ shstk_free(tsk); \
load_gs_index(0); \
loadsegment(fs, 0); \
} while (0)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 4acbcddddc29..772d03487520 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -9,13 +9,6 @@ struct paravirt_patch_site {
u8 type; /* type of this instruction */
u8 len; /* length of original instruction */
};
-
-/* Lazy mode for batching updates / context switch */
-enum paravirt_lazy_mode {
- PARAVIRT_LAZY_NONE,
- PARAVIRT_LAZY_MMU,
- PARAVIRT_LAZY_CPU,
-};
#endif
#ifdef CONFIG_PARAVIRT
@@ -549,14 +542,6 @@ int paravirt_disable_iospace(void);
__PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
-enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
-void paravirt_start_context_switch(struct task_struct *prev);
-void paravirt_end_context_switch(struct task_struct *next);
-
-void paravirt_enter_lazy_mmu(void);
-void paravirt_leave_lazy_mmu(void);
-void paravirt_flush_lazy_mmu(void);
-
void _paravirt_nop(void);
void paravirt_BUG(void);
unsigned long paravirt_ret0(void);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 0086920cda06..a3669a7774ed 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -683,13 +683,11 @@ extern u16 get_llc_id(unsigned int cpu);
#ifdef CONFIG_CPU_SUP_AMD
extern u32 amd_get_nodes_per_socket(void);
extern u32 amd_get_highest_perf(void);
-extern bool cpu_has_ibpb_brtype_microcode(void);
extern void amd_clear_divider(void);
extern void amd_check_microcode(void);
#else
static inline u32 amd_get_nodes_per_socket(void) { return 0; }
static inline u32 amd_get_highest_perf(void) { return 0; }
-static inline bool cpu_has_ibpb_brtype_microcode(void) { return false; }
static inline void amd_clear_divider(void) { }
static inline void amd_check_microcode(void) { }
#endif
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 5fc35f889cd1..7048dfacc04b 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -36,6 +36,7 @@
extern struct shared_info *HYPERVISOR_shared_info;
extern struct start_info *xen_start_info;
+#include <asm/bug.h>
#include <asm/processor.h>
#define XEN_SIGNATURE "XenVMMXenVMM"
@@ -63,4 +64,40 @@ void __init xen_pvh_init(struct boot_params *boot_params);
void __init mem_map_via_hcall(struct boot_params *boot_params_p);
#endif
+/* Lazy mode for batching updates / context switch */
+enum xen_lazy_mode {
+ XEN_LAZY_NONE,
+ XEN_LAZY_MMU,
+ XEN_LAZY_CPU,
+};
+
+DECLARE_PER_CPU(enum xen_lazy_mode, xen_lazy_mode);
+DECLARE_PER_CPU(unsigned int, xen_lazy_nesting);
+
+static inline void enter_lazy(enum xen_lazy_mode mode)
+{
+ enum xen_lazy_mode old_mode = this_cpu_read(xen_lazy_mode);
+
+ if (mode == old_mode) {
+ this_cpu_inc(xen_lazy_nesting);
+ return;
+ }
+
+ BUG_ON(old_mode != XEN_LAZY_NONE);
+
+ this_cpu_write(xen_lazy_mode, mode);
+}
+
+static inline void leave_lazy(enum xen_lazy_mode mode)
+{
+ BUG_ON(this_cpu_read(xen_lazy_mode) != mode);
+
+ if (this_cpu_read(xen_lazy_nesting) == 0)
+ this_cpu_write(xen_lazy_mode, XEN_LAZY_NONE);
+ else
+ this_cpu_dec(xen_lazy_nesting);
+}
+
+enum xen_lazy_mode xen_get_lazy_mode(void);
+
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a5ead6a6d233..517ee01503be 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -720,13 +720,8 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
{
s32 *s;
- /*
- * Do not patch out the default return thunks if those needed are the
- * ones generated by the compiler.
- */
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK) &&
- (x86_return_thunk == __x86_return_thunk))
- return;
+ if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ static_call_force_reinit();
for (s = start; s < end; s++) {
void *dest = NULL, *addr = (void *)s + *s;
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index c06bfc086565..faa9f2299848 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -272,7 +272,6 @@ void __init callthunks_patch_builtin_calls(void)
pr_info("Setting up call depth tracking\n");
mutex_lock(&text_mutex);
callthunks_setup(&cs, &builtin_coretext);
- static_call_force_reinit();
thunks_initialized = true;
mutex_unlock(&text_mutex);
}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index dd8379d84445..03ef962a6992 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -766,6 +766,15 @@ static void early_init_amd(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_TOPOEXT))
smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
+
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) {
+ if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB))
+ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+ else if (c->x86 >= 0x19 && !wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) {
+ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+ setup_force_cpu_cap(X86_FEATURE_SBPB);
+ }
+ }
}
static void init_amd_k8(struct cpuinfo_x86 *c)
@@ -1301,25 +1310,6 @@ void amd_check_microcode(void)
on_each_cpu(zenbleed_check_cpu, NULL, 1);
}
-bool cpu_has_ibpb_brtype_microcode(void)
-{
- switch (boot_cpu_data.x86) {
- /* Zen1/2 IBPB flushes branch type predictions too. */
- case 0x17:
- return boot_cpu_has(X86_FEATURE_AMD_IBPB);
- case 0x19:
- /* Poke the MSR bit on Zen3/4 to check its presence. */
- if (!wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) {
- setup_force_cpu_cap(X86_FEATURE_SBPB);
- return true;
- } else {
- return false;
- }
- default:
- return false;
- }
-}
-
/*
* Issue a DIV 0/1 insn to clear any division data from previous DIV
* operations.
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index f081d26616ac..10499bcd4e39 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -2404,27 +2404,16 @@ early_param("spec_rstack_overflow", srso_parse_cmdline);
static void __init srso_select_mitigation(void)
{
- bool has_microcode;
+ bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE);
if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off())
goto pred_cmd;
- /*
- * The first check is for the kernel running as a guest in order
- * for guests to verify whether IBPB is a viable mitigation.
- */
- has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) || cpu_has_ibpb_brtype_microcode();
if (!has_microcode) {
pr_warn("IBPB-extending microcode not applied!\n");
pr_warn(SRSO_NOTICE);
} else {
/*
- * Enable the synthetic (even if in a real CPUID leaf)
- * flags for guests.
- */
- setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
-
- /*
* Zen1/2 with SMT off aren't vulnerable after the right
* IBPB microcode has been applied.
*/
@@ -2444,7 +2433,7 @@ static void __init srso_select_mitigation(void)
switch (srso_cmd) {
case SRSO_CMD_OFF:
- return;
+ goto pred_cmd;
case SRSO_CMD_MICROCODE:
if (has_microcode) {
@@ -2717,7 +2706,7 @@ static ssize_t srso_show_state(char *buf)
return sysfs_emit(buf, "%s%s\n",
srso_strings[srso_mitigation],
- (cpu_has_ibpb_brtype_microcode() ? "" : ", no microcode"));
+ boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) ? "" : ", no microcode");
}
static ssize_t gds_show_state(char *buf)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 975f98d5eee5..97f1436c1a20 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -143,66 +143,7 @@ int paravirt_disable_iospace(void)
return request_resource(&ioport_resource, &reserve_ioports);
}
-static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
-
-static inline void enter_lazy(enum paravirt_lazy_mode mode)
-{
- BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
-
- this_cpu_write(paravirt_lazy_mode, mode);
-}
-
-static void leave_lazy(enum paravirt_lazy_mode mode)
-{
- BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode);
-
- this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
-}
-
-void paravirt_enter_lazy_mmu(void)
-{
- enter_lazy(PARAVIRT_LAZY_MMU);
-}
-
-void paravirt_leave_lazy_mmu(void)
-{
- leave_lazy(PARAVIRT_LAZY_MMU);
-}
-
-void paravirt_flush_lazy_mmu(void)
-{
- preempt_disable();
-
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
- arch_leave_lazy_mmu_mode();
- arch_enter_lazy_mmu_mode();
- }
-
- preempt_enable();
-}
-
#ifdef CONFIG_PARAVIRT_XXL
-void paravirt_start_context_switch(struct task_struct *prev)
-{
- BUG_ON(preemptible());
-
- if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
- arch_leave_lazy_mmu_mode();
- set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
- }
- enter_lazy(PARAVIRT_LAZY_CPU);
-}
-
-void paravirt_end_context_switch(struct task_struct *next)
-{
- BUG_ON(preemptible());
-
- leave_lazy(PARAVIRT_LAZY_CPU);
-
- if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
- arch_enter_lazy_mmu_mode();
-}
-
static noinstr void pv_native_write_cr2(unsigned long val)
{
native_write_cr2(val);
@@ -229,14 +170,6 @@ static noinstr void pv_native_safe_halt(void)
}
#endif
-enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
-{
- if (in_interrupt())
- return PARAVIRT_LAZY_NONE;
-
- return this_cpu_read(paravirt_lazy_mode);
-}
-
struct pv_info pv_info = {
.name = "bare hardware",
#ifdef CONFIG_PARAVIRT_XXL
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9f0909142a0a..b6f4e8399fca 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -257,13 +257,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP)))
io_bitmap_share(p);
- /*
- * If copy_thread() if failing, don't leak the shadow stack possibly
- * allocated in shstk_alloc_thread_stack() above.
- */
- if (ret)
- shstk_free(p);
-
return ret;
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b9145a63da77..b098b1fa2470 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -358,15 +358,11 @@ static void __init add_early_ima_buffer(u64 phys_addr)
#if defined(CONFIG_HAVE_IMA_KEXEC) && !defined(CONFIG_OF_FLATTREE)
int __init ima_free_kexec_buffer(void)
{
- int rc;
-
if (!ima_kexec_buffer_size)
return -ENOENT;
- rc = memblock_phys_free(ima_kexec_buffer_phys,
- ima_kexec_buffer_size);
- if (rc)
- return rc;
+ memblock_free_late(ima_kexec_buffer_phys,
+ ima_kexec_buffer_size);
ima_kexec_buffer_phys = 0;
ima_kexec_buffer_size = 0;
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index fd689921a1db..59e15dd8d0f8 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -205,10 +205,21 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long cl
return 0;
/*
- * For CLONE_VM, except vfork, the child needs a separate shadow
+ * For CLONE_VFORK the child will share the parents shadow stack.
+ * Make sure to clear the internal tracking of the thread shadow
+ * stack so the freeing logic run for child knows to leave it alone.
+ */
+ if (clone_flags & CLONE_VFORK) {
+ shstk->base = 0;
+ shstk->size = 0;
+ return 0;
+ }
+
+ /*
+ * For !CLONE_VM the child will use a copy of the parents shadow
* stack.
*/
- if ((clone_flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM)
+ if (!(clone_flags & CLONE_VM))
return 0;
size = adjust_shstk_size(stack_size);
@@ -408,7 +419,25 @@ void shstk_free(struct task_struct *tsk)
if (!tsk->mm || tsk->mm != current->mm)
return;
+ /*
+ * If shstk->base is NULL, then this task is not managing its
+ * own shadow stack (CLONE_VFORK). So skip freeing it.
+ */
+ if (!shstk->base)
+ return;
+
+ /*
+ * shstk->base is NULL for CLONE_VFORK child tasks, and so is
+ * normal. But size = 0 on a shstk->base is not normal and
+ * indicated an attempt to free the thread shadow stack twice.
+ * Warn about it.
+ */
+ if (WARN_ON(!shstk->size))
+ return;
+
unmap_shadow_stack(shstk->base, shstk->size);
+
+ shstk->size = 0;
}
static int wrss_control(bool enable)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index e1d011c67cc6..f7901cb4d2fa 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6167,20 +6167,15 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
}
-int kvm_mmu_init_vm(struct kvm *kvm)
+void kvm_mmu_init_vm(struct kvm *kvm)
{
- int r;
-
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
- if (tdp_mmu_enabled) {
- r = kvm_mmu_init_tdp_mmu(kvm);
- if (r < 0)
- return r;
- }
+ if (tdp_mmu_enabled)
+ kvm_mmu_init_tdp_mmu(kvm);
kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
@@ -6189,8 +6184,6 @@ int kvm_mmu_init_vm(struct kvm *kvm)
kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
-
- return 0;
}
static void mmu_free_vm_memory_caches(struct kvm *kvm)
@@ -6246,7 +6239,6 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
{
bool flush;
- int i;
if (WARN_ON_ONCE(gfn_end <= gfn_start))
return;
@@ -6257,11 +6249,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
- if (tdp_mmu_enabled) {
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
- flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
- gfn_end, true, flush);
- }
+ if (tdp_mmu_enabled)
+ flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush);
if (flush)
kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index b102014e2c60..decc1f153669 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -58,7 +58,12 @@ struct kvm_mmu_page {
bool tdp_mmu_page;
bool unsync;
- u8 mmu_valid_gen;
+ union {
+ u8 mmu_valid_gen;
+
+ /* Only accessed under slots_lock. */
+ bool tdp_mmu_scheduled_root_to_zap;
+ };
/*
* The shadow page can't be replaced by an equivalent huge page
@@ -100,13 +105,7 @@ struct kvm_mmu_page {
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
tdp_ptep_t ptep;
};
- union {
- DECLARE_BITMAP(unsync_child_bitmap, 512);
- struct {
- struct work_struct tdp_mmu_async_work;
- void *tdp_mmu_async_data;
- };
- };
+ DECLARE_BITMAP(unsync_child_bitmap, 512);
/*
* Tracks shadow pages that, if zapped, would allow KVM to create an NX
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 6c63f2d1675f..6cd4dd631a2f 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -12,18 +12,10 @@
#include <trace/events/kvm.h>
/* Initializes the TDP MMU for the VM, if enabled. */
-int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
+void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
{
- struct workqueue_struct *wq;
-
- wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
- if (!wq)
- return -ENOMEM;
-
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
- kvm->arch.tdp_mmu_zap_wq = wq;
- return 1;
}
/* Arbitrarily returns true so that this may be used in if statements. */
@@ -46,20 +38,15 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
* ultimately frees all roots.
*/
kvm_tdp_mmu_invalidate_all_roots(kvm);
-
- /*
- * Destroying a workqueue also first flushes the workqueue, i.e. no
- * need to invoke kvm_tdp_mmu_zap_invalidated_roots().
- */
- destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
+ kvm_tdp_mmu_zap_invalidated_roots(kvm);
WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
/*
* Ensure that all the outstanding RCU callbacks to free shadow pages
- * can run before the VM is torn down. Work items on tdp_mmu_zap_wq
- * can call kvm_tdp_mmu_put_root and create new callbacks.
+ * can run before the VM is torn down. Putting the last reference to
+ * zapped roots will create new callbacks.
*/
rcu_barrier();
}
@@ -86,46 +73,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
tdp_mmu_free_sp(sp);
}
-static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
- bool shared);
-
-static void tdp_mmu_zap_root_work(struct work_struct *work)
-{
- struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
- tdp_mmu_async_work);
- struct kvm *kvm = root->tdp_mmu_async_data;
-
- read_lock(&kvm->mmu_lock);
-
- /*
- * A TLB flush is not necessary as KVM performs a local TLB flush when
- * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
- * to a different pCPU. Note, the local TLB flush on reuse also
- * invalidates any paging-structure-cache entries, i.e. TLB entries for
- * intermediate paging structures, that may be zapped, as such entries
- * are associated with the ASID on both VMX and SVM.
- */
- tdp_mmu_zap_root(kvm, root, true);
-
- /*
- * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
- * avoiding an infinite loop. By design, the root is reachable while
- * it's being asynchronously zapped, thus a different task can put its
- * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
- * asynchronously zapped root is unavoidable.
- */
- kvm_tdp_mmu_put_root(kvm, root, true);
-
- read_unlock(&kvm->mmu_lock);
-}
-
-static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
-{
- root->tdp_mmu_async_data = kvm;
- INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
- queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
-}
-
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared)
{
@@ -211,8 +158,12 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
- __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, false); \
+ _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _shared, false)) \
+ if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) { \
+ } else
/*
* Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
@@ -292,7 +243,7 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
* by a memslot update or by the destruction of the VM. Initialize the
* refcount to two; one reference for the vCPU, and one reference for
* the TDP MMU itself, which is held until the root is invalidated and
- * is ultimately put by tdp_mmu_zap_root_work().
+ * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
*/
refcount_set(&root->tdp_mmu_root_count, 2);
@@ -877,13 +828,12 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
* true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
* more SPTEs were zapped since the MMU lock was last acquired.
*/
-bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
- bool can_yield, bool flush)
+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
{
struct kvm_mmu_page *root;
- for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
- flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, false)
+ flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
return flush;
}
@@ -891,7 +841,6 @@ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{
struct kvm_mmu_page *root;
- int i;
/*
* Zap all roots, including invalid roots, as all SPTEs must be dropped
@@ -905,10 +854,8 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
* is being destroyed or the userspace VMM has exited. In both cases,
* KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
*/
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- for_each_tdp_mmu_root_yield_safe(kvm, root, i)
- tdp_mmu_zap_root(kvm, root, false);
- }
+ for_each_tdp_mmu_root_yield_safe(kvm, root, false)
+ tdp_mmu_zap_root(kvm, root, false);
}
/*
@@ -917,18 +864,47 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
*/
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
{
- flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
+ struct kvm_mmu_page *root;
+
+ read_lock(&kvm->mmu_lock);
+
+ for_each_tdp_mmu_root_yield_safe(kvm, root, true) {
+ if (!root->tdp_mmu_scheduled_root_to_zap)
+ continue;
+
+ root->tdp_mmu_scheduled_root_to_zap = false;
+ KVM_BUG_ON(!root->role.invalid, kvm);
+
+ /*
+ * A TLB flush is not necessary as KVM performs a local TLB
+ * flush when allocating a new root (see kvm_mmu_load()), and
+ * when migrating a vCPU to a different pCPU. Note, the local
+ * TLB flush on reuse also invalidates paging-structure-cache
+ * entries, i.e. TLB entries for intermediate paging structures,
+ * that may be zapped, as such entries are associated with the
+ * ASID on both VMX and SVM.
+ */
+ tdp_mmu_zap_root(kvm, root, true);
+
+ /*
+ * The referenced needs to be put *after* zapping the root, as
+ * the root must be reachable by mmu_notifiers while it's being
+ * zapped
+ */
+ kvm_tdp_mmu_put_root(kvm, root, true);
+ }
+
+ read_unlock(&kvm->mmu_lo