From 3ccfebedd8cf54e291c809c838d8ad5cc00f5688 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 29 Jan 2018 15:20:11 -0500 Subject: powerpc, membarrier: Skip memory barrier in switch_mm() Allow PowerPC to skip the full memory barrier in switch_mm(), and only issue the barrier when scheduling into a task belonging to a process that has registered to use expedited private. Threads targeting the same VM but which belong to different thread groups is a tricky case. It has a few consequences: It turns out that we cannot rely on get_nr_threads(p) to count the number of threads using a VM. We can use (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) instead to skip the synchronize_sched() for cases where the VM only has a single user, and that user only has a single thread. It also turns out that we cannot use for_each_thread() to set thread flags in all threads using a VM, as it only iterates on the thread group. Therefore, test the membarrier state variable directly rather than relying on thread flags. This means membarrier_register_private_expedited() needs to set the MEMBARRIER_STATE_PRIVATE_EXPEDITED flag, issue synchronize_sched(), and only then set MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY which allows private expedited membarrier commands to succeed. membarrier_arch_switch_mm() now tests for the MEMBARRIER_STATE_PRIVATE_EXPEDITED flag. Signed-off-by: Mathieu Desnoyers Acked-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Alan Stern Cc: Alexander Viro Cc: Andrea Parri Cc: Andrew Hunter Cc: Andy Lutomirski Cc: Avi Kivity Cc: Benjamin Herrenschmidt Cc: Boqun Feng Cc: Dave Watson Cc: David Sehr Cc: Greg Hackmann Cc: Linus Torvalds Cc: Maged Michael Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul E. McKenney Cc: Paul Mackerras Cc: Russell King Cc: Will Deacon Cc: linux-api@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20180129202020.8515-3-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/membarrier.h | 26 ++++++++++++++++++++++++++ arch/powerpc/mm/mmu_context.c | 7 +++++++ 3 files changed, 34 insertions(+) create mode 100644 arch/powerpc/include/asm/membarrier.h (limited to 'arch') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2ed525a44734..a2380de50878 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -140,6 +140,7 @@ config PPC select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_PMEM_API if PPC64 + select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE select ARCH_HAS_SG_CHAIN select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST diff --git a/arch/powerpc/include/asm/membarrier.h b/arch/powerpc/include/asm/membarrier.h new file mode 100644 index 000000000000..98ff4f1fcf2b --- /dev/null +++ b/arch/powerpc/include/asm/membarrier.h @@ -0,0 +1,26 @@ +#ifndef _ASM_POWERPC_MEMBARRIER_H +#define _ASM_POWERPC_MEMBARRIER_H + +static inline void membarrier_arch_switch_mm(struct mm_struct *prev, + struct mm_struct *next, + struct task_struct *tsk) +{ + /* + * Only need the full barrier when switching between processes. + * Barrier when switching from kernel to userspace is not + * required here, given that it is implied by mmdrop(). Barrier + * when switching from userspace to kernel is not needed after + * store to rq->curr. + */ + if (likely(!(atomic_read(&next->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED) || !prev)) + return; + + /* + * The membarrier system call requires a full memory barrier + * after storing to rq->curr, before going back to user-space. + */ + smp_mb(); +} + +#endif /* _ASM_POWERPC_MEMBARRIER_H */ diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c index d60a62bf4fc7..0ab297c4cfad 100644 --- a/arch/powerpc/mm/mmu_context.c +++ b/arch/powerpc/mm/mmu_context.c @@ -12,6 +12,7 @@ #include #include +#include #include @@ -58,6 +59,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * * On the read side the barrier is in pte_xchg(), which orders * the store to the PTE vs the load of mm_cpumask. + * + * This full barrier is needed by membarrier when switching + * between processes after store to rq->curr, before user-space + * memory accesses. */ smp_mb(); @@ -80,6 +85,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, if (new_on_cpu) radix_kvm_prefetch_workaround(next); + else + membarrier_arch_switch_mm(prev, next, tsk); /* * The actual HW switching method differs between the various -- cgit v1.2.3 From 306e060435d7a3aef8f6f033e43b0f581638adce Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 29 Jan 2018 15:20:12 -0500 Subject: membarrier: Document scheduler barrier requirements Document the membarrier requirement on having a full memory barrier in __schedule() after coming from user-space, before storing to rq->curr. It is provided by smp_mb__after_spinlock() in __schedule(). Document that membarrier requires a full barrier on transition from kernel thread to userspace thread. We currently have an implicit barrier from atomic_dec_and_test() in mmdrop() that ensures this. The x86 switch_mm_irqs_off() full barrier is currently provided by many cpumask update operations as well as write_cr3(). Document that write_cr3() provides this barrier. Signed-off-by: Mathieu Desnoyers Acked-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Andrea Parri Cc: Andrew Hunter Cc: Andy Lutomirski Cc: Avi Kivity Cc: Benjamin Herrenschmidt Cc: Boqun Feng Cc: Dave Watson Cc: David Sehr Cc: Greg Hackmann Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Maged Michael Cc: Michael Ellerman Cc: Paul E. McKenney Cc: Paul Mackerras Cc: Russell King Cc: Will Deacon Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/20180129202020.8515-4-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5bfe61a5e8e3..9fa7d2e0e15e 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -228,6 +228,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, #endif this_cpu_write(cpu_tlbstate.is_lazy, false); + /* + * The membarrier system call requires a full memory barrier + * before returning to user-space, after storing to rq->curr. + * Writing to CR3 provides that full memory barrier. + */ if (real_prev == next) { VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); -- cgit v1.2.3 From c5f58bd58f432be5d92df33c5458e0bcbee3aadf Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 29 Jan 2018 15:20:13 -0500 Subject: membarrier: Provide GLOBAL_EXPEDITED command Allow expedited membarrier to be used for data shared between processes through shared memory. Processes wishing to receive the membarriers register with MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED. Those which want to issue membarrier invoke MEMBARRIER_CMD_GLOBAL_EXPEDITED. This allows extremely simple kernel-level implementation: we have almost everything we need with the PRIVATE_EXPEDITED barrier code. All we need to do is to add a flag in the mm_struct that will be used to check whether we need to send the IPI to the current thread of each CPU. There is a slight downside to this approach compared to targeting specific shared memory users: when performing a membarrier operation, all registered "global" receivers will get the barrier, even if they don't share a memory mapping with the sender issuing MEMBARRIER_CMD_GLOBAL_EXPEDITED. This registration approach seems to fit the requirement of not disturbing processes that really deeply care about real-time: they simply should not register with MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED. In order to align the membarrier command names, the "MEMBARRIER_CMD_SHARED" command is renamed to "MEMBARRIER_CMD_GLOBAL", keeping an alias of MEMBARRIER_CMD_SHARED to MEMBARRIER_CMD_GLOBAL for UAPI header backward compatibility. Signed-off-by: Mathieu Desnoyers Acked-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Andrea Parri Cc: Andrew Hunter Cc: Andy Lutomirski Cc: Avi Kivity Cc: Benjamin Herrenschmidt Cc: Boqun Feng Cc: Dave Watson Cc: David Sehr Cc: Greg Hackmann Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Maged Michael Cc: Michael Ellerman Cc: Paul E. McKenney Cc: Paul Mackerras Cc: Russell King Cc: Will Deacon Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/20180129202020.8515-5-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/membarrier.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/membarrier.h b/arch/powerpc/include/asm/membarrier.h index 98ff4f1fcf2b..6e20bb5c74ea 100644 --- a/arch/powerpc/include/asm/membarrier.h +++ b/arch/powerpc/include/asm/membarrier.h @@ -13,7 +13,8 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev, * store to rq->curr. */ if (likely(!(atomic_read(&next->membarrier_state) & - MEMBARRIER_STATE_PRIVATE_EXPEDITED) || !prev)) + (MEMBARRIER_STATE_PRIVATE_EXPEDITED | + MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev)) return; /* -- cgit v1.2.3 From ac1ab12a3e6e878274e7107c8c6f326694a1c1f3 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 29 Jan 2018 15:20:16 -0500 Subject: lockin/x86: Implement sync_core_before_usermode() Ensure that a core serializing instruction is issued before returning to user-mode. x86 implements return to user-space through sysexit, sysrel, and sysretq, which are not core serializing. Signed-off-by: Mathieu Desnoyers Acked-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Andrea Parri Cc: Andrew Hunter Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Avi Kivity Cc: Benjamin Herrenschmidt Cc: Boqun Feng Cc: Dave Watson Cc: David Sehr Cc: Greg Hackmann Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Maged Michael Cc: Michael Ellerman Cc: Paul E. McKenney Cc: Paul Mackerras Cc: Russell King Cc: Will Deacon Cc: linux-api@vger.kernel.org Cc: linux-arch@vger.kernel.org Link: http://lkml.kernel.org/r/20180129202020.8515-8-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/include/asm/sync_core.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 arch/x86/include/asm/sync_core.h (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 423e4b64e683..31030ade4690 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -61,6 +61,7 @@ config X86 select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX + select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAS_ZONE_DEVICE if X86_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h new file mode 100644 index 000000000000..c67caafd3381 --- /dev/null +++ b/arch/x86/include/asm/sync_core.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SYNC_CORE_H +#define _ASM_X86_SYNC_CORE_H + +#include +#include +#include + +/* + * Ensure that a core serializing instruction is issued before returning + * to user-mode. x86 implements return to user-space through sysexit, + * sysrel, and sysretq, which are not core serializing. + */ +static inline void sync_core_before_usermode(void) +{ + /* With PTI, we unconditionally serialize before running user code. */ + if (static_cpu_has(X86_FEATURE_PTI)) + return; + /* + * Return from interrupt and NMI is done through iret, which is core + * serializing. + */ + if (in_irq() || in_nmi()) + return; + sync_core(); +} + +#endif /* _ASM_X86_SYNC_CORE_H */ -- cgit v1.2.3 From 10bcc80e9dbced128e3b4aa86e4737e5486a45d0 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 29 Jan 2018 15:20:18 -0500 Subject: membarrier/x86: Provide core serializing command There are two places where core serialization is needed by membarrier: 1) When returning from the membarrier IPI, 2) After scheduler updates curr to a thread with a different mm, before going back to user-space, since the curr->mm is used by membarrier to check whether it needs to send an IPI to that CPU. x86-32 uses IRET as return from interrupt, and both IRET and SYSEXIT to go back to user-space. The IRET instruction is core serializing, but not SYSEXIT. x86-64 uses IRET as return from interrupt, which takes care of the IPI. However, it can return to user-space through either SYSRETL (compat code), SYSRETQ, or IRET. Given that SYSRET{L,Q} is not core serializing, we rely instead on write_cr3() performed by switch_mm() to provide core serialization after changing the current mm, and deal with the special case of kthread -> uthread (temporarily keeping current mm into active_mm) by adding a sync_core() in that specific case. Use the new sync_core_before_usermode() to guarantee this. Signed-off-by: Mathieu Desnoyers Acked-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Andrea Parri Cc: Andrew Hunter Cc: Andy Lutomirski Cc: Avi Kivity Cc: Benjamin Herrenschmidt Cc: Boqun Feng Cc: Dave Watson Cc: David Sehr Cc: Greg Hackmann Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Maged Michael Cc: Michael Ellerman Cc: Paul E. McKenney Cc: Paul Mackerras Cc: Russell King Cc: Will Deacon Cc: linux-api@vger.kernel.org Cc: linux-arch@vger.kernel.org Link: http://lkml.kernel.org/r/20180129202020.8515-10-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/entry/entry_32.S | 5 +++++ arch/x86/entry/entry_64.S | 4 ++++ arch/x86/mm/tlb.c | 7 ++++--- 4 files changed, 14 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 31030ade4690..e095bdb9afdf 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -54,6 +54,7 @@ config X86 select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV if X86_64 + select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 2a35b1e0fb90..abee6d2b9311 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -566,6 +566,11 @@ restore_all: .Lrestore_nocheck: RESTORE_REGS 4 # skip orig_eax/error_code .Lirq_return: + /* + * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization + * when returning from IPI handler and when returning from + * scheduler to user-space. + */ INTERRUPT_RETURN .section .fixup, "ax" diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a83570495162..5816858c8820 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -804,6 +804,10 @@ GLOBAL(restore_regs_and_return_to_kernel) POP_EXTRA_REGS POP_C_REGS addq $8, %rsp /* skip regs->orig_ax */ + /* + * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization + * when returning from IPI handler. + */ INTERRUPT_RETURN ENTRY(native_iret) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 9fa7d2e0e15e..9b34121c8f05 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -229,9 +229,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.is_lazy, false); /* - * The membarrier system call requires a full memory barrier - * before returning to user-space, after storing to rq->curr. - * Writing to CR3 provides that full memory barrier. + * The membarrier system call requires a full memory barrier and + * core serialization before returning to user-space, after + * storing to rq->curr. Writing to CR3 provides that full + * memory barrier and core serializing instruction. */ if (real_prev == next) { VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != -- cgit v1.2.3 From f1e3a12b6543a73cfd47aa7da16d3d73d297795e Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 29 Jan 2018 15:20:19 -0500 Subject: membarrier/arm64: Provide core serializing command Signed-off-by: Mathieu Desnoyers Acked-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Andrea Parri Cc: Andrew Hunter Cc: Andy Lutomirski Cc: Avi Kivity Cc: Benjamin Herrenschmidt Cc: Boqun Feng Cc: Dave Watson Cc: David Sehr Cc: Greg Hackmann Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Maged Michael Cc: Michael Ellerman Cc: Paul E. McKenney Cc: Paul Mackerras Cc: Russell King Cc: Will Deacon Cc: linux-api@vger.kernel.org Cc: linux-arch@vger.kernel.org Link: http://lkml.kernel.org/r/20180129202020.8515-11-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- arch/arm64/Kconfig | 1 + arch/arm64/kernel/entry.S | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c9a7e9e1414f..5b0c06d8dbbe 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -16,6 +16,7 @@ config ARM64 select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV + select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 6d14b8f29b5f..5edde1c2e93e 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -302,6 +302,10 @@ alternative_else_nop_endif ldp x28, x29, [sp, #16 * 14] ldr lr, [sp, #S_LR] add sp, sp, #S_FRAME_SIZE // restore sp + /* + * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on eret context synchronization + * when returning from IPI handler, and when returning to user-space. + */ eret // return to kernel .endm -- cgit v1.2.3