From 68cbd415dd4b9c5b9df69f0f091879e56bf5907a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 21 Jun 2024 11:15:58 +0200 Subject: task_work: s/task_work_cancel()/task_work_cancel_func()/ A proper task_work_cancel() API that actually cancels a callback and not *any* callback pointing to a given function is going to be needed for perf events event freeing. Do the appropriate rename to prepare for that. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240621091601.18227-2-frederic@kernel.org --- include/linux/task_work.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 795ef5a68429..23ab01ae185e 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -30,7 +30,7 @@ int task_work_add(struct task_struct *task, struct callback_head *twork, struct callback_head *task_work_cancel_match(struct task_struct *task, bool (*match)(struct callback_head *, void *data), void *data); -struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t); +struct callback_head *task_work_cancel_func(struct task_struct *, task_work_func_t); void task_work_run(void); static inline void exit_task_work(struct task_struct *task) -- cgit v1.2.3 From f409530e4db9dd11b88cb7703c97c8f326ff6566 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 21 Jun 2024 11:15:59 +0200 Subject: task_work: Introduce task_work_cancel() again Re-introduce task_work_cancel(), this time to cancel an actual callback and not *any* callback pointing to a given function. This is going to be needed for perf events event freeing. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240621091601.18227-3-frederic@kernel.org --- include/linux/task_work.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 23ab01ae185e..26b8a47f41fc 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -31,6 +31,7 @@ int task_work_add(struct task_struct *task, struct callback_head *twork, struct callback_head *task_work_cancel_match(struct task_struct *task, bool (*match)(struct callback_head *, void *data), void *data); struct callback_head *task_work_cancel_func(struct task_struct *, task_work_func_t); +bool task_work_cancel(struct task_struct *task, struct callback_head *cb); void task_work_run(void); static inline void exit_task_work(struct task_struct *task) -- cgit v1.2.3 From 3a5465418f5fd970e86a86c7f4075be262682840 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 21 Jun 2024 11:16:01 +0200 Subject: perf: Fix event leak upon exec and file release The perf pending task work is never waited upon the matching event release. In the case of a child event, released via free_event() directly, this can potentially result in a leaked event, such as in the following scenario that doesn't even require a weak IRQ work implementation to trigger: schedule() prepare_task_switch() =======> perf_event_overflow() event->pending_sigtrap = ... irq_work_queue(&event->pending_irq) <======= perf_event_task_sched_out() event_sched_out() event->pending_sigtrap = 0; atomic_long_inc_not_zero(&event->refcount) task_work_add(&event->pending_task) finish_lock_switch() =======> perf_pending_irq() //do nothing, rely on pending task work <======= begin_new_exec() perf_event_exit_task() perf_event_exit_event() // If is child event free_event() WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1) // event is leaked Similar scenarios can also happen with perf_event_remove_on_exec() or simply against concurrent perf_event_release(). Fix this with synchonizing against the possibly remaining pending task work while freeing the event, just like is done with remaining pending IRQ work. This means that the pending task callback neither need nor should hold a reference to the event, preventing it from ever beeing freed. Fixes: 517e6a301f34 ("perf: Fix perf_pending_task() UaF") Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240621091601.18227-5-frederic@kernel.org --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a5304ae8c654..393fb13733b0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -786,6 +786,7 @@ struct perf_event { struct irq_work pending_irq; struct callback_head pending_task; unsigned int pending_work; + struct rcuwait pending_work_wait; atomic_t event_limit; -- cgit v1.2.3 From 466e4d801cd438a1ab2c8a2cce1bef6b65c31bbb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 4 Jul 2024 19:03:36 +0200 Subject: task_work: Add TWA_NMI_CURRENT as an additional notify mode. Adding task_work from NMI context requires the following: - The kasan_record_aux_stack() is not NMU safe and must be avoided. - Using TWA_RESUME is NMI safe. If the NMI occurs while the CPU is in userland then it will continue in userland and not invoke the `work' callback. Add TWA_NMI_CURRENT as an additional notify mode. In this mode skip kasan and use irq_work in hardirq-mode to for needed interrupt. Set TIF_NOTIFY_RESUME within the irq_work callback due to k[ac]san instrumentation in test_and_set_bit() which does not look NMI safe in case of a report. Suggested-by: Peter Zijlstra Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240704170424.1466941-3-bigeasy@linutronix.de --- include/linux/task_work.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 26b8a47f41fc..cf5e7e891a77 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -18,6 +18,7 @@ enum task_work_notify_mode { TWA_RESUME, TWA_SIGNAL, TWA_SIGNAL_NO_IPI, + TWA_NMI_CURRENT, }; static inline bool task_work_pending(struct task_struct *task) -- cgit v1.2.3 From c5d93d23a26012a98b77f9e354ab9b3afd420059 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 4 Jul 2024 19:03:37 +0200 Subject: perf: Enqueue SIGTRAP always via task_work. A signal is delivered by raising irq_work() which works from any context including NMI. irq_work() can be delayed if the architecture does not provide an interrupt vector. In order not to lose a signal, the signal is injected via task_work during event_sched_out(). Instead going via irq_work, the signal could be added directly via task_work. The signal is sent to current and can be enqueued on its return path to userland. Queue signal via task_work and consider possible NMI context. Remove perf_event::pending_sigtrap and and use perf_event::pending_work instead. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Tested-by: Marco Elver Tested-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/r/20240704170424.1466941-4-bigeasy@linutronix.de --- include/linux/perf_event.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 393fb13733b0..ea0d82418d85 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -781,7 +781,6 @@ struct perf_event { unsigned int pending_wakeup; unsigned int pending_kill; unsigned int pending_disable; - unsigned int pending_sigtrap; unsigned long pending_addr; /* SIGTRAP */ struct irq_work pending_irq; struct callback_head pending_task; @@ -963,7 +962,7 @@ struct perf_event_context { struct rcu_head rcu_head; /* - * Sum (event->pending_sigtrap + event->pending_work) + * Sum (event->pending_work + event->pending_work) * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. -- cgit v1.2.3 From 0d40a6d83e3e6751f1107ba33587262d937c969f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 4 Jul 2024 19:03:39 +0200 Subject: perf: Move swevent_htable::recursion into task_struct. The swevent_htable::recursion counter is used to avoid creating an swevent while an event is processed to avoid recursion. The counter is per-CPU and preemption must be disabled to have a stable counter. perf_pending_task() disables preemption to access the counter and then signal. This is problematic on PREEMPT_RT because sending a signal uses a spinlock_t which must not be acquired in atomic on PREEMPT_RT because it becomes a sleeping lock. The atomic context can be avoided by moving the counter into the task_struct. There is a 4 byte hole between futex_state (usually always on) and the following perf pointer (perf_event_ctxp). After the recursion lost some weight it fits perfectly. Move swevent_htable::recursion into task_struct. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Tested-by: Marco Elver Link: https://lore.kernel.org/r/20240704170424.1466941-6-bigeasy@linutronix.de --- include/linux/perf_event.h | 6 ------ include/linux/sched.h | 7 +++++++ 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ea0d82418d85..99a7ea1d29ed 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -970,12 +970,6 @@ struct perf_event_context { local_t nr_pending; }; -/* - * Number of contexts where an event can trigger: - * task, softirq, hardirq, nmi. - */ -#define PERF_NR_CONTEXTS 4 - struct perf_cpu_pmu_context { struct perf_event_pmu_context epc; struct perf_event_pmu_context *task_epc; diff --git a/include/linux/sched.h b/include/linux/sched.h index 61591ac6eab6..afb1087f5831 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -734,6 +734,12 @@ enum perf_event_task_context { perf_nr_task_contexts, }; +/* + * Number of contexts where an event can trigger: + * task, softirq, hardirq, nmi. + */ +#define PERF_NR_CONTEXTS 4 + struct wake_q_node { struct wake_q_node *next; }; @@ -1256,6 +1262,7 @@ struct task_struct { unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS + u8 perf_recursion[PERF_NR_CONTEXTS]; struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; -- cgit v1.2.3 From 2b84def990d388bed4afe4f21ae383a01991046c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 4 Jul 2024 19:03:41 +0200 Subject: perf: Split __perf_pending_irq() out of perf_pending_irq() perf_pending_irq() invokes perf_event_wakeup() and __perf_pending_irq(). The former is in charge of waking any tasks which waits to be woken up while the latter disables perf-events. The irq_work perf_pending_irq(), while this an irq_work, the callback is invoked in thread context on PREEMPT_RT. This is needed because all the waking functions (wake_up_all(), kill_fasync()) acquire sleep locks which must not be used with disabled interrupts. Disabling events, as done by __perf_pending_irq(), expects a hardirq context and disabled interrupts. This requirement is not fulfilled on PREEMPT_RT. Split functionality based on perf_event::pending_disable into irq_work named `pending_disable_irq' and invoke it in hardirq context on PREEMPT_RT. Rename the split out callback to perf_pending_disable(). Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Tested-by: Marco Elver Tested-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/r/20240704170424.1466941-8-bigeasy@linutronix.de --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 99a7ea1d29ed..65ece0d5b4b6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -783,6 +783,7 @@ struct perf_event { unsigned int pending_disable; unsigned long pending_addr; /* SIGTRAP */ struct irq_work pending_irq; + struct irq_work pending_disable_irq; struct callback_head pending_task; unsigned int pending_work; struct rcuwait pending_work_wait; -- cgit v1.2.3