summaryrefslogtreecommitdiff
path: root/kernel/events
diff options
context:
space:
mode:
authorNamhyung Kim <namhyung@kernel.org>2026-01-12 08:51:57 -0800
committerSasha Levin <sashal@kernel.org>2026-03-04 07:19:44 -0500
commit88cc56ca28f40bb24c19c7e58b5a33de6f7a6b54 (patch)
tree1aaa1ff5cbb99fc0e826719ea93fc3bc322fec76 /kernel/events
parent7beae54111c34ca63357ef120e115889b915beb5 (diff)
downloadlinux-88cc56ca28f40bb24c19c7e58b5a33de6f7a6b54.tar.gz
linux-88cc56ca28f40bb24c19c7e58b5a33de6f7a6b54.tar.bz2
linux-88cc56ca28f40bb24c19c7e58b5a33de6f7a6b54.zip
perf/core: Fix slow perf_event_task_exit() with LBR callstacks
[ Upstream commit 4960626f956d63dce57f099016c2ecbe637a8229 ] I got a report that a task is stuck in perf_event_exit_task() waiting for global_ctx_data_rwsem. On large systems with lots threads, it'd have performance issues when it grabs the lock to iterate all threads in the system to allocate the context data. And it'd block task exit path which is problematic especially under memory pressure. perf_event_open perf_event_alloc attach_perf_ctx_data attach_global_ctx_data percpu_down_write (global_ctx_data_rwsem) for_each_process_thread alloc_task_ctx_data do_exit perf_event_exit_task percpu_down_read (global_ctx_data_rwsem) It should not hold the global_ctx_data_rwsem on the exit path. Let's skip allocation for exiting tasks and free the data carefully. Reported-by: Rosalie Fang <rosaliefang@google.com> Suggested-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://patch.msgid.link/20260112165157.1919624-1-namhyung@kernel.org Signed-off-by: Sasha Levin <sashal@kernel.org>
Diffstat (limited to 'kernel/events')
-rw-r--r--kernel/events/core.c20
1 files changed, 18 insertions, 2 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8cca80094624..69c56cad88a8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5280,9 +5280,20 @@ attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
return -ENOMEM;
for (;;) {
- if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) {
+ if (try_cmpxchg(&task->perf_ctx_data, &old, cd)) {
if (old)
perf_free_ctx_data_rcu(old);
+ /*
+ * Above try_cmpxchg() pairs with try_cmpxchg() from
+ * detach_task_ctx_data() such that
+ * if we race with perf_event_exit_task(), we must
+ * observe PF_EXITING.
+ */
+ if (task->flags & PF_EXITING) {
+ /* detach_task_ctx_data() may free it already */
+ if (try_cmpxchg(&task->perf_ctx_data, &cd, NULL))
+ perf_free_ctx_data_rcu(cd);
+ }
return 0;
}
@@ -5328,6 +5339,8 @@ again:
/* Allocate everything */
scoped_guard (rcu) {
for_each_process_thread(g, p) {
+ if (p->flags & PF_EXITING)
+ continue;
cd = rcu_dereference(p->perf_ctx_data);
if (cd && !cd->global) {
cd->global = 1;
@@ -14294,8 +14307,11 @@ void perf_event_exit_task(struct task_struct *task)
/*
* Detach the perf_ctx_data for the system-wide event.
+ *
+ * Done without holding global_ctx_data_rwsem; typically
+ * attach_global_ctx_data() will skip over this task, but otherwise
+ * attach_task_ctx_data() will observe PF_EXITING.
*/
- guard(percpu_read)(&global_ctx_data_rwsem);
detach_task_ctx_data(task);
}