From 76ba6acfcce871db13ad51c6dc8f56fec2e92853 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Thu, 20 Jun 2024 20:21:24 +0800 Subject: mm: optimize the redundant loop of mm_update_owner_next() When mm_update_owner_next() is racing with swapoff (try_to_unuse()) or /proc or ptrace or page migration (get_task_mm()), it is impossible to find an appropriate task_struct in the loop whose mm_struct is the same as the target mm_struct. If the above race condition is combined with the stress-ng-zombie and stress-ng-dup tests, such a long loop can easily cause a Hard Lockup in write_lock_irq() for tasklist_lock. Recognize this situation in advance and exit early. Link: https://lkml.kernel.org/r/20240620122123.3877432-1-alexjlzheng@tencent.com Signed-off-by: Jinliang Zheng Acked-by: Michal Hocko Cc: Christian Brauner Cc: Jens Axboe Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Oleg Nesterov Cc: Tycho Andersen Cc: Signed-off-by: Andrew Morton --- kernel/exit.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index f95a2c1338a8..81fcee45d630 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -484,6 +484,8 @@ retry: * Search through everything else, we should not get here often. */ for_each_process(g) { + if (atomic_read(&mm->mm_users) <= 1) + break; if (g->flags & PF_KTHREAD) continue; for_each_thread(g, c) { -- cgit v1.2.3 From 2a22b773b15f5aa97c029acad79bda11ce5f2b4d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 Jun 2024 17:29:24 +0200 Subject: memcg: mm_update_next_owner: kill the "retry" logic Add the new helper, try_to_set_owner(), which tries to update mm->owner once we see c->mm == mm. This way mm_update_next_owner() doesn't need to restart the list_for_each_entry/for_each_process loops from the very beginning if it races with exit/exec, it can just continue. Unlike the current code, try_to_set_owner() re-checks tsk->mm == mm before it drops tasklist_lock, so it doesn't need get/put_task_struct(). Link: https://lkml.kernel.org/r/20240626152924.GA17933@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Michal Hocko Cc: Christian Brauner Cc: Eric W. Biederman Cc: Jens Axboe Cc: Jinliang Zheng Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Tycho Andersen Signed-off-by: Andrew Morton --- kernel/exit.c | 57 +++++++++++++++++++++++++++------------------------------ 1 file changed, 27 insertions(+), 30 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 81fcee45d630..877fae2cc705 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -439,6 +439,23 @@ static void coredump_task_exit(struct task_struct *tsk) } #ifdef CONFIG_MEMCG +/* drops tasklist_lock if succeeds */ +static bool try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) +{ + bool ret = false; + + task_lock(tsk); + if (likely(tsk->mm == mm)) { + /* tsk can't pass exit_mm/exec_mmap and exit */ + read_unlock(&tasklist_lock); + WRITE_ONCE(mm->owner, tsk); + lru_gen_migrate_mm(mm); + ret = true; + } + task_unlock(tsk); + return ret; +} + /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ @@ -446,7 +463,6 @@ void mm_update_next_owner(struct mm_struct *mm) { struct task_struct *c, *g, *p = current; -retry: /* * If the exiting or execing task is not the owner, it's * someone else's problem. @@ -468,16 +484,16 @@ retry: * Search in the children */ list_for_each_entry(c, &p->children, sibling) { - if (c->mm == mm) - goto assign_new_owner; + if (c->mm == mm && try_to_set_owner(c, mm)) + goto ret; } /* * Search in the siblings */ list_for_each_entry(c, &p->real_parent->children, sibling) { - if (c->mm == mm) - goto assign_new_owner; + if (c->mm == mm && try_to_set_owner(c, mm)) + goto ret; } /* @@ -489,9 +505,11 @@ retry: if (g->flags & PF_KTHREAD) continue; for_each_thread(g, c) { - if (c->mm == mm) - goto assign_new_owner; - if (c->mm) + struct mm_struct *c_mm = READ_ONCE(c->mm); + if (c_mm == mm) { + if (try_to_set_owner(c, mm)) + goto ret; + } else if (c_mm) break; } } @@ -502,30 +520,9 @@ retry: * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ WRITE_ONCE(mm->owner, NULL); + ret: return; -assign_new_owner: - BUG_ON(c == p); - get_task_struct(c); - /* - * The task_lock protects c->mm from changing. - * We always want mm->owner->mm == mm - */ - task_lock(c); - /* - * Delay read_unlock() till we have the task_lock() - * to ensure that c does not slip away underneath us - */ - read_unlock(&tasklist_lock); - if (c->mm != mm) { - task_unlock(c); - put_task_struct(c); - goto retry; - } - WRITE_ONCE(mm->owner, c); - lru_gen_migrate_mm(mm); - task_unlock(c); - put_task_struct(c); } #endif /* CONFIG_MEMCG */ -- cgit v1.2.3 From d73d00352145fb51d31771047aa939850d87fa50 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 Jun 2024 17:29:30 +0200 Subject: memcg: mm_update_next_owner: move for_each_thread() into try_to_set_owner() mm_update_next_owner() checks the children / real_parent->children to avoid the "everything else" loop in the likely case, but this won't work if a child/sibling has a zombie leader with ->mm == NULL. Move the for_each_thread() logic into try_to_set_owner(), if nothing else this makes the children/siblings/everything searches more consistent. Link: https://lkml.kernel.org/r/20240626152930.GA17936@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Michal Hocko Cc: Christian Brauner Cc: Eric W. Biederman Cc: Jens Axboe Cc: Jinliang Zheng Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Tycho Andersen Signed-off-by: Andrew Morton --- kernel/exit.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 877fae2cc705..a5dd736c6767 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -440,7 +440,7 @@ static void coredump_task_exit(struct task_struct *tsk) #ifdef CONFIG_MEMCG /* drops tasklist_lock if succeeds */ -static bool try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) +static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) { bool ret = false; @@ -456,12 +456,28 @@ static bool try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) return ret; } +static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm) +{ + struct task_struct *t; + + for_each_thread(g, t) { + struct mm_struct *t_mm = READ_ONCE(t->mm); + if (t_mm == mm) { + if (__try_to_set_owner(t, mm)) + return true; + } else if (t_mm) + break; + } + + return false; +} + /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ void mm_update_next_owner(struct mm_struct *mm) { - struct task_struct *c, *g, *p = current; + struct task_struct *g, *p = current; /* * If the exiting or execing task is not the owner, it's @@ -483,19 +499,17 @@ void mm_update_next_owner(struct mm_struct *mm) /* * Search in the children */ - list_for_each_entry(c, &p->children, sibling) { - if (c->mm == mm && try_to_set_owner(c, mm)) + list_for_each_entry(g, &p->children, sibling) { + if (try_to_set_owner(g, mm)) goto ret; } - /* * Search in the siblings */ - list_for_each_entry(c, &p->real_parent->children, sibling) { - if (c->mm == mm && try_to_set_owner(c, mm)) + list_for_each_entry(g, &p->real_parent->children, sibling) { + if (try_to_set_owner(g, mm)) goto ret; } - /* * Search through everything else, we should not get here often. */ @@ -504,14 +518,8 @@ void mm_update_next_owner(struct mm_struct *mm) break; if (g->flags & PF_KTHREAD) continue; - for_each_thread(g, c) { - struct mm_struct *c_mm = READ_ONCE(c->mm); - if (c_mm == mm) { - if (try_to_set_owner(c, mm)) - goto ret; - } else if (c_mm) - break; - } + if (try_to_set_owner(g, mm)) + goto ret; } read_unlock(&tasklist_lock); /* -- cgit v1.2.3