summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorShakeel Butt <shakeel.butt@linux.dev>2025-01-30 16:05:42 -0800
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2025-02-21 14:01:35 +0100
commitedd408444537ecdf89d719acf331cebc0c5b1351 (patch)
tree52cf6815cd0a1d13520771e18cc6f9485f9ea76a /kernel
parent05b9d614eb64bcd1c590b07698be72d43f51efaf (diff)
downloadlinux-edd408444537ecdf89d719acf331cebc0c5b1351.tar.gz
linux-edd408444537ecdf89d719acf331cebc0c5b1351.tar.bz2
linux-edd408444537ecdf89d719acf331cebc0c5b1351.zip
cgroup: fix race between fork and cgroup.kill
commit b69bb476dee99d564d65d418e9a20acca6f32c3f upstream. Tejun reported the following race between fork() and cgroup.kill at [1]. Tejun: I was looking at cgroup.kill implementation and wondering whether there could be a race window. So, __cgroup_kill() does the following: k1. Set CGRP_KILL. k2. Iterate tasks and deliver SIGKILL. k3. Clear CGRP_KILL. The copy_process() does the following: c1. Copy a bunch of stuff. c2. Grab siglock. c3. Check fatal_signal_pending(). c4. Commit to forking. c5. Release siglock. c6. Call cgroup_post_fork() which puts the task on the css_set and tests CGRP_KILL. The intention seems to be that either a forking task gets SIGKILL and terminates on c3 or it sees CGRP_KILL on c6 and kills the child. However, I don't see what guarantees that k3 can't happen before c6. ie. After a forking task passes c5, k2 can take place and then before the forking task reaches c6, k3 can happen. Then, nobody would send SIGKILL to the child. What am I missing? This is indeed a race. One way to fix this race is by taking cgroup_threadgroup_rwsem in write mode in __cgroup_kill() as the fork() side takes cgroup_threadgroup_rwsem in read mode from cgroup_can_fork() to cgroup_post_fork(). However that would be heavy handed as this adds one more potential stall scenario for cgroup.kill which is usually called under extreme situation like memory pressure. To fix this race, let's maintain a sequence number per cgroup which gets incremented on __cgroup_kill() call. On the fork() side, the cgroup_can_fork() will cache the sequence number locally and recheck it against the cgroup's sequence number at cgroup_post_fork() site. If the sequence numbers mismatch, it means __cgroup_kill() can been called and we should send SIGKILL to the newly created task. Reported-by: Tejun Heo <tj@kernel.org> Closes: https://lore.kernel.org/all/Z5QHE2Qn-QZ6M-KW@slm.duckdns.org/ [1] Fixes: 661ee6280931 ("cgroup: introduce cgroup.kill") Cc: stable@vger.kernel.org # v5.14+ Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev> Reviewed-by: Michal Koutný <mkoutny@suse.com> Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup/cgroup.c20
1 files changed, 12 insertions, 8 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e275eaf2de7f..216535e055e1 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4013,7 +4013,7 @@ static void __cgroup_kill(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- set_bit(CGRP_KILL, &cgrp->flags);
+ cgrp->kill_seq++;
spin_unlock_irq(&css_set_lock);
css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
@@ -4029,10 +4029,6 @@ static void __cgroup_kill(struct cgroup *cgrp)
send_sig(SIGKILL, task, 0);
}
css_task_iter_end(&it);
-
- spin_lock_irq(&css_set_lock);
- clear_bit(CGRP_KILL, &cgrp->flags);
- spin_unlock_irq(&css_set_lock);
}
static void cgroup_kill(struct cgroup *cgrp)
@@ -6489,6 +6485,10 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
get_css_set(cset);
+ if (kargs->cgrp)
+ kargs->kill_seq = kargs->cgrp->kill_seq;
+ else
+ kargs->kill_seq = cset->dfl_cgrp->kill_seq;
spin_unlock_irq(&css_set_lock);
if (!(kargs->flags & CLONE_INTO_CGROUP)) {
@@ -6672,6 +6672,7 @@ void cgroup_post_fork(struct task_struct *child,
struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
+ unsigned int cgrp_kill_seq = 0;
unsigned long cgrp_flags = 0;
bool kill = false;
struct cgroup_subsys *ss;
@@ -6685,10 +6686,13 @@ void cgroup_post_fork(struct task_struct *child,
/* init tasks are special, only link regular threads */
if (likely(child->pid)) {
- if (kargs->cgrp)
+ if (kargs->cgrp) {
cgrp_flags = kargs->cgrp->flags;
- else
+ cgrp_kill_seq = kargs->cgrp->kill_seq;
+ } else {
cgrp_flags = cset->dfl_cgrp->flags;
+ cgrp_kill_seq = cset->dfl_cgrp->kill_seq;
+ }
WARN_ON_ONCE(!list_empty(&child->cg_list));
cset->nr_tasks++;
@@ -6723,7 +6727,7 @@ void cgroup_post_fork(struct task_struct *child,
* child down right after we finished preparing it for
* userspace.
*/
- kill = test_bit(CGRP_KILL, &cgrp_flags);
+ kill = kargs->kill_seq != cgrp_kill_seq;
}
spin_unlock_irq(&css_set_lock);