From 6ab428604f724cf217a47b7d3f3353aab815b40e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Oct 2022 10:45:44 -1000 Subject: cgroup: Implement DEBUG_CGROUP_REF It's really difficult to debug when cgroup or css refs leak. Let's add a debug option to force the refcnt function to not be inlined so that they can be kprobed for debugging. Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d922773fa90b..f786c4c973a0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -248,6 +248,11 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); +#ifdef CONFIG_DEBUG_CGROUP_REF +#define CGROUP_REF_FN_ATTRS noinline +#include +#endif + /** * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID * @ssid: subsys ID of interest -- cgit v1.2.3 From 79a7f41f7f5ac69fd22eaf1fb3e230bea95f3399 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 31 Oct 2022 07:12:13 -1000 Subject: cgroup: cgroup refcnt functions should be exported when CONFIG_DEBUG_CGROUP_REF 6ab428604f72 ("cgroup: Implement DEBUG_CGROUP_REF") added a config option which forces cgroup refcnt functions to be not inlined so that they can be kprobed for debugging. However, it forgot export them when the config is enabled breaking modules which make use of css reference counting. Fix it by adding CGROUP_REF_EXPORT() macro to cgroup_refcnt.h which is defined to EXPORT_SYMBOL_GPL when CONFIG_DEBUG_CGROUP_REF is set. Signed-off-by: Tejun Heo Fixes: 6ab428604f72 ("cgroup: Implement DEBUG_CGROUP_REF") --- kernel/cgroup/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f786c4c973a0..f2743a476190 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -250,6 +250,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, #ifdef CONFIG_DEBUG_CGROUP_REF #define CGROUP_REF_FN_ATTRS noinline +#define CGROUP_REF_EXPORT(fn) EXPORT_SYMBOL_GPL(fn); #include #endif -- cgit v1.2.3 From 18f9a4d47527772515ad6cbdac796422566e6440 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 12 Nov 2022 17:19:38 -0500 Subject: cgroup/cpuset: Skip spread flags update on v2 Cpuset v2 has no spread flags to set. So we can skip spread flags update if cpuset v2 is being used. Also change the name to cpuset_update_task_spread_flags() to indicate that there are multiple spread flags. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index b474289c15b8..2525905cdf48 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -550,11 +550,15 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Call with callback_lock or cpuset_rwsem held. + * Call with callback_lock or cpuset_rwsem held. The check can be skipped + * if on default hierarchy. */ -static void cpuset_update_task_spread_flag(struct cpuset *cs, +static void cpuset_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk) { + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + return; + if (is_spread_page(cs)) task_set_spread_page(tsk); else @@ -2153,7 +2157,7 @@ static void update_tasks_flags(struct cpuset *cs) css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) - cpuset_update_task_spread_flag(cs, task); + cpuset_update_task_spread_flags(cs, task); css_task_iter_end(&it); } @@ -2530,7 +2534,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flag(cs, task); + cpuset_update_task_spread_flags(cs, task); } /* -- cgit v1.2.3 From 7fd4da9c1584be97ffbc40e600a19cb469fd4e78 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 12 Nov 2022 17:19:39 -0500 Subject: cgroup/cpuset: Optimize cpuset_attach() on v2 It was found that with the default hierarchy, enabling cpuset in the child cgroups can trigger a cpuset_attach() call in each of the child cgroups that have tasks with no change in effective cpus and mems. If there are many processes in those child cgroups, it will burn quite a lot of cpu cycles iterating all the tasks without doing useful work. Optimizing this case by comparing between the old and new cpusets and skip useless update if there is no change in effective cpus and mems. Also mems_allowed are less likely to be changed than cpus_allowed. So skip changing mm if there is no change in effective_mems and CS_MEMORY_MIGRATE is not set. By inserting some instrumentation code and running a simple command in a container 200 times in a cgroup v2 system, it was found that all the cpuset_attach() calls are skipped (401 times in total) as there was no change in effective cpus and mems. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2525905cdf48..b8361f55ef36 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2513,12 +2513,28 @@ static void cpuset_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; + bool cpus_updated, mems_updated; cgroup_taskset_first(tset, &css); cs = css_cs(css); lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ percpu_down_write(&cpuset_rwsem); + cpus_updated = !cpumask_equal(cs->effective_cpus, + oldcs->effective_cpus); + mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); + + /* + * In the default hierarchy, enabling cpuset in the child cgroups + * will trigger a number of cpuset_attach() calls with no change + * in effective cpus and mems. In that case, we can optimize out + * by skipping the task iteration and update. + */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !cpus_updated && !mems_updated) { + cpuset_attach_nodemask_to = cs->effective_mems; + goto out; + } guarantee_online_mems(cs, &cpuset_attach_nodemask_to); @@ -2539,9 +2555,14 @@ static void cpuset_attach(struct cgroup_taskset *tset) /* * Change mm for all threadgroup leaders. This is expensive and may - * sleep and should be moved outside migration path proper. + * sleep and should be moved outside migration path proper. Skip it + * if there is no change in effective_mems and CS_MEMORY_MIGRATE is + * not set. */ cpuset_attach_nodemask_to = cs->effective_mems; + if (!is_memory_migrate(cs) && !mems_updated) + goto out; + cgroup_taskset_for_each_leader(leader, css, tset) { struct mm_struct *mm = get_task_mm(leader); @@ -2564,6 +2585,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) } } +out: cs->old_mems_allowed = cpuset_attach_nodemask_to; cs->attach_in_progress--; -- cgit v1.2.3 From 0a2cafe6c7c25597a026ab961c3182c8179c7959 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Thu, 17 Nov 2022 12:45:57 +0530 Subject: cgroup/cpuset: Improve cpuset_css_alloc() description Change the function argument in the description of cpuset_css_alloc() from 'struct cgroup' -> 'struct cgroup_subsys_state'. The change to the argument type was introduced by commit eb95419b023a ("cgroup: pass around cgroup_subsys_state instead of cgroup in subsystem methods"). Also, add more information to its description. Signed-off-by: Kamalesh Babulal Acked-by: Waiman Long Acked-by: Joel Savitz Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index b8361f55ef36..589827ccda8b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3072,11 +3072,15 @@ static struct cftype dfl_files[] = { }; -/* - * cpuset_css_alloc - allocate a cpuset css - * cgrp: control group that the new cpuset will be part of +/** + * cpuset_css_alloc - Allocate a cpuset css + * @parent_css: Parent css of the control group that the new cpuset will be + * part of + * Return: cpuset css on success, -ENOMEM on failure. + * + * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return + * top cpuset css otherwise. */ - static struct cgroup_subsys_state * cpuset_css_alloc(struct cgroup_subsys_state *parent_css) { -- cgit v1.2.3 From 674b745e22b3caae48ad20422795eefd3f832a7b Mon Sep 17 00:00:00 2001 From: Ran Tian Date: Wed, 23 Nov 2022 22:45:14 +0800 Subject: cgroup: remove rcu_read_lock()/rcu_read_unlock() in critical section of spin_lock_irq() spin_lock_irq() already disable preempt, so remove rcu_read_lock(). Signed-off-by: Ran Tian Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f2743a476190..3028f6bc7d11 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2866,14 +2866,12 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, * take an rcu_read_lock. */ spin_lock_irq(&css_set_lock); - rcu_read_lock(); task = leader; do { cgroup_migrate_add_task(task, mgctx); if (!threadgroup) break; } while_each_thread(leader, task); - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); return cgroup_migrate_execute(mgctx); -- cgit v1.2.3