summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-05-15 17:06:08 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-05-15 17:06:08 -0700
commitde6fef50eaf40789b11841474726fd918a3a84a1 (patch)
treedf6e1395240b9275e1f5d064edd6957cf3d8d510
parentf4b0c4b508364fde023e4f7b9f23f7e38c663dfe (diff)
parent21c38a3bd4ee3fb7337d013a638302fb5e5f9dc2 (diff)
downloadlinux-de6fef50eaf40789b11841474726fd918a3a84a1.tar.gz
linux-de6fef50eaf40789b11841474726fd918a3a84a1.tar.bz2
linux-de6fef50eaf40789b11841474726fd918a3a84a1.zip
Merge tag 'cgroup-for-6.10' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - The locking around cpuset hotplug processing has always been a bit of mess which was worked around by making hotplug processing asynchronous. The asynchronity isn't great and led to other issues. We tried to make the behavior synchronous a while ago but that led to lockdep splats. Waiman took another stab at cleaning up and making it synchronous. The patch has been in -next for well over a month and there haven't been any complaints, so fingers crossed. - Tracepoints added to help understanding rstat lock contentions. - A bunch of minor changes - doc updates, code cleanups and selftests. * tag 'cgroup-for-6.10' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (24 commits) cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints selftests/cgroup: Drop define _GNU_SOURCE docs: cgroup-v1: Update page cache removal functions selftests/cgroup: fix uninitialized variables in test_zswap.c selftests/cgroup: cpu_hogger init: use {} instead of {NULL} selftests/cgroup: fix clang warnings: uninitialized fd variable selftests/cgroup: fix clang build failures for abs() calls cgroup/cpuset: Remove outdated comment in sched_partition_write() cgroup/cpuset: Fix incorrect top_cpuset flags cgroup/cpuset: Avoid clearing CS_SCHED_LOAD_BALANCE twice cgroup/cpuset: Statically initialize more members of top_cpuset cgroup: Avoid unnecessary looping in cgroup_no_v1() cgroup, legacy_freezer: update comment for freezer_css_offline() docs, cgroup: add entries for pids to cgroup-v2.rst cgroup: don't call cgroup1_pidlist_destroy_all() for v2 cgroup_freezer: update comment for freezer_css_online() cgroup/rstat: desc member cgrp in cgroup_rstat_flush_release cgroup/rstat: add cgroup_rstat_lock helpers and tracepoints cgroup/pids: Remove superfluous zeroing docs: cgroup-v1: Fix description for css_online ...
-rw-r--r--Documentation/admin-guide/cgroup-v1/cgroups.rst2
-rw-r--r--Documentation/admin-guide/cgroup-v1/memcg_test.rst2
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst31
-rw-r--r--include/linux/cgroup.h2
-rw-r--r--include/linux/cpuset.h3
-rw-r--r--include/trace/events/cgroup.h92
-rw-r--r--kernel/cgroup/cgroup-v1.c1
-rw-r--r--kernel/cgroup/cgroup.c3
-rw-r--r--kernel/cgroup/cpuset.c156
-rw-r--r--kernel/cgroup/legacy_freezer.c5
-rw-r--r--kernel/cgroup/pids.c2
-rw-r--r--kernel/cgroup/rstat.c118
-rw-r--r--kernel/cpu.c48
-rw-r--r--kernel/power/process.c2
-rw-r--r--tools/testing/selftests/cgroup/Makefile2
-rw-r--r--tools/testing/selftests/cgroup/cgroup_util.c11
-rw-r--r--tools/testing/selftests/cgroup/cgroup_util.h4
-rw-r--r--tools/testing/selftests/cgroup/test_core.c9
-rw-r--r--tools/testing/selftests/cgroup/test_cpu.c8
-rw-r--r--tools/testing/selftests/cgroup/test_cpuset.c2
-rwxr-xr-xtools/testing/selftests/cgroup/test_cpuset_v1_hp.sh46
-rw-r--r--tools/testing/selftests/cgroup/test_freezer.c2
-rw-r--r--tools/testing/selftests/cgroup/test_hugetlb_memcg.c4
-rw-r--r--tools/testing/selftests/cgroup/test_kill.c2
-rw-r--r--tools/testing/selftests/cgroup/test_kmem.c8
-rw-r--r--tools/testing/selftests/cgroup/test_memcontrol.c8
-rw-r--r--tools/testing/selftests/cgroup/test_zswap.c8
27 files changed, 358 insertions, 223 deletions
diff --git a/Documentation/admin-guide/cgroup-v1/cgroups.rst b/Documentation/admin-guide/cgroup-v1/cgroups.rst
index 9343148ee993..a3e2edb3d274 100644
--- a/Documentation/admin-guide/cgroup-v1/cgroups.rst
+++ b/Documentation/admin-guide/cgroup-v1/cgroups.rst
@@ -570,7 +570,7 @@ visible to cgroup_for_each_child/descendant_*() iterators. The
subsystem may choose to fail creation by returning -errno. This
callback can be used to implement reliable state sharing and
propagation along the hierarchy. See the comment on
-cgroup_for_each_descendant_pre() for details.
+cgroup_for_each_live_descendant_pre() for details.
``void css_offline(struct cgroup *cgrp);``
(cgroup_mutex held by caller)
diff --git a/Documentation/admin-guide/cgroup-v1/memcg_test.rst b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
index 1f128458ddea..9f8e27355cba 100644
--- a/Documentation/admin-guide/cgroup-v1/memcg_test.rst
+++ b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
@@ -102,7 +102,7 @@ Under below explanation, we assume CONFIG_SWAP=y.
The logic is very clear. (About migration, see below)
Note:
- __remove_from_page_cache() is called by remove_from_page_cache()
+ __filemap_remove_folio() is called by filemap_remove_folio()
and __remove_mapping().
6. Shmem(tmpfs) Page Cache
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index eaf9e66e472a..f554df7ac649 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1058,12 +1058,15 @@ cpufreq governor about the minimum desired frequency which should always be
provided by a CPU, as well as the maximum desired frequency, which should not
be exceeded by a CPU.
-WARNING: cgroup2 doesn't yet support control of realtime processes and
-the cpu controller can only be enabled when all RT processes are in
-the root cgroup. Be aware that system management software may already
-have placed RT processes into nonroot cgroups during the system boot
-process, and these processes may need to be moved to the root cgroup
-before the cpu controller can be enabled.
+WARNING: cgroup2 doesn't yet support control of realtime processes. For
+a kernel built with the CONFIG_RT_GROUP_SCHED option enabled for group
+scheduling of realtime processes, the cpu controller can only be enabled
+when all RT processes are in the root cgroup. This limitation does
+not apply if CONFIG_RT_GROUP_SCHED is disabled. Be aware that system
+management software may already have placed RT processes into nonroot
+cgroups during the system boot process, and these processes may need
+to be moved to the root cgroup before the cpu controller can be enabled
+with a CONFIG_RT_GROUP_SCHED enabled kernel.
CPU Interface Files
@@ -2190,11 +2193,25 @@ PID Interface Files
Hard limit of number of processes.
pids.current
- A read-only single value file which exists on all cgroups.
+ A read-only single value file which exists on non-root cgroups.
The number of processes currently in the cgroup and its
descendants.
+ pids.peak
+ A read-only single value file which exists on non-root cgroups.
+
+ The maximum value that the number of processes in the cgroup and its
+ descendants has ever reached.
+
+ pids.events
+ A read-only flat-keyed file which exists on non-root cgroups. The
+ following entries are defined. Unless specified otherwise, a value
+ change in this file generates a file modified event.
+
+ max
+ Number of times fork failed because limit was hit.
+
Organisational operations are not blocked by cgroup policies, so it is
possible to have pids.current > pids.max. This can be done by either
setting the limit to be smaller than pids.current, or attaching enough
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 34aaf0e87def..2150ca60394b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -690,7 +690,7 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
void cgroup_rstat_flush(struct cgroup *cgrp);
void cgroup_rstat_flush_hold(struct cgroup *cgrp);
-void cgroup_rstat_flush_release(void);
+void cgroup_rstat_flush_release(struct cgroup *cgrp);
/*
* Basic resource stats.
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 0ce6ff0d9c9a..de4cf0ee96f7 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -70,7 +70,6 @@ extern int cpuset_init(void);
extern void cpuset_init_smp(void);
extern void cpuset_force_rebuild(void);
extern void cpuset_update_active_cpus(void);
-extern void cpuset_wait_for_hotplug(void);
extern void inc_dl_tasks_cs(struct task_struct *task);
extern void dec_dl_tasks_cs(struct task_struct *task);
extern void cpuset_lock(void);
@@ -185,8 +184,6 @@ static inline void cpuset_update_active_cpus(void)
partition_sched_domains(1, NULL, NULL);
}
-static inline void cpuset_wait_for_hotplug(void) { }
-
static inline void inc_dl_tasks_cs(struct task_struct *task) { }
static inline void dec_dl_tasks_cs(struct task_struct *task) { }
static inline void cpuset_lock(void) { }
diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
index dd7d7c9efecd..0b95865a90f3 100644
--- a/include/trace/events/cgroup.h
+++ b/include/trace/events/cgroup.h
@@ -204,6 +204,98 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
TP_ARGS(cgrp, path, val)
);
+DECLARE_EVENT_CLASS(cgroup_rstat,
+
+ TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+ TP_ARGS(cgrp, cpu, contended),
+
+ TP_STRUCT__entry(
+ __field( int, root )
+ __field( int, level )
+ __field( u64, id )
+ __field( int, cpu )
+ __field( bool, contended )
+ ),
+
+ TP_fast_assign(
+ __entry->root = cgrp->root->hierarchy_id;
+ __entry->id = cgroup_id(cgrp);
+ __entry->level = cgrp->level;
+ __entry->cpu = cpu;
+ __entry->contended = contended;
+ ),
+
+ TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
+ __entry->root, __entry->id, __entry->level,
+ __entry->cpu, __entry->contended)
+);
+
+/* Related to global: cgroup_rstat_lock */
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
+
+ TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+ TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,
+
+ TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+ TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
+
+ TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+ TP_ARGS(cgrp, cpu, contended)
+);
+
+/* Related to per CPU: cgroup_rstat_cpu_lock */
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,
+
+ TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+ TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,
+
+ TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+ TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,
+
+ TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+ TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,
+
+ TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+ TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,
+
+ TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+ TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
+
+ TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+ TP_ARGS(cgrp, cpu, contended)
+);
+
#endif /* _TRACE_CGROUP_H */
/* This part must be outside protection */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 520a11cb12f4..b9dbf6bf2779 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1335,6 +1335,7 @@ static int __init cgroup_no_v1(char *str)
continue;
cgroup_no_v1_mask |= 1 << i;
+ break;
}
}
return 1;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a66c088c851c..e32b6972c478 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5368,7 +5368,8 @@ static void css_free_rwork_fn(struct work_struct *work)
} else {
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
- cgroup1_pidlist_destroy_all(cgrp);
+ if (!cgroup_on_dfl(cgrp))
+ cgroup1_pidlist_destroy_all(cgrp);
cancel_work_sync(&cgrp->release_agent_work);
bpf_cgrp_storage_free(cgrp);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4237c8748715..a10e4bd0c0c1 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -202,6 +202,14 @@ struct cpuset {
};
/*
+ * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
+ */
+struct cpuset_remove_tasks_struct {
+ struct work_struct work;
+ struct cpuset *cs;
+};
+
+/*
* Exclusive CPUs distributed out to sub-partitions of top_cpuset
*/
static cpumask_var_t subpartitions_cpus;
@@ -360,9 +368,10 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs)
}
static struct cpuset top_cpuset = {
- .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
- (1 << CS_MEM_EXCLUSIVE)),
+ .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) |
+ BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
.partition_root_state = PRS_ROOT,
+ .relax_domain_level = -1,
.remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),
};
@@ -449,12 +458,6 @@ static DEFINE_SPINLOCK(callback_lock);
static struct workqueue_struct *cpuset_migrate_mm_wq;
-/*
- * CPU / memory hotplug is handled asynchronously.
- */
-static void cpuset_hotplug_workfn(struct work_struct *work);
-static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
-
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
static inline void check_insane_mems_config(nodemask_t *nodes)
@@ -540,22 +543,10 @@ static void guarantee_online_cpus(struct task_struct *tsk,
rcu_read_lock();
cs = task_cs(tsk);
- while (!cpumask_intersects(cs->effective_cpus, pmask)) {
+ while (!cpumask_intersects(cs->effective_cpus, pmask))
cs = parent_cs(cs);
- if (unlikely(!cs)) {
- /*
- * The top cpuset doesn't have any online cpu as a
- * consequence of a race between cpuset_hotplug_work
- * and cpu hotplug notifier. But we know the top
- * cpuset's effective_cpus is on its way to be
- * identical to cpu_online_mask.
- */
- goto out_unlock;
- }
- }
- cpumask_and(pmask, pmask, cs->effective_cpus);
-out_unlock:
+ cpumask_and(pmask, pmask, cs->effective_cpus);
rcu_read_unlock();
}
@@ -1217,7 +1208,7 @@ static void rebuild_sched_domains_locked(void)
/*
* If we have raced with CPU hotplug, return early to avoid
* passing doms with offlined cpu to partition_sched_domains().
- * Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
+ * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
*
* With no CPUs in any subpartitions, top_cpuset's effective CPUs
* should be the same as the active CPUs, so checking only top_cpuset
@@ -1260,12 +1251,17 @@ static void rebuild_sched_domains_locked(void)
}
#endif /* CONFIG_SMP */
-void rebuild_sched_domains(void)
+static void rebuild_sched_domains_cpuslocked(void)
{
- cpus_read_lock();
mutex_lock(&cpuset_mutex);
rebuild_sched_domains_locked();
mutex_unlock(&cpuset_mutex);
+}
+
+void rebuild_sched_domains(void)
+{
+ cpus_read_lock();
+ rebuild_sched_domains_cpuslocked();
cpus_read_unlock();
}
@@ -2079,14 +2075,11 @@ write_error:
/*
* For partcmd_update without newmask, it is being called from
- * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken.
- * Update the load balance flag and scheduling domain if
- * cpus_read_trylock() is successful.
+ * cpuset_handle_hotplug(). Update the load balance flag and
+ * scheduling domain accordingly.
*/
- if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) {
+ if ((cmd == partcmd_update) && !newmask)
update_partition_sd_lb(cs, old_prs);
- cpus_read_unlock();
- }
notify_partition_change(cs, old_prs);
return 0;
@@ -3599,8 +3592,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
* proceeding, so that we don't end up keep removing tasks added
* after execution capability is restored.
*
- * cpuset_hotplug_work calls back into cgroup core via
- * cgroup_transfer_tasks() and waiting for it from a cgroupfs
+ * cpuset_handle_hotplug may call back into cgroup core asynchronously
+ * via cgroup_transfer_tasks() and waiting for it from a cgroupfs
* operation like this one can lead to a deadlock through kernfs
* active_ref protection. Let's break the protection. Losing the
* protection is okay as we check whether @cs is online after
@@ -3609,7 +3602,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
*/
css_get(&cs->css);
kernfs_break_active_protection(of->kn);
- flush_work(&cpuset_hotplug_work);
cpus_read_lock();
mutex_lock(&cpuset_mutex);
@@ -3782,9 +3774,6 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
buf = strstrip(buf);
- /*
- * Convert "root" to ENABLED, and convert "member" to DISABLED.
- */
if (!strcmp(buf, "root"))
val = PRS_ROOT;
else if (!strcmp(buf, "member"))
@@ -4060,11 +4049,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->effective_mems = parent->effective_mems;
cs->use_parent_ecpus = true;
parent->child_ecpus_count++;
- /*
- * Clear CS_SCHED_LOAD_BALANCE if parent is isolated
- */
- if (!is_sched_load_balance(parent))
- clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}
/*
@@ -4318,8 +4302,6 @@ int __init cpuset_init(void)
nodes_setall(top_cpuset.effective_mems);
fmeter_init(&top_cpuset.fmeter);
- set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
- top_cpuset.relax_domain_level = -1;
INIT_LIST_HEAD(&remote_children);
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
@@ -4354,6 +4336,16 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
}
}
+static void cpuset_migrate_tasks_workfn(struct work_struct *work)
+{
+ struct cpuset_remove_tasks_struct *s;
+
+ s = container_of(work, struct cpuset_remove_tasks_struct, work);
+ remove_tasks_in_empty_cpuset(s->cs);
+ css_put(&s->cs->css);
+ kfree(s);
+}
+
static void
hotplug_update_tasks_legacy(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
@@ -4383,12 +4375,21 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
/*
* Move tasks to the nearest ancestor with execution resources,
* This is full cgroup operation which will also call back into
- * cpuset. Should be done outside any lock.
+ * cpuset. Execute it asynchronously using workqueue.
*/
- if (is_empty) {
- mutex_unlock(&cpuset_mutex);
- remove_tasks_in_empty_cpuset(cs);
- mutex_lock(&cpuset_mutex);
+ if (is_empty && cs->css.cgroup->nr_populated_csets &&
+ css_tryget_online(&cs->css)) {
+ struct cpuset_remove_tasks_struct *s;
+
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
+ if (WARN_ON_ONCE(!s)) {
+ css_put(&cs->css);
+ return;
+ }
+
+ s->cs = cs;
+ INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
+ schedule_work(&s->work);
}
}
@@ -4421,30 +4422,6 @@ void cpuset_force_rebuild(void)
force_rebuild = true;
}
-/*
- * Attempt to acquire a cpus_read_lock while a hotplug operation may be in
- * progress.
- * Return: true if successful, false otherwise
- *
- * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock,
- * cpus_read_trylock() is used here to acquire the lock.
- */
-static bool cpuset_hotplug_cpus_read_trylock(void)
-{
- int retries = 0;
-
- while (!cpus_read_trylock()) {
- /*
- * CPU hotplug still in progress. Retry 5 times
- * with a 10ms wait before bailing out.
- */
- if (++retries > 5)
- return false;
- msleep(10);
- }
- return true;
-}
-
/**
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
* @cs: cpuset in interest
@@ -4493,13 +4470,11 @@ retry:
compute_partition_effective_cpumask(cs, &new_cpus);
if (remote && cpumask_empty(&new_cpus) &&
- partition_is_populated(cs, NULL) &&
- cpuset_hotplug_cpus_read_trylock()) {
+ partition_is_populated(cs, NULL)) {
remote_partition_disable(cs, tmp);
compute_effective_cpumask(&new_cpus, cs, parent);
remote = false;
cpuset_force_rebuild();
- cpus_read_unlock();
}
/*
@@ -4519,18 +4494,8 @@ retry:
else if (is_partition_valid(parent) && is_partition_invalid(cs))
partcmd = partcmd_update;
- /*
- * cpus_read_lock needs to be held before calling
- * update_parent_effective_cpumask(). To avoid circular lock
- * dependency between cpuset_mutex and cpus_read_lock,
- * cpus_read_trylock() is used here to acquire the lock.
- */
if (partcmd >= 0) {
- if (!cpuset_hotplug_cpus_read_trylock())
- goto update_tasks;
-
update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
- cpus_read_unlock();
if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
compute_partition_effective_cpumask(cs, &new_cpus);
cpuset_force_rebuild();
@@ -4558,8 +4523,7 @@ unlock:
}
/**
- * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
- * @work: unused
+ * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
*
* This function is called after either CPU or memory configuration has
* changed and updates cpuset accordingly. The top_cpuset is always
@@ -4573,8 +4537,10 @@ unlock:
*
* Note that CPU offlining during suspend is ignored. We don't modify
* cpusets across suspend/resume cycles at all.
+ *
+ * CPU / memory hotplug is handled synchronously.
*/
-static void cpuset_hotplug_workfn(struct work_struct *work)
+static void cpuset_handle_hotplug(void)
{
static cpumask_t new_cpus;
static nodemask_t new_mems;
@@ -4585,6 +4551,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
if (on_dfl && !alloc_cpumasks(NULL, &tmp))
ptmp = &tmp;
+ lockdep_assert_cpus_held();
mutex_lock(&cpuset_mutex);
/* fetch the available cpus/mems and find out which changed how */
@@ -4666,7 +4633,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
/* rebuild sched domains if cpus_allowed has changed */
if (cpus_updated || force_rebuild) {
force_rebuild = false;
- rebuild_sched_domains();
+ rebuild_sched_domains_cpuslocked();
}
free_cpumasks(NULL, ptmp);
@@ -4679,12 +4646,7 @@ void cpuset_update_active_cpus(void)
* inside cgroup synchronization. Bounce actual hotplug processing
* to a work item to avoid reverse locking order.
*/
- schedule_work(&cpuset_hotplug_work);
-}
-
-void cpuset_wait_for_hotplug(void)
-{
- flush_work(&cpuset_hotplug_work);
+ cpuset_handle_hotplug();
}
/*
@@ -4695,7 +4657,7 @@ void cpuset_wait_for_hotplug(void)
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
- schedule_work(&cpuset_hotplug_work);
+ cpuset_handle_hotplug();
return NOTIFY_OK;
}
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 66d1708042a7..074653f964c1 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -106,8 +106,7 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)
* @css: css being created
*
* We're committing to creation of @css. Mark it online and inherit
- * parent's freezing state while holding both parent's and our
- * freezer->lock.
+ * parent's freezing state while holding cpus read lock and freezer_mutex.
*/
static int freezer_css_online(struct cgroup_subsys_state *css)
{
@@ -133,7 +132,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
* freezer_css_offline - initiate destruction of a freezer css
* @css: css being destroyed
*
- * @css is going away. Mark it dead and decrement system_freezing_count if
+ * @css is going away. Mark it dead and decrement freezer_active if
* it was holding one.
*/
static void freezer_css_offline(struct cgroup_subsys_state *css)
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 7695e60bcb40..0e5ec7d59b4d 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -75,9 +75,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent)
if (!pids)
return ERR_PTR(-ENOMEM);
- atomic64_set(&pids->counter, 0);
atomic64_set(&pids->limit, PIDS_MAX);
- atomic64_set(&pids->events_limit, 0);
return &pids->css;
}
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 07e2284bb499..fb8b49437573 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -7,6 +7,8 @@
#include <linux/btf.h>
#include <linux/btf_ids.h>
+#include <trace/events/cgroup.h>
+
static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
@@ -17,6 +19,60 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
return per_cpu_ptr(cgrp->rstat_cpu, cpu);
}
+/*
+ * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
+ *
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments. The parameter @fast_path determine the
+ * tracepoints being added, allowing us to diagnose "flush" related
+ * operations without handling high-frequency fast-path "update" events.
+ */
+static __always_inline
+unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
+ struct cgroup *cgrp, const bool fast_path)
+{
+ unsigned long flags;
+ bool contended;
+
+ /*
+ * The _irqsave() is needed because cgroup_rstat_lock is
+ * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+ * this lock with the _irq() suffix only disables interrupts on
+ * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+ * interrupts on both configurations. The _irqsave() ensures
+ * that interrupts are always disabled and later restored.
+ */
+ contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
+ if (contended) {
+ if (fast_path)
+ trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
+ else
+ trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
+
+ raw_spin_lock_irqsave(cpu_lock, flags);
+ }
+
+ if (fast_path)
+ trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
+ else
+ trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
+
+ return flags;
+}
+
+static __always_inline
+void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
+ struct cgroup *cgrp, unsigned long flags,
+ const bool fast_path)
+{
+ if (fast_path)
+ trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
+ else
+ trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
+
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+
/**
* cgroup_rstat_updated - keep track of updated rstat_cpu
* @cgrp: target cgroup
@@ -42,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
return;
- raw_spin_lock_irqsave(cpu_lock, flags);
+ flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
/* put @cgrp and all ancestors on the corresponding updated lists */
while (true) {
@@ -70,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
cgrp = parent;
}
- raw_spin_unlock_irqrestore(cpu_lock, flags);
+ _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
}
/**
@@ -151,15 +207,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
struct cgroup *head = NULL, *parent, *child;
unsigned long flags;
- /*
- * The _irqsave() is needed because cgroup_rstat_lock is
- * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
- * this lock with the _irq() suffix only disables interrupts on
- * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
- * interrupts on both configurations. The _irqsave() ensures
- * that interrupts are always disabled and later restored.
- */
- raw_spin_lock_irqsave(cpu_lock, flags);
+ flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
/* Return NULL if this subtree is not on-list */
if (!rstatc->updated_next)
@@ -196,7 +244,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
if (child != root)
head = cgroup_rstat_push_children(head, child, cpu);
unlock_ret:
- raw_spin_unlock_irqrestore(cpu_lock, flags);
+ _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
return head;
}
@@ -222,6 +270,35 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
__bpf_hook_end();
+/*
+ * Helper functions for locking cgroup_rstat_lock.
+ *
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments. The parameter @cpu_in_loop indicate lock