summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-09-18 06:39:03 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2024-09-18 06:39:03 +0200
commit78567e2bc723b444228644d2e34ae5255d4ab8a0 (patch)
tree3f07fbf68ae127d9c97531a5a80f4eac74acef07
parent2f27fce67173bbb05d5a0ee03dae5c021202c912 (diff)
parentaf000ce85293b8e608f696f0c6c280bc3a75887f (diff)
downloadlinux-78567e2bc723b444228644d2e34ae5255d4ab8a0.tar.gz
linux-78567e2bc723b444228644d2e34ae5255d4ab8a0.tar.bz2
linux-78567e2bc723b444228644d2e34ae5255d4ab8a0.zip
Merge tag 'cgroup-for-6.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - cpuset isolation improvements - cpuset cgroup1 support is split into its own file behind the new config option CONFIG_CPUSET_V1. This makes it the second controller which makes cgroup1 support optional after memcg - Handling of unavailable v1 controller handling improved during cgroup1 mount operations - union_find applied to cpuset. It makes code simpler and more efficient - Reduce spurious events in pids.events - Cleanups and other misc changes - Contains a merge of cgroup/for-6.11-fixes to receive cpuset fixes that further changes build upon * tag 'cgroup-for-6.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (34 commits) cgroup: Do not report unavailable v1 controllers in /proc/cgroups cgroup: Disallow mounting v1 hierarchies without controller implementation cgroup/cpuset: Expose cpuset filesystem with cpuset v1 only cgroup/cpuset: Move cpu.h include to cpuset-internal.h cgroup/cpuset: add sefltest for cpuset v1 cgroup/cpuset: guard cpuset-v1 code under CONFIG_CPUSETS_V1 cgroup/cpuset: rename functions shared between v1 and v2 cgroup/cpuset: move v1 interfaces to cpuset-v1.c cgroup/cpuset: move validate_change_legacy to cpuset-v1.c cgroup/cpuset: move legacy hotplug update to cpuset-v1.c cgroup/cpuset: add callback_lock helper cgroup/cpuset: move memory_spread to cpuset-v1.c cgroup/cpuset: move relax_domain_level to cpuset-v1.c cgroup/cpuset: move memory_pressure to cpuset-v1.c cgroup/cpuset: move common code to cpuset-internal.h cgroup/cpuset: introduce cpuset-v1.c selftest/cgroup: Make test_cpuset_prs.sh deal with pre-isolated CPUs cgroup/cpuset: Account for boot time isolated CPUs cgroup/cpuset: remove use_parent_ecpus of cpuset cgroup/cpuset: remove fetch_xcpus ...
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst22
-rw-r--r--Documentation/core-api/index.rst1
-rw-r--r--Documentation/core-api/union_find.rst106
-rw-r--r--Documentation/translations/zh_CN/core-api/index.rst1
-rw-r--r--Documentation/translations/zh_CN/core-api/union_find.rst92
-rw-r--r--MAINTAINERS12
-rw-r--r--include/linux/cgroup-defs.h14
-rw-r--r--include/linux/cpuset.h10
-rw-r--r--include/linux/sched.h1
-rw-r--r--include/linux/union_find.h41
-rw-r--r--init/Kconfig13
-rw-r--r--kernel/cgroup/Makefile1
-rw-r--r--kernel/cgroup/cgroup-v1.c17
-rw-r--r--kernel/cgroup/cgroup.c68
-rw-r--r--kernel/cgroup/cpuset-internal.h305
-rw-r--r--kernel/cgroup/cpuset-v1.c562
-rw-r--r--kernel/cgroup/cpuset.c1155
-rw-r--r--kernel/cgroup/pids.c32
-rw-r--r--kernel/fork.c1
-rw-r--r--lib/Makefile2
-rw-r--r--lib/union_find.c49
-rwxr-xr-xtools/testing/selftests/cgroup/test_cpuset_prs.sh56
-rwxr-xr-xtools/testing/selftests/cgroup/test_cpuset_v1_base.sh77
23 files changed, 1569 insertions, 1069 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 95c18bc17083..c65756afd372 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -533,10 +533,12 @@ cgroup namespace on namespace creation.
Because the resource control interface files in a given directory
control the distribution of the parent's resources, the delegatee
shouldn't be allowed to write to them. For the first method, this is
-achieved by not granting access to these files. For the second, the
-kernel rejects writes to all files other than "cgroup.procs" and
-"cgroup.subtree_control" on a namespace root from inside the
-namespace.
+achieved by not granting access to these files. For the second, files
+outside the namespace should be hidden from the delegatee by the means
+of at least mount namespacing, and the kernel rejects writes to all
+files on a namespace root from inside the cgroup namespace, except for
+those files listed in "/sys/kernel/cgroup/delegate" (including
+"cgroup.procs", "cgroup.threads", "cgroup.subtree_control", etc.).
The end results are equivalent for both delegation types. Once
delegated, the user can build sub-hierarchy under the directory,
@@ -981,6 +983,14 @@ All cgroup core files are prefixed with "cgroup."
A dying cgroup can consume system resources not exceeding
limits, which were active at the moment of cgroup deletion.
+ nr_subsys_<cgroup_subsys>
+ Total number of live cgroup subsystems (e.g memory
+ cgroup) at and beneath the current cgroup.
+
+ nr_dying_subsys_<cgroup_subsys>
+ Total number of dying cgroup subsystems (e.g. memory
+ cgroup) at and beneath the current cgroup.
+
cgroup.freeze
A read-write single value file which exists on non-root cgroups.
Allowed values are "0" and "1". The default is "0".
@@ -2940,8 +2950,8 @@ Deprecated v1 Core Features
- "cgroup.clone_children" is removed.
-- /proc/cgroups is meaningless for v2. Use "cgroup.controllers" file
- at the root instead.
+- /proc/cgroups is meaningless for v2. Use "cgroup.controllers" or
+ "cgroup.stat" files at the root instead.
Issues with v1 and Rationales for v2
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index f147854700e4..e18a2ffe0787 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -49,6 +49,7 @@ Library functionality that is used throughout the kernel.
wrappers/atomic_t
wrappers/atomic_bitops
floating-point
+ union_find
Low level entry and exit
========================
diff --git a/Documentation/core-api/union_find.rst b/Documentation/core-api/union_find.rst
new file mode 100644
index 000000000000..6df8b94fdb5a
--- /dev/null
+++ b/Documentation/core-api/union_find.rst
@@ -0,0 +1,106 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====================
+Union-Find in Linux
+====================
+
+
+:Date: June 21, 2024
+:Author: Xavier <xavier_qy@163.com>
+
+What is union-find, and what is it used for?
+------------------------------------------------
+
+Union-find is a data structure used to handle the merging and querying
+of disjoint sets. The primary operations supported by union-find are:
+
+ Initialization: Resetting each element as an individual set, with
+ each set's initial parent node pointing to itself.
+
+ Find: Determine which set a particular element belongs to, usually by
+ returning a “representative element” of that set. This operation
+ is used to check if two elements are in the same set.
+
+ Union: Merge two sets into one.
+
+As a data structure used to maintain sets (groups), union-find is commonly
+utilized to solve problems related to offline queries, dynamic connectivity,
+and graph theory. It is also a key component in Kruskal's algorithm for
+computing the minimum spanning tree, which is crucial in scenarios like
+network routing. Consequently, union-find is widely referenced. Additionally,
+union-find has applications in symbolic computation, register allocation,
+and more.
+
+Space Complexity: O(n), where n is the number of nodes.
+
+Time Complexity: Using path compression can reduce the time complexity of
+the find operation, and using union by rank can reduce the time complexity
+of the union operation. These optimizations reduce the average time
+complexity of each find and union operation to O(α(n)), where α(n) is the
+inverse Ackermann function. This can be roughly considered a constant time
+complexity for practical purposes.
+
+This document covers use of the Linux union-find implementation. For more
+information on the nature and implementation of union-find, see:
+
+ Wikipedia entry on union-find
+ https://en.wikipedia.org/wiki/Disjoint-set_data_structure
+
+Linux implementation of union-find
+-----------------------------------
+
+Linux's union-find implementation resides in the file "lib/union_find.c".
+To use it, "#include <linux/union_find.h>".
+
+The union-find data structure is defined as follows::
+
+ struct uf_node {
+ struct uf_node *parent;
+ unsigned int rank;
+ };
+
+In this structure, parent points to the parent node of the current node.
+The rank field represents the height of the current tree. During a union
+operation, the tree with the smaller rank is attached under the tree with the
+larger rank to maintain balance.
+
+Initializing union-find
+-----------------------
+
+You can complete the initialization using either static or initialization
+interface. Initialize the parent pointer to point to itself and set the rank
+to 0.
+Example::
+
+ struct uf_node my_node = UF_INIT_NODE(my_node);
+
+or
+
+ uf_node_init(&my_node);
+
+Find the Root Node of union-find
+--------------------------------
+
+This operation is mainly used to determine whether two nodes belong to the same
+set in the union-find. If they have the same root, they are in the same set.
+During the find operation, path compression is performed to improve the
+efficiency of subsequent find operations.
+Example::
+
+ int connected;
+ struct uf_node *root1 = uf_find(&node_1);
+ struct uf_node *root2 = uf_find(&node_2);
+ if (root1 == root2)
+ connected = 1;
+ else
+ connected = 0;
+
+Union Two Sets in union-find
+----------------------------
+
+To union two sets in the union-find, you first find their respective root nodes
+and then link the smaller node to the larger node based on the rank of the root
+nodes.
+Example::
+
+ uf_union(&node_1, &node_2);
diff --git a/Documentation/translations/zh_CN/core-api/index.rst b/Documentation/translations/zh_CN/core-api/index.rst
index 922cabf7b5dd..453a02cd1f44 100644
--- a/Documentation/translations/zh_CN/core-api/index.rst
+++ b/Documentation/translations/zh_CN/core-api/index.rst
@@ -49,6 +49,7 @@
generic-radix-tree
packing
this_cpu_ops
+ union_find
=======
diff --git a/Documentation/translations/zh_CN/core-api/union_find.rst b/Documentation/translations/zh_CN/core-api/union_find.rst
new file mode 100644
index 000000000000..bb93fa8c6533
--- /dev/null
+++ b/Documentation/translations/zh_CN/core-api/union_find.rst
@@ -0,0 +1,92 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/core-api/union_find.rst
+
+=============================
+Linux中的并查集(Union-Find)
+=============================
+
+
+:日期: 2024年6月21日
+:作者: Xavier <xavier_qy@163.com>
+
+何为并查集,它有什么用?
+------------------------
+
+并查集是一种数据结构,用于处理一些不交集的合并及查询问题。并查集支持的主要操作:
+ 初始化:将每个元素初始化为单独的集合,每个集合的初始父节点指向自身。
+
+ 查询:查询某个元素属于哪个集合,通常是返回集合中的一个“代表元素”。这个操作是为
+ 了判断两个元素是否在同一个集合之中。
+
+ 合并:将两个集合合并为一个。
+
+并查集作为一种用于维护集合(组)的数据结构,它通常用于解决一些离线查询、动态连通性和
+图论等相关问题,同时也是用于计算最小生成树的克鲁斯克尔算法中的关键,由于最小生成树在
+网络路由等场景下十分重要,并查集也得到了广泛的引用。此外,并查集在符号计算,寄存器分
+配等方面也有应用。
+
+空间复杂度: O(n),n为节点数。
+
+时间复杂度:使用路径压缩可以减少查找操作的时间复杂度,使用按秩合并可以减少合并操作的
+时间复杂度,使得并查集每个查询和合并操作的平均时间复杂度仅为O(α(n)),其中α(n)是反阿
+克曼函数,可以粗略地认为并查集的操作有常数的时间复杂度。
+
+本文档涵盖了对Linux并查集实现的使用方法。更多关于并查集的性质和实现的信息,参见:
+
+ 维基百科并查集词条
+ https://en.wikipedia.org/wiki/Disjoint-set_data_structure
+
+并查集的Linux实现
+------------------
+
+Linux的并查集实现在文件“lib/union_find.c”中。要使用它,需要
+“#include <linux/union_find.h>”。
+
+并查集的数据结构定义如下::
+
+ struct uf_node {
+ struct uf_node *parent;
+ unsigned int rank;
+ };
+
+其中parent为当前节点的父节点,rank为当前树的高度,在合并时将rank小的节点接到rank大
+的节点下面以增加平衡性。
+
+初始化并查集
+-------------
+
+可以采用静态或初始化接口完成初始化操作。初始化时,parent 指针指向自身,rank 设置
+为 0。
+示例::
+
+ struct uf_node my_node = UF_INIT_NODE(my_node);
+
+或
+
+ uf_node_init(&my_node);
+
+查找并查集的根节点
+------------------
+
+主要用于判断两个并查集是否属于一个集合,如果根相同,那么他们就是一个集合。在查找过程中
+会对路径进行压缩,提高后续查找效率。
+示例::
+
+ int connected;
+ struct uf_node *root1 = uf_find(&node_1);
+ struct uf_node *root2 = uf_find(&node_2);
+ if (root1 == root2)
+ connected = 1;
+ else
+ connected = 0;
+
+合并两个并查集
+--------------
+
+对于两个相交的并查集进行合并,会首先查找它们各自的根节点,然后根据根节点秩大小,将小的
+节点连接到大的节点下面。
+示例::
+
+ uf_union(&node_1, &node_2);
diff --git a/MAINTAINERS b/MAINTAINERS
index 42d2d950877c..0035cef79ba5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5736,9 +5736,12 @@ S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
F: Documentation/admin-guide/cgroup-v1/cpusets.rst
F: include/linux/cpuset.h
+F: kernel/cgroup/cpuset-internal.h
+F: kernel/cgroup/cpuset-v1.c
F: kernel/cgroup/cpuset.c
F: tools/testing/selftests/cgroup/test_cpuset.c
F: tools/testing/selftests/cgroup/test_cpuset_prs.sh
+F: tools/testing/selftests/cgroup/test_cpuset_v1_base.sh
CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG)
M: Johannes Weiner <hannes@cmpxchg.org>
@@ -23606,6 +23609,15 @@ F: drivers/cdrom/cdrom.c
F: include/linux/cdrom.h
F: include/uapi/linux/cdrom.h
+UNION-FIND
+M: Xavier <xavier_qy@163.com>
+L: linux-kernel@vger.kernel.org
+S: Maintained
+F: Documentation/core-api/union_find.rst
+F: Documentation/translations/zh_CN/core-api/union_find.rst
+F: include/linux/union_find.h
+F: lib/union_find.c
+
UNIVERSAL FLASH STORAGE HOST CONTROLLER DRIVER
R: Alim Akhtar <alim.akhtar@samsung.com>
R: Avri Altman <avri.altman@wdc.com>
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ae04035b6cbe..eb0f6f349496 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -210,6 +210,14 @@ struct cgroup_subsys_state {
* fields of the containing structure.
*/
struct cgroup_subsys_state *parent;
+
+ /*
+ * Keep track of total numbers of visible descendant CSSes.
+ * The total number of dying CSSes is tracked in
+ * css->cgroup->nr_dying_subsys[ssid].
+ * Protected by cgroup_mutex.
+ */
+ int nr_descendants;
};
/*
@@ -470,6 +478,12 @@ struct cgroup {
/* Private pointers for each registered subsystem */
struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
+ /*
+ * Keep track of total number of dying CSSes at and below this cgroup.
+ * Protected by cgroup_mutex.
+ */
+ int nr_dying_subsys[CGROUP_SUBSYS_COUNT];
+
struct cgroup_root *root;
/*
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index de4cf0ee96f7..835e7b793f6a 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -99,6 +99,7 @@ static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
const struct task_struct *tsk2);
+#ifdef CONFIG_CPUSETS_V1
#define cpuset_memory_pressure_bump() \
do { \
if (cpuset_memory_pressure_enabled) \
@@ -106,6 +107,9 @@ extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
} while (0)
extern int cpuset_memory_pressure_enabled;
extern void __cpuset_memory_pressure_bump(void);
+#else
+static inline void cpuset_memory_pressure_bump(void) { }
+#endif
extern void cpuset_task_status_allowed(struct seq_file *m,
struct task_struct *task);
@@ -113,7 +117,6 @@ extern int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk);
extern int cpuset_mem_spread_node(void);
-extern int cpuset_slab_spread_node(void);
static inline int cpuset_do_page_mem_spread(void)
{
@@ -246,11 +249,6 @@ static inline int cpuset_mem_spread_node(void)
return 0;
}
-static inline int cpuset_slab_spread_node(void)
-{
- return 0;
-}
-
static inline int cpuset_do_page_mem_spread(void)
{
return 0;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f8d150343d42..3773c1c8f099 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1243,7 +1243,6 @@ struct task_struct {
/* Sequence number to catch updates: */
seqcount_spinlock_t mems_allowed_seq;
int cpuset_mem_spread_rotor;
- int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
/* Control Group info protected by css_set_lock: */
diff --git a/include/linux/union_find.h b/include/linux/union_find.h
new file mode 100644
index 000000000000..cfd49263c138
--- /dev/null
+++ b/include/linux/union_find.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_UNION_FIND_H
+#define __LINUX_UNION_FIND_H
+/**
+ * union_find.h - union-find data structure implementation
+ *
+ * This header provides functions and structures to implement the union-find
+ * data structure. The union-find data structure is used to manage disjoint
+ * sets and supports efficient union and find operations.
+ *
+ * See Documentation/core-api/union_find.rst for documentation and samples.
+ */
+
+struct uf_node {
+ struct uf_node *parent;
+ unsigned int rank;
+};
+
+/* This macro is used for static initialization of a union-find node. */
+#define UF_INIT_NODE(node) {.parent = &node, .rank = 0}
+
+/**
+ * uf_node_init - Initialize a union-find node
+ * @node: pointer to the union-find node to be initialized
+ *
+ * This function sets the parent of the node to itself and
+ * initializes its rank to 0.
+ */
+static inline void uf_node_init(struct uf_node *node)
+{
+ node->parent = node;
+ node->rank = 0;
+}
+
+/* find the root of a node */
+struct uf_node *uf_find(struct uf_node *node);
+
+/* Merge two intersecting nodes */
+void uf_union(struct uf_node *node1, struct uf_node *node2);
+
+#endif /* __LINUX_UNION_FIND_H */
diff --git a/init/Kconfig b/init/Kconfig
index 3b6ca7cce03b..8afb993b09cc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1143,6 +1143,19 @@ config CPUSETS
Say N if unsure.
+config CPUSETS_V1
+ bool "Legacy cgroup v1 cpusets controller"
+ depends on CPUSETS
+ default n
+ help
+ Legacy cgroup v1 cpusets controller which has been deprecated by
+ cgroup v2 implementation. The v1 is there for legacy applications
+ which haven't migrated to the new cgroup v2 interface yet. If you
+ do not have any such application then you are completely fine leaving
+ this option disabled.
+
+ Say N if unsure.
+
config PROC_PID_CPUSET
bool "Include legacy /proc/<pid>/cpuset file"
depends on CPUSETS
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 12f8457ad1f9..a5c9359d516f 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -5,5 +5,6 @@ obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
obj-$(CONFIG_CGROUP_RDMA) += rdma.o
obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CPUSETS_V1) += cpuset-v1.o
obj-$(CONFIG_CGROUP_MISC) += misc.o
obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index b9dbf6bf2779..e28d5f0d20ed 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -46,6 +46,12 @@ bool cgroup1_ssid_disabled(int ssid)
return cgroup_no_v1_mask & (1 << ssid);
}
+static bool cgroup1_subsys_absent(struct cgroup_subsys *ss)
+{
+ /* Check also dfl_cftypes for file-less controllers, i.e. perf_event */
+ return ss->legacy_cftypes == NULL && ss->dfl_cftypes;
+}
+
/**
* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
* @from: attach to all cgroups of a given task
@@ -675,11 +681,14 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
* cgroup_mutex contention.
*/
- for_each_subsys(ss, i)
+ for_each_subsys(ss, i) {
+ if (cgroup1_subsys_absent(ss))
+ continue;
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->legacy_name, ss->root->hierarchy_id,
atomic_read(&ss->root->nr_cgrps),
cgroup_ssid_enabled(i));
+ }
return 0;
}
@@ -932,7 +941,8 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
if (ret != -ENOPARAM)
return ret;
for_each_subsys(ss, i) {
- if (strcmp(param->key, ss->legacy_name))
+ if (strcmp(param->key, ss->legacy_name) ||
+ cgroup1_subsys_absent(ss))
continue;
if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i))
return invalfc(fc, "Disabled controller '%s'",
@@ -1024,7 +1034,8 @@ static int check_cgroupfs_options(struct fs_context *fc)
mask = ~((u16)1 << cpuset_cgrp_id);
#endif
for_each_subsys(ss, i)
- if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
+ if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) &&
+ !cgroup1_subsys_absent(ss))
enabled |= 1 << i;
ctx->subsys_mask &= enabled;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c8e4b62b436a..90e50d6d3cf3 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2331,7 +2331,7 @@ static struct file_system_type cgroup2_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
-#ifdef CONFIG_CPUSETS
+#ifdef CONFIG_CPUSETS_V1
static const struct fs_context_operations cpuset_fs_context_ops = {
.get_tree = cgroup1_get_tree,
.free = cgroup_fs_context_free,
@@ -3669,12 +3669,40 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
static int cgroup_stat_show(struct seq_file *seq, void *v)
{
struct cgroup *cgroup = seq_css(seq)->cgroup;
+ struct cgroup_subsys_state *css;
+ int dying_cnt[CGROUP_SUBSYS_COUNT];
+ int ssid;
seq_printf(seq, "nr_descendants %d\n",
cgroup->nr_descendants);
+
+ /*
+ * Show the number of live and dying csses associated with each of
+ * non-inhibited cgroup subsystems that is bound to cgroup v2.
+ *
+ * Without proper lock protection, racing is possible. So the
+ * numbers may not be consistent when that happens.
+ */
+ rcu_read_lock();
+ for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
+ dying_cnt[ssid] = -1;
+ if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) ||
+ (cgroup_subsys[ssid]->root != &cgrp_dfl_root))
+ continue;
+ css = rcu_dereference_raw(cgroup->subsys[ssid]);
+ dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid];
+ seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name,
+ css ? (css->nr_descendants + 1) : 0);
+ }
+
seq_printf(seq, "nr_dying_descendants %d\n",
cgroup->nr_dying_descendants);
-
+ for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
+ if (dying_cnt[ssid] >= 0)
+ seq_printf(seq, "nr_dying_subsys_%s %d\n",
+ cgroup_subsys[ssid]->name, dying_cnt[ssid]);
+ }
+ rcu_read_unlock();
return 0;
}
@@ -4096,7 +4124,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
* If namespaces are delegation boundaries, disallow writes to
* files in an non-init namespace root from inside the namespace
* except for the files explicitly marked delegatable -
- * cgroup.procs and cgroup.subtree_control.
+ * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control.
*/
if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
@@ -5424,6 +5452,8 @@ static void css_release_work_fn(struct work_struct *work)
list_del_rcu(&css->sibling);
if (ss) {
+ struct cgroup *parent_cgrp;
+
/* css release path */
if (!list_empty(&css->rstat_css_node)) {
cgroup_rstat_flush(cgrp);
@@ -5433,6 +5463,21 @@ static void css_release_work_fn(struct work_struct *work)
cgroup_idr_replace(&ss->css_idr, NULL, css->id);
if (ss->css_released)
ss->css_released(css);
+
+ cgrp->nr_dying_subsys[ss->id]--;
+ /*
+ * When a css is released and ready to be freed, its
+ * nr_descendants must be zero. However, the corresponding
+ * cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem
+ * is activated and deactivated multiple times with one or
+ * more of its previous activation leaving behind dying csses.
+ */
+ WARN_ON_ONCE(css->nr_descendants);
+ parent_cgrp = cgroup_parent(cgrp);
+ while (parent_cgrp) {
+ parent_cgrp->nr_dying_subsys[ss->id]--;
+ parent_cgrp = cgroup_parent(parent_cgrp);
+ }
} else {
struct cgroup *tcgrp;
@@ -5517,8 +5562,11 @@ static int online_css(struct cgroup_subsys_state *css)
rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
atomic_inc(&css->online_cnt);
- if (css->parent)
+ if (css->parent) {
atomic_inc(&css->parent->online_cnt);
+ while ((css = css->parent))
+ css->nr_descendants++;
+ }
}
return ret;
}
@@ -5540,6 +5588,16 @@ static void offline_css(struct cgroup_subsys_state *css)
RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
wake_up_all(&css->cgroup->offline_waitq);
+
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ /*
+ * Parent css and cgroup cannot be freed until after the freeing
+ * of child css, see css_free_rwork_fn().
+ */
+ while ((css = css->parent)) {
+ css->nr_descendants--;
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ }
}
/**
@@ -6178,7 +6236,7 @@ int __init cgroup_init(void)
WARN_ON(register_filesystem(&cgroup_fs_type));
WARN_ON(register_filesystem(&cgroup2_fs_type));
WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
-#ifdef CONFIG_CPUSETS
+#ifdef CONFIG_CPUSETS_V1
WARN_ON(register_filesystem(&cpuset_fs_type));
#endif
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
new file mode 100644
index 000000000000..976a8bc3ff60
--- /dev/null
+++ b/kernel/cgroup/cpuset-internal.h
@@ -0,0 +1,305 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __CPUSET_INTERNAL_H
+#define __CPUSET_INTERNAL_H
+
+#include <linux/cgroup.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/spinlock.h>
+#include <linux/union_find.h>
+
+/* See "Frequency meter" comments, below. */
+
+struct fmeter {
+ int cnt; /* unprocessed events count */
+ int val; /* most recent output value */
+ time64_t time; /* clock (secs) when val computed */
+ spinlock_t lock; /* guards read or write of above */
+};
+
+/*
+ * Invalid partition error code
+ */
+enum prs_errcode {
+ PERR_NONE = 0,
+ PERR_INVCPUS,
+ PERR_INVPARENT,
+ PERR_NOTPART,
+ PERR_NOTEXCL,
+ PERR_NOCPUS,
+ PERR_HOTPLUG,
+ PERR_CPUSEMPTY,
+ PERR_HKEEPING,
+ PERR_ACCESS,
+};
+
+/* bits in struct cpuset flags field */
+typedef enum {
+ CS_ONLINE,
+ CS_CPU_EXCLUSIVE,
+ CS_MEM_EXCLUSIVE,
+ CS_MEM_HARDWALL,
+ CS_MEMORY_MIGRATE,
+ CS_SCHED_LOAD_BALANCE,
+ CS_SPREAD_PAGE,
+ CS_SPREAD_SLAB,
+} cpuset_flagbits_t;
+
+/* The various types of files and directories in a cpuset file system */
+
+typedef enum {
+ FILE_MEMORY_MIGRATE,
+ FILE_CPULIST,
+ FILE_MEMLIST,
+ FILE_EFFECTIVE_CPULIST,
+ FILE_EFFECTIVE_MEMLIST,
+ FILE_SUBPARTS_CPULIST,
+ FILE_EXCLUSIVE_CPULIST,
+ FILE_EFFECTIVE_XCPULIST,
+ FILE_ISOLATED_CPULIST,
+ FILE_CPU_EXCLUSIVE,
+ FILE_MEM_EXCLUSIVE,
+ FILE_MEM_HARDWALL,
+ FILE_SCHED_LOAD_BALANCE,
+ FILE_PARTITION_ROOT,
+ FILE_SCHED_RELAX_DOMAIN_LEVEL,
+ FILE_MEMORY_PRESSURE_ENABLED,
+ FILE_MEMORY_PRESSURE,
+ FILE_SPREAD_PAGE,
+ FILE_SPREAD_SLAB,
+} cpuset_filetype_t;
+
+struct cpuset {
+ struct cgroup_subsys_state css;
+
+ unsigned long flags; /* "unsigned long" so bitops work */
+
+ /*
+ * On default hierarchy:
+ *
+ * The user-configured masks can only be changed by writing to
+ * cpuset.cpus and cpuset.mems, and won't be limited by the
+ * parent masks.
+ *
+ * The effective masks is the real masks that apply to the tasks
+ * in the cpuset. They may be changed if the configured masks are
+ * changed or hotplug happens.
+ *
+ * effective_mask == configured_mask & parent's effective_mask,
+ * and if it ends up empty, it will inherit the parent's mask.
+ *
+ *
+ * On legacy hierarchy:
+ *
+ * The user-configured masks are always the same with effective masks.
+ */
+
+ /* user-configured CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t cpus_allowed;
+ nodemask_t mems_allowed;
+
+ /* effective CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t effective_cpus;
+ nodemask_t effective_mems;
+
+ /*
+ * Exclusive CPUs dedicated to current cgroup (default hierarchy only)
+ *
+ * The effective_cpus of a valid partition root comes solely from its
+ * effective_xcpus and some of the effective_xcpus may be distributed
+ * to sub-partitions below & hence excluded from its effective_cpus.
+ * For a valid partition root, its effective_cpus have no relationship
+ * with cpus_allowed unless its exclusive_cpus isn't set.
+ *
+ * This value will only be set if either exclusive_cpus is set or
+ * when this cpuset becomes a local partition root.
+ */
+ cpumask_var_t effective_xcpus;
+
+ /*
+ * Exclusive CPUs as requested by the user (default hierarchy only)
+ *
+ * Its value is independent of cpus_allowed and designates the set of
+ * CPUs that can be granted to the current cpuset or its children when
+ * it becomes a valid partition root. The effective set of exclusive
<