summaryrefslogtreecommitdiff
path: root/kernel/cgroup
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-10-10 11:12:25 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-10-10 11:12:25 -0700
commitadf4bfc4a9ab86be0b72fa9cadc9e7ab6ad15dfe (patch)
treebfdc5c20da3406cb75aa320c89460cc1b7aa6919 /kernel/cgroup
parent8adc0486f3c85e3c1e40c1ce6884317a17c380d3 (diff)
parent8619e94d3be376bb5e8f20988c0e6e3309d2b09a (diff)
downloadlinux-adf4bfc4a9ab86be0b72fa9cadc9e7ab6ad15dfe.tar.gz
linux-adf4bfc4a9ab86be0b72fa9cadc9e7ab6ad15dfe.tar.bz2
linux-adf4bfc4a9ab86be0b72fa9cadc9e7ab6ad15dfe.zip
Merge tag 'cgroup-for-6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - cpuset now support isolated cpus.partition type, which will enable dynamic CPU isolation - pids.peak added to remember the max number of pids used - holes in cgroup namespace plugged - internal cleanups * tag 'cgroup-for-6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (25 commits) cgroup: use strscpy() is more robust and safer iocost_monitor: reorder BlkgIterator cgroup: simplify code in cgroup_apply_control cgroup: Make cgroup_get_from_id() prettier cgroup/cpuset: remove unreachable code cgroup: Remove CFTYPE_PRESSURE cgroup: Improve cftype add/rm error handling kselftest/cgroup: Add cpuset v2 partition root state test cgroup/cpuset: Update description of cpuset.cpus.partition in cgroup-v2.rst cgroup/cpuset: Make partition invalid if cpumask change violates exclusivity rule cgroup/cpuset: Relocate a code block in validate_change() cgroup/cpuset: Show invalid partition reason string cgroup/cpuset: Add a new isolated cpus.partition type cgroup/cpuset: Relax constraints to partition & cpus changes cgroup/cpuset: Allow no-task partition to have empty cpuset.cpus.effective cgroup/cpuset: Miscellaneous cleanups & add helper functions cgroup/cpuset: Enable update_tasks_cpumask() on top_cpuset cgroup: add pids.peak interface for pids controller cgroup: Remove data-race around cgrp_dfl_visible cgroup: Fix build failure when CONFIG_SHRINKER_DEBUG ...
Diffstat (limited to 'kernel/cgroup')
-rw-r--r--kernel/cgroup/cgroup-internal.h2
-rw-r--r--kernel/cgroup/cgroup-v1.c6
-rw-r--r--kernel/cgroup/cgroup.c145
-rw-r--r--kernel/cgroup/cpuset.c815
-rw-r--r--kernel/cgroup/pids.c37
5 files changed, 663 insertions, 342 deletions
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 36b740cb3d59..2c7ecca226be 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -250,6 +250,8 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup);
+void cgroup_attach_lock(bool lock_threadgroup);
+void cgroup_attach_unlock(bool lock_threadgroup);
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
bool *locked)
__acquires(&cgroup_threadgroup_rwsem);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index ff6a8099eb2a..52bb5a74a23b 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -59,8 +59,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
int retval = 0;
mutex_lock(&cgroup_mutex);
- cpus_read_lock();
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_lock(true);
for_each_root(root) {
struct cgroup *from_cgrp;
@@ -72,8 +71,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
if (retval)
break;
}
- percpu_up_write(&cgroup_threadgroup_rwsem);
- cpus_read_unlock();
+ cgroup_attach_unlock(true);
mutex_unlock(&cgroup_mutex);
return retval;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index b6e3110b3ea7..764bdd5fd8d1 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -217,6 +217,7 @@ struct cgroup_namespace init_cgroup_ns = {
static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];
+static struct cftype cgroup_psi_files[];
/* cgroup optional features */
enum cgroup_opt_features {
@@ -1689,12 +1690,16 @@ static void css_clear_dir(struct cgroup_subsys_state *css)
css->flags &= ~CSS_VISIBLE;
if (!css->ss) {
- if (cgroup_on_dfl(cgrp))
- cfts = cgroup_base_files;
- else
- cfts = cgroup1_base_files;
-
- cgroup_addrm_files(css, cgrp, cfts, false);
+ if (cgroup_on_dfl(cgrp)) {
+ cgroup_addrm_files(css, cgrp,
+ cgroup_base_files, false);
+ if (cgroup_psi_enabled())
+ cgroup_addrm_files(css, cgrp,
+ cgroup_psi_files, false);
+ } else {
+ cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, false);
+ }
} else {
list_for_each_entry(cfts, &css->ss->cfts, node)
cgroup_addrm_files(css, cgrp, cfts, false);
@@ -1717,14 +1722,22 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
return 0;
if (!css->ss) {
- if (cgroup_on_dfl(cgrp))
- cfts = cgroup_base_files;
- else
- cfts = cgroup1_base_files;
-
- ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
- if (ret < 0)
- return ret;
+ if (cgroup_on_dfl(cgrp)) {
+ ret = cgroup_addrm_files(&cgrp->self, cgrp,
+ cgroup_base_files, true);
+ if (ret < 0)
+ return ret;
+
+ if (cgroup_psi_enabled()) {
+ ret = cgroup_addrm_files(&cgrp->self, cgrp,
+ cgroup_psi_files, true);
+ if (ret < 0)
+ return ret;
+ }
+ } else {
+ cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, true);
+ }
} else {
list_for_each_entry(cfts, &css->ss->cfts, node) {
ret = cgroup_addrm_files(css, cgrp, cfts, true);
@@ -2050,7 +2063,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
}
root_cgrp->kn = kernfs_root_to_node(root->kf_root);
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
- root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
+ root_cgrp->ancestors[0] = root_cgrp;
ret = css_populate_dir(&root_cgrp->self);
if (ret)
@@ -2173,7 +2186,7 @@ static int cgroup_get_tree(struct fs_context *fc)
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
- cgrp_dfl_visible = true;
+ WRITE_ONCE(cgrp_dfl_visible, true);
cgroup_get_live(&cgrp_dfl_root.cgrp);
ctx->root = &cgrp_dfl_root;
@@ -2361,7 +2374,7 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
} else {
/* if no hierarchy exists, everyone is in "/" */
- ret = strlcpy(buf, "/", buflen);
+ ret = strscpy(buf, "/", buflen);
}
spin_unlock_irq(&css_set_lock);
@@ -2393,7 +2406,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path);
* write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
* CPU hotplug is disabled on entry.
*/
-static void cgroup_attach_lock(bool lock_threadgroup)
+void cgroup_attach_lock(bool lock_threadgroup)
{
cpus_read_lock();
if (lock_threadgroup)
@@ -2404,7 +2417,7 @@ static void cgroup_attach_lock(bool lock_threadgroup)
* cgroup_attach_unlock - Undo cgroup_attach_lock()
* @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
*/
-static void cgroup_attach_unlock(bool lock_threadgroup)
+void cgroup_attach_unlock(bool lock_threadgroup)
{
if (lock_threadgroup)
percpu_up_write(&cgroup_threadgroup_rwsem);
@@ -3292,11 +3305,7 @@ static int cgroup_apply_control(struct cgroup *cgrp)
* making the following cgroup_update_dfl_csses() properly update
* css associations of all tasks in the subtree.
*/
- ret = cgroup_update_dfl_csses(cgrp);
- if (ret)
- return ret;
-
- return 0;
+ return cgroup_update_dfl_csses(cgrp);
}
/**
@@ -4132,8 +4141,6 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
restart:
for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
- if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
- continue;
if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue;
if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
@@ -4198,21 +4205,25 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
cft->ss = NULL;
/* revert flags set by cgroup core while adding @cfts */
- cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
+ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL |
+ __CFTYPE_ADDED);
}
}
static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype *cft;
+ int ret = 0;
for (cft = cfts; cft->name[0] != '\0'; cft++) {
struct kernfs_ops *kf_ops;
WARN_ON(cft->ss || cft->kf_ops);
- if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
- continue;
+ if (cft->flags & __CFTYPE_ADDED) {
+ ret = -EBUSY;
+ break;
+ }
if (cft->seq_start)
kf_ops = &cgroup_kf_ops;
@@ -4226,26 +4237,26 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
if (!kf_ops) {
- cgroup_exit_cftypes(cfts);
- return -ENOMEM;
+ ret = -ENOMEM;
+ break;
}
kf_ops->atomic_write_len = cft->max_write_len;
}
cft->kf_ops = kf_ops;
cft->ss = ss;
+ cft->flags |= __CFTYPE_ADDED;
}
- return 0;
+ if (ret)
+ cgroup_exit_cftypes(cfts);
+ return ret;
}
static int cgroup_rm_cftypes_locked(struct cftype *cfts)
{
lockdep_assert_held(&cgroup_mutex);
- if (!cfts || !cfts[0].ss)
- return -ENOENT;
-
list_del(&cfts->node);
cgroup_apply_cftypes(cfts, false);
cgroup_exit_cftypes(cfts);
@@ -4267,6 +4278,12 @@ int cgroup_rm_cftypes(struct cftype *cfts)
{
int ret;
+ if (!cfts || cfts[0].name[0] == '\0')
+ return 0;
+
+ if (!(cfts[0].flags & __CFTYPE_ADDED))
+ return -ENOENT;
+
mutex_lock(&cgroup_mutex);
ret = cgroup_rm_cftypes_locked(cfts);
mutex_unlock(&cgroup_mutex);
@@ -5151,10 +5168,13 @@ static struct cftype cgroup_base_files[] = {
.name = "cpu.stat",
.seq_show = cpu_stat_show,
},
+ { } /* terminate */
+};
+
+static struct cftype cgroup_psi_files[] = {
#ifdef CONFIG_PSI
{
.name = "io.pressure",
- .flags = CFTYPE_PRESSURE,
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5162,7 +5182,6 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "memory.pressure",
- .flags = CFTYPE_PRESSURE,
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5170,7 +5189,6 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cpu.pressure",
- .flags = CFTYPE_PRESSURE,
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5452,8 +5470,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
- GFP_KERNEL);
+ cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
if (!cgrp)
return ERR_PTR(-ENOMEM);
@@ -5505,7 +5522,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
- cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
+ cgrp->ancestors[tcgrp->level] = tcgrp;
if (tcgrp != cgrp) {
tcgrp->nr_descendants++;
@@ -5938,6 +5955,7 @@ int __init cgroup_init(void)
BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
cgroup_rstat_boot();
@@ -6058,19 +6076,22 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
/*
* cgroup_get_from_id : get the cgroup associated with cgroup id
* @id: cgroup id
- * On success return the cgrp, on failure return NULL
+ * On success return the cgrp or ERR_PTR on failure
+ * Only cgroups within current task's cgroup NS are valid.
*/
struct cgroup *cgroup_get_from_id(u64 id)
{
struct kernfs_node *kn;
- struct cgroup *cgrp = NULL;
+ struct cgroup *cgrp, *root_cgrp;
kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
if (!kn)
- goto out;
+ return ERR_PTR(-ENOENT);
- if (kernfs_type(kn) != KERNFS_DIR)
- goto put;
+ if (kernfs_type(kn) != KERNFS_DIR) {
+ kernfs_put(kn);
+ return ERR_PTR(-ENOENT);
+ }
rcu_read_lock();
@@ -6079,9 +6100,19 @@ struct cgroup *cgroup_get_from_id(u64 id)
cgrp = NULL;
rcu_read_unlock();
-put:
kernfs_put(kn);
-out:
+
+ if (!cgrp)
+ return ERR_PTR(-ENOENT);
+
+ spin_lock_irq(&css_set_lock);
+ root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
+ spin_unlock_irq(&css_set_lock);
+ if (!cgroup_is_descendant(cgrp, root_cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-ENOENT);
+ }
+
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_id);
@@ -6111,7 +6142,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct cgroup *cgrp;
int ssid, count = 0;
- if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
+ if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
continue;
seq_printf(m, "%d:", root->hierarchy_id);
@@ -6653,8 +6684,12 @@ struct cgroup *cgroup_get_from_path(const char *path)
{
struct kernfs_node *kn;
struct cgroup *cgrp = ERR_PTR(-ENOENT);
+ struct cgroup *root_cgrp;
- kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
+ spin_lock_irq(&css_set_lock);
+ root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
+ kn = kernfs_walk_and_get(root_cgrp->kn, path);
+ spin_unlock_irq(&css_set_lock);
if (!kn)
goto out;
@@ -6812,9 +6847,6 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
continue;
- if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
- continue;
-
if (prefix)
ret += snprintf(buf + ret, size - ret, "%s.", prefix);
@@ -6834,8 +6866,11 @@ static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
int ssid;
ssize_t ret = 0;
- ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
- NULL);
+ ret = show_delegatable_files(cgroup_base_files, buf + ret,
+ PAGE_SIZE - ret, NULL);
+ if (cgroup_psi_enabled())
+ ret += show_delegatable_files(cgroup_psi_files, buf + ret,
+ PAGE_SIZE - ret, NULL);
for_each_subsys(ss, ssid)
ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 1f3a55297f39..b474289c15b8 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -33,6 +33,7 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
+#include <linux/kthread.h>
#include <linux/list.h>
#include <linux/mempolicy.h>
#include <linux/mm.h>
@@ -85,6 +86,30 @@ struct fmeter {
spinlock_t lock; /* guards read or write of above */
};
+/*
+ * Invalid partition error code
+ */
+enum prs_errcode {
+ PERR_NONE = 0,
+ PERR_INVCPUS,
+ PERR_INVPARENT,
+ PERR_NOTPART,
+ PERR_NOTEXCL,
+ PERR_NOCPUS,
+ PERR_HOTPLUG,
+ PERR_CPUSEMPTY,
+};
+
+static const char * const perr_strings[] = {
+ [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus",
+ [PERR_INVPARENT] = "Parent is an invalid partition root",
+ [PERR_NOTPART] = "Parent is not a partition root",
+ [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",
+ [PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
+ [PERR_HOTPLUG] = "No cpu available due to hotplug",
+ [PERR_CPUSEMPTY] = "cpuset.cpus is empty",
+};
+
struct cpuset {
struct cgroup_subsys_state css;
@@ -168,6 +193,9 @@ struct cpuset {
int use_parent_ecpus;
int child_ecpus_count;
+ /* Invalid partition error code, not lock protected */
+ enum prs_errcode prs_err;
+
/* Handle for cpuset.cpus.partition */
struct cgroup_file partition_file;
};
@@ -175,20 +203,22 @@ struct cpuset {
/*
* Partition root states:
*
- * 0 - not a partition root
- *
+ * 0 - member (not a partition root)
* 1 - partition root
- *
+ * 2 - partition root without load balancing (isolated)
* -1 - invalid partition root
- * None of the cpus in cpus_allowed can be put into the parent's
- * subparts_cpus. In this case, the cpuset is not a real partition
- * root anymore. However, the CPU_EXCLUSIVE bit will still be set
- * and the cpuset can be restored back to a partition root if the
- * parent cpuset can give more CPUs back to this child cpuset.
+ * -2 - invalid isolated partition root
*/
-#define PRS_DISABLED 0
-#define PRS_ENABLED 1
-#define PRS_ERROR -1
+#define PRS_MEMBER 0
+#define PRS_ROOT 1
+#define PRS_ISOLATED 2
+#define PRS_INVALID_ROOT -1
+#define PRS_INVALID_ISOLATED -2
+
+static inline bool is_prs_invalid(int prs_state)
+{
+ return prs_state < 0;
+}
/*
* Temporary cpumasks for working with partitions that are passed among
@@ -268,25 +298,43 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
-static inline int is_partition_root(const struct cpuset *cs)
+static inline int is_partition_valid(const struct cpuset *cs)
{
return cs->partition_root_state > 0;
}
+static inline int is_partition_invalid(const struct cpuset *cs)
+{
+ return cs->partition_root_state < 0;
+}
+
+/*
+ * Callers should hold callback_lock to modify partition_root_state.
+ */
+static inline void make_partition_invalid(struct cpuset *cs)
+{
+ if (is_partition_valid(cs))
+ cs->partition_root_state = -cs->partition_root_state;
+}
+
/*
* Send notification event of whenever partition_root_state changes.
*/
-static inline void notify_partition_change(struct cpuset *cs,
- int old_prs, int new_prs)
+static inline void notify_partition_change(struct cpuset *cs, int old_prs)
{
- if (old_prs != new_prs)
- cgroup_file_notify(&cs->partition_file);
+ if (old_prs == cs->partition_root_state)
+ return;
+ cgroup_file_notify(&cs->partition_file);
+
+ /* Reset prs_err if not invalid */
+ if (is_partition_valid(cs))
+ WRITE_ONCE(cs->prs_err, PERR_NONE);
}
static struct cpuset top_cpuset = {
.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
(1 << CS_MEM_EXCLUSIVE)),
- .partition_root_state = PRS_ENABLED,
+ .partition_root_state = PRS_ROOT,
};
/**
@@ -404,6 +452,41 @@ static inline bool is_in_v2_mode(void)
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}
+/**
+ * partition_is_populated - check if partition has tasks
+ * @cs: partition root to be checked
+ * @excluded_child: a child cpuset to be excluded in task checking
+ * Return: true if there are tasks, false otherwise
+ *
+ * It is assumed that @cs is a valid partition root. @excluded_child should
+ * be non-NULL when this cpuset is going to become a partition itself.
+ */
+static inline bool partition_is_populated(struct cpuset *cs,
+ struct cpuset *excluded_child)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *child;
+
+ if (cs->css.cgroup->nr_populated_csets)
+ return true;
+ if (!excluded_child && !cs->nr_subparts_cpus)
+ return cgroup_is_populated(cs->css.cgroup);
+
+ rcu_read_lock();
+ cpuset_for_each_child(child, css, cs) {
+ if (child == excluded_child)
+ continue;
+ if (is_partition_valid(child))
+ continue;
+ if (cgroup_is_populated(child->css.cgroup)) {
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+ return false;
+}
+
/*
* Return in pmask the portion of a task's cpusets's cpus_allowed that
* are online and are capable of running the task. If none are found,
@@ -659,22 +742,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
par = parent_cs(cur);
/*
- * If either I or some sibling (!= me) is exclusive, we can't
- * overlap
- */
- ret = -EINVAL;
- cpuset_for_each_child(c, css, par) {
- if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
- c != cur &&
- cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
- goto out;
- if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
- c != cur &&
- nodes_intersects(trial->mems_allowed, c->mems_allowed))
- goto out;
- }
-
- /*
* Cpusets with tasks - existing or newly being attached - can't
* be changed to have empty cpus_allowed or mems_allowed.
*/
@@ -698,6 +765,22 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
trial->cpus_allowed))
goto out;
+ /*
+ * If either I or some sibling (!= me) is exclusive, we can't
+ * overlap
+ */
+ ret = -EINVAL;
+ cpuset_for_each_child(c, css, par) {
+ if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
+ c != cur &&
+ cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
+ goto out;
+ if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
+ c != cur &&
+ nodes_intersects(trial->mems_allowed, c->mems_allowed))
+ goto out;
+ }
+
ret = 0;
out:
rcu_read_unlock();
@@ -875,7 +958,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
csa[csn++] = cp;
/* skip @cp's subtree if not a partition root */
- if (!is_partition_root(cp))
+ if (!is_partition_valid(cp))
pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
@@ -1081,7 +1164,7 @@ static void rebuild_sched_domains_locked(void)
if (top_cpuset.nr_subparts_cpus) {
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
- if (!is_partition_root(cs)) {
+ if (!is_partition_valid(cs)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
@@ -1127,10 +1210,18 @@ static void update_tasks_cpumask(struct cpuset *cs)
{
struct css_task_iter it;
struct task_struct *task;
+ bool top_cs = cs == &top_cpuset;
css_task_iter_start(&cs->css, 0, &it);
- while ((task = css_task_iter_next(&it)))
+ while ((task = css_task_iter_next(&it))) {
+ /*
+ * Percpu kthreads in top_cpuset are ignored
+ */
+ if (top_cs && (task->flags & PF_KTHREAD) &&
+ kthread_is_per_cpu(task))
+ continue;
set_cpus_allowed_ptr(task, cs->effective_cpus);
+ }
css_task_iter_end(&it);
}
@@ -1165,15 +1256,18 @@ enum subparts_cmd {
partcmd_enable, /* Enable partition root */
partcmd_disable, /* Disable partition root */
partcmd_update, /* Update parent's subparts_cpus */
+ partcmd_invalidate, /* Make partition invalid */
};
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+ int turning_on);
/**
* update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
* @cpuset: The cpuset that requests change in partition root state
* @cmd: Partition root state change command
* @newmask: Optional new cpumask for partcmd_update
* @tmp: Temporary addmask and delmask
- * Return: 0, 1 or an error code
+ * Return: 0 or a partition root state error code
*
* For partcmd_enable, the cpuset is being transformed from a non-partition
* root to a partition root. The cpus_allowed mask of the given cpuset will
@@ -1184,38 +1278,36 @@ enum subparts_cmd {
* For partcmd_disable, the cpuset is being transformed from a partition
* root back to a non-partition root. Any CPUs in cpus_allowed that are in
* parent's subparts_cpus will be taken away from that cpumask and put back
- * into parent's effective_cpus. 0 should always be returned.
- *
- * For partcmd_update, if the optional newmask is specified, the cpu
- * list is to be changed from cpus_allowed to newmask. Otherwise,
- * cpus_allowed is assumed to remain the same. The cpuset should either
- * be a partition root or an invalid partition root. The partition root
- * state may change if newmask is NULL and none of the requested CPUs can
- * be granted by the parent. The function will return 1 if changes to
- * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
- * Error code should only be returned when newmask is non-NULL.
+ * into parent's effective_cpus. 0 will always be returned.
*
- * The partcmd_enable and partcmd_disable commands are used by
- * update_prstate(). The partcmd_update command is used by
- * update_cpumasks_hier() with newmask NULL and update_cpumask() with
- * newmask set.
+ * For partcmd_update, if the optional newmask is specified, the cpu list is
+ * to be changed from cpus_allowed to newmask. Otherwise, cpus_allowed is
+ * assumed to remain the same. The cpuset should either be a valid or invalid
+ * partition root. The partition root state may change from valid to invalid
+ * or vice versa. An error code will only be returned if transitioning from
+ * invalid to valid violates the exclusivity rule.
*
- * The checking is more strict when enabling partition root than the
- * other two commands.
+ * For partcmd_invalidate, the current partition will be made invalid.
*
- * Because of the implicit cpu exclusive nature of a partition root,
- * cpumask changes that violates the cpu exclusivity rule will not be
- * permitted when checked by validate_change().
+ * The partcmd_enable and partcmd_disable commands are used by
+ * update_prstate(). An error code may be returned and the caller will check
+ * for error.
+ *
+ * The partcmd_update command is used by update_cpumasks_hier() with newmask
+ * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
+ * by update_cpumask() with NULL newmask. In both cases, the callers won't
+ * check for error and so partition_root_state and prs_error will be updated
+ * directly.
*/
-static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
+static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
struct cpumask *newmask,
struct tmpmasks *tmp)
{
- struct cpuset *parent = parent_cs(cpuset);
+ struct cpuset *parent = parent_cs(cs);
int adding; /* Moving cpus from effective_cpus to subparts_cpus */
int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
int old_prs, new_prs;
- bool part_error = false; /* Partition error? */
+ int part_error = PERR_NONE; /* Partition error? */
percpu_rwsem_assert_held(&cpuset_rwsem);
@@ -1224,126 +1316,165 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
* The new cpumask, if present, or the current cpus_allowed must
* not be empty.
*/
- if (!is_partition_root(parent) ||
- (newmask && cpumask_empty(newmask)) ||
- (!newmask && cpumask_empty(cpuset->cpus_allowed)))
- return -EINVAL;
-
- /*
- * Enabling/disabling partition root is not allowed if there are
- * online children.
- */
- if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
- return -EBUSY;
-
- /*
- * Enabling partition root is not allowed if not all the CPUs
- * can be granted from parent's effective_cpus or at least one
- * CPU will be left after that.
- */
- if ((cmd == partcmd_enable) &&
- (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
- cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
- return -EINVAL;
+ if (!is_partition_valid(parent)) {
+ return is_partition_invalid(parent)
+ ? PERR_INVPARENT : PERR_NOTPART;
+ }
+ if ((newmask && cpumask_empty(newmask)) ||
+ (!newmask && cpumask_empty(cs->cpus_allowed)))
+ return PERR_CPUSEMPTY;
/*
- * A cpumask update cannot make parent's effective_cpus become empty.
+ * new_prs will only be changed for the partcmd_update and
+ * partcmd_invalidate commands.
*/
adding = deleting = false;
- old_prs = new_prs = cpuset->partition_root_state;
+ old_prs = new_prs = cs->partition_root_state;
if (cmd == partcmd_enable) {
- cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
+ /*
+ * Enabling partition root is not allowed if cpus_allowed
+ * doesn't overlap parent's cpus_allowed.
+ */
+ if (!cpumask_intersects(cs->cpus_allowed, parent->cpus_allowed))
+ return PERR_INVCPUS;
+
+ /*
+ * A parent can be left with no CPU as long as there is no
+ * task directly associated with the parent partition.
+ */
+ if (!cpumask_intersects(cs->cpus_allowed, parent->effective_cpus) &&
+ partition_is_populated(parent, cs))
+ return PERR_NOCPUS;
+
+ cpumask_copy(tmp->addmask, cs->cpus_allowed);
adding = true;
} else if (cmd == partcmd_disable) {
- deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+ /*
+ * Need to remove cpus from parent's subparts_cpus for valid
+ * partition root.
+ */
+ deleting = !is_prs_invalid(old_prs) &&
+ cpumask_and(tmp->delmask, cs->cpus_allowed,
+ parent->subparts_cpus);
+ } else if (cmd == partcmd_invalidate) {
+ if (is_prs_invalid(old_prs))
+ return 0;
+
+ /*
+ * Make the current partition invalid. It is assumed that
+ * invalidation is caused by violating cpu exclusivity rule.
+ */
+ deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
parent->subparts_cpus);
+ if (old_prs > 0) {
+ new_prs = -old_prs;
+ part_error = PERR_NOTEXCL;
+ }
} else if (newmask) {
/*
* partcmd_update with newmask:
*
+ * Compute add/delete mask to/from subparts_cpus
+ *
* delmask = cpus_allowed & ~newmask & parent->subparts_cpus
- * addmask = newmask & parent->effective_cpus
+ * addmask = newmask & parent->cpus_allowed
* & ~parent->subparts_cpus
*/
- cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
+ cpumask_andnot(tmp->delmask, cs->cpus_allowed, newmask);
deleting = cpumask_and(tmp->delmask, tmp->delmask,
parent->subparts_cpus);
- cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
+ cpumask_and(tmp->addmask, newmask, parent->cpus_allowed);
adding = cpumask_andnot(tmp->addmask, tmp->addmask,
parent->subparts_cpus);
/*
- * Return error if the new effective_cpus could become empty.
+ * Make partition invalid if parent's effective_cpus could
+ * become empty and there are tasks in the parent.
*/
if (adding &&
- cpumask_equal(parent->effective_cpus, tmp->addmask)) {
- if (!deleting)
- return -EINVAL;
- /*
- * As some of the CPUs in subparts_cpus might have
- * been offlined, we need to compute the real delmask
- * to confirm that.
- */
- if (!cpumask_and(tmp->addmask, tmp->delmask,
- cpu_active_mask))
- return -EINVAL;
- cpumask_copy(tmp->addmask, parent->effective_cpus);
+ cpumask_subset(parent->effective_cpus, tmp->addmask) &&
+ !cpumask_intersects(tmp->delmask, cpu_active_mask) &&
+ partition_is_populated(parent, cs)) {
+ part_error = PERR_NOCPUS;
+ adding = false;
+ deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
+ parent->subparts_cpus);
}
} else {
/*
* partcmd_update w/o newmask:
*
- * addmask = cpus_allowed & parent->effective_cpus
+ * delmask = cpus_allowed & parent->subparts_cpus
+ * addmask = cpus_allowed & parent->cpus_allowed
+ * & ~parent->subparts_cpus
*
- * Note that parent's subparts_cpus may have been
- * pre-shrunk in case there is a change in the cpu list.
- * So no deletion is needed.
+ * This gets invoked either due to a hotplug event or from
+ * update_cpumasks_hier(). This can cause the state of a
+ * partition root to transition from valid to invalid or vice
+ * versa. So we still need to compute the addmask and delmask.
+
+ * A partition error happens when:
+ * 1) Cpuset is valid partition, but parent does not distribute
+ * out any CPUs.
+ * 2) Parent has tasks and all its effective CPUs will have
+ * to be distributed out.
*/
- adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
- parent->effective_cpus);
- part_error = cpumask_equal(tmp->addmask,
- parent->effective_cpus);
+ cpumask_and(tmp->addmask, cs->cpus_allowed,
+ parent->cpus_allowed);
+ adding = cpumask_andnot(tmp->addmask, tmp->addmask,
+ parent->subparts_cpus);
+
+ if ((is_partition_valid(cs) && !parent->nr_subparts_cpus) ||
+ (adding &&
+ cpumask_subset(parent->effective_cpus, tmp->addmask) &&
+ partition_is_populated(parent, cs))) {
+ part_error = PERR_NOCPUS;
+ adding = false;
+ }
+
+ if (part_error && is_partition_valid(cs) &&
+ parent->nr_subparts_cpus)
+ deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
+ parent->subparts_cpus);
}
+ if (part_error)
+ WRITE_ONCE(cs->prs_err, part_error);
if (cmd == partcmd_update) {
- int prev_prs = cpuset->partition_root_state;
-
/*
- * Check for possible transition between PRS_ENABLED
- * and PRS_ERROR.
+ * Check for possible transition between valid and invalid
+ * partition root.
*/
- switch (cpuset->partition_root_state) {
- case PRS_ENABLED:
+ switch (cs->partition_root_state) {
+ case PRS_ROOT:
+ case PRS_ISOLATED:
if (part_error)
- new_prs = PRS_ERROR;
+ new_prs = -old_prs;
break;
- case PRS_ERROR:
+ case PRS_INVALID_ROOT:
+ case PRS_INVALID_ISOLATED:
if (!part_error)
- new_prs = PRS_ENABLED;
+ new_prs = -old_prs;
break;
}
- /*
- * Set part_error if previously in invalid state.
- */
- part_error = (prev_prs == PRS_ERROR);
- }
-
- if (!part_error && (new_prs == PRS_ERROR))
- return 0; /* Nothing need to be done */
-
- if (new_prs == PRS_ERROR) {
- /*
- * Remove all its cpus from parent's subparts_cpus.
- */