diff options
| -rw-r--r-- | Documentation/cgroups/cgroups.txt | 14 | ||||
| -rw-r--r-- | Documentation/cgroups/unified-hierarchy.txt | 35 | ||||
| -rw-r--r-- | block/blk-cgroup.c | 13 | ||||
| -rw-r--r-- | block/blk-throttle.c | 6 | ||||
| -rw-r--r-- | include/linux/cgroup.h | 165 | ||||
| -rw-r--r-- | kernel/cgroup.c | 453 | ||||
| -rw-r--r-- | kernel/cgroup_freezer.c | 2 | ||||
| -rw-r--r-- | kernel/cpuset.c | 500 | ||||
| -rw-r--r-- | kernel/sched/core.c | 2 | ||||
| -rw-r--r-- | kernel/sched/cpuacct.c | 2 | ||||
| -rw-r--r-- | mm/hugetlb_cgroup.c | 5 | ||||
| -rw-r--r-- | mm/memcontrol.c | 37 | ||||
| -rw-r--r-- | net/core/netclassid_cgroup.c | 2 | ||||
| -rw-r--r-- | net/core/netprio_cgroup.c | 2 | ||||
| -rw-r--r-- | net/ipv4/tcp_memcontrol.c | 2 | ||||
| -rw-r--r-- | security/device_cgroup.c | 2 |
16 files changed, 806 insertions, 436 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 821de56d1580..10c949b293e4 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -599,6 +599,20 @@ fork. If this method returns 0 (success) then this should remain valid while the caller holds cgroup_mutex and it is ensured that either attach() or cancel_attach() will be called in future. +void css_reset(struct cgroup_subsys_state *css) +(cgroup_mutex held by caller) + +An optional operation which should restore @css's configuration to the +initial state. This is currently only used on the unified hierarchy +when a subsystem is disabled on a cgroup through +"cgroup.subtree_control" but should remain enabled because other +subsystems depend on it. cgroup core makes such a css invisible by +removing the associated interface files and invokes this callback so +that the hidden subsystem can return to the initial neutral state. +This prevents unexpected resource control from a hidden css and +ensures that the configuration is in the initial state when it is made +visible again later. + void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) (cgroup_mutex held by caller) diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt index 324b182e6000..4f4563277864 100644 --- a/Documentation/cgroups/unified-hierarchy.txt +++ b/Documentation/cgroups/unified-hierarchy.txt @@ -94,12 +94,35 @@ change soon. mount -t cgroup -o __DEVEL__sane_behavior cgroup $MOUNT_POINT -All controllers which are not bound to other hierarchies are -automatically bound to unified hierarchy and show up at the root of -it. Controllers which are enabled only in the root of unified -hierarchy can be bound to other hierarchies at any time. This allows -mixing unified hierarchy with the traditional multiple hierarchies in -a fully backward compatible way. +All controllers which support the unified hierarchy and are not bound +to other hierarchies are automatically bound to unified hierarchy and +show up at the root of it. Controllers which are enabled only in the +root of unified hierarchy can be bound to other hierarchies. This +allows mixing unified hierarchy with the traditional multiple +hierarchies in a fully backward compatible way. + +For development purposes, the following boot parameter makes all +controllers to appear on the unified hierarchy whether supported or +not. + + cgroup__DEVEL__legacy_files_on_dfl + +A controller can be moved across hierarchies only after the controller +is no longer referenced in its current hierarchy. Because per-cgroup +controller states are destroyed asynchronously and controllers may +have lingering references, a controller may not show up immediately on +the unified hierarchy after the final umount of the previous +hierarchy. Similarly, a controller should be fully disabled to be +moved out of the unified hierarchy and it may take some time for the +disabled controller to become available for other hierarchies; +furthermore, due to dependencies among controllers, other controllers +may need to be disabled too. + +While useful for development and manual configurations, dynamically +moving controllers between the unified and other hierarchies is +strongly discouraged for production use. It is recommended to decide +the hierarchies and controller associations before starting using the +controllers. 2-2. cgroup.subtree_control diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 28d227c5ca77..e17da947f6bd 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -928,7 +928,15 @@ struct cgroup_subsys blkio_cgrp_subsys = { .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, - .base_cftypes = blkcg_files, + .legacy_cftypes = blkcg_files, +#ifdef CONFIG_MEMCG + /* + * This ensures that, if available, memcg is automatically enabled + * together on the default hierarchy so that the owner cgroup can + * be retrieved from writeback pages. + */ + .depends_on = 1 << memory_cgrp_id, +#endif }; EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); @@ -1120,7 +1128,8 @@ int blkcg_policy_register(struct blkcg_policy *pol) /* everything is in place, add intf files for the new policy */ if (pol->cftypes) - WARN_ON(cgroup_add_cftypes(&blkio_cgrp_subsys, pol->cftypes)); + WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys, + pol->cftypes)); ret = 0; out_unlock: mutex_unlock(&blkcg_pol_mutex); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 3fdb21a390c1..9273d0969ebd 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -412,13 +412,13 @@ static void throtl_pd_init(struct blkcg_gq *blkg) int rw; /* - * If sane_hierarchy is enabled, we switch to properly hierarchical + * If on the default hierarchy, we switch to properly hierarchical * behavior where limits on a given throtl_grp are applied to the * whole subtree rather than just the group itself. e.g. If 16M * read_bps limit is set on the root group, the whole system can't * exceed 16M for the device. * - * If sane_hierarchy is not enabled, the broken flat hierarchy + * If not on the default hierarchy, the broken flat hierarchy * behavior is retained where all throtl_grps are treated as if * they're all separate root groups right below throtl_data. * Limits of a group don't interact with limits of other groups @@ -426,7 +426,7 @@ static void throtl_pd_init(struct blkcg_gq *blkg) */ parent_sq = &td->service_queue; - if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent) + if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent) parent_sq = &blkg_to_tg(blkg->parent)->service_queue; throtl_service_queue_init(&tg->service_queue, parent_sq); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8a111dd42d7a..b5223c570eba 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -203,7 +203,15 @@ struct cgroup { struct kernfs_node *kn; /* cgroup kernfs entry */ struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ - /* the bitmask of subsystems enabled on the child cgroups */ + /* + * The bitmask of subsystems enabled on the child cgroups. + * ->subtree_control is the one configured through + * "cgroup.subtree_control" while ->child_subsys_mask is the + * effective one which may have more subsystems enabled. + * Controller knobs are made available iff it's enabled in + * ->subtree_control. + */ + unsigned int subtree_control; unsigned int child_subsys_mask; /* Private pointers for each registered subsystem */ @@ -248,73 +256,9 @@ struct cgroup { /* cgroup_root->flags */ enum { - /* - * Unfortunately, cgroup core and various controllers are riddled - * with idiosyncrasies and pointless options. The following flag, - * when set, will force sane behavior - some options are forced on, - * others are disallowed, and some controllers will change their - * hierarchical or other behaviors. - * - * The set of behaviors affected by this flag are still being - * determined and developed and the mount option for this flag is - * prefixed with __DEVEL__. The prefix will be dropped once we - * reach the point where all behaviors are compatible with the - * planned unified hierarchy, which will automatically turn on this - * flag. - * - * The followings are the behaviors currently affected this flag. - * - * - Mount options "noprefix", "xattr", "clone_children", - * "release_agent" and "name" are disallowed. - * - * - When mounting an existing superblock, mount options should - * match. - * - * - Remount is disallowed. - * - * - rename(2) is disallowed. - * - * - "tasks" is removed. Everything should be at process - * granularity. Use "cgroup.procs" instead. - * - * - "cgroup.procs" is not sorted. pids will be unique unless they - * got recycled inbetween reads. - * - * - "release_agent" and "notify_on_release" are removed. - * Replacement notification mechanism will be implemented. - * - * - "cgroup.clone_children" is removed. - * - * - "cgroup.subtree_populated" is available. Its value is 0 if - * the cgroup and its descendants contain no task; otherwise, 1. - * The file also generates kernfs notification which can be - * monitored through poll and [di]notify when the value of the - * file changes. - * - * - If mount is requested with sane_behavior but without any - * subsystem, the default unified hierarchy is mounted. - * - * - cpuset: tasks will be kept in empty cpusets when hotplug happens - * and take masks of ancestors with non-empty cpus/mems, instead of - * being moved to an ancestor. - * - * - cpuset: a task can be moved into an empty cpuset, and again it - * takes masks of ancestors. - * - * - memcg: use_hierarchy is on by default and the cgroup file for - * the flag is not created. - * - * - blkcg: blk-throttle becomes properly hierarchical. - * - * - debug: disallowed on the default hierarchy. - */ - CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), - + CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */ CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ - - /* mount options live below bit 16 */ - CGRP_ROOT_OPTION_MASK = (1 << 16) - 1, }; /* @@ -440,9 +384,11 @@ struct css_set { enum { CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ - CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */ CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ - CFTYPE_ONLY_ON_DFL = (1 << 4), /* only on default hierarchy */ + + /* internal flags, do not use outside cgroup core proper */ + __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ + __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ }; #define MAX_CFTYPE_NAME 64 @@ -526,20 +472,64 @@ struct cftype { extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; +/** + * cgroup_on_dfl - test whether a cgroup is on the default hierarchy + * @cgrp: the cgroup of interest + * + * The default hierarchy is the v2 interface of cgroup and this function + * can be used to test whether a cgroup is on the default hierarchy for + * cases where a subsystem should behave differnetly depending on the + * interface version. + * + * The set of behaviors which change on the default hierarchy are still + * being determined and the mount option is prefixed with __DEVEL__. + * + * List of changed behaviors: + * + * - Mount options "noprefix", "xattr", "clone_children", "release_agent" + * and "name" are disallowed. + * + * - When mounting an existing superblock, mount options should match. + * + * - Remount is disallowed. + * + * - rename(2) is disallowed. + * + * - "tasks" is removed. Everything should be at process granularity. Use + * "cgroup.procs" instead. + * + * - "cgroup.procs" is not sorted. pids will be unique unless they got + * recycled inbetween reads. + * + * - "release_agent" and "notify_on_release" are removed. Replacement + * notification mechanism will be implemented. + * + * - "cgroup.clone_children" is removed. + * + * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup + * and its descendants contain no task; otherwise, 1. The file also + * generates kernfs notification which can be monitored through poll and + * [di]notify when the value of the file changes. + * + * - cpuset: tasks will be kept in empty cpusets when hotplug happens and + * take masks of ancestors with non-empty cpus/mems, instead of being + * moved to an ancestor. + * + * - cpuset: a task can be moved into an empty cpuset, and again it takes + * masks of ancestors. + * + * - memcg: use_hierarchy is on by default and the cgroup file for the flag + * is not created. + * + * - blkcg: blk-throttle becomes properly hierarchical. + * + * - debug: disallowed on the default hierarchy. + */ static inline bool cgroup_on_dfl(const struct cgroup *cgrp) { return cgrp->root == &cgrp_dfl_root; } -/* - * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This - * function can be called as long as @cgrp is accessible. - */ -static inline bool cgroup_sane_behavior(const struct cgroup *cgrp) -{ - return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR; -} - /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_has_tasks(struct cgroup *cgrp) { @@ -602,7 +592,8 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); -int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); +int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); +int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); @@ -634,6 +625,7 @@ struct cgroup_subsys { int (*css_online)(struct cgroup_subsys_state *css); void (*css_offline)(struct cgroup_subsys_state *css); void (*css_free)(struct cgroup_subsys_state *css); + void (*css_reset)(struct cgroup_subsys_state *css); int (*can_attach)(struct cgroup_subsys_state *css, struct cgroup_taskset *tset); @@ -682,8 +674,21 @@ struct cgroup_subsys { */ struct list_head cfts; - /* base cftypes, automatically registered with subsys itself */ - struct cftype *base_cftypes; + /* + * Base cftypes which are automatically registered. The two can + * point to the same array. + */ + struct cftype *dfl_cftypes; /* for the default hierarchy */ + struct cftype *legacy_cftypes; /* for the legacy hierarchies */ + + /* + * A subsystem may depend on other subsystems. When such subsystem + * is enabled on a cgroup, the depended-upon subsystems are enabled + * together if available. Subsystems enabled due to dependency are + * not visible to userland until explicitly enabled. The following + * specifies the mask of subsystems that this one depends on. + */ + unsigned int depends_on; }; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index aad41f06901b..7dc8788cfd52 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -149,12 +149,14 @@ struct cgroup_root cgrp_dfl_root; */ static bool cgrp_dfl_root_visible; +/* + * Set by the boot param of the same name and makes subsystems with NULL + * ->dfl_files to use ->legacy_files on the default hierarchy. + */ +static bool cgroup_legacy_files_on_dfl; + /* some controllers are not supported in the default hierarchy */ -static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 -#ifdef CONFIG_CGROUP_DEBUG - | (1 << debug_cgrp_id) -#endif - ; +static unsigned int cgrp_dfl_root_inhibit_ss_mask; /* The list of hierarchy roots */ @@ -180,13 +182,15 @@ static u64 css_serial_nr_next = 1; */ static int need_forkexit_callback __read_mostly; -static struct cftype cgroup_base_files[]; +static struct cftype cgroup_dfl_base_files[]; +static struct cftype cgroup_legacy_base_files[]; static void cgroup_put(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask); static int cgroup_destroy_locked(struct cgroup *cgrp); -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, + bool visible); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], @@ -1037,6 +1041,58 @@ static void cgroup_put(struct cgroup *cgrp) } /** + * cgroup_refresh_child_subsys_mask - update child_subsys_mask + * @cgrp: the target cgroup + * + * On the default hierarchy, a subsystem may request other subsystems to be + * enabled together through its ->depends_on mask. In such cases, more + * subsystems than specified in "cgroup.subtree_control" may be enabled. + * + * This function determines which subsystems need to be enabled given the + * current @cgrp->subtree_control and records it in + * @cgrp->child_subsys_mask. The resulting mask is always a superset of + * @cgrp->subtree_control and follows the usual hierarchy rules. + */ +static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + unsigned int cur_ss_mask = cgrp->subtree_control; + struct cgroup_subsys *ss; + int ssid; + + lockdep_assert_held(&cgroup_mutex); + + if (!cgroup_on_dfl(cgrp)) { + cgrp->child_subsys_mask = cur_ss_mask; + return; + } + + while (true) { + unsigned int new_ss_mask = cur_ss_mask; + + for_each_subsys(ss, ssid) + if (cur_ss_mask & (1 << ssid)) + new_ss_mask |= ss->depends_on; + + /* + * Mask out subsystems which aren't available. This can + * happen only if some depended-upon subsystems were bound + * to non-default hierarchies. + */ + if (parent) + new_ss_mask &= parent->child_subsys_mask; + else + new_ss_mask &= cgrp->root->subsys_mask; + + if (new_ss_mask == cur_ss_mask) + break; + cur_ss_mask = new_ss_mask; + } + + cgrp->child_subsys_mask = cur_ss_mask; +} + +/** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * @@ -1208,12 +1264,15 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) up_write(&css_set_rwsem); src_root->subsys_mask &= ~(1 << ssid); - src_root->cgrp.child_subsys_mask &= ~(1 << ssid); + src_root->cgrp.subtree_control &= ~(1 << ssid); + cgroup_refresh_child_subsys_mask(&src_root->cgrp); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; - if (dst_root != &cgrp_dfl_root) - dst_root->cgrp.child_subsys_mask |= 1 << ssid; + if (dst_root != &cgrp_dfl_root) { + dst_root->cgrp.subtree_control |= 1 << ssid; + cgroup_refresh_child_subsys_mask(&dst_root->cgrp); + } if (ss->bind) ss->bind(css); @@ -1233,8 +1292,6 @@ static int cgroup_show_options(struct seq_file *seq, for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_printf(seq, ",%s", ss->name); - if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) - seq_puts(seq, ",sane_behavior"); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) @@ -1268,6 +1325,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) bool all_ss = false, one_ss = false; unsigned int mask = -1U; struct cgroup_subsys *ss; + int nr_opts = 0; int i; #ifdef CONFIG_CPUSETS @@ -1277,6 +1335,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) memset(opts, 0, sizeof(*opts)); while ((token = strsep(&o, ",")) != NULL) { + nr_opts++; + if (!*token) return -EINVAL; if (!strcmp(token, "none")) { @@ -1361,37 +1421,33 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return -ENOENT; } - /* Consistency checks */ - if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - - if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || - opts->cpuset_clone_children || opts->release_agent || - opts->name) { - pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); + if (nr_opts != 1) { + pr_err("sane_behavior: no other mount options allowed\n"); return -EINVAL; } - } else { - /* - * If the 'all' option was specified select all the - * subsystems, otherwise if 'none', 'name=' and a subsystem - * name options were not specified, let's default to 'all' - */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (!ss->disabled) - opts->subsys_mask |= (1 << i); - - /* - * We either have to specify by name or by subsystems. (So - * all empty hierarchies must have a name). - */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; + return 0; } /* + * If the 'all' option was specified select all the subsystems, + * otherwise if 'none', 'name=' and a subsystem name options were + * not specified, let's default to 'all' + */ + if (all_ss || (!one_ss && !opts->none && !opts->name)) + for_each_subsys(ss, i) + if (!ss->disabled) + opts->subsys_mask |= (1 << i); + + /* + * We either have to specify by name or by subsystems. (So all + * empty hierarchies must have a name). + */ + if (!opts->subsys_mask && !opts->name) + return -EINVAL; + + /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. @@ -1399,7 +1455,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) return -EINVAL; - /* Can't specify "none" and some subsystems */ if (opts->subsys_mask && opts->none) return -EINVAL; @@ -1414,8 +1469,8 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) struct cgroup_sb_opts opts; unsigned int added_mask, removed_mask; - if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("sane_behavior: remount is not allowed\n"); + if (root == &cgrp_dfl_root) { + pr_err("remount is not allowed\n"); return -EINVAL; } @@ -1434,11 +1489,10 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) removed_mask = root->subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ - if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || + if ((opts.flags ^ root->flags) || (opts.name && strcmp(opts.name, root->name))) { pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", - opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", - root->flags & CGRP_ROOT_OPTION_MASK, root->name); + opts.flags, opts.name ?: "", root->flags, root->name); ret = -EINVAL; goto out_unlock; } @@ -1563,6 +1617,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; + struct cftype *base_files; struct css_set *cset; int i, ret; @@ -1600,7 +1655,12 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) } root_cgrp->kn = root->kf_root->kn; - ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); + if (root == &cgrp_dfl_root) + base_files = cgroup_dfl_base_files; + else + base_files = cgroup_legacy_base_files; + + ret = cgroup_addrm_files(root_cgrp, base_files, true); if (ret) goto destroy_root; @@ -1672,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; /* look for a matching existing root */ - if (!opts.subsys_mask && !opts.none && !opts.name) { + if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) { cgrp_dfl_root_visible = true; root = &cgrp_dfl_root; cgroup_get(&root->cgrp); @@ -1730,15 +1790,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; } - if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { - if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("sane_behavior: new mount options should match the existing superblock\n"); - ret = -EINVAL; - goto out_unlock; - } else { - pr_warn("new mount options do not match the existing superblock, will be ignored\n"); - } - } + if (root->flags ^ opts.flags) + pr_warn("new mount options do not match the existing superblock, will be ignored\n"); /* * We want to reuse @root whose lifetime is governed by its @@ -2457,9 +2510,7 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v) static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) { - struct cgroup *cgrp = seq_css(seq)->cgroup; - - seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); + seq_puts(seq, "0\n"); return 0; } @@ -2496,7 +2547,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); + cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control); return 0; } @@ -2505,7 +2556,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); + cgroup_print_ss_mask(seq, cgrp->subtree_control); return 0; } @@ -2611,6 +2662,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, loff_t off) { unsigned int enable = 0, disable = 0; + unsigned int css_enable, css_disable, old_ctrl, new_ctrl; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -2650,11 +2702,26 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, for_each_subsys(ss, ssid) { if (enable & (1 << ssid)) { - if (cgrp->child_subsys_mask & (1 << ssid)) { + if (cgrp->subtree_control & (1 << ssid)) { enable &= ~(1 << ssid); continue; } + /* unavailable or not enabled on the parent? */ + if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || + (cgroup_parent(cgrp) && + !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) { + ret = -ENOENT; + goto out_unlock; + } + + /* + * @ss is already enabled through dependency and + * we'll just make it visible. Skip draining. + */ + if (cgrp->child_subsys_mask & (1 << ssid)) + continue; + /* * Because css offlining is asynchronous, userland * might try to re-enable the same controller while @@ -2677,23 +2744,15 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return restart_syscall(); } - - /* unavailable or not enabled on the parent? */ - if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || - (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) { - ret = -ENOENT; - goto out_unlock; - } } else if (disable & (1 << ssid)) { - if (!(cgrp->child_subsys_mask & (1 << ssid))) { + if (!(cgrp->subtree_control & (1 << ssid))) { disable &= ~(1 << ssid); continue; } /* a child has it enabled? */ cgroup_for_each_live_child(child, cgrp) { - if (child->child_subsys_mask & (1 << ssid)) { + if (child->subtree_control & (1 << ssid)) { ret = -EBUSY; goto out_unlock; } @@ -2707,7 +2766,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, } /* - * Except for the root, child_subsys_mask must be zero for a cgroup + * Except for the root, subtree_control must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { @@ -2716,36 +2775,75 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, } /* - * Create csses for enables and update child_subsys_mask. This - * changes cgroup_e_css() results which in turn makes the - * subsequent cgroup_update_dfl_csses() associate all tasks in the - * subtree to the updated csses. + * Update subsys masks and calculate what needs to be done. More + * subsystems than specified may need to be enabled or disabled + * depending on subsystem dependencies. + */ + cgrp->subtree_control |= enable; + cgrp->subtree_control &= ~disable; + + old_ctrl = cgrp->child_subsys_mask; + cgroup_refresh_child_subsys_mask(cgrp); + new_ctrl = cgrp->child_subsys_mask; + + css_enable = ~old_ctrl & new_ctrl; + css_disable = old_ctrl & ~new_ctrl; + enable |= css_enable; + disable |= css_disable; + + /* + * Create new csses or make the existing ones visible. A css is + * created invisible if it's being implicitly enabled through + * dependency. An invisible css is made visible when the userland + * explicitly enables it. */ for_each_subsys(ss, ssid) { if (!(enable & (1 << ssid))) continue; cgroup_for_each_live_child(child, cgrp) { - ret = create_css(child, ss); + if (css_enable & (1 << ssid)) + ret = create_css(child, ss, + cgrp->subtree_control & (1 << ssid)); + else + ret = cgroup_populate_dir(child, 1 << ssid); if (ret) goto err_undo_css; } } - cgrp->child_subsys_mask |= enable; - cgrp->child_subsys_mask &= ~disable; - + /* + * At this point, cgroup_e_css() results reflect the new csses + * making the following cgroup_update_dfl_csses() properly update + * css associations of all tasks in the subtree. + */ ret = cgroup_update_dfl_csses(cgrp); if (ret) goto err_undo_css; - /* all tasks are now migrated away from the old csses, kill them */ + /* + * All tasks are migrated out of disabled csses. Kill or hide + * them. A css is hidden when the userland requests it to be + * disabled while other subsystems are still depending on it. The + * css must not actively control resources and be in the vanilla + * state if it's made visible again later. Controllers which may + * be depended upon should provide ->css_reset() for this purpose. + */ for_each_subsys(ss, ssid) { if (!(disable & (1 << ssid))) continue; - cgroup_for_each_live_child(child, cgrp) - kill_css(cgroup_css(child, ss)); + cgroup_for_each_live_child(child, cgrp) { + struct cgroup_subsys_state *css = cgroup_css(child, ss); + + if (css_disable & (1 << ssid)) { + kill_css(css); + } else { + cgroup_clear_dir(child, 1 << ssid); + if (ss->css_reset) + ss->css_reset(css); + } + } } kernfs_activate(cgrp->kn); @@ -2755,8 +2853,9 @@ out_unlock: return ret ?: nbytes; err_undo_css: - cgrp->child_subsys_mask &= ~enable; - cgrp->child_subsys_mask |= disable; + cgrp->subtree_control &= ~enable; + cgrp->subtree_control |= disable; + cgroup_refresh_child_subsys_mask(cgrp); for_each_subsys(ss, ssid) { if (!(enable & (1 << ssid))) @@ -2764,8 +2863,14 @@ err_undo_css: cgroup_for_each_live_child(child, cgrp) { struct cgroup_subsys_state *css = cgroup_css(child, ss); - if (css) + + if (!css) + continue; + + if (css_enable & (1 << ssid)) kill_css(css); + else + cgroup_clear_dir(child, 1 << ssid); } } goto out_unlock; @@ -2878,9 +2983,9 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, /* * Th |
