diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/cgroup/cgroup.c | 30 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset.c | 1306 |
2 files changed, 1001 insertions, 335 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1fb7f562289d..c09531bace38 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -207,6 +207,8 @@ static u16 have_exit_callback __read_mostly; static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; +static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); + /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { .ns.count = REFCOUNT_INIT(2), @@ -1350,7 +1352,9 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_root_count--; } - cgroup_favor_dynmods(root, false); + if (!have_favordynmods) + cgroup_favor_dynmods(root, false); + cgroup_exit_root_id(root); cgroup_unlock(); @@ -1719,20 +1723,22 @@ static int css_populate_dir(struct cgroup_subsys_state *css) if (!css->ss) { if (cgroup_on_dfl(cgrp)) { - ret = cgroup_addrm_files(&cgrp->self, cgrp, + ret = cgroup_addrm_files(css, cgrp, cgroup_base_files, true); if (ret < 0) return ret; if (cgroup_psi_enabled()) { - ret = cgroup_addrm_files(&cgrp->self, cgrp, + ret = cgroup_addrm_files(css, cgrp, cgroup_psi_files, true); if (ret < 0) return ret; } } else { - cgroup_addrm_files(css, cgrp, - cgroup1_base_files, true); + ret = cgroup_addrm_files(css, cgrp, + cgroup1_base_files, true); + if (ret < 0) + return ret; } } else { list_for_each_entry(cfts, &css->ss->cfts, node) { @@ -2243,9 +2249,9 @@ static int cgroup_init_fs_context(struct fs_context *fc) fc->user_ns = get_user_ns(ctx->ns->user_ns); fc->global = true; -#ifdef CONFIG_CGROUP_FAVOR_DYNMODS - ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; -#endif + if (have_favordynmods) + ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; + return 0; } @@ -6121,7 +6127,7 @@ int __init cgroup_init(void) if (cgroup1_ssid_disabled(ssid)) pr_info("Disabling %s control group subsystem in v1 mounts\n", - ss->name); + ss->legacy_name); cgrp_dfl_root.subsys_mask |= 1 << ss->id; @@ -6764,6 +6770,12 @@ static int __init enable_cgroup_debug(char *str) } __setup("cgroup_debug", enable_cgroup_debug); +static int __init cgroup_favordynmods_setup(char *str) +{ + return (kstrtobool(str, &have_favordynmods) == 0); +} +__setup("cgroup_favordynmods=", cgroup_favordynmods_setup); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 58ec88efa4f8..615daaf87f1f 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -75,16 +75,18 @@ enum prs_errcode { PERR_NOCPUS, PERR_HOTPLUG, PERR_CPUSEMPTY, + PERR_HKEEPING, }; static const char * const perr_strings[] = { - [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus", + [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive", [PERR_INVPARENT] = "Parent is an invalid partition root", [PERR_NOTPART] = "Parent is not a partition root", [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", [PERR_HOTPLUG] = "No cpu available due to hotplug", [PERR_CPUSEMPTY] = "cpuset.cpus is empty", + [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", }; struct cpuset { @@ -121,14 +123,23 @@ struct cpuset { nodemask_t effective_mems; /* - * CPUs allocated to child sub-partitions (default hierarchy only) - * - CPUs granted by the parent = effective_cpus U subparts_cpus - * - effective_cpus and subparts_cpus are mutually exclusive. + * Exclusive CPUs dedicated to current cgroup (default hierarchy only) * - * effective_cpus contains only onlined CPUs, but subparts_cpus - * may have offlined ones. + * This exclusive CPUs must be a subset of cpus_allowed. A parent + * cgroup can only grant exclusive CPUs to one of its children. + * + * When the cgroup becomes a valid partition root, effective_xcpus + * defaults to cpus_allowed if not set. The effective_cpus of a valid + * partition root comes solely from its effective_xcpus and some of the + * effective_xcpus may be distributed to sub-partitions below & hence + * excluded from its effective_cpus. + */ + cpumask_var_t effective_xcpus; + + /* + * Exclusive CPUs as requested by the user (default hierarchy only) */ - cpumask_var_t subparts_cpus; + cpumask_var_t exclusive_cpus; /* * This is old Memory Nodes tasks took on. @@ -156,8 +167,8 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* number of CPUs in subparts_cpus */ - int nr_subparts_cpus; + /* number of valid sub-partitions */ + int nr_subparts; /* partition root state */ int partition_root_state; @@ -183,9 +194,20 @@ struct cpuset { /* Handle for cpuset.cpus.partition */ struct cgroup_file partition_file; + + /* Remote partition silbling list anchored at remote_children */ + struct list_head remote_sibling; }; /* + * Exclusive CPUs distributed out to sub-partitions of top_cpuset + */ +static cpumask_var_t subpartitions_cpus; + +/* List of remote partition root children */ +static struct list_head remote_children; + +/* * Partition root states: * * 0 - member (not a partition root) @@ -312,7 +334,7 @@ static inline int is_partition_invalid(const struct cpuset *cs) */ static inline void make_partition_invalid(struct cpuset *cs) { - if (is_partition_valid(cs)) + if (cs->partition_root_state > 0) cs->partition_root_state = -cs->partition_root_state; } @@ -334,6 +356,7 @@ static struct cpuset top_cpuset = { .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), .partition_root_state = PRS_ROOT, + .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), }; /** @@ -469,7 +492,7 @@ static inline bool partition_is_populated(struct cpuset *cs, if (cs->css.cgroup->nr_populated_csets) return true; - if (!excluded_child && !cs->nr_subparts_cpus) + if (!excluded_child && !cs->nr_subparts) return cgroup_is_populated(cs->css.cgroup); rcu_read_lock(); @@ -596,16 +619,18 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) */ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { - cpumask_var_t *pmask1, *pmask2, *pmask3; + cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4; if (cs) { pmask1 = &cs->cpus_allowed; pmask2 = &cs->effective_cpus; - pmask3 = &cs->subparts_cpus; + pmask3 = &cs->effective_xcpus; + pmask4 = &cs->exclusive_cpus; } else { pmask1 = &tmp->new_cpus; pmask2 = &tmp->addmask; pmask3 = &tmp->delmask; + pmask4 = NULL; } if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) @@ -617,8 +642,14 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) goto free_two; + if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL)) + goto free_three; + + return 0; +free_three: + free_cpumask_var(*pmask3); free_two: free_cpumask_var(*pmask2); free_one: @@ -636,7 +667,8 @@ static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) if (cs) { free_cpumask_var(cs->cpus_allowed); free_cpumask_var(cs->effective_cpus); - free_cpumask_var(cs->subparts_cpus); + free_cpumask_var(cs->effective_xcpus); + free_cpumask_var(cs->exclusive_cpus); } if (tmp) { free_cpumask_var(tmp->new_cpus); @@ -664,6 +696,8 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->effective_cpus, cs->effective_cpus); + cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); + cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); return trial; } @@ -677,6 +711,28 @@ static inline void free_cpuset(struct cpuset *cs) kfree(cs); } +static inline struct cpumask *fetch_xcpus(struct cpuset *cs) +{ + return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus : + cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed + : cs->effective_xcpus; +} + +/* + * cpusets_are_exclusive() - check if two cpusets are exclusive + * + * Return true if exclusive, false if not + */ +static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) +{ + struct cpumask *xcpus1 = fetch_xcpus(cs1); + struct cpumask *xcpus2 = fetch_xcpus(cs2); + + if (cpumask_intersects(xcpus1, xcpus2)) + return false; + return true; +} + /* * validate_change_legacy() - Validate conditions specific to legacy (v1) * behavior. @@ -776,9 +832,10 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) ret = -EINVAL; cpuset_for_each_child(c, css, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && - c != cur && - cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) - goto out; + c != cur) { + if (!cpusets_are_exclusive(trial, c)) + goto out; + } if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) @@ -908,7 +965,7 @@ static int generate_sched_domains(cpumask_var_t **domains, csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && !top_cpuset.nr_subparts_cpus) { + if (root_load_balance && !top_cpuset.nr_subparts) { ndoms = 1; doms = alloc_sched_domains(ndoms); if (!doms) @@ -1159,7 +1216,7 @@ static void rebuild_sched_domains_locked(void) * should be the same as the active CPUs, so checking only top_cpuset * is enough to detect racing CPU offlines. */ - if (!top_cpuset.nr_subparts_cpus && + if (cpumask_empty(subpartitions_cpus) && !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) return; @@ -1168,7 +1225,7 @@ static void rebuild_sched_domains_locked(void) * root should be only a subset of the active CPUs. Since a CPU in any * partition root could be offlined, all must be checked. */ - if (top_cpuset.nr_subparts_cpus) { + if (top_cpuset.nr_subparts) { rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (!is_partition_valid(cs)) { @@ -1232,7 +1289,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) */ if (kthread_is_per_cpu(task)) continue; - cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus); + cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); } else { cpumask_and(new_cpus, possible_mask, cs->effective_cpus); } @@ -1247,32 +1304,22 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) * @cs: the cpuset the need to recompute the new effective_cpus mask * @parent: the parent cpuset * - * If the parent has subpartition CPUs, include them in the list of - * allowable CPUs in computing the new effective_cpus mask. Since offlined - * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask - * to mask those out. + * The result is valid only if the given cpuset isn't a partition root. */ static void compute_effective_cpumask(struct cpumask *new_cpus, struct cpuset *cs, struct cpuset *parent) { - if (parent->nr_subparts_cpus && is_partition_valid(cs)) { - cpumask_or(new_cpus, parent->effective_cpus, - parent->subparts_cpus); - cpumask_and(new_cpus, new_cpus, cs->cpus_allowed); - cpumask_and(new_cpus, new_cpus, cpu_active_mask); - } else { - cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); - } + cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); } /* - * Commands for update_parent_subparts_cpumask + * Commands for update_parent_effective_cpumask */ -enum subparts_cmd { - partcmd_enable, /* Enable partition root */ - partcmd_disable, /* Disable partition root */ - partcmd_update, /* Update parent's subparts_cpus */ - partcmd_invalidate, /* Make partition invalid */ +enum partition_cmd { + partcmd_enable, /* Enable partition root */ + partcmd_disable, /* Disable partition root */ + partcmd_update, /* Update parent's effective_cpus */ + partcmd_invalidate, /* Make partition invalid */ }; static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, @@ -1304,13 +1351,23 @@ static int update_partition_exclusive(struct cpuset *cs, int new_prs) * * Changing load balance flag will automatically call * rebuild_sched_domains_locked(). + * This function is for cgroup v2 only. */ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) { int new_prs = cs->partition_root_state; - bool new_lb = (new_prs != PRS_ISOLATED); bool rebuild_domains = (new_prs > 0) || (old_prs > 0); + bool new_lb; + /* + * If cs is not a valid partition root, the load balance state + * will follow its parent. + */ + if (new_prs > 0) { + new_lb = (new_prs != PRS_ISOLATED); + } else { + new_lb = is_sched_load_balance(parent_cs(cs)); + } if (new_lb != !!is_sched_load_balance(cs)) { rebuild_domains = true; if (new_lb) @@ -1323,8 +1380,296 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) rebuild_sched_domains_locked(); } +/* + * tasks_nocpu_error - Return true if tasks will have no effective_cpus + */ +static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs, + struct cpumask *xcpus) +{ + /* + * A populated partition (cs or parent) can't have empty effective_cpus + */ + return (cpumask_subset(parent->effective_cpus, xcpus) && + partition_is_populated(parent, cs)) || + (!cpumask_intersects(xcpus, cpu_active_mask) && + partition_is_populated(cs, NULL)); +} + +static void reset_partition_data(struct cpuset *cs) +{ + struct cpuset *parent = parent_cs(cs); + + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + return; + + lockdep_assert_held(&callback_lock); + + cs->nr_subparts = 0; + if (cpumask_empty(cs->exclusive_cpus)) { + cpumask_clear(cs->effective_xcpus); + if (is_cpu_exclusive(cs)) + clear_bit(CS_CPU_EXCLUSIVE, &cs->flags); + } + if (!cpumask_and(cs->effective_cpus, + parent->effective_cpus, cs->cpus_allowed)) { + cs->use_parent_ecpus = true; + parent->child_ecpus_count++; + cpumask_copy(cs->effective_cpus, parent->effective_cpus); + } +} + +/* + * compute_effective_exclusive_cpumask - compute effective exclusive CPUs + * @cs: cpuset + * @xcpus: effective exclusive CPUs value to be set + * Return: true if xcpus is not empty, false otherwise. + * + * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), + * it must be a subset of cpus_allowed and parent's effective_xcpus. + */ +static bool compute_effective_exclusive_cpumask(struct cpuset *cs, + struct cpumask *xcpus) +{ + struct cpuset *parent = parent_cs(cs); + + if (!xcpus) + xcpus = cs->effective_xcpus; + + if (!cpumask_empty(cs->exclusive_cpus)) + cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed); + else + cpumask_copy(xcpus, cs->cpus_allowed); + + return cpumask_and(xcpus, xcpus, parent->effective_xcpus); +} + +static inline bool is_remote_partition(struct cpuset *cs) +{ + return !list_empty(&cs->remote_sibling); +} + +static inline bool is_local_partition(struct cpuset *cs) +{ + return is_partition_valid(cs) && !is_remote_partition(cs); +} + +/* + * remote_partition_enable - Enable current cpuset as a remote partition root + * @cs: the cpuset to update + * @tmp: temparary masks + * Return: 1 if successful, 0 if error + * + * Enable the current cpuset to become a remote partition root taking CPUs + * directly from the top cpuset. cpuset_mutex must be held by the caller. + */ +static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp) +{ + /* + * The user must have sysadmin privilege. + */ + if (!capable(CAP_SYS_ADMIN)) + return 0; + + /* + * The requested exclusive_cpus must not be allocated to other + * partitions and it can't use up all the root's effective_cpus. + * + * Note that if there is any local partition root above it or + * remote partition root underneath it, its exclusive_cpus must + * have overlapped with subpartitions_cpus. + */ + compute_effective_exclusive_cpumask(cs, tmp->new_cpus); + if (cpumask_empty(tmp->new_cpus) || + cpumask_intersects(tmp->new_cpus, subpartitions_cpus) || + cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) + return 0; + + spin_lock_irq(&callback_lock); + cpumask_andnot(top_cpuset.effective_cpus, + top_cpuset.effective_cpus, tmp->new_cpus); + cpumask_or(subpartitions_cpus, + subpartitions_cpus, tmp->new_cpus); + + if (cs->use_parent_ecpus) { + struct cpuset *parent = parent_cs(cs); + + cs->use_parent_ecpus = false; + parent->child_ecpus_count--; + } + list_add(&cs->remote_sibling, &remote_children); + spin_unlock_irq(&callback_lock); + + /* + * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + */ + update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + update_sibling_cpumasks(&top_cpuset, NULL, tmp); + + return 1; +} + +/* + * remote_partition_disable - Remove current cpuset from remote partition list + * @cs: the cpuset to update + * @tmp: temparary masks + * + * The effective_cpus is also updated. + * + * cpuset_mutex must be held by the caller. + */ +static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) +{ + compute_effective_exclusive_cpumask(cs, tmp->new_cpus); + WARN_ON_ONCE(!is_remote_partition(cs)); + WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus)); + + spin_lock_irq(&callback_lock); + cpumask_andnot(subpartitions_cpus, + subpartitions_cpus, tmp->new_cpus); + cpumask_and(tmp->new_cpus, + tmp->new_cpus, cpu_active_mask); + cpumask_or(top_cpuset.effective_cpus, + top_cpuset.effective_cpus, tmp->new_cpus); + list_del_init(&cs->remote_sibling); + cs->partition_root_state = -cs->partition_root_state; + if (!cs->prs_err) + cs->prs_err = PERR_INVCPUS; + reset_partition_data(cs); + spin_unlock_irq(&callback_lock); + + /* + * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + */ + update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + update_sibling_cpumasks(&top_cpuset, NULL, tmp); +} + +/* + * remote_cpus_update - cpus_exclusive change of remote partition + * @cs: the cpuset to be updated + * @newmask: the new effective_xcpus mask + * @tmp: temparary masks + * + * top_cpuset and subpartitions_cpus will be updated or partition can be + * invalidated. + */ +static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, + struct tmpmasks *tmp) +{ + bool adding, deleting; + + if (WARN_ON_ONCE(!is_remote_partition(cs))) + return; + + WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); + + if (cpumask_empty(newmask)) + goto invalidate; + + adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus); + deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask); + + /* + * Additions of remote CPUs is only allowed if those CPUs are + * not allocated to other partitions and there are effective_cpus + * left in the top cpuset. + */ + if (adding && (!capable(CAP_SYS_ADMIN) || + cpumask_intersects(tmp->addmask, subpartitions_cpus) || + cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))) + goto invalidate; + + spin_lock_irq(&callback_lock); + if (adding) { + cpumask_or(subpartitions_cpus, + subpartitions_cpus, tmp->addmask); + cpumask_andnot(top_cpuset.effective_cpus, + top_cpuset.effective_cpus, tmp->addmask); + } + if (deleting) { + cpumask_andnot(subpartitions_cpus, + subpartitions_cpus, tmp->delmask); + cpumask_and(tmp->delmask, + tmp->delmask, cpu_active_mask); + cpumask_or(top_cpuset.effective_cpus, + top_cpuset.effective_cpus, tmp->delmask); + } + spin_unlock_irq(&callback_lock); + + /* + * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + */ + update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + update_sibling_cpumasks(&top_cpuset, NULL, tmp); + return; + +invalidate: + remote_partition_disable(cs, tmp); +} + +/* + * remote_partition_check - check if a child remote partition needs update + * @cs: the cpuset to be updated + * @newmask: the new effective_xcpus mask + * @delmask: temporary mask for deletion (not in tmp) + * @tmp: temparary masks + * + * This should be called before the given cs has updated its cpus_allowed + * and/or effective_xcpus. + */ +static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, + struct cpumask *delmask, struct tmpmasks *tmp) +{ + struct cpuset *child, *next; + int disable_cnt = 0; + + /* + * Compute the effective exclusive CPUs that will be deleted. + */ + if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) || + !cpumask_intersects(delmask, subpartitions_cpus)) + return; /* No deletion of exclusive CPUs in partitions */ + + /* + * Searching the remote children list to look for those that will + * be impacted by the deletion of exclusive CPUs. + * + * Since a cpuset must be removed from the remote children list + * before it can go offline and holding cpuset_mutex will prevent + * any change in cpuset status. RCU read lock isn't needed. + */ + lockdep_assert_held(&cpuset_mutex); + list_for_each_entry_safe(child, next, &remote_children, remote_sibling) + if (cpumask_intersects(child->effective_cpus, delmask)) { + remote_partition_disable(child, tmp); + disable_cnt++; + } + if (disable_cnt) + rebuild_sched_domains_locked(); +} + +/* + * prstate_housekeeping_conflict - check for partition & housekeeping conflicts + * @prstate: partition root state to be checked + * @new_cpus: cpu mask + * Return: true if there is conflict, false otherwise + * + * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in + * an isolated partition. + */ +static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) +{ + const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN); + bool all_in_hk = cpumask_subset(new_cpus, hk_domain); + + if (!all_in_hk && (prstate != PRS_ISOLATED)) + return true; + + return false; +} + /** - * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset + * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state * @cmd: Partition root state change command * @newmask: Optional new cpumask for partcmd_update @@ -1332,21 +1677,20 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) * Return: 0 or a partition root state error code * * For partcmd_enable, the cpuset is being transformed from a non-partition - * root to a partition root. The cpus_allowed mask of the given cpuset will - * be put into parent's subparts_cpus and taken away from parent's + * root to a partition root. The effective_xcpus (cpus_allowed if effective_xcpus + * not set) mask of the given cpuset will be taken away from parent's * effective_cpus. The function will return 0 if all the CPUs listed in - * cpus_allowed can be granted or an error code will be returned. + * effective_xcpus can be granted or an error code will be returned. * * For partcmd_disable, the cpuset is being transformed from a partition - * root back to a non-partition root. Any CPUs in cpus_allowed that are in - * parent's subparts_cpus will be taken away from that cpumask and put back - * into parent's effective_cpus. 0 will always be returned. + * root back to a non-partition root. Any CPUs in effective_xcpus will be + * given back to parent's effective_cpus. 0 will always be returned. * * For partcmd_update, if the optional newmask is specified, the cpu list is - * to be changed from cpus_allowed to newmask. Otherwise, cpus_allowed is + * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is * assumed to remain the same. The cpuset should either be a valid or invalid * partition root. The partition root state may change from valid to invalid - * or vice versa. An error code will only be returned if transitioning from + * or vice versa. An error code will be returned if transitioning from * invalid to valid violates the exclusivity rule. * * For partcmd_invalidate, the current partition will be made invalid. @@ -1361,19 +1705,48 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) * check for error and so partition_root_state and prs_error will be updated * directly. */ -static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, - struct cpumask *newmask, - struct tmpmasks *tmp) +static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, + struct cpumask *newmask, + struct tmpmasks *tmp) { struct cpuset *parent = parent_cs(cs); - int adding; /* Moving cpus from effective_cpus to subparts_cpus */ - int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ + int adding; /* Adding cpus to parent's effective_cpus */ + int deleting; /* Deleting cpus from parent's effective_cpus */ int old_prs, new_prs; int part_error = PERR_NONE; /* Partition error? */ + int subparts_delta = 0; + struct cpumask *xcpus; /* cs effective_xcpus */ + bool nocpu; lockdep_assert_held(&cpuset_mutex); /* + * new_prs will only be changed for the partcmd_update and + * partcmd_invalidate commands. + */ + adding = deleting = false; + old_prs = new_prs = cs->partition_root_state; + xcpus = !cpumask_empty(cs->exclusive_cpus) + ? cs->effective_xcpus : cs->cpus_allowed; + + if (cmd == partcmd_invalidate) { + if (is_prs_invalid(old_prs)) + return 0; + + /* + * Make the current partition invalid. + */ + if (is_partition_valid(parent)) + adding = cpumask_and(tmp->addmask, + xcpus, parent->effective_xcpus); + if (old_prs > 0) { + new_prs = -old_prs; + subparts_delta--; + } + goto write_error; + } + + /* * The parent must be a partition root. * The new cpumask, if present, or the current cpus_allowed must * not be empty. @@ -1385,124 +1758,138 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, if (!newmask && cpumask_empty(cs->cpus_allowed)) return PERR_CPUSEMPTY; - /* - * new_prs will only be changed for the partcmd_update and - * partcmd_invalidate commands. - */ - adding = deleting = false; - old_prs = new_prs = cs->partition_root_state; + nocpu = tasks_nocpu_error(parent, cs, xcpus); + if (cmd == partcmd_enable) { /* - * Enabling partition root is not allowed if cpus_allowed - * doesn't overlap parent's cpus_allowed. + * Enabling partition root is not allowed if its + * effective_xcpus is empty or doesn't overlap with + * parent's effective_xcpus. */ - if (!cpumask_intersects(cs->cpus_allowed, parent->cpus_allowed)) + if (cpumask_empty(xcpus) || + !cpumask_intersects(xcpus, parent->effective_xcpus)) return PERR_INVCPUS; + if (prstate_housekeeping_conflict(new_prs, xcpus)) + return PERR_HKEEPING; + /* * A parent can be left with no CPU as long as there is no * task directly associated with the parent partition. */ - if (cpumask_subset(parent->effective_cpus, cs->cpus_allowed) && - partition_is_populated(parent, cs)) + if (nocpu) return PERR_NOCPUS; - cpumask_copy(tmp->addmask, cs->cpus_allowed); - adding = true; + cpumask_copy(tmp->delmask, xcpus); + deleting = true; + subparts_delta++; } else if (cmd == partcmd_disable) { /* - * Need to remove cpus from parent's subparts_cpus for valid - * partition root. + * May need to add cpus to parent's effective_cpus for + * valid partition root. */ - deleting = !is_prs_invalid(old_prs) && - cpumask_and(tmp->delmask, cs->cpus_allowed, - parent->subparts_cpus); - } else if (cmd == partcmd_invalidate) { - if (is_prs_invalid(old_prs)) - return 0; - + adding = !is_prs_invalid(old_prs) && + cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); + if (adding) + subparts_delta--; + } else if (newmask) { /* - * Make the current partition invalid. It is assumed that - * invalidation is caused by violating cpu exclusivity rule. + * Empty cpumask is not allowed */ - deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, - parent->subparts_cpus); - if (old_prs > 0) { - new_prs = -old_prs; - part_error = PERR_NOTEXCL; + if (cpumask_empty(newmask)) { + part_error = PERR_CPUSEMPTY; + goto write_error; } - } else if (newmask) { + /* * partcmd_update with newmask: * - * Compute add/delete mask to/from subparts_cpus + * Compute add/delete mask to/from effective_cpus + * + * For valid partition: + * addmask = exclusive_cpus & ~newmask + * & parent->effective_xcpus + * delmask = newmask & ~exclusive_cpus + * & parent->effective_xcpus * - * delmask = cpus_allowed & ~newmask & parent->subparts_cpus - * addmask = newmask & parent->cpus_allowed - * & ~parent->subparts_cpus + * For invalid partition: + * delmask = newmask & parent->effective_xcpus */ - cpumask_andnot(tmp->delmask, cs->cpus_allowed, newmask); - deleting = cpumask_and(tmp->delmask, tmp->delmask, - parent->subparts_cpus); + if (is_prs_invalid(old_prs)) { + adding = false; + deleting = cpumask_and(tmp->delmask, + newmask, parent->effective_xcpus); + } else { + cpumask_andnot(tmp->addmask, xcpus, newmask); + adding = cpumask_and(tmp->addmask, tmp->addmask, + parent->effective_xcpus); - cpumask_and(tmp->addmask, newmask, parent->cpus_allowed); - adding = cpumask_andnot(tmp->addmask, tmp->addmask, - parent->subparts_cpus); - /* - * Empty cpumask is not allowed - */ - if (cpumask_empty(newmask)) { - part_error = PERR_CPUSEMPTY; + cpumask_andnot(tmp->delmask, newmask, xcpus); + deleting = cpumask_and(tmp->delmask, tmp->delmask, + parent->effective_xcpus); + } /* * Make partition invalid if parent's effective_cpus could * become empty and there are tasks in the parent. */ - } else if (adding && - cpumask_subset(parent->effective_cpus, tmp->addmask) && - !cpumask_intersects(tmp->delmask, cpu_active_mask) && - partition_is_populated(parent, cs)) { + if (nocpu && (!adding || + !cpumask_intersects(tmp->addmask, cpu_active_mask))) { part_error = PERR_NOCPUS; - adding = false; - deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, - parent->subparts_cpus); + deleting = false; + adding = cpumask_and(tmp->addmask, + xcpus, parent->effective_xcpus); } } else { /* - * partcmd_update w/o newmask: + * partcmd_update w/o newmask + * + * delmask = effective_xcpus & parent->effective_cpus + * + * This can be called from: + * 1) update_cpumasks_hier() + * 2) cpuset_hotplug_update_tasks() * - * delmask = cpus_allowed & parent->subparts_cpus - * addmask = cpus_allowed & parent->cpus_allowed - * & ~parent->subparts_cpus + * Check to see if it can be transitioned from valid to + * invalid partition or vice versa. * - * This gets invoked either due to a hotplug event or from - * update_cpumasks_hier(). This can cause the state of a - * partition root to transition from valid to invalid or vice - * versa. So we still need to compute the addmask and delmask. - - * A partition error happens when: - * 1) Cpuset is valid partition, but parent does not distribute - * out any CPUs. - * 2) Parent has tasks and all its effective CPUs will have - * to be distributed out. + * A partition error happens when parent has tasks and all + * its effective CPUs will have to be distributed out. */ - cpumask_and(tmp->addmask, cs->cpus_allowed, - parent->cpus_allowed); - adding = cpumask_andnot(tmp->addmask, tmp->addmask, - parent->subparts_cpus); - - if ((is_partition_valid(cs) && !parent->nr_subparts_cpus) || - (adding && - cpumask_subset(parent->effective_cpus, tmp->addmask) && - partition_is_populated(parent, cs))) { + WARN_ON_ONCE(!is_partition_valid(parent)); + if (nocpu) { part_error = PERR_NOCPUS; - adding = false; - } + if (is_partition_valid(cs)) + adding = cpumask_and(tmp->addmask, + xcpus, parent->effective_xcpus); + } else if (is_partition_invalid(cs) && + cpumask_subset(xcpus, parent->effective_xcpus)) { + struct cgroup_subsys_state *css; + struct cpuset *child; + bool exclusive = true; - if (part_error && is_partition_valid(cs) && - parent->nr_subparts_cpus) - deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, - parent->subparts_cpus); + /* + * Convert invalid partition to valid has to + * pass the cpu exclusivity test. + */ + rcu_read_lock(); + cpuset_for_each_child(child, css, parent) { + if (child == cs) + continue; + if (!cpusets_are_exclusive(cs, child)) { + exclusive = false; + break; + } + } + rcu_read_unlock(); + if (exclusive) + deleting = cpumask_and(tmp->delmask, + xcpus, parent->effective_cpus); + else + part_error = PERR_NOTEXCL; + } } + +write_error: if (part_error) WRITE_ONCE(cs->prs_err, part_error); @@ -1514,13 +1901,17 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, switch (cs->partition_root_state) { case PRS_ROOT: case PRS_ISOLATED: - if (part_error) + if (part_error) { new_prs = -old_prs; + subparts_delta--; + } break; case PRS_INVALID_ROOT: case PRS_INVALID_ISOLATED: - if (!part_error) + if (!part_error) { new_prs = -old_prs; + subparts_delta++; + } break; } } @@ -1530,9 +1921,11 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, /* * Transitioning between invalid to valid or vice versa may require - * changing CS_CPU_EXCLUSIVE. + * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update, + * validate_change() has already been successfully called and + * CPU lists in cs haven't been updated yet. So defer it to later. */ - if (old_prs != new_prs) { + if ((old_prs != new_prs) && (cmd != partcmd_update)) { int err = update_partition_exclusive(cs, new_prs); if (err) @@ -1540,39 +1933,52 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, } /* - |
