diff options
author | Tejun Heo <tj@kernel.org> | 2016-12-27 14:49:05 -0500 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2016-12-27 14:49:05 -0500 |
commit | 201af4c0fab02876ef0311e7f7b4083aa138930c (patch) | |
tree | d79bfd5f25c5fd111a79993bc10fc129281db225 /kernel/cgroup/cgroup.c | |
parent | 5f617ebbdf10abd49312a89e3b894b927c7367f5 (diff) | |
download | linux-201af4c0fab02876ef0311e7f7b4083aa138930c.tar.gz linux-201af4c0fab02876ef0311e7f7b4083aa138930c.tar.bz2 linux-201af4c0fab02876ef0311e7f7b4083aa138930c.zip |
cgroup: move cgroup files under kernel/cgroup/
They're growing to be too many and planned to get split further. Move
them under their own directory.
kernel/cgroup.c -> kernel/cgroup/cgroup.c
kernel/cgroup_freezer.c -> kernel/cgroup/freezer.c
kernel/cgroup_pids.c -> kernel/cgroup/pids.c
kernel/cpuset.c -> kernel/cgroup/cpuset.c
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
Diffstat (limited to 'kernel/cgroup/cgroup.c')
-rw-r--r-- | kernel/cgroup/cgroup.c | 6705 |
1 files changed, 6705 insertions, 0 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c new file mode 100644 index 000000000000..1a815f275849 --- /dev/null +++ b/kernel/cgroup/cgroup.c @@ -0,0 +1,6705 @@ +/* + * Generic process-grouping system. + * + * Based originally on the cpuset system, extracted by Paul Menage + * Copyright (C) 2006 Google, Inc + * + * Notifications support + * Copyright (C) 2009 Nokia Corporation + * Author: Kirill A. Shutemov + * + * Copyright notices from the original cpuset code: + * -------------------------------------------------- + * Copyright (C) 2003 BULL SA. + * Copyright (C) 2004-2006 Silicon Graphics, Inc. + * + * Portions derived from Patrick Mochel's sysfs code. + * sysfs is Copyright (c) 2001-3 Patrick Mochel + * + * 2003-10-10 Written by Simon Derr. + * 2003-10-22 Updates by Stephen Hemminger. + * 2004 May-July Rework by Paul Jackson. + * --------------------------------------------------- + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cgroup.h> +#include <linux/cred.h> +#include <linux/ctype.h> +#include <linux/errno.h> +#include <linux/init_task.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/magic.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/mount.h> +#include <linux/pagemap.h> +#include <linux/proc_fs.h> +#include <linux/rcupdate.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/percpu-rwsem.h> +#include <linux/string.h> +#include <linux/sort.h> +#include <linux/kmod.h> +#include <linux/delayacct.h> +#include <linux/cgroupstats.h> +#include <linux/hashtable.h> +#include <linux/pid_namespace.h> +#include <linux/idr.h> +#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ +#include <linux/kthread.h> +#include <linux/delay.h> +#include <linux/atomic.h> +#include <linux/cpuset.h> +#include <linux/proc_ns.h> +#include <linux/nsproxy.h> +#include <linux/file.h> +#include <net/sock.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/cgroup.h> + +/* + * pidlists linger the following amount before being destroyed. The goal + * is avoiding frequent destruction in the middle of consecutive read calls + * Expiring in the middle is a performance problem not a correctness one. + * 1 sec should be enough. + */ +#define CGROUP_PIDLIST_DESTROY_DELAY HZ + +#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ + MAX_CFTYPE_NAME + 2) + +/* + * cgroup_mutex is the master lock. Any modification to cgroup or its + * hierarchy must be performed while holding it. + * + * css_set_lock protects task->cgroups pointer, the list of css_set + * objects, and the chain of tasks off each css_set. + * + * These locks are exported if CONFIG_PROVE_RCU so that accessors in + * cgroup.h can use them for lockdep annotations. + */ +#ifdef CONFIG_PROVE_RCU +DEFINE_MUTEX(cgroup_mutex); +DEFINE_SPINLOCK(css_set_lock); +EXPORT_SYMBOL_GPL(cgroup_mutex); +EXPORT_SYMBOL_GPL(css_set_lock); +#else +static DEFINE_MUTEX(cgroup_mutex); +static DEFINE_SPINLOCK(css_set_lock); +#endif + +/* + * Protects cgroup_idr and css_idr so that IDs can be released without + * grabbing cgroup_mutex. + */ +static DEFINE_SPINLOCK(cgroup_idr_lock); + +/* + * Protects cgroup_file->kn for !self csses. It synchronizes notifications + * against file removal/re-creation across css hiding. + */ +static DEFINE_SPINLOCK(cgroup_file_kn_lock); + +/* + * Protects cgroup_subsys->release_agent_path. Modifying it also requires + * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. + */ +static DEFINE_SPINLOCK(release_agent_path_lock); + +struct percpu_rw_semaphore cgroup_threadgroup_rwsem; + +#define cgroup_assert_mutex_or_rcu_locked() \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ + !lockdep_is_held(&cgroup_mutex), \ + "cgroup_mutex or RCU read lock required"); + +/* + * cgroup destruction makes heavy use of work items and there can be a lot + * of concurrent destructions. Use a separate workqueue so that cgroup + * destruction work items don't end up filling up max_active of system_wq + * which may lead to deadlock. + */ +static struct workqueue_struct *cgroup_destroy_wq; + +/* + * pidlist destructions need to be flushed on cgroup destruction. Use a + * separate workqueue as flush domain. + */ +static struct workqueue_struct *cgroup_pidlist_destroy_wq; + +/* generate an array of cgroup subsystem pointers */ +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, +static struct cgroup_subsys *cgroup_subsys[] = { +#include <linux/cgroup_subsys.h> +}; +#undef SUBSYS + +/* array of cgroup subsystem names */ +#define SUBSYS(_x) [_x ## _cgrp_id] = #_x, +static const char *cgroup_subsys_name[] = { +#include <linux/cgroup_subsys.h> +}; +#undef SUBSYS + +/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */ +#define SUBSYS(_x) \ + DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \ + DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \ + EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \ + EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key); +#include <linux/cgroup_subsys.h> +#undef SUBSYS + +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key, +static struct static_key_true *cgroup_subsys_enabled_key[] = { +#include <linux/cgroup_subsys.h> +}; +#undef SUBSYS + +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key, +static struct static_key_true *cgroup_subsys_on_dfl_key[] = { +#include <linux/cgroup_subsys.h> +}; +#undef SUBSYS + +/* + * The default hierarchy, reserved for the subsystems that are otherwise + * unattached - it never has more than a single cgroup, and all tasks are + * part of that cgroup. + */ +struct cgroup_root cgrp_dfl_root; +EXPORT_SYMBOL_GPL(cgrp_dfl_root); + +/* + * The default hierarchy always exists but is hidden until mounted for the + * first time. This is for backward compatibility. + */ +static bool cgrp_dfl_visible; + +/* Controllers blocked by the commandline in v1 */ +static u16 cgroup_no_v1_mask; + +/* some controllers are not supported in the default hierarchy */ +static u16 cgrp_dfl_inhibit_ss_mask; + +/* some controllers are implicitly enabled on the default hierarchy */ +static unsigned long cgrp_dfl_implicit_ss_mask; + +/* The list of hierarchy roots */ + +static LIST_HEAD(cgroup_roots); +static int cgroup_root_count; + +/* hierarchy ID allocation and mapping, protected by cgroup_mutex */ +static DEFINE_IDR(cgroup_hierarchy_idr); + +/* + * Assign a monotonically increasing serial number to csses. It guarantees + * cgroups with bigger numbers are newer than those with smaller numbers. + * Also, as csses are always appended to the parent's ->children list, it + * guarantees that sibling csses are always sorted in the ascending serial + * number order on the list. Protected by cgroup_mutex. + */ +static u64 css_serial_nr_next = 1; + +/* + * These bitmask flags indicate whether tasks in the fork and exit paths have + * fork/exit handlers to call. This avoids us having to do extra work in the + * fork/exit path to check which subsystems have fork/exit callbacks. + */ +static u16 have_fork_callback __read_mostly; +static u16 have_exit_callback __read_mostly; +static u16 have_free_callback __read_mostly; + +/* cgroup namespace for init task */ +struct cgroup_namespace init_cgroup_ns = { + .count = { .counter = 2, }, + .user_ns = &init_user_ns, + .ns.ops = &cgroupns_operations, + .ns.inum = PROC_CGROUP_INIT_INO, + .root_cset = &init_css_set, +}; + +/* Ditto for the can_fork callback. */ +static u16 have_canfork_callback __read_mostly; + +static struct file_system_type cgroup2_fs_type; +static struct cftype cgroup_dfl_base_files[]; +static struct cftype cgroup_legacy_base_files[]; + +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp); +static int cgroup_apply_control(struct cgroup *cgrp); +static void cgroup_finalize_control(struct cgroup *cgrp, int ret); +static void css_task_iter_advance(struct css_task_iter *it); +static int cgroup_destroy_locked(struct cgroup *cgrp); +static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, + struct cgroup_subsys *ss); +static void css_release(struct percpu_ref *ref); +static void kill_css(struct cgroup_subsys_state *css); +static int cgroup_addrm_files(struct cgroup_subsys_state *css, + struct cgroup *cgrp, struct cftype cfts[], + bool is_add); + +/** + * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID + * @ssid: subsys ID of interest + * + * cgroup_subsys_enabled() can only be used with literal subsys names which + * is fine for individual subsystems but unsuitable for cgroup core. This + * is slower static_key_enabled() based test indexed by @ssid. + */ +static bool cgroup_ssid_enabled(int ssid) +{ + if (CGROUP_SUBSYS_COUNT == 0) + return false; + + return static_key_enabled(cgroup_subsys_enabled_key[ssid]); +} + +static bool cgroup_ssid_no_v1(int ssid) +{ + return cgroup_no_v1_mask & (1 << ssid); +} + +/** + * cgroup_on_dfl - test whether a cgroup is on the default hierarchy + * @cgrp: the cgroup of interest + * + * The default hierarchy is the v2 interface of cgroup and this function + * can be used to test whether a cgroup is on the default hierarchy for + * cases where a subsystem should behave differnetly depending on the + * interface version. + * + * The set of behaviors which change on the default hierarchy are still + * being determined and the mount option is prefixed with __DEVEL__. + * + * List of changed behaviors: + * + * - Mount options "noprefix", "xattr", "clone_children", "release_agent" + * and "name" are disallowed. + * + * - When mounting an existing superblock, mount options should match. + * + * - Remount is disallowed. + * + * - rename(2) is disallowed. + * + * - "tasks" is removed. Everything should be at process granularity. Use + * "cgroup.procs" instead. + * + * - "cgroup.procs" is not sorted. pids will be unique unless they got + * recycled inbetween reads. + * + * - "release_agent" and "notify_on_release" are removed. Replacement + * notification mechanism will be implemented. + * + * - "cgroup.clone_children" is removed. + * + * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup + * and its descendants contain no task; otherwise, 1. The file also + * generates kernfs notification which can be monitored through poll and + * [di]notify when the value of the file changes. + * + * - cpuset: tasks will be kept in empty cpusets when hotplug happens and + * take masks of ancestors with non-empty cpus/mems, instead of being + * moved to an ancestor. + * + * - cpuset: a task can be moved into an empty cpuset, and again it takes + * masks of ancestors. + * + * - memcg: use_hierarchy is on by default and the cgroup file for the flag + * is not created. + * + * - blkcg: blk-throttle becomes properly hierarchical. + * + * - debug: disallowed on the default hierarchy. + */ +static bool cgroup_on_dfl(const struct cgroup *cgrp) +{ + return cgrp->root == &cgrp_dfl_root; +} + +/* IDR wrappers which synchronize using cgroup_idr_lock */ +static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, + gfp_t gfp_mask) +{ + int ret; + + idr_preload(gfp_mask); + spin_lock_bh(&cgroup_idr_lock); + ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM); + spin_unlock_bh(&cgroup_idr_lock); + idr_preload_end(); + return ret; +} + +static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) +{ + void *ret; + + spin_lock_bh(&cgroup_idr_lock); + ret = idr_replace(idr, ptr, id); + spin_unlock_bh(&cgroup_idr_lock); + return ret; +} + +static void cgroup_idr_remove(struct idr *idr, int id) +{ + spin_lock_bh(&cgroup_idr_lock); + idr_remove(idr, id); + spin_unlock_bh(&cgroup_idr_lock); +} + +static struct cgroup *cgroup_parent(struct cgroup *cgrp) +{ + struct cgroup_subsys_state *parent_css = cgrp->self.parent; + + if (parent_css) + return container_of(parent_css, struct cgroup, self); + return NULL; +} + +/* subsystems visibly enabled on a cgroup */ +static u16 cgroup_control(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + u16 root_ss_mask = cgrp->root->subsys_mask; + + if (parent) + return parent->subtree_control; + + if (cgroup_on_dfl(cgrp)) + root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | + cgrp_dfl_implicit_ss_mask); + return root_ss_mask; +} + +/* subsystems enabled on a cgroup */ +static u16 cgroup_ss_mask(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + + if (parent) + return parent->subtree_ss_mask; + + return cgrp->root->subsys_mask; +} + +/** + * cgroup_css - obtain a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest (%NULL returns @cgrp->self) + * + * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This + * function must be called either under cgroup_mutex or rcu_read_lock() and + * the caller is responsible for pinning the returned css if it wants to + * keep accessing it outside the said locks. This function may return + * %NULL if @cgrp doesn't have @subsys_id enabled. + */ +static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + if (ss) + return rcu_dereference_check(cgrp->subsys[ss->id], + lockdep_is_held(&cgroup_mutex)); + else + return &cgrp->self; +} + +/** + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest (%NULL returns @cgrp->self) + * + * Similar to cgroup_css() but returns the effective css, which is defined + * as the matching css of the nearest ancestor including self which has @ss + * enabled. If @ss is associated with the hierarchy @cgrp is on, this + * function is guaranteed to return non-NULL css. + */ +static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + lockdep_assert_held(&cgroup_mutex); + + if (!ss) + return &cgrp->self; + + /* + * This function is used while updating css associations and thus + * can't test the csses directly. Test ss_mask. + */ + while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) { + cgrp = cgroup_parent(cgrp); + if (!cgrp) + return NULL; + } + + return cgroup_css(cgrp, ss); +} + +/** + * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get the effective css of @cgrp for @ss. The effective css is + * defined as the matching css of the nearest ancestor including self which + * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, + * the root css is returned, so this function always returns a valid css. + * The returned css must be put using css_put(). + */ +struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + + do { + css = cgroup_css(cgrp, ss); + + if (css && css_tryget_online(css)) + goto out_unlock; + cgrp = cgroup_parent(cgrp); + } while (cgrp); + + css = init_css_set.subsys[ss->id]; + css_get(css); +out_unlock: + rcu_read_unlock(); + return css; +} + +/* convenient tests for these bits */ +static inline bool cgroup_is_dead(const struct cgroup *cgrp) +{ + return !(cgrp->self.flags & CSS_ONLINE); +} + +static void cgroup_get(struct cgroup *cgrp) +{ + WARN_ON_ONCE(cgroup_is_dead(cgrp)); + css_get(&cgrp->self); +} + +static bool cgroup_tryget(struct cgroup *cgrp) +{ + return css_tryget(&cgrp->self); +} + +struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) +{ + struct cgroup *cgrp = of->kn->parent->priv; + struct cftype *cft = of_cft(of); + + /* + * This is open and unprotected implementation of cgroup_css(). + * seq_css() is only called from a kernfs file operation which has + * an active reference on the file. Because all the subsystem + * files are drained before a css is disassociated with a cgroup, + * the matching css from the cgroup's subsys table is guaranteed to + * be and stay valid until the enclosing operation is complete. + */ + if (cft->ss) + return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); + else + return &cgrp->self; +} +EXPORT_SYMBOL_GPL(of_css); + +static int notify_on_release(const struct cgroup *cgrp) +{ + return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); +} + +/** + * for_each_css - iterate all css's of a cgroup + * @css: the iteration cursor + * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end + * @cgrp: the target cgroup to iterate css's of + * + * Should be called under cgroup_[tree_]mutex. + */ +#define for_each_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = rcu_dereference_check( \ + (cgrp)->subsys[(ssid)], \ + lockdep_is_held(&cgroup_mutex)))) { } \ + else + +/** + * for_each_e_css - iterate all effective css's of a cgroup + * @css: the iteration cursor + * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end + * @cgrp: the target cgroup to iterate css's of + * + * Should be called under cgroup_[tree_]mutex. + */ +#define for_each_e_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ + ; \ + else + +/** + * for_each_subsys - iterate all enabled cgroup subsystems + * @ss: the iteration cursor + * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end + */ +#define for_each_subsys(ss, ssid) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ + (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) + +/** + * do_each_subsys_mask - filter for_each_subsys with a bitmask + * @ss: the iteration cursor + * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end + * @ss_mask: the bitmask + * + * The block will only run for cases where the ssid-th bit (1 << ssid) of + * @ss_mask is set. + */ +#define do_each_subsys_mask(ss, ssid, ss_mask) do { \ + unsigned long __ss_mask = (ss_mask); \ + if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \ + (ssid) = 0; \ + break; \ + } \ + for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \ + (ss) = cgroup_subsys[ssid]; \ + { + +#define while_each_subsys_mask() \ + } \ + } \ +} while (false) + +/* iterate across the hierarchies */ +#define for_each_root(root) \ + list_for_each_entry((root), &cgroup_roots, root_list) + +/* iterate over child cgrps, lock should be held throughout iteration */ +#define cgroup_for_each_live_child(child, cgrp) \ + list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + cgroup_is_dead(child); })) \ + ; \ + else + +/* walk live descendants in preorder */ +#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ + css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + +/* walk live descendants in postorder */ +#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ + css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + +static void cgroup_release_agent(struct work_struct *work); +static void check_for_release(struct cgroup *cgrp); + +/* + * A cgroup can be associated with multiple css_sets as different tasks may + * belong to different cgroups on different hierarchies. In the other + * direction, a css_set is naturally associated with multiple cgroups. + * This M:N relationship is represented by the following link structure + * which exists for each association and allows traversing the associations + * from both sides. + */ +struct cgrp_cset_link { + /* the cgroup and css_set this link associates */ + struct cgroup *cgrp; + struct css_set *cset; + + /* list of cgrp_cset_links anchored at cgrp->cset_links */ + struct list_head cset_link; + + /* list of cgrp_cset_links anchored at css_set->cgrp_links */ + struct list_head cgrp_link; +}; + +/* + * The default css_set - used by init and its children prior to any + * hierarchies being mounted. It contains a pointer to the root state + * for each subsystem. Also used to anchor the list of css_sets. Not + * reference-counted, to improve performance when child cgroups + * haven't been created. + */ +struct css_set init_css_set = { + .refcount = ATOMIC_INIT(1), + .tasks = LIST_HEAD_INIT(init_css_set.tasks), + .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), + .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), + .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), + .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), + .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), +}; + +static int css_set_count = 1; /* 1 for init_css_set */ + +/** + * css_set_populated - does a css_set contain any tasks? + * @cset: target css_set + */ +static bool css_set_populated(struct css_set *cset) +{ + lockdep_assert_held(&css_set_lock); + + return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks); +} + +/** + * cgroup_update_populated - updated populated count of a cgroup + * @cgrp: the target cgroup + * @populated: inc or dec populated count + * + * One of the css_sets associated with @cgrp is either getting its first + * task or losing the last. Update @cgrp->populated_cnt accordingly. The + * count is propagated towards root so that a given cgroup's populated_cnt + * is zero iff the cgroup and all its descendants don't contain any tasks. + * + * @cgrp's interface file "cgroup.populated" is zero if + * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt + * changes from or to zero, userland is notified that the content of the + * interface file has changed. This can be used to detect when @cgrp and + * its descendants become populated or empty. + */ +static void cgroup_update_populated(struct cgroup *cgrp, bool populated) +{ + lockdep_assert_held(&css_set_lock); + + do { + bool trigger; + + if (populated) + trigger = !cgrp->populated_cnt++; + else + trigger = !--cgrp->populated_cnt; + + if (!trigger) + break; + + check_for_release(cgrp); + cgroup_file_notify(&cgrp->events_file); + + cgrp = cgroup_parent(cgrp); + } while (cgrp); +} + +/** + * css_set_update_populated - update populated state of a css_set + * @cset: target css_set + * @populated: whether @cset is populated or depopulated + * + * @cset is either getting the first task or losing the last. Update the + * ->populated_cnt of all associated cgroups accordingly. + */ +static void css_set_update_populated(struct css_set *cset, bool populated) +{ + struct cgrp_cset_link *link; + + lockdep_assert_held(&css_set_lock); + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) + cgroup_update_populated(link->cgrp, populated); +} + +/** + * css_set_move_task - move a task from one css_set to another + * @task: task being moved + * @from_cset: css_set @task currently belongs to (may be NULL) + * @to_cset: new css_set @task is being moved to (may be NULL) + * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks + * + * Move @task from @from_cset to @to_cset. If @task didn't belong to any + * css_set, @from_cset can be NULL. If @task is being disassociated + * instead of moved, @to_cset can be NULL. + * + * This function automatically handles populated_cnt updates and + * css_task_iter adjustments but the caller is responsible for managing + * @from_cset and @to_cset's reference counts. + */ +static void css_set_move_task(struct task_struct *task, + struct css_set *from_cset, struct css_set *to_cset, + bool use_mg_tasks) +{ + lockdep_assert_held(&css_set_lock); + + if (to_cset && !css_set_populated(to_cset)) + css_set_update_populated(to_cset, true); + + if (from_cset) { + struct css_task_iter *it, *pos; + + WARN_ON_ONCE(list_empty(&task->cg_list)); + + /* + * @task is leaving, advance task iterators which are + * pointing to it so that they can resume at the next + * position. Advancing an iterator might remove it from + * the list, use safe walk. See css_task_iter_advance*() + * for details. + */ + list_for_each_entry_safe(it, pos, &from_cset->task_iters, + iters_node) + if (it->task_pos == &task->cg_list) + css_task_iter_advance(it); + + list_del_init(&task->cg_list); + if (!css_set_populated(from_cset)) + css_set_update_populated(from_cset, false); + } else { + WARN_ON_ONCE(!list_empty(&task->cg_list)); + } + + if (to_cset) { + /* + * We are synchronized through cgroup_threadgroup_rwsem + * against PF_EXITING setting such that we can't race + * against cgroup_exit() changing the css_set to + * init_css_set and dropping the old one. + */ + WARN_ON_ONCE(task->flags & PF_EXITING); + + rcu_assign_pointer(task->cgroups, to_cset); + list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : + &to_cset->tasks); + } +} + +/* + * hash table for cgroup groups. This improves the performance to find + * an existing css_set. This hash doesn't (currently) take into + * account cgroups in empty hierarchies. + */ +#define CSS_SET_HASH_BITS 7 +static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); + +static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) +{ + unsigned long key = 0UL; + struct cgroup_subsys *ss; + int i; + + for_each_subsys(ss, i) + key += (unsigned long)css[i]; + key = (key >> 16) ^ key; + + return key; +} + +static void put_css_set_locked(struct css_set *cset) +{ + struct cgrp_cset_link *link, *tmp_link; + struct cgroup_subsys *ss; + int ssid; + + lockdep_assert_held(&css_set_lock); + + if (!atomic_dec_and_test(&cset->refcount)) + return; + + /* This css_set is dead. unlink it and release cgroup and css refs */ + for_each_subsys(ss, ssid) { + list_del(&cset->e_cset_node[ssid]); + css_put(cset->subsys[ssid]); + } + hash_del(&cset->hlist); + css_set_count--; + + list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { + list_del(&link->cset_link); + list_del(&link->cgrp_link); + if (cgroup_parent(link->cgrp)) + cgroup_put(link->cgrp); + kfree(link); + } + + kfree_rcu(cset, rcu_head); +} + +static void put_css_set(struct css_set *cset) +{ + unsigned long flags; + + /* + * Ensure that the refcount doesn't hit zero while any readers + * can see it. Similar to atomic_dec_and_lock(), but for an + * rwlock + */ + if (atomic_add_unless(&cset->refcount, -1, 1)) + return; + + spin_lock_irqsave(&css_set_lock, flags); + put_css_set_locked(cset); + spin_unlock_irqrestore(&css_set_lock, flags); +} + +/* + * refcounted get/put for css_set objects + */ +static inline void get_css_set(struct css_set *cset) +{ + atomic_inc(&cset->refcount); +} + +/** + * compare_css_sets - helper function for find_existing_css_set(). + * @cset: candidate css_set being tested + * @old_cset: existing css_set for a task + * @new_cgrp: cgroup that's being entered by the task + * @template: desired set of css pointers in css_set (pre-calculated) + * + * Returns true if "cset" matches "old_cset" except for the hierarchy + * which "new_cgrp" belongs to, for which it should match "new_cgrp". + */ +static bool compare_css_sets(struct css_set *cset, + struct css_set *old_cset, + struct cgroup *new_cgrp, + struct cgroup_subsys_state *template[]) +{ + struct list_head *l1, *l2; + + /* + * On the default hierarchy, there can be csets which are + * associated with the same set of cgroups but different csses. + * Let's first ensure that csses match. + */ + if (memcmp(template, cset->subsys, sizeof(cset->subsys))) + return false; + + /* + * Compare cgroup pointers in order to distinguish between + * different cgroups in hierarchies. As different cgroups may + * share the same effective css, this comparison is always + * necessary. + */ + l1 = &cset->cgrp_links; + l2 = &old_cset->cgrp_links; + while (1) { + struct cgrp_cset_link *link1, *link2; + struct cgroup *cgrp1, *cgrp2; + + l1 = l1->next; + l2 = l2->next; + /* See if we reached the end - both lists are equal length. */ + if (l1 == &cset->cgrp_links) { + BUG_ON(l2 != &old_cset->cgrp_links); + break; + } else { + BUG_ON(l2 == &old_cset->cgrp_links); + } + /* Locate the cgroups associated with these links. */ + link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link); + link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link); + cgrp1 = link1->cgrp; + cgrp2 = link2->cgrp; + /* Hierarchies should be linked in the same order. */ + BUG_ON(cgrp1->root != cgrp2->root); + + /* + * If this hierarchy is the hierarchy of the cgroup + * that's changing, then we need to check that this + * css_set points to the new cgroup; if it's any other + * hierarchy, then this css_set should point to the + * same cgroup as the old css_set. + */ + if (cgrp1->root == new_cgrp->root) { + if (cgrp1 != new_cgrp) + return false; + } else { + if (cgrp1 != cgrp2) + return false; + } + } + return true; +} + +/** + * find_existing_css_set - init css array and find the matching css_set + * @old_cset: the css_set that we're using before the cgroup transition + * @cgrp: the cgroup that we're moving into + * @template: out param for the new set of csses, should be clear on entry + */ +static struct css_set *find_existing_css_set(struct css_set *old_cset, + struct cgroup *cgrp, + struct cgroup_subsys_state *template[]) +{ + struct cgroup_root *root = cgrp->root; + struct cgroup_subsys *ss; + struct css_set *cset; + unsigned long key; + int i; + + /* + * Build the set of subsystem state objects that we want to see in the + * new css_set. while subsystems can change globally, the entries here + * won't change, so no need for locking. + */ + for_each_subsys(ss, i) { + if (root->subsys_mask & (1UL << i)) { + /* + * @ss is in this hierarchy, so we want the + * effective css from @cgrp. + */ + template[i] = cgroup_e_css(cgrp, ss); + } else { + /* + * @ss is not in this hierarchy, so we don't want + * to change the css. + */ + template[i] = old_cset->subsys[i]; + } + } + + key = css_set_hash(template); + hash_for_each_possible(css_set_table, cset, hlist, key) { + if (!compare_css_sets(cset, old_cset, cgrp, template)) + continue; + + /* This css_set matches what we need */ + return cset; + } + + /* No existing cgroup group matched */ + return NULL; +} + +static void free_cgrp_cset_links(struct list_head *links_to_free) +{ + struct cgrp_cset_link *link, *tmp_link; + + list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) { + list_del(&link->cset_link); + kfree(link); + } +} + +/** + * allocate_cgrp_cset_links - allocate cgrp_cset_links + * @count: the number of links to allocate + * @tmp_links: list_head the allocated links are put on + * + * Allocate @count cgrp_cset_link structures and chain them on @tmp_links + * through ->cset_link. Returns 0 on success or -errno. + */ +static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links) +{ + struct cgrp_cset_link *link; + int i; + + INIT_LIST_HEAD(tmp_links); + + for (i = 0; i < count; i++) { + link = kzalloc(sizeof(*link), GFP_KERNEL); + if (!link) { + free_cgrp_cset_links(tmp_links); + return -ENOMEM; + } + list_add(&link->cset_link, tmp_links); + } + return 0; +} + +/** + * link_css_set - a helper function to link a css_set to a cgroup + * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links() + * @cset: the css_set to be linked + * @cgrp: the destination cgroup + */ +static void link_css_set(struct list_head *tmp_links, struct css_set *cset, + struct cgroup *cgrp) +{ + struct cgrp_cset_link *link; + + BUG_ON(list_empty(tmp_links)); + + if (cgroup_on_dfl(cgrp)) + cset->dfl_cgrp = cgrp; + + link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); + link->cset = cset; + link->cgrp = cgrp; + + /* + * Always add links to the tail of the lists so that the lists are + * in choronological order. + */ + list_move_tail(&link->cset_link, &cgrp->cset_links); + list_add_tail(&link->cgrp_link, &cset->cgrp_links); + + if (cgroup_parent(cgrp)) < |