summaryrefslogtreecommitdiff
path: root/kernel/cgroup/cgroup.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2016-12-27 14:49:05 -0500
committerTejun Heo <tj@kernel.org>2016-12-27 14:49:05 -0500
commit201af4c0fab02876ef0311e7f7b4083aa138930c (patch)
treed79bfd5f25c5fd111a79993bc10fc129281db225 /kernel/cgroup/cgroup.c
parent5f617ebbdf10abd49312a89e3b894b927c7367f5 (diff)
downloadlinux-201af4c0fab02876ef0311e7f7b4083aa138930c.tar.gz
linux-201af4c0fab02876ef0311e7f7b4083aa138930c.tar.bz2
linux-201af4c0fab02876ef0311e7f7b4083aa138930c.zip
cgroup: move cgroup files under kernel/cgroup/
They're growing to be too many and planned to get split further. Move them under their own directory. kernel/cgroup.c -> kernel/cgroup/cgroup.c kernel/cgroup_freezer.c -> kernel/cgroup/freezer.c kernel/cgroup_pids.c -> kernel/cgroup/pids.c kernel/cpuset.c -> kernel/cgroup/cpuset.c Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
Diffstat (limited to 'kernel/cgroup/cgroup.c')
-rw-r--r--kernel/cgroup/cgroup.c6705
1 files changed, 6705 insertions, 0 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
new file mode 100644
index 000000000000..1a815f275849
--- /dev/null
+++ b/kernel/cgroup/cgroup.c
@@ -0,0 +1,6705 @@
+/*
+ * Generic process-grouping system.
+ *
+ * Based originally on the cpuset system, extracted by Paul Menage
+ * Copyright (C) 2006 Google, Inc
+ *
+ * Notifications support
+ * Copyright (C) 2009 Nokia Corporation
+ * Author: Kirill A. Shutemov
+ *
+ * Copyright notices from the original cpuset code:
+ * --------------------------------------------------
+ * Copyright (C) 2003 BULL SA.
+ * Copyright (C) 2004-2006 Silicon Graphics, Inc.
+ *
+ * Portions derived from Patrick Mochel's sysfs code.
+ * sysfs is Copyright (c) 2001-3 Patrick Mochel
+ *
+ * 2003-10-10 Written by Simon Derr.
+ * 2003-10-22 Updates by Stephen Hemminger.
+ * 2004 May-July Rework by Paul Jackson.
+ * ---------------------------------------------------
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cgroup.h>
+#include <linux/cred.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/init_task.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/magic.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/string.h>
+#include <linux/sort.h>
+#include <linux/kmod.h>
+#include <linux/delayacct.h>
+#include <linux/cgroupstats.h>
+#include <linux/hashtable.h>
+#include <linux/pid_namespace.h>
+#include <linux/idr.h>
+#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/atomic.h>
+#include <linux/cpuset.h>
+#include <linux/proc_ns.h>
+#include <linux/nsproxy.h>
+#include <linux/file.h>
+#include <net/sock.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/cgroup.h>
+
+/*
+ * pidlists linger the following amount before being destroyed. The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY HZ
+
+#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
+ MAX_CFTYPE_NAME + 2)
+
+/*
+ * cgroup_mutex is the master lock. Any modification to cgroup or its
+ * hierarchy must be performed while holding it.
+ *
+ * css_set_lock protects task->cgroups pointer, the list of css_set
+ * objects, and the chain of tasks off each css_set.
+ *
+ * These locks are exported if CONFIG_PROVE_RCU so that accessors in
+ * cgroup.h can use them for lockdep annotations.
+ */
+#ifdef CONFIG_PROVE_RCU
+DEFINE_MUTEX(cgroup_mutex);
+DEFINE_SPINLOCK(css_set_lock);
+EXPORT_SYMBOL_GPL(cgroup_mutex);
+EXPORT_SYMBOL_GPL(css_set_lock);
+#else
+static DEFINE_MUTEX(cgroup_mutex);
+static DEFINE_SPINLOCK(css_set_lock);
+#endif
+
+/*
+ * Protects cgroup_idr and css_idr so that IDs can be released without
+ * grabbing cgroup_mutex.
+ */
+static DEFINE_SPINLOCK(cgroup_idr_lock);
+
+/*
+ * Protects cgroup_file->kn for !self csses. It synchronizes notifications
+ * against file removal/re-creation across css hiding.
+ */
+static DEFINE_SPINLOCK(cgroup_file_kn_lock);
+
+/*
+ * Protects cgroup_subsys->release_agent_path. Modifying it also requires
+ * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
+ */
+static DEFINE_SPINLOCK(release_agent_path_lock);
+
+struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+
+#define cgroup_assert_mutex_or_rcu_locked() \
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
+ !lockdep_is_held(&cgroup_mutex), \
+ "cgroup_mutex or RCU read lock required");
+
+/*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_destroy_wq;
+
+/*
+ * pidlist destructions need to be flushed on cgroup destruction. Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
+
+/* generate an array of cgroup subsystem pointers */
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
+static struct cgroup_subsys *cgroup_subsys[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+/* array of cgroup subsystem names */
+#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
+static const char *cgroup_subsys_name[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
+#define SUBSYS(_x) \
+ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
+ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
+ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
+ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
+#include <linux/cgroup_subsys.h>
+#undef SUBSYS
+
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
+static struct static_key_true *cgroup_subsys_enabled_key[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
+static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+/*
+ * The default hierarchy, reserved for the subsystems that are otherwise
+ * unattached - it never has more than a single cgroup, and all tasks are
+ * part of that cgroup.
+ */
+struct cgroup_root cgrp_dfl_root;
+EXPORT_SYMBOL_GPL(cgrp_dfl_root);
+
+/*
+ * The default hierarchy always exists but is hidden until mounted for the
+ * first time. This is for backward compatibility.
+ */
+static bool cgrp_dfl_visible;
+
+/* Controllers blocked by the commandline in v1 */
+static u16 cgroup_no_v1_mask;
+
+/* some controllers are not supported in the default hierarchy */
+static u16 cgrp_dfl_inhibit_ss_mask;
+
+/* some controllers are implicitly enabled on the default hierarchy */
+static unsigned long cgrp_dfl_implicit_ss_mask;
+
+/* The list of hierarchy roots */
+
+static LIST_HEAD(cgroup_roots);
+static int cgroup_root_count;
+
+/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
+static DEFINE_IDR(cgroup_hierarchy_idr);
+
+/*
+ * Assign a monotonically increasing serial number to csses. It guarantees
+ * cgroups with bigger numbers are newer than those with smaller numbers.
+ * Also, as csses are always appended to the parent's ->children list, it
+ * guarantees that sibling csses are always sorted in the ascending serial
+ * number order on the list. Protected by cgroup_mutex.
+ */
+static u64 css_serial_nr_next = 1;
+
+/*
+ * These bitmask flags indicate whether tasks in the fork and exit paths have
+ * fork/exit handlers to call. This avoids us having to do extra work in the
+ * fork/exit path to check which subsystems have fork/exit callbacks.
+ */
+static u16 have_fork_callback __read_mostly;
+static u16 have_exit_callback __read_mostly;
+static u16 have_free_callback __read_mostly;
+
+/* cgroup namespace for init task */
+struct cgroup_namespace init_cgroup_ns = {
+ .count = { .counter = 2, },
+ .user_ns = &init_user_ns,
+ .ns.ops = &cgroupns_operations,
+ .ns.inum = PROC_CGROUP_INIT_INO,
+ .root_cset = &init_css_set,
+};
+
+/* Ditto for the can_fork callback. */
+static u16 have_canfork_callback __read_mostly;
+
+static struct file_system_type cgroup2_fs_type;
+static struct cftype cgroup_dfl_base_files[];
+static struct cftype cgroup_legacy_base_files[];
+
+static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
+static int cgroup_apply_control(struct cgroup *cgrp);
+static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
+static void css_task_iter_advance(struct css_task_iter *it);
+static int cgroup_destroy_locked(struct cgroup *cgrp);
+static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
+ struct cgroup_subsys *ss);
+static void css_release(struct percpu_ref *ref);
+static void kill_css(struct cgroup_subsys_state *css);
+static int cgroup_addrm_files(struct cgroup_subsys_state *css,
+ struct cgroup *cgrp, struct cftype cfts[],
+ bool is_add);
+
+/**
+ * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
+ * @ssid: subsys ID of interest
+ *
+ * cgroup_subsys_enabled() can only be used with literal subsys names which
+ * is fine for individual subsystems but unsuitable for cgroup core. This
+ * is slower static_key_enabled() based test indexed by @ssid.
+ */
+static bool cgroup_ssid_enabled(int ssid)
+{
+ if (CGROUP_SUBSYS_COUNT == 0)
+ return false;
+
+ return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
+}
+
+static bool cgroup_ssid_no_v1(int ssid)
+{
+ return cgroup_no_v1_mask & (1 << ssid);
+}
+
+/**
+ * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
+ * @cgrp: the cgroup of interest
+ *
+ * The default hierarchy is the v2 interface of cgroup and this function
+ * can be used to test whether a cgroup is on the default hierarchy for
+ * cases where a subsystem should behave differnetly depending on the
+ * interface version.
+ *
+ * The set of behaviors which change on the default hierarchy are still
+ * being determined and the mount option is prefixed with __DEVEL__.
+ *
+ * List of changed behaviors:
+ *
+ * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
+ * and "name" are disallowed.
+ *
+ * - When mounting an existing superblock, mount options should match.
+ *
+ * - Remount is disallowed.
+ *
+ * - rename(2) is disallowed.
+ *
+ * - "tasks" is removed. Everything should be at process granularity. Use
+ * "cgroup.procs" instead.
+ *
+ * - "cgroup.procs" is not sorted. pids will be unique unless they got
+ * recycled inbetween reads.
+ *
+ * - "release_agent" and "notify_on_release" are removed. Replacement
+ * notification mechanism will be implemented.
+ *
+ * - "cgroup.clone_children" is removed.
+ *
+ * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
+ * and its descendants contain no task; otherwise, 1. The file also
+ * generates kernfs notification which can be monitored through poll and
+ * [di]notify when the value of the file changes.
+ *
+ * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
+ * take masks of ancestors with non-empty cpus/mems, instead of being
+ * moved to an ancestor.
+ *
+ * - cpuset: a task can be moved into an empty cpuset, and again it takes
+ * masks of ancestors.
+ *
+ * - memcg: use_hierarchy is on by default and the cgroup file for the flag
+ * is not created.
+ *
+ * - blkcg: blk-throttle becomes properly hierarchical.
+ *
+ * - debug: disallowed on the default hierarchy.
+ */
+static bool cgroup_on_dfl(const struct cgroup *cgrp)
+{
+ return cgrp->root == &cgrp_dfl_root;
+}
+
+/* IDR wrappers which synchronize using cgroup_idr_lock */
+static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
+ gfp_t gfp_mask)
+{
+ int ret;
+
+ idr_preload(gfp_mask);
+ spin_lock_bh(&cgroup_idr_lock);
+ ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
+ spin_unlock_bh(&cgroup_idr_lock);
+ idr_preload_end();
+ return ret;
+}
+
+static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
+{
+ void *ret;
+
+ spin_lock_bh(&cgroup_idr_lock);
+ ret = idr_replace(idr, ptr, id);
+ spin_unlock_bh(&cgroup_idr_lock);
+ return ret;
+}
+
+static void cgroup_idr_remove(struct idr *idr, int id)
+{
+ spin_lock_bh(&cgroup_idr_lock);
+ idr_remove(idr, id);
+ spin_unlock_bh(&cgroup_idr_lock);
+}
+
+static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+ struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+
+ if (parent_css)
+ return container_of(parent_css, struct cgroup, self);
+ return NULL;
+}
+
+/* subsystems visibly enabled on a cgroup */
+static u16 cgroup_control(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ u16 root_ss_mask = cgrp->root->subsys_mask;
+
+ if (parent)
+ return parent->subtree_control;
+
+ if (cgroup_on_dfl(cgrp))
+ root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
+ cgrp_dfl_implicit_ss_mask);
+ return root_ss_mask;
+}
+
+/* subsystems enabled on a cgroup */
+static u16 cgroup_ss_mask(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+
+ if (parent)
+ return parent->subtree_ss_mask;
+
+ return cgrp->root->subsys_mask;
+}
+
+/**
+ * cgroup_css - obtain a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
+ * function must be called either under cgroup_mutex or rcu_read_lock() and
+ * the caller is responsible for pinning the returned css if it wants to
+ * keep accessing it outside the said locks. This function may return
+ * %NULL if @cgrp doesn't have @subsys_id enabled.
+ */
+static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ if (ss)
+ return rcu_dereference_check(cgrp->subsys[ss->id],
+ lockdep_is_held(&cgroup_mutex));
+ else
+ return &cgrp->self;
+}
+
+/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Similar to cgroup_css() but returns the effective css, which is defined
+ * as the matching css of the nearest ancestor including self which has @ss
+ * enabled. If @ss is associated with the hierarchy @cgrp is on, this
+ * function is guaranteed to return non-NULL css.
+ */
+static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!ss)
+ return &cgrp->self;
+
+ /*
+ * This function is used while updating css associations and thus
+ * can't test the csses directly. Test ss_mask.
+ */
+ while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
+ cgrp = cgroup_parent(cgrp);
+ if (!cgrp)
+ return NULL;
+ }
+
+ return cgroup_css(cgrp, ss);
+}
+
+/**
+ * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get the effective css of @cgrp for @ss. The effective css is
+ * defined as the matching css of the nearest ancestor including self which
+ * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
+ * the root css is returned, so this function always returns a valid css.
+ * The returned css must be put using css_put().
+ */
+struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+
+ do {
+ css = cgroup_css(cgrp, ss);
+
+ if (css && css_tryget_online(css))
+ goto out_unlock;
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+
+ css = init_css_set.subsys[ss->id];
+ css_get(css);
+out_unlock:
+ rcu_read_unlock();
+ return css;
+}
+
+/* convenient tests for these bits */
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
+{
+ return !(cgrp->self.flags & CSS_ONLINE);
+}
+
+static void cgroup_get(struct cgroup *cgrp)
+{
+ WARN_ON_ONCE(cgroup_is_dead(cgrp));
+ css_get(&cgrp->self);
+}
+
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+ return css_tryget(&cgrp->self);
+}
+
+struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
+{
+ struct cgroup *cgrp = of->kn->parent->priv;
+ struct cftype *cft = of_cft(of);
+
+ /*
+ * This is open and unprotected implementation of cgroup_css().
+ * seq_css() is only called from a kernfs file operation which has
+ * an active reference on the file. Because all the subsystem
+ * files are drained before a css is disassociated with a cgroup,
+ * the matching css from the cgroup's subsys table is guaranteed to
+ * be and stay valid until the enclosing operation is complete.
+ */
+ if (cft->ss)
+ return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
+ else
+ return &cgrp->self;
+}
+EXPORT_SYMBOL_GPL(of_css);
+
+static int notify_on_release(const struct cgroup *cgrp)
+{
+ return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+}
+
+/**
+ * for_each_css - iterate all css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
+ */
+#define for_each_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = rcu_dereference_check( \
+ (cgrp)->subsys[(ssid)], \
+ lockdep_is_held(&cgroup_mutex)))) { } \
+ else
+
+/**
+ * for_each_e_css - iterate all effective css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
+ */
+#define for_each_e_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
+ ; \
+ else
+
+/**
+ * for_each_subsys - iterate all enabled cgroup subsystems
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ */
+#define for_each_subsys(ss, ssid) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
+ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+
+/**
+ * do_each_subsys_mask - filter for_each_subsys with a bitmask
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ * @ss_mask: the bitmask
+ *
+ * The block will only run for cases where the ssid-th bit (1 << ssid) of
+ * @ss_mask is set.
+ */
+#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
+ unsigned long __ss_mask = (ss_mask); \
+ if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \
+ (ssid) = 0; \
+ break; \
+ } \
+ for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
+ (ss) = cgroup_subsys[ssid]; \
+ {
+
+#define while_each_subsys_mask() \
+ } \
+ } \
+} while (false)
+
+/* iterate across the hierarchies */
+#define for_each_root(root) \
+ list_for_each_entry((root), &cgroup_roots, root_list)
+
+/* iterate over child cgrps, lock should be held throughout iteration */
+#define cgroup_for_each_live_child(child, cgrp) \
+ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
+ if (({ lockdep_assert_held(&cgroup_mutex); \
+ cgroup_is_dead(child); })) \
+ ; \
+ else
+
+/* walk live descendants in preorder */
+#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
+ css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
+ if (({ lockdep_assert_held(&cgroup_mutex); \
+ (dsct) = (d_css)->cgroup; \
+ cgroup_is_dead(dsct); })) \
+ ; \
+ else
+
+/* walk live descendants in postorder */
+#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
+ css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
+ if (({ lockdep_assert_held(&cgroup_mutex); \
+ (dsct) = (d_css)->cgroup; \
+ cgroup_is_dead(dsct); })) \
+ ; \
+ else
+
+static void cgroup_release_agent(struct work_struct *work);
+static void check_for_release(struct cgroup *cgrp);
+
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies. In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+ /* the cgroup and css_set this link associates */
+ struct cgroup *cgrp;
+ struct css_set *cset;
+
+ /* list of cgrp_cset_links anchored at cgrp->cset_links */
+ struct list_head cset_link;
+
+ /* list of cgrp_cset_links anchored at css_set->cgrp_links */
+ struct list_head cgrp_link;
+};
+
+/*
+ * The default css_set - used by init and its children prior to any
+ * hierarchies being mounted. It contains a pointer to the root state
+ * for each subsystem. Also used to anchor the list of css_sets. Not
+ * reference-counted, to improve performance when child cgroups
+ * haven't been created.
+ */
+struct css_set init_css_set = {
+ .refcount = ATOMIC_INIT(1),
+ .tasks = LIST_HEAD_INIT(init_css_set.tasks),
+ .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
+ .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
+ .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
+ .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
+ .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
+};
+
+static int css_set_count = 1; /* 1 for init_css_set */
+
+/**
+ * css_set_populated - does a css_set contain any tasks?
+ * @cset: target css_set
+ */
+static bool css_set_populated(struct css_set *cset)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
+}
+
+/**
+ * cgroup_update_populated - updated populated count of a cgroup
+ * @cgrp: the target cgroup
+ * @populated: inc or dec populated count
+ *
+ * One of the css_sets associated with @cgrp is either getting its first
+ * task or losing the last. Update @cgrp->populated_cnt accordingly. The
+ * count is propagated towards root so that a given cgroup's populated_cnt
+ * is zero iff the cgroup and all its descendants don't contain any tasks.
+ *
+ * @cgrp's interface file "cgroup.populated" is zero if
+ * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
+ * changes from or to zero, userland is notified that the content of the
+ * interface file has changed. This can be used to detect when @cgrp and
+ * its descendants become populated or empty.
+ */
+static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ do {
+ bool trigger;
+
+ if (populated)
+ trigger = !cgrp->populated_cnt++;
+ else
+ trigger = !--cgrp->populated_cnt;
+
+ if (!trigger)
+ break;
+
+ check_for_release(cgrp);
+ cgroup_file_notify(&cgrp->events_file);
+
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+}
+
+/**
+ * css_set_update_populated - update populated state of a css_set
+ * @cset: target css_set
+ * @populated: whether @cset is populated or depopulated
+ *
+ * @cset is either getting the first task or losing the last. Update the
+ * ->populated_cnt of all associated cgroups accordingly.
+ */
+static void css_set_update_populated(struct css_set *cset, bool populated)
+{
+ struct cgrp_cset_link *link;
+
+ lockdep_assert_held(&css_set_lock);
+
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
+ cgroup_update_populated(link->cgrp, populated);
+}
+
+/**
+ * css_set_move_task - move a task from one css_set to another
+ * @task: task being moved
+ * @from_cset: css_set @task currently belongs to (may be NULL)
+ * @to_cset: new css_set @task is being moved to (may be NULL)
+ * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
+ *
+ * Move @task from @from_cset to @to_cset. If @task didn't belong to any
+ * css_set, @from_cset can be NULL. If @task is being disassociated
+ * instead of moved, @to_cset can be NULL.
+ *
+ * This function automatically handles populated_cnt updates and
+ * css_task_iter adjustments but the caller is responsible for managing
+ * @from_cset and @to_cset's reference counts.
+ */
+static void css_set_move_task(struct task_struct *task,
+ struct css_set *from_cset, struct css_set *to_cset,
+ bool use_mg_tasks)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ if (to_cset && !css_set_populated(to_cset))
+ css_set_update_populated(to_cset, true);
+
+ if (from_cset) {
+ struct css_task_iter *it, *pos;
+
+ WARN_ON_ONCE(list_empty(&task->cg_list));
+
+ /*
+ * @task is leaving, advance task iterators which are
+ * pointing to it so that they can resume at the next
+ * position. Advancing an iterator might remove it from
+ * the list, use safe walk. See css_task_iter_advance*()
+ * for details.
+ */
+ list_for_each_entry_safe(it, pos, &from_cset->task_iters,
+ iters_node)
+ if (it->task_pos == &task->cg_list)
+ css_task_iter_advance(it);
+
+ list_del_init(&task->cg_list);
+ if (!css_set_populated(from_cset))
+ css_set_update_populated(from_cset, false);
+ } else {
+ WARN_ON_ONCE(!list_empty(&task->cg_list));
+ }
+
+ if (to_cset) {
+ /*
+ * We are synchronized through cgroup_threadgroup_rwsem
+ * against PF_EXITING setting such that we can't race
+ * against cgroup_exit() changing the css_set to
+ * init_css_set and dropping the old one.
+ */
+ WARN_ON_ONCE(task->flags & PF_EXITING);
+
+ rcu_assign_pointer(task->cgroups, to_cset);
+ list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
+ &to_cset->tasks);
+ }
+}
+
+/*
+ * hash table for cgroup groups. This improves the performance to find
+ * an existing css_set. This hash doesn't (currently) take into
+ * account cgroups in empty hierarchies.
+ */
+#define CSS_SET_HASH_BITS 7
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
+
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
+{
+ unsigned long key = 0UL;
+ struct cgroup_subsys *ss;
+ int i;
+
+ for_each_subsys(ss, i)
+ key += (unsigned long)css[i];
+ key = (key >> 16) ^ key;
+
+ return key;
+}
+
+static void put_css_set_locked(struct css_set *cset)
+{
+ struct cgrp_cset_link *link, *tmp_link;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ lockdep_assert_held(&css_set_lock);
+
+ if (!atomic_dec_and_test(&cset->refcount))
+ return;
+
+ /* This css_set is dead. unlink it and release cgroup and css refs */
+ for_each_subsys(ss, ssid) {
+ list_del(&cset->e_cset_node[ssid]);
+ css_put(cset->subsys[ssid]);
+ }
+ hash_del(&cset->hlist);
+ css_set_count--;
+
+ list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
+ list_del(&link->cset_link);
+ list_del(&link->cgrp_link);
+ if (cgroup_parent(link->cgrp))
+ cgroup_put(link->cgrp);
+ kfree(link);
+ }
+
+ kfree_rcu(cset, rcu_head);
+}
+
+static void put_css_set(struct css_set *cset)
+{
+ unsigned long flags;
+
+ /*
+ * Ensure that the refcount doesn't hit zero while any readers
+ * can see it. Similar to atomic_dec_and_lock(), but for an
+ * rwlock
+ */
+ if (atomic_add_unless(&cset->refcount, -1, 1))
+ return;
+
+ spin_lock_irqsave(&css_set_lock, flags);
+ put_css_set_locked(cset);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+}
+
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cset)
+{
+ atomic_inc(&cset->refcount);
+}
+
+/**
+ * compare_css_sets - helper function for find_existing_css_set().
+ * @cset: candidate css_set being tested
+ * @old_cset: existing css_set for a task
+ * @new_cgrp: cgroup that's being entered by the task
+ * @template: desired set of css pointers in css_set (pre-calculated)
+ *
+ * Returns true if "cset" matches "old_cset" except for the hierarchy
+ * which "new_cgrp" belongs to, for which it should match "new_cgrp".
+ */
+static bool compare_css_sets(struct css_set *cset,
+ struct css_set *old_cset,
+ struct cgroup *new_cgrp,
+ struct cgroup_subsys_state *template[])
+{
+ struct list_head *l1, *l2;
+
+ /*
+ * On the default hierarchy, there can be csets which are
+ * associated with the same set of cgroups but different csses.
+ * Let's first ensure that csses match.
+ */
+ if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
+ return false;
+
+ /*
+ * Compare cgroup pointers in order to distinguish between
+ * different cgroups in hierarchies. As different cgroups may
+ * share the same effective css, this comparison is always
+ * necessary.
+ */
+ l1 = &cset->cgrp_links;
+ l2 = &old_cset->cgrp_links;
+ while (1) {
+ struct cgrp_cset_link *link1, *link2;
+ struct cgroup *cgrp1, *cgrp2;
+
+ l1 = l1->next;
+ l2 = l2->next;
+ /* See if we reached the end - both lists are equal length. */
+ if (l1 == &cset->cgrp_links) {
+ BUG_ON(l2 != &old_cset->cgrp_links);
+ break;
+ } else {
+ BUG_ON(l2 == &old_cset->cgrp_links);
+ }
+ /* Locate the cgroups associated with these links. */
+ link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
+ link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
+ cgrp1 = link1->cgrp;
+ cgrp2 = link2->cgrp;
+ /* Hierarchies should be linked in the same order. */
+ BUG_ON(cgrp1->root != cgrp2->root);
+
+ /*
+ * If this hierarchy is the hierarchy of the cgroup
+ * that's changing, then we need to check that this
+ * css_set points to the new cgroup; if it's any other
+ * hierarchy, then this css_set should point to the
+ * same cgroup as the old css_set.
+ */
+ if (cgrp1->root == new_cgrp->root) {
+ if (cgrp1 != new_cgrp)
+ return false;
+ } else {
+ if (cgrp1 != cgrp2)
+ return false;
+ }
+ }
+ return true;
+}
+
+/**
+ * find_existing_css_set - init css array and find the matching css_set
+ * @old_cset: the css_set that we're using before the cgroup transition
+ * @cgrp: the cgroup that we're moving into
+ * @template: out param for the new set of csses, should be clear on entry
+ */
+static struct css_set *find_existing_css_set(struct css_set *old_cset,
+ struct cgroup *cgrp,
+ struct cgroup_subsys_state *template[])
+{
+ struct cgroup_root *root = cgrp->root;
+ struct cgroup_subsys *ss;
+ struct css_set *cset;
+ unsigned long key;
+ int i;
+
+ /*
+ * Build the set of subsystem state objects that we want to see in the
+ * new css_set. while subsystems can change globally, the entries here
+ * won't change, so no need for locking.
+ */
+ for_each_subsys(ss, i) {
+ if (root->subsys_mask & (1UL << i)) {
+ /*
+ * @ss is in this hierarchy, so we want the
+ * effective css from @cgrp.
+ */
+ template[i] = cgroup_e_css(cgrp, ss);
+ } else {
+ /*
+ * @ss is not in this hierarchy, so we don't want
+ * to change the css.
+ */
+ template[i] = old_cset->subsys[i];
+ }
+ }
+
+ key = css_set_hash(template);
+ hash_for_each_possible(css_set_table, cset, hlist, key) {
+ if (!compare_css_sets(cset, old_cset, cgrp, template))
+ continue;
+
+ /* This css_set matches what we need */
+ return cset;
+ }
+
+ /* No existing cgroup group matched */
+ return NULL;
+}
+
+static void free_cgrp_cset_links(struct list_head *links_to_free)
+{
+ struct cgrp_cset_link *link, *tmp_link;
+
+ list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
+ list_del(&link->cset_link);
+ kfree(link);
+ }
+}
+
+/**
+ * allocate_cgrp_cset_links - allocate cgrp_cset_links
+ * @count: the number of links to allocate
+ * @tmp_links: list_head the allocated links are put on
+ *
+ * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
+ * through ->cset_link. Returns 0 on success or -errno.
+ */
+static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
+{
+ struct cgrp_cset_link *link;
+ int i;
+
+ INIT_LIST_HEAD(tmp_links);
+
+ for (i = 0; i < count; i++) {
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ if (!link) {
+ free_cgrp_cset_links(tmp_links);
+ return -ENOMEM;
+ }
+ list_add(&link->cset_link, tmp_links);
+ }
+ return 0;
+}
+
+/**
+ * link_css_set - a helper function to link a css_set to a cgroup
+ * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
+ * @cset: the css_set to be linked
+ * @cgrp: the destination cgroup
+ */
+static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
+ struct cgroup *cgrp)
+{
+ struct cgrp_cset_link *link;
+
+ BUG_ON(list_empty(tmp_links));
+
+ if (cgroup_on_dfl(cgrp))
+ cset->dfl_cgrp = cgrp;
+
+ link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
+ link->cset = cset;
+ link->cgrp = cgrp;
+
+ /*
+ * Always add links to the tail of the lists so that the lists are
+ * in choronological order.
+ */
+ list_move_tail(&link->cset_link, &cgrp->cset_links);
+ list_add_tail(&link->cgrp_link, &cset->cgrp_links);
+
+ if (cgroup_parent(cgrp))
<