cgroup: move cgroup files under kernel/cgroup/

They're growing to be too many and planned to get split further. Move them under their own directory. kernel/cgroup.c -> kernel/cgroup/cgroup.c kernel/cgroup_freezer.c -> kernel/cgroup/freezer.c kernel/cgroup_pids.c -> kernel/cgroup/pids.c kernel/cpuset.c -> kernel/cgroup/cpuset.c Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
author: Tejun Heo <tj@kernel.org> 2016-12-27 14:49:05 -0500
committer: Tejun Heo <tj@kernel.org> 2016-12-27 14:49:05 -0500
commit: 201af4c0fab02876ef0311e7f7b4083aa138930c (patch)
tree: d79bfd5f25c5fd111a79993bc10fc129281db225 /kernel/cgroup/cgroup.c
parent: 5f617ebbdf10abd49312a89e3b894b927c7367f5 (diff)
download: linux-201af4c0fab02876ef0311e7f7b4083aa138930c.tar.gz
linux-201af4c0fab02876ef0311e7f7b4083aa138930c.tar.bz2
linux-201af4c0fab02876ef0311e7f7b4083aa138930c.zip
1 files changed, 6705 insertions, 0 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
new file mode 100644
index 000000000000..1a815f275849
--- /dev/null
+++ b/kernel/cgroup/cgroup.c
@@ -0,0 +1,6705 @@
+/*
+ *  Generic process-grouping system.
+ *
+ *  Based originally on the cpuset system, extracted by Paul Menage
+ *  Copyright (C) 2006 Google, Inc
+ *
+ *  Notifications support
+ *  Copyright (C) 2009 Nokia Corporation
+ *  Author: Kirill A. Shutemov
+ *
+ *  Copyright notices from the original cpuset code:
+ *  --------------------------------------------------
+ *  Copyright (C) 2003 BULL SA.
+ *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
+ *
+ *  Portions derived from Patrick Mochel's sysfs code.
+ *  sysfs is Copyright (c) 2001-3 Patrick Mochel
+ *
+ *  2003-10-10 Written by Simon Derr.
+ *  2003-10-22 Updates by Stephen Hemminger.
+ *  2004 May-July Rework by Paul Jackson.
+ *  ---------------------------------------------------
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cgroup.h>
+#include <linux/cred.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/init_task.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/magic.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/string.h>
+#include <linux/sort.h>
+#include <linux/kmod.h>
+#include <linux/delayacct.h>
+#include <linux/cgroupstats.h>
+#include <linux/hashtable.h>
+#include <linux/pid_namespace.h>
+#include <linux/idr.h>
+#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/atomic.h>
+#include <linux/cpuset.h>
+#include <linux/proc_ns.h>
+#include <linux/nsproxy.h>
+#include <linux/file.h>
+#include <net/sock.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/cgroup.h>
+
+/*
+ * pidlists linger the following amount before being destroyed.  The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY	HZ
+
+#define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
+					 MAX_CFTYPE_NAME + 2)
+
+/*
+ * cgroup_mutex is the master lock.  Any modification to cgroup or its
+ * hierarchy must be performed while holding it.
+ *
+ * css_set_lock protects task->cgroups pointer, the list of css_set
+ * objects, and the chain of tasks off each css_set.
+ *
+ * These locks are exported if CONFIG_PROVE_RCU so that accessors in
+ * cgroup.h can use them for lockdep annotations.
+ */
+#ifdef CONFIG_PROVE_RCU
+DEFINE_MUTEX(cgroup_mutex);
+DEFINE_SPINLOCK(css_set_lock);
+EXPORT_SYMBOL_GPL(cgroup_mutex);
+EXPORT_SYMBOL_GPL(css_set_lock);
+#else
+static DEFINE_MUTEX(cgroup_mutex);
+static DEFINE_SPINLOCK(css_set_lock);
+#endif
+
+/*
+ * Protects cgroup_idr and css_idr so that IDs can be released without
+ * grabbing cgroup_mutex.
+ */
+static DEFINE_SPINLOCK(cgroup_idr_lock);
+
+/*
+ * Protects cgroup_file->kn for !self csses.  It synchronizes notifications
+ * against file removal/re-creation across css hiding.
+ */
+static DEFINE_SPINLOCK(cgroup_file_kn_lock);
+
+/*
+ * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
+ * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
+ */
+static DEFINE_SPINLOCK(release_agent_path_lock);
+
+struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+
+#define cgroup_assert_mutex_or_rcu_locked()				\
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
+			   !lockdep_is_held(&cgroup_mutex),		\
+			   "cgroup_mutex or RCU read lock required");
+
+/*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions.  Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_destroy_wq;
+
+/*
+ * pidlist destructions need to be flushed on cgroup destruction.  Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
+
+/* generate an array of cgroup subsystem pointers */
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
+static struct cgroup_subsys *cgroup_subsys[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+/* array of cgroup subsystem names */
+#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
+static const char *cgroup_subsys_name[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
+#define SUBSYS(_x)								\
+	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);			\
+	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);			\
+	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);			\
+	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
+#include <linux/cgroup_subsys.h>
+#undef SUBSYS
+
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
+static struct static_key_true *cgroup_subsys_enabled_key[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
+static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+/*
+ * The default hierarchy, reserved for the subsystems that are otherwise
+ * unattached - it never has more than a single cgroup, and all tasks are
+ * part of that cgroup.
+ */
+struct cgroup_root cgrp_dfl_root;
+EXPORT_SYMBOL_GPL(cgrp_dfl_root);
+
+/*
+ * The default hierarchy always exists but is hidden until mounted for the
+ * first time.  This is for backward compatibility.
+ */
+static bool cgrp_dfl_visible;
+
+/* Controllers blocked by the commandline in v1 */
+static u16 cgroup_no_v1_mask;
+
+/* some controllers are not supported in the default hierarchy */
+static u16 cgrp_dfl_inhibit_ss_mask;
+
+/* some controllers are implicitly enabled on the default hierarchy */
+static unsigned long cgrp_dfl_implicit_ss_mask;
+
+/* The list of hierarchy roots */
+
+static LIST_HEAD(cgroup_roots);
+static int cgroup_root_count;
+
+/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
+static DEFINE_IDR(cgroup_hierarchy_idr);
+
+/*
+ * Assign a monotonically increasing serial number to csses.  It guarantees
+ * cgroups with bigger numbers are newer than those with smaller numbers.
+ * Also, as csses are always appended to the parent's ->children list, it
+ * guarantees that sibling csses are always sorted in the ascending serial
+ * number order on the list.  Protected by cgroup_mutex.
+ */
+static u64 css_serial_nr_next = 1;
+
+/*
+ * These bitmask flags indicate whether tasks in the fork and exit paths have
+ * fork/exit handlers to call. This avoids us having to do extra work in the
+ * fork/exit path to check which subsystems have fork/exit callbacks.
+ */
+static u16 have_fork_callback __read_mostly;
+static u16 have_exit_callback __read_mostly;
+static u16 have_free_callback __read_mostly;
+
+/* cgroup namespace for init task */
+struct cgroup_namespace init_cgroup_ns = {
+	.count		= { .counter = 2, },
+	.user_ns	= &init_user_ns,
+	.ns.ops		= &cgroupns_operations,
+	.ns.inum	= PROC_CGROUP_INIT_INO,
+	.root_cset	= &init_css_set,
+};
+
+/* Ditto for the can_fork callback. */
+static u16 have_canfork_callback __read_mostly;
+
+static struct file_system_type cgroup2_fs_type;
+static struct cftype cgroup_dfl_base_files[];
+static struct cftype cgroup_legacy_base_files[];
+
+static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
+static int cgroup_apply_control(struct cgroup *cgrp);
+static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
+static void css_task_iter_advance(struct css_task_iter *it);
+static int cgroup_destroy_locked(struct cgroup *cgrp);
+static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
+					      struct cgroup_subsys *ss);
+static void css_release(struct percpu_ref *ref);
+static void kill_css(struct cgroup_subsys_state *css);
+static int cgroup_addrm_files(struct cgroup_subsys_state *css,
+			      struct cgroup *cgrp, struct cftype cfts[],
+			      bool is_add);
+
+/**
+ * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
+ * @ssid: subsys ID of interest
+ *
+ * cgroup_subsys_enabled() can only be used with literal subsys names which
+ * is fine for individual subsystems but unsuitable for cgroup core.  This
+ * is slower static_key_enabled() based test indexed by @ssid.
+ */
+static bool cgroup_ssid_enabled(int ssid)
+{
+	if (CGROUP_SUBSYS_COUNT == 0)
+		return false;
+
+	return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
+}
+
+static bool cgroup_ssid_no_v1(int ssid)
+{
+	return cgroup_no_v1_mask & (1 << ssid);
+}
+
+/**
+ * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
+ * @cgrp: the cgroup of interest
+ *
+ * The default hierarchy is the v2 interface of cgroup and this function
+ * can be used to test whether a cgroup is on the default hierarchy for
+ * cases where a subsystem should behave differnetly depending on the
+ * interface version.
+ *
+ * The set of behaviors which change on the default hierarchy are still
+ * being determined and the mount option is prefixed with __DEVEL__.
+ *
+ * List of changed behaviors:
+ *
+ * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
+ *   and "name" are disallowed.
+ *
+ * - When mounting an existing superblock, mount options should match.
+ *
+ * - Remount is disallowed.
+ *
+ * - rename(2) is disallowed.
+ *
+ * - "tasks" is removed.  Everything should be at process granularity.  Use
+ *   "cgroup.procs" instead.
+ *
+ * - "cgroup.procs" is not sorted.  pids will be unique unless they got
+ *   recycled inbetween reads.
+ *
+ * - "release_agent" and "notify_on_release" are removed.  Replacement
+ *   notification mechanism will be implemented.
+ *
+ * - "cgroup.clone_children" is removed.
+ *
+ * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
+ *   and its descendants contain no task; otherwise, 1.  The file also
+ *   generates kernfs notification which can be monitored through poll and
+ *   [di]notify when the value of the file changes.
+ *
+ * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
+ *   take masks of ancestors with non-empty cpus/mems, instead of being
+ *   moved to an ancestor.
+ *
+ * - cpuset: a task can be moved into an empty cpuset, and again it takes
+ *   masks of ancestors.
+ *
+ * - memcg: use_hierarchy is on by default and the cgroup file for the flag
+ *   is not created.
+ *
+ * - blkcg: blk-throttle becomes properly hierarchical.
+ *
+ * - debug: disallowed on the default hierarchy.
+ */
+static bool cgroup_on_dfl(const struct cgroup *cgrp)
+{
+	return cgrp->root == &cgrp_dfl_root;
+}
+
+/* IDR wrappers which synchronize using cgroup_idr_lock */
+static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
+			    gfp_t gfp_mask)
+{
+	int ret;
+
+	idr_preload(gfp_mask);
+	spin_lock_bh(&cgroup_idr_lock);
+	ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
+	spin_unlock_bh(&cgroup_idr_lock);
+	idr_preload_end();
+	return ret;
+}
+
+static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
+{
+	void *ret;
+
+	spin_lock_bh(&cgroup_idr_lock);
+	ret = idr_replace(idr, ptr, id);
+	spin_unlock_bh(&cgroup_idr_lock);
+	return ret;
+}
+
+static void cgroup_idr_remove(struct idr *idr, int id)
+{
+	spin_lock_bh(&cgroup_idr_lock);
+	idr_remove(idr, id);
+	spin_unlock_bh(&cgroup_idr_lock);
+}
+
+static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+	struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+
+	if (parent_css)
+		return container_of(parent_css, struct cgroup, self);
+	return NULL;
+}
+
+/* subsystems visibly enabled on a cgroup */
+static u16 cgroup_control(struct cgroup *cgrp)
+{
+	struct cgroup *parent = cgroup_parent(cgrp);
+	u16 root_ss_mask = cgrp->root->subsys_mask;
+
+	if (parent)
+		return parent->subtree_control;
+
+	if (cgroup_on_dfl(cgrp))
+		root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
+				  cgrp_dfl_implicit_ss_mask);
+	return root_ss_mask;
+}
+
+/* subsystems enabled on a cgroup */
+static u16 cgroup_ss_mask(struct cgroup *cgrp)
+{
+	struct cgroup *parent = cgroup_parent(cgrp);
+
+	if (parent)
+		return parent->subtree_ss_mask;
+
+	return cgrp->root->subsys_mask;
+}
+
+/**
+ * cgroup_css - obtain a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
+ * function must be called either under cgroup_mutex or rcu_read_lock() and
+ * the caller is responsible for pinning the returned css if it wants to
+ * keep accessing it outside the said locks.  This function may return
+ * %NULL if @cgrp doesn't have @subsys_id enabled.
+ */
+static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
+					      struct cgroup_subsys *ss)
+{
+	if (ss)
+		return rcu_dereference_check(cgrp->subsys[ss->id],
+					lockdep_is_held(&cgroup_mutex));
+	else
+		return &cgrp->self;
+}
+
+/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Similar to cgroup_css() but returns the effective css, which is defined
+ * as the matching css of the nearest ancestor including self which has @ss
+ * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
+ * function is guaranteed to return non-NULL css.
+ */
+static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+						struct cgroup_subsys *ss)
+{
+	lockdep_assert_held(&cgroup_mutex);
+
+	if (!ss)
+		return &cgrp->self;
+
+	/*
+	 * This function is used while updating css associations and thus
+	 * can't test the csses directly.  Test ss_mask.
+	 */
+	while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
+		cgrp = cgroup_parent(cgrp);
+		if (!cgrp)
+			return NULL;
+	}
+
+	return cgroup_css(cgrp, ss);
+}
+
+/**
+ * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get the effective css of @cgrp for @ss.  The effective css is
+ * defined as the matching css of the nearest ancestor including self which
+ * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
+ * the root css is returned, so this function always returns a valid css.
+ * The returned css must be put using css_put().
+ */
+struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
+					     struct cgroup_subsys *ss)
+{
+	struct cgroup_subsys_state *css;
+
+	rcu_read_lock();
+
+	do {
+		css = cgroup_css(cgrp, ss);
+
+		if (css && css_tryget_online(css))
+			goto out_unlock;
+		cgrp = cgroup_parent(cgrp);
+	} while (cgrp);
+
+	css = init_css_set.subsys[ss->id];
+	css_get(css);
+out_unlock:
+	rcu_read_unlock();
+	return css;
+}
+
+/* convenient tests for these bits */
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
+{
+	return !(cgrp->self.flags & CSS_ONLINE);
+}
+
+static void cgroup_get(struct cgroup *cgrp)
+{
+	WARN_ON_ONCE(cgroup_is_dead(cgrp));
+	css_get(&cgrp->self);
+}
+
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+	return css_tryget(&cgrp->self);
+}
+
+struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
+{
+	struct cgroup *cgrp = of->kn->parent->priv;
+	struct cftype *cft = of_cft(of);
+
+	/*
+	 * This is open and unprotected implementation of cgroup_css().
+	 * seq_css() is only called from a kernfs file operation which has
+	 * an active reference on the file.  Because all the subsystem
+	 * files are drained before a css is disassociated with a cgroup,
+	 * the matching css from the cgroup's subsys table is guaranteed to
+	 * be and stay valid until the enclosing operation is complete.
+	 */
+	if (cft->ss)
+		return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
+	else
+		return &cgrp->self;
+}
+EXPORT_SYMBOL_GPL(of_css);
+
+static int notify_on_release(const struct cgroup *cgrp)
+{
+	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+}
+
+/**
+ * for_each_css - iterate all css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
+ */
+#define for_each_css(css, ssid, cgrp)					\
+	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
+		if (!((css) = rcu_dereference_check(			\
+				(cgrp)->subsys[(ssid)],			\
+				lockdep_is_held(&cgroup_mutex)))) { }	\
+		else
+
+/**
+ * for_each_e_css - iterate all effective css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
+ */
+#define for_each_e_css(css, ssid, cgrp)					\
+	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
+		if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
+			;						\
+		else
+
+/**
+ * for_each_subsys - iterate all enabled cgroup subsystems
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ */
+#define for_each_subsys(ss, ssid)					\
+	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
+	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+
+/**
+ * do_each_subsys_mask - filter for_each_subsys with a bitmask
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ * @ss_mask: the bitmask
+ *
+ * The block will only run for cases where the ssid-th bit (1 << ssid) of
+ * @ss_mask is set.
+ */
+#define do_each_subsys_mask(ss, ssid, ss_mask) do {			\
+	unsigned long __ss_mask = (ss_mask);				\
+	if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */	\
+		(ssid) = 0;						\
+		break;							\
+	}								\
+	for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) {	\
+		(ss) = cgroup_subsys[ssid];				\
+		{
+
+#define while_each_subsys_mask()					\
+		}							\
+	}								\
+} while (false)
+
+/* iterate across the hierarchies */
+#define for_each_root(root)						\
+	list_for_each_entry((root), &cgroup_roots, root_list)
+
+/* iterate over child cgrps, lock should be held throughout iteration */
+#define cgroup_for_each_live_child(child, cgrp)				\
+	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
+		if (({ lockdep_assert_held(&cgroup_mutex);		\
+		       cgroup_is_dead(child); }))			\
+			;						\
+		else
+
+/* walk live descendants in preorder */
+#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)		\
+	css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))	\
+		if (({ lockdep_assert_held(&cgroup_mutex);		\
+		       (dsct) = (d_css)->cgroup;			\
+		       cgroup_is_dead(dsct); }))			\
+			;						\
+		else
+
+/* walk live descendants in postorder */
+#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)		\
+	css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL))	\
+		if (({ lockdep_assert_held(&cgroup_mutex);		\
+		       (dsct) = (d_css)->cgroup;			\
+		       cgroup_is_dead(dsct); }))			\
+			;						\
+		else
+
+static void cgroup_release_agent(struct work_struct *work);
+static void check_for_release(struct cgroup *cgrp);
+
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies.  In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+	/* the cgroup and css_set this link associates */
+	struct cgroup		*cgrp;
+	struct css_set		*cset;
+
+	/* list of cgrp_cset_links anchored at cgrp->cset_links */
+	struct list_head	cset_link;
+
+	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
+	struct list_head	cgrp_link;
+};
+
+/*
+ * The default css_set - used by init and its children prior to any
+ * hierarchies being mounted. It contains a pointer to the root state
+ * for each subsystem. Also used to anchor the list of css_sets. Not
+ * reference-counted, to improve performance when child cgroups
+ * haven't been created.
+ */
+struct css_set init_css_set = {
+	.refcount		= ATOMIC_INIT(1),
+	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
+	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
+	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
+	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
+	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
+	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
+};
+
+static int css_set_count	= 1;	/* 1 for init_css_set */
+
+/**
+ * css_set_populated - does a css_set contain any tasks?
+ * @cset: target css_set
+ */
+static bool css_set_populated(struct css_set *cset)
+{
+	lockdep_assert_held(&css_set_lock);
+
+	return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
+}
+
+/**
+ * cgroup_update_populated - updated populated count of a cgroup
+ * @cgrp: the target cgroup
+ * @populated: inc or dec populated count
+ *
+ * One of the css_sets associated with @cgrp is either getting its first
+ * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
+ * count is propagated towards root so that a given cgroup's populated_cnt
+ * is zero iff the cgroup and all its descendants don't contain any tasks.
+ *
+ * @cgrp's interface file "cgroup.populated" is zero if
+ * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
+ * changes from or to zero, userland is notified that the content of the
+ * interface file has changed.  This can be used to detect when @cgrp and
+ * its descendants become populated or empty.
+ */
+static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
+{
+	lockdep_assert_held(&css_set_lock);
+
+	do {
+		bool trigger;
+
+		if (populated)
+			trigger = !cgrp->populated_cnt++;
+		else
+			trigger = !--cgrp->populated_cnt;
+
+		if (!trigger)
+			break;
+
+		check_for_release(cgrp);
+		cgroup_file_notify(&cgrp->events_file);
+
+		cgrp = cgroup_parent(cgrp);
+	} while (cgrp);
+}
+
+/**
+ * css_set_update_populated - update populated state of a css_set
+ * @cset: target css_set
+ * @populated: whether @cset is populated or depopulated
+ *
+ * @cset is either getting the first task or losing the last.  Update the
+ * ->populated_cnt of all associated cgroups accordingly.
+ */
+static void css_set_update_populated(struct css_set *cset, bool populated)
+{
+	struct cgrp_cset_link *link;
+
+	lockdep_assert_held(&css_set_lock);
+
+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
+		cgroup_update_populated(link->cgrp, populated);
+}
+
+/**
+ * css_set_move_task - move a task from one css_set to another
+ * @task: task being moved
+ * @from_cset: css_set @task currently belongs to (may be NULL)
+ * @to_cset: new css_set @task is being moved to (may be NULL)
+ * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
+ *
+ * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
+ * css_set, @from_cset can be NULL.  If @task is being disassociated
+ * instead of moved, @to_cset can be NULL.
+ *
+ * This function automatically handles populated_cnt updates and
+ * css_task_iter adjustments but the caller is responsible for managing
+ * @from_cset and @to_cset's reference counts.
+ */
+static void css_set_move_task(struct task_struct *task,
+			      struct css_set *from_cset, struct css_set *to_cset,
+			      bool use_mg_tasks)
+{
+	lockdep_assert_held(&css_set_lock);
+
+	if (to_cset && !css_set_populated(to_cset))
+		css_set_update_populated(to_cset, true);
+
+	if (from_cset) {
+		struct css_task_iter *it, *pos;
+
+		WARN_ON_ONCE(list_empty(&task->cg_list));
+
+		/*
+		 * @task is leaving, advance task iterators which are
+		 * pointing to it so that they can resume at the next
+		 * position.  Advancing an iterator might remove it from
+		 * the list, use safe walk.  See css_task_iter_advance*()
+		 * for details.
+		 */
+		list_for_each_entry_safe(it, pos, &from_cset->task_iters,
+					 iters_node)
+			if (it->task_pos == &task->cg_list)
+				css_task_iter_advance(it);
+
+		list_del_init(&task->cg_list);
+		if (!css_set_populated(from_cset))
+			css_set_update_populated(from_cset, false);
+	} else {
+		WARN_ON_ONCE(!list_empty(&task->cg_list));
+	}
+
+	if (to_cset) {
+		/*
+		 * We are synchronized through cgroup_threadgroup_rwsem
+		 * against PF_EXITING setting such that we can't race
+		 * against cgroup_exit() changing the css_set to
+		 * init_css_set and dropping the old one.
+		 */
+		WARN_ON_ONCE(task->flags & PF_EXITING);
+
+		rcu_assign_pointer(task->cgroups, to_cset);
+		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
+							     &to_cset->tasks);
+	}
+}
+
+/*
+ * hash table for cgroup groups. This improves the performance to find
+ * an existing css_set. This hash doesn't (currently) take into
+ * account cgroups in empty hierarchies.
+ */
+#define CSS_SET_HASH_BITS	7
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
+
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
+{
+	unsigned long key = 0UL;
+	struct cgroup_subsys *ss;
+	int i;
+
+	for_each_subsys(ss, i)
+		key += (unsigned long)css[i];
+	key = (key >> 16) ^ key;
+
+	return key;
+}
+
+static void put_css_set_locked(struct css_set *cset)
+{
+	struct cgrp_cset_link *link, *tmp_link;
+	struct cgroup_subsys *ss;
+	int ssid;
+
+	lockdep_assert_held(&css_set_lock);
+
+	if (!atomic_dec_and_test(&cset->refcount))
+		return;
+
+	/* This css_set is dead. unlink it and release cgroup and css refs */
+	for_each_subsys(ss, ssid) {
+		list_del(&cset->e_cset_node[ssid]);
+		css_put(cset->subsys[ssid]);
+	}
+	hash_del(&cset->hlist);
+	css_set_count--;
+
+	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
+		list_del(&link->cset_link);
+		list_del(&link->cgrp_link);
+		if (cgroup_parent(link->cgrp))
+			cgroup_put(link->cgrp);
+		kfree(link);
+	}
+
+	kfree_rcu(cset, rcu_head);
+}
+
+static void put_css_set(struct css_set *cset)
+{
+	unsigned long flags;
+
+	/*
+	 * Ensure that the refcount doesn't hit zero while any readers
+	 * can see it. Similar to atomic_dec_and_lock(), but for an
+	 * rwlock
+	 */
+	if (atomic_add_unless(&cset->refcount, -1, 1))
+		return;
+
+	spin_lock_irqsave(&css_set_lock, flags);
+	put_css_set_locked(cset);
+	spin_unlock_irqrestore(&css_set_lock, flags);
+}
+
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cset)
+{
+	atomic_inc(&cset->refcount);
+}
+
+/**
+ * compare_css_sets - helper function for find_existing_css_set().
+ * @cset: candidate css_set being tested
+ * @old_cset: existing css_set for a task
+ * @new_cgrp: cgroup that's being entered by the task
+ * @template: desired set of css pointers in css_set (pre-calculated)
+ *
+ * Returns true if "cset" matches "old_cset" except for the hierarchy
+ * which "new_cgrp" belongs to, for which it should match "new_cgrp".
+ */
+static bool compare_css_sets(struct css_set *cset,
+			     struct css_set *old_cset,
+			     struct cgroup *new_cgrp,
+			     struct cgroup_subsys_state *template[])
+{
+	struct list_head *l1, *l2;
+
+	/*
+	 * On the default hierarchy, there can be csets which are
+	 * associated with the same set of cgroups but different csses.
+	 * Let's first ensure that csses match.
+	 */
+	if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
+		return false;
+
+	/*
+	 * Compare cgroup pointers in order to distinguish between
+	 * different cgroups in hierarchies.  As different cgroups may
+	 * share the same effective css, this comparison is always
+	 * necessary.
+	 */
+	l1 = &cset->cgrp_links;
+	l2 = &old_cset->cgrp_links;
+	while (1) {
+		struct cgrp_cset_link *link1, *link2;
+		struct cgroup *cgrp1, *cgrp2;
+
+		l1 = l1->next;
+		l2 = l2->next;
+		/* See if we reached the end - both lists are equal length. */
+		if (l1 == &cset->cgrp_links) {
+			BUG_ON(l2 != &old_cset->cgrp_links);
+			break;
+		} else {
+			BUG_ON(l2 == &old_cset->cgrp_links);
+		}
+		/* Locate the cgroups associated with these links. */
+		link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
+		link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
+		cgrp1 = link1->cgrp;
+		cgrp2 = link2->cgrp;
+		/* Hierarchies should be linked in the same order. */
+		BUG_ON(cgrp1->root != cgrp2->root);
+
+		/*
+		 * If this hierarchy is the hierarchy of the cgroup
+		 * that's changing, then we need to check that this
+		 * css_set points to the new cgroup; if it's any other
+		 * hierarchy, then this css_set should point to the
+		 * same cgroup as the old css_set.
+		 */
+		if (cgrp1->root == new_cgrp->root) {
+			if (cgrp1 != new_cgrp)
+				return false;
+		} else {
+			if (cgrp1 != cgrp2)
+				return false;
+		}
+	}
+	return true;
+}
+
+/**
+ * find_existing_css_set - init css array and find the matching css_set
+ * @old_cset: the css_set that we're using before the cgroup transition
+ * @cgrp: the cgroup that we're moving into
+ * @template: out param for the new set of csses, should be clear on entry
+ */
+static struct css_set *find_existing_css_set(struct css_set *old_cset,
+					struct cgroup *cgrp,
+					struct cgroup_subsys_state *template[])
+{
+	struct cgroup_root *root = cgrp->root;
+	struct cgroup_subsys *ss;
+	struct css_set *cset;
+	unsigned long key;
+	int i;
+
+	/*
+	 * Build the set of subsystem state objects that we want to see in the
+	 * new css_set. while subsystems can change globally, the entries here
+	 * won't change, so no need for locking.
+	 */
+	for_each_subsys(ss, i) {
+		if (root->subsys_mask & (1UL << i)) {
+			/*
+			 * @ss is in this hierarchy, so we want the
+			 * effective css from @cgrp.
+			 */
+			template[i] = cgroup_e_css(cgrp, ss);
+		} else {
+			/*
+			 * @ss is not in this hierarchy, so we don't want
+			 * to change the css.
+			 */
+			template[i] = old_cset->subsys[i];
+		}
+	}
+
+	key = css_set_hash(template);
+	hash_for_each_possible(css_set_table, cset, hlist, key) {
+		if (!compare_css_sets(cset, old_cset, cgrp, template))
+			continue;
+
+		/* This css_set matches what we need */
+		return cset;
+	}
+
+	/* No existing cgroup group matched */
+	return NULL;
+}
+
+static void free_cgrp_cset_links(struct list_head *links_to_free)
+{
+	struct cgrp_cset_link *link, *tmp_link;
+
+	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
+		list_del(&link->cset_link);
+		kfree(link);
+	}
+}
+
+/**
+ * allocate_cgrp_cset_links - allocate cgrp_cset_links
+ * @count: the number of links to allocate
+ * @tmp_links: list_head the allocated links are put on
+ *
+ * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
+ * through ->cset_link.  Returns 0 on success or -errno.
+ */
+static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
+{
+	struct cgrp_cset_link *link;
+	int i;
+
+	INIT_LIST_HEAD(tmp_links);
+
+	for (i = 0; i < count; i++) {
+		link = kzalloc(sizeof(*link), GFP_KERNEL);
+		if (!link) {
+			free_cgrp_cset_links(tmp_links);
+			return -ENOMEM;
+		}
+		list_add(&link->cset_link, tmp_links);
+	}
+	return 0;
+}
+
+/**
+ * link_css_set - a helper function to link a css_set to a cgroup
+ * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
+ * @cset: the css_set to be linked
+ * @cgrp: the destination cgroup
+ */
+static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
+			 struct cgroup *cgrp)
+{
+	struct cgrp_cset_link *link;
+
+	BUG_ON(list_empty(tmp_links));
+
+	if (cgroup_on_dfl(cgrp))
+		cset->dfl_cgrp = cgrp;
+
+	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
+	link->cset = cset;
+	link->cgrp = cgrp;
+
+	/*
+	 * Always add links to the tail of the lists so that the lists are
+	 * in choronological order.
+	 */
+	list_move_tail(&link->cset_link, &cgrp->cset_links);
+	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
+
+	if (cgroup_parent(cgrp))
author	Tejun Heo <tj@kernel.org>	2016-12-27 14:49:05 -0500
committer	Tejun Heo <tj@kernel.org>	2016-12-27 14:49:05 -0500
commit	201af4c0fab02876ef0311e7f7b4083aa138930c (patch)
tree	d79bfd5f25c5fd111a79993bc10fc129281db225 /kernel/cgroup/cgroup.c
parent	5f617ebbdf10abd49312a89e3b894b927c7367f5 (diff)
download	linux-201af4c0fab02876ef0311e7f7b4083aa138930c.tar.gz linux-201af4c0fab02876ef0311e7f7b4083aa138930c.tar.bz2 linux-201af4c0fab02876ef0311e7f7b4083aa138930c.zip