summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-10 18:34:42 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-10 18:34:42 -0800
commitb6da0076bab5a12afb19312ffee41c95490af2a0 (patch)
tree52a5675b9c2ff95d88b981d5b9a3822f6073c112 /mm/memcontrol.c
parentcbfe0de303a55ed96d8831c2d5f56f8131cd6612 (diff)
parenta53b831549141aa060a8b54b76e3a42870d74cc0 (diff)
downloadlinux-b6da0076bab5a12afb19312ffee41c95490af2a0.tar.gz
linux-b6da0076bab5a12afb19312ffee41c95490af2a0.tar.bz2
linux-b6da0076bab5a12afb19312ffee41c95490af2a0.zip
Merge branch 'akpm' (patchbomb from Andrew)
Merge first patchbomb from Andrew Morton: - a few minor cifs fixes - dma-debug upadtes - ocfs2 - slab - about half of MM - procfs - kernel/exit.c - panic.c tweaks - printk upates - lib/ updates - checkpatch updates - fs/binfmt updates - the drivers/rtc tree - nilfs - kmod fixes - more kernel/exit.c - various other misc tweaks and fixes * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (190 commits) exit: pidns: fix/update the comments in zap_pid_ns_processes() exit: pidns: alloc_pid() leaks pid_namespace if child_reaper is exiting exit: exit_notify: re-use "dead" list to autoreap current exit: reparent: call forget_original_parent() under tasklist_lock exit: reparent: avoid find_new_reaper() if no children exit: reparent: introduce find_alive_thread() exit: reparent: introduce find_child_reaper() exit: reparent: document the ->has_child_subreaper checks exit: reparent: s/while_each_thread/for_each_thread/ in find_new_reaper() exit: reparent: fix the cross-namespace PR_SET_CHILD_SUBREAPER reparenting exit: reparent: fix the dead-parent PR_SET_CHILD_SUBREAPER reparenting exit: proc: don't try to flush /proc/tgid/task/tgid exit: release_task: fix the comment about group leader accounting exit: wait: drop tasklist_lock before psig->c* accounting exit: wait: don't use zombie->real_parent exit: wait: cleanup the ptrace_reparented() checks usermodehelper: kill the kmod_thread_locker logic usermodehelper: don't use CLONE_VFORK for ____call_usermodehelper() fs/hfs/catalog.c: fix comparison bug in hfs_cat_keycmp nilfs2: fix the nilfs_iget() vs. nilfs_new_inode() races ...
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c1706
1 files changed, 505 insertions, 1201 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ee48428cf8e3..85df503ec023 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -25,7 +25,7 @@
* GNU General Public License for more details.
*/
-#include <linux/res_counter.h>
+#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
#include <linux/mm.h>
@@ -51,7 +51,7 @@
#include <linux/seq_file.h>
#include <linux/vmpressure.h>
#include <linux/mm_inline.h>
-#include <linux/page_cgroup.h>
+#include <linux/swap_cgroup.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/lockdep.h>
@@ -143,14 +143,8 @@ struct mem_cgroup_stat_cpu {
unsigned long targets[MEM_CGROUP_NTARGETS];
};
-struct mem_cgroup_reclaim_iter {
- /*
- * last scanned hierarchy member. Valid only if last_dead_count
- * matches memcg->dead_count of the hierarchy root group.
- */
- struct mem_cgroup *last_visited;
- int last_dead_count;
-
+struct reclaim_iter {
+ struct mem_cgroup *position;
/* scan generation, increased every round-trip */
unsigned int generation;
};
@@ -162,10 +156,10 @@ struct mem_cgroup_per_zone {
struct lruvec lruvec;
unsigned long lru_size[NR_LRU_LISTS];
- struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
+ struct reclaim_iter iter[DEF_PRIORITY + 1];
struct rb_node tree_node; /* RB tree node */
- unsigned long long usage_in_excess;/* Set to the value by which */
+ unsigned long usage_in_excess;/* Set to the value by which */
/* the soft limit is exceeded*/
bool on_tree;
struct mem_cgroup *memcg; /* Back pointer, we cannot */
@@ -198,7 +192,7 @@ static struct mem_cgroup_tree soft_limit_tree __read_mostly;
struct mem_cgroup_threshold {
struct eventfd_ctx *eventfd;
- u64 threshold;
+ unsigned long threshold;
};
/* For threshold */
@@ -284,10 +278,13 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
*/
struct mem_cgroup {
struct cgroup_subsys_state css;
- /*
- * the counter to account for memory usage
- */
- struct res_counter res;
+
+ /* Accounted resources */
+ struct page_counter memory;
+ struct page_counter memsw;
+ struct page_counter kmem;
+
+ unsigned long soft_limit;
/* vmpressure notifications */
struct vmpressure vmpressure;
@@ -296,15 +293,6 @@ struct mem_cgroup {
int initialized;
/*
- * the counter to account for mem+swap usage.
- */
- struct res_counter memsw;
-
- /*
- * the counter to account for kernel memory usage.
- */
- struct res_counter kmem;
- /*
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
@@ -352,7 +340,6 @@ struct mem_cgroup {
struct mem_cgroup_stat_cpu nocpu_base;
spinlock_t pcp_counter_lock;
- atomic_t dead_count;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
struct cg_proto tcp_mem;
#endif
@@ -382,7 +369,6 @@ struct mem_cgroup {
/* internal only representation about the status of kmem accounting. */
enum {
KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
- KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
};
#ifdef CONFIG_MEMCG_KMEM
@@ -396,22 +382,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}
-static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
-{
- /*
- * Our caller must use css_get() first, because memcg_uncharge_kmem()
- * will call css_put() if it sees the memcg is dead.
- */
- smp_wmb();
- if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
- set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
-}
-
-static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
-{
- return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
- &memcg->kmem_account_flags);
-}
#endif
/* Stuffs for move charges at task migration. */
@@ -650,7 +620,7 @@ static void disarm_kmem_keys(struct mem_cgroup *memcg)
* This check can't live in kmem destruction function,
* since the charges will outlive the cgroup
*/
- WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
+ WARN_ON(page_counter_read(&memcg->kmem));
}
#else
static void disarm_kmem_keys(struct mem_cgroup *memcg)
@@ -664,8 +634,6 @@ static void disarm_static_keys(struct mem_cgroup *memcg)
disarm_kmem_keys(memcg);
}
-static void drain_all_stock_async(struct mem_cgroup *memcg);
-
static struct mem_cgroup_per_zone *
mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
{
@@ -706,7 +674,7 @@ soft_limit_tree_from_page(struct page *page)
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz,
- unsigned long long new_usage_in_excess)
+ unsigned long new_usage_in_excess)
{
struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL;
@@ -755,10 +723,21 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
spin_unlock_irqrestore(&mctz->lock, flags);
}
+static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
+{
+ unsigned long nr_pages = page_counter_read(&memcg->memory);
+ unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);
+ unsigned long excess = 0;
+
+ if (nr_pages > soft_limit)
+ excess = nr_pages - soft_limit;
+
+ return excess;
+}
static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
{
- unsigned long long excess;
+ unsigned long excess;
struct mem_cgroup_per_zone *mz;
struct mem_cgroup_tree_per_zone *mctz;
@@ -769,7 +748,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
*/
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
mz = mem_cgroup_page_zoneinfo(memcg, page);
- excess = res_counter_soft_limit_excess(&memcg->res);
+ excess = soft_limit_excess(memcg);
/*
* We have to update the tree if mz is on RB-tree or
* mem is over its softlimit.
@@ -825,7 +804,7 @@ retry:
* position in the tree.
*/
__mem_cgroup_remove_exceeded(mz, mctz);
- if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
+ if (!soft_limit_excess(mz->memcg) ||
!css_tryget_online(&mz->memcg->css))
goto retry;
done:
@@ -1062,122 +1041,6 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
return memcg;
}
-/*
- * Returns a next (in a pre-order walk) alive memcg (with elevated css
- * ref. count) or NULL if the whole root's subtree has been visited.
- *
- * helper function to be used by mem_cgroup_iter
- */
-static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
- struct mem_cgroup *last_visited)
-{
- struct cgroup_subsys_state *prev_css, *next_css;
-
- prev_css = last_visited ? &last_visited->css : NULL;
-skip_node:
- next_css = css_next_descendant_pre(prev_css, &root->css);
-
- /*
- * Even if we found a group we have to make sure it is
- * alive. css && !memcg means that the groups should be
- * skipped and we should continue the tree walk.
- * last_visited css is safe to use because it is
- * protected by css_get and the tree walk is rcu safe.
- *
- * We do not take a reference on the root of the tree walk
- * because we might race with the root removal when it would
- * be the only node in the iterated hierarchy and mem_cgroup_iter
- * would end up in an endless loop because it expects that at
- * least one valid node will be returned. Root cannot disappear
- * because caller of the iterator should hold it already so
- * skipping css reference should be safe.
- */
- if (next_css) {
- struct mem_cgroup *memcg = mem_cgroup_from_css(next_css);
-
- if (next_css == &root->css)
- return memcg;
-
- if (css_tryget_online(next_css)) {
- /*
- * Make sure the memcg is initialized:
- * mem_cgroup_css_online() orders the the
- * initialization against setting the flag.
- */
- if (smp_load_acquire(&memcg->initialized))
- return memcg;
- css_put(next_css);
- }
-
- prev_css = next_css;
- goto skip_node;
- }
-
- return NULL;
-}
-
-static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
-{
- /*
- * When a group in the hierarchy below root is destroyed, the
- * hierarchy iterator can no longer be trusted since it might
- * have pointed to the destroyed group. Invalidate it.
- */
- atomic_inc(&root->dead_count);
-}
-
-static struct mem_cgroup *
-mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
- struct mem_cgroup *root,
- int *sequence)
-{
- struct mem_cgroup *position = NULL;
- /*
- * A cgroup destruction happens in two stages: offlining and
- * release. They are separated by a RCU grace period.
- *
- * If the iterator is valid, we may still race with an
- * offlining. The RCU lock ensures the object won't be
- * released, tryget will fail if we lost the race.
- */
- *sequence = atomic_read(&root->dead_count);
- if (iter->last_dead_count == *sequence) {
- smp_rmb();
- position = iter->last_visited;
-
- /*
- * We cannot take a reference to root because we might race
- * with root removal and returning NULL would end up in
- * an endless loop on the iterator user level when root
- * would be returned all the time.
- */
- if (position && position != root &&
- !css_tryget_online(&position->css))
- position = NULL;
- }
- return position;
-}
-
-static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
- struct mem_cgroup *last_visited,
- struct mem_cgroup *new_position,
- struct mem_cgroup *root,
- int sequence)
-{
- /* root reference counting symmetric to mem_cgroup_iter_load */
- if (last_visited && last_visited != root)
- css_put(&last_visited->css);
- /*
- * We store the sequence count from the time @last_visited was
- * loaded successfully instead of rereading it here so that we
- * don't lose destruction events in between. We could have
- * raced with the destruction of @new_position after all.
- */
- iter->last_visited = new_position;
- smp_wmb();
- iter->last_dead_count = sequence;
-}
-
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
@@ -1199,8 +1062,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim)
{
+ struct reclaim_iter *uninitialized_var(iter);
+ struct cgroup_subsys_state *css = NULL;
struct mem_cgroup *memcg = NULL;
- struct mem_cgroup *last_visited = NULL;
+ struct mem_cgroup *pos = NULL;
if (mem_cgroup_disabled())
return NULL;
@@ -1209,50 +1074,101 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
root = root_mem_cgroup;
if (prev && !reclaim)
- last_visited = prev;
+ pos = prev;
if (!root->use_hierarchy && root != root_mem_cgroup) {
if (prev)
- goto out_css_put;
+ goto out;
return root;
}
rcu_read_lock();
- while (!memcg) {
- struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
- int uninitialized_var(seq);
-
- if (reclaim) {
- struct mem_cgroup_per_zone *mz;
-
- mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
- iter = &mz->reclaim_iter[reclaim->priority];
- if (prev && reclaim->generation != iter->generation) {
- iter->last_visited = NULL;
- goto out_unlock;
- }
- last_visited = mem_cgroup_iter_load(iter, root, &seq);
+ if (reclaim) {
+ struct mem_cgroup_per_zone *mz;
+
+ mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
+ iter = &mz->iter[reclaim->priority];
+
+ if (prev && reclaim->generation != iter->generation)
+ goto out_unlock;
+
+ do {
+ pos = ACCESS_ONCE(iter->position);
+ /*
+ * A racing update may change the position and
+ * put the last reference, hence css_tryget(),
+ * or retry to see the updated position.
+ */
+ } while (pos && !css_tryget(&pos->css));
+ }
+
+ if (pos)
+ css = &pos->css;
+
+ for (;;) {
+ css = css_next_descendant_pre(css, &root->css);
+ if (!css) {
+ /*
+ * Reclaimers share the hierarchy walk, and a
+ * new one might jump in right at the end of
+ * the hierarchy - make sure they see at least
+ * one group and restart from the beginning.
+ */
+ if (!prev)
+ continue;
+ break;
}
- memcg = __mem_cgroup_iter_next(root, last_visited);
+ /*
+ * Verify the css and acquire a reference. The root
+ * is provided by the caller, so we know it's alive
+ * and kicking, and don't take an extra reference.
+ */
+ memcg = mem_cgroup_from_css(css);
+
+ if (css == &root->css)
+ break;
- if (reclaim) {
- mem_cgroup_iter_update(iter, last_visited, memcg, root,
- seq);
+ if (css_tryget(css)) {
+ /*
+ * Make sure the memcg is initialized:
+ * mem_cgroup_css_online() orders the the
+ * initialization against setting the flag.
+ */
+ if (smp_load_acquire(&memcg->initialized))
+ break;
- if (!memcg)
- iter->generation++;
- else if (!prev && memcg)
- reclaim->generation = iter->generation;
+ css_put(css);
}
- if (prev && !memcg)
- goto out_unlock;
+ memcg = NULL;
+ }
+
+ if (reclaim) {
+ if (cmpxchg(&iter->position, pos, memcg) == pos) {
+ if (memcg)
+ css_get(&memcg->css);
+ if (pos)
+ css_put(&pos->css);
+ }
+
+ /*
+ * pairs with css_tryget when dereferencing iter->position
+ * above.
+ */
+ if (pos)
+ css_put(&pos->css);
+
+ if (!memcg)
+ iter->generation++;
+ else if (!prev)
+ reclaim->generation = iter->generation;
}
+
out_unlock:
rcu_read_unlock();
-out_css_put:
+out:
if (prev && prev != root)
css_put(&prev->css);
@@ -1346,15 +1262,18 @@ out:
}
/**
- * mem_cgroup_page_lruvec - return lruvec for adding an lru page
+ * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
* @page: the page
* @zone: zone of the page
+ *
+ * This function is only safe when following the LRU page isolation
+ * and putback protocol: the LRU lock must be held, and the page must
+ * either be PageLRU() or the caller must have isolated/allocated it.
*/
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
{
struct mem_cgroup_per_zone *mz;
struct mem_cgroup *memcg;
- struct page_cgroup *pc;
struct lruvec *lruvec;
if (mem_cgroup_disabled()) {
@@ -1362,20 +1281,13 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
goto out;
}
- pc = lookup_page_cgroup(page);
- memcg = pc->mem_cgroup;
-
+ memcg = page->mem_cgroup;
/*
- * Surreptitiously switch any uncharged offlist page to root:
- * an uncharged page off lru does nothing to secure
- * its former mem_cgroup from sudden removal.
- *
- * Our caller holds lru_lock, and PageCgroupUsed is updated
- * under page_cgroup lock: between them, they make all uses
- * of pc->mem_cgroup safe.
+ * Swapcache readahead pages are added to the LRU - and
+ * possibly migrated - before they are charged.
*/
- if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
- pc->mem_cgroup = memcg = root_mem_cgroup;
+ if (!memcg)
+ memcg = root_mem_cgroup;
mz = mem_cgroup_page_zoneinfo(memcg, page);
lruvec = &mz->lruvec;
@@ -1414,41 +1326,24 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
VM_BUG_ON((long)(*lru_size) < 0);
}
-/*
- * Checks whether given mem is same or in the root_mem_cgroup's
- * hierarchy subtree
- */
-bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
- struct mem_cgroup *memcg)
+bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
{
- if (root_memcg == memcg)
+ if (root == memcg)
return true;
- if (!root_memcg->use_hierarchy || !memcg)
+ if (!root->use_hierarchy)
return false;
- return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
-}
-
-static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
- struct mem_cgroup *memcg)
-{
- bool ret;
-
- rcu_read_lock();
- ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
- rcu_read_unlock();
- return ret;
+ return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
}
-bool task_in_mem_cgroup(struct task_struct *task,
- const struct mem_cgroup *memcg)
+bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
{
- struct mem_cgroup *curr = NULL;
+ struct mem_cgroup *task_memcg;
struct task_struct *p;
bool ret;
p = find_lock_task_mm(task);
if (p) {
- curr = get_mem_cgroup_from_mm(p->mm);
+ task_memcg = get_mem_cgroup_from_mm(p->mm);
task_unlock(p);
} else {
/*
@@ -1457,19 +1352,12 @@ bool task_in_mem_cgroup(struct task_struct *task,
* killed to prevent needlessly killing additional tasks.
*/
rcu_read_lock();
- curr = mem_cgroup_from_task(task);
- if (curr)
- css_get(&curr->css);
+ task_memcg = mem_cgroup_from_task(task);
+ css_get(&task_memcg->css);
rcu_read_unlock();
}
- /*
- * We should check use_hierarchy of "memcg" not "curr". Because checking
- * use_hierarchy of "curr" here make this function true if hierarchy is
- * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
- * hierarchy(even if use_hierarchy is disabled in "memcg").
- */
- ret = mem_cgroup_same_or_subtree(memcg, curr);
- css_put(&curr->css);
+ ret = mem_cgroup_is_descendant(task_memcg, memcg);
+ css_put(&task_memcg->css);
return ret;
}
@@ -1492,7 +1380,7 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
return inactive * inactive_ratio < active;
}
-#define mem_cgroup_from_res_counter(counter, member) \
+#define mem_cgroup_from_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
/**
@@ -1504,12 +1392,23 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
*/
static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
{
- unsigned long long margin;
+ unsigned long margin = 0;
+ unsigned long count;
+ unsigned long limit;
- margin = res_counter_margin(&memcg->res);
- if (do_swap_account)
- margin = min(margin, res_counter_margin(&memcg->memsw));
- return margin >> PAGE_SHIFT;
+ count = page_counter_read(&memcg->memory);
+ limit = ACCESS_ONCE(memcg->memory.limit);
+ if (count < limit)
+ margin = limit - count;
+
+ if (do_swap_account) {
+ count = page_counter_read(&memcg->memsw);
+ limit = ACCESS_ONCE(memcg->memsw.limit);
+ if (count <= limit)
+ margin = min(margin, limit - count);
+ }
+
+ return margin;
}
int mem_cgroup_swappiness(struct mem_cgroup *memcg)
@@ -1522,37 +1421,6 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
}
/*
- * memcg->moving_account is used for checking possibility that some thread is
- * calling move_account(). When a thread on CPU-A starts moving pages under
- * a memcg, other threads should check memcg->moving_account under
- * rcu_read_lock(), like this:
- *
- * CPU-A CPU-B
- * rcu_read_lock()
- * memcg->moving_account+1 if (memcg->mocing_account)
- * take heavy locks.
- * synchronize_rcu() update something.
- * rcu_read_unlock()
- * start move here.
- */
-
-static void mem_cgroup_start_move(struct mem_cgroup *memcg)
-{
- atomic_inc(&memcg->moving_account);
- synchronize_rcu();
-}
-
-static void mem_cgroup_end_move(struct mem_cgroup *memcg)
-{
- /*
- * Now, mem_cgroup_clear_mc() may call this function with NULL.
- * We check NULL in callee rather than caller.
- */
- if (memcg)
- atomic_dec(&memcg->moving_account);
-}
-
-/*
* A routine for checking "mem" is under move_account() or not.
*
* Checking a cgroup is mc.from or mc.to or under hierarchy of
@@ -1574,8 +1442,8 @@ static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
if (!from)
goto unlock;
- ret = mem_cgroup_same_or_subtree(memcg, from)
- || mem_cgroup_same_or_subtree(memcg, to);
+ ret = mem_cgroup_is_descendant(from, memcg) ||
+ mem_cgroup_is_descendant(to, memcg);
unlock:
spin_unlock(&mc.lock);
return ret;
@@ -1597,23 +1465,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
return false;
}
-/*
- * Take this lock when
- * - a code tries to modify page's memcg while it's USED.
- * - a code tries to modify page state accounting in a memcg.
- */
-static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
- unsigned long *flags)
-{
- spin_lock_irqsave(&memcg->move_lock, *flags);
-}
-
-static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
- unsigned long *flags)
-{
- spin_unlock_irqrestore(&memcg->move_lock, *flags);
-}
-
#define K(x) ((x) << (PAGE_SHIFT-10))
/**
* mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
@@ -1644,18 +1495,15 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
rcu_read_unlock();
- pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
- res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
- res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
- res_counter_read_u64(&memcg->res, RES_FAILCNT));
- pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
- res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
- res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
- res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
- pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
- res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
- res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
- res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
+ pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->memory)),
+ K((u64)memcg->memory.limit), memcg->memory.failcnt);
+ pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->memsw)),
+ K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
+ pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->kmem)),
+ K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
for_each_mem_cgroup_tree(iter, memcg) {
pr_info("Memory cgroup stats for ");
@@ -1695,28 +1543,17 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
-static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
{
- u64 limit;
-
- limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ unsigned long limit;
- /*
- * Do not consider swap space if we cannot swap due to swappiness
- */
+ limit = memcg->memory.limit;
if (mem_cgroup_swappiness(memcg)) {
- u64 memsw;
+ unsigned long memsw_limit;
- limit += total_swap_pages << PAGE_SHIFT;
- memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
-
- /*
- * If memsw is finite and limits the amount of swap space
- * available to this memcg, return that limit.
- */
- limit = min(limit, memsw);
+ memsw_limit = memcg->memsw.limit;
+ limit = min(limit + total_swap_pages, memsw_limit);
}
-
return limit;
}
@@ -1740,7 +1577,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
}
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
- totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
+ totalpages = mem_cgroup_get_limit(memcg) ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
struct task_struct *task;
@@ -1880,52 +1717,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
memcg->last_scanned_node = node;
return node;
}
-
-/*
- * Check all nodes whether it contains reclaimable pages or not.
- * For quick scan, we make use of scan_nodes. This will allow us to skip
- * unused nodes. But scan_nodes is lazily updated and may not cotain
- * enough new information. We need to do double check.
- */
-static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
-{
- int nid;
-
- /*
- * quick check...making use of scan_node.
- * We can skip unused nodes.
- */
- if (!nodes_empty(memcg->scan_nodes)) {
- for (nid = first_node(memcg->scan_nodes);
- nid < MAX_NUMNODES;
- nid = next_node(nid, memcg->scan_nodes)) {
-
- if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
- return true;
- }
- }
- /*
- * Check rest of nodes.
- */
- for_each_node_state(nid, N_MEMORY) {
- if (node_isset(nid, memcg->scan_nodes))
- continue;
- if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
- return true;
- }
- return false;
-}
-
#else
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
return 0;
}
-
-static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
-{
- return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
-}
#endif
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
@@ -1943,7 +1739,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
.priority = 0,
};
- excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
+ excess = soft_limit_excess(root_memcg);
while (1) {
victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
@@ -1969,12 +1765,10 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
}
continue;
}
- if (!mem_cgroup_reclaimable(victim, false))
- continue;
total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
zone, &nr_scanned);
*total_scanned += nr_scanned;
- if (!res_counter_soft_limit_excess(&root_memcg->res))
+ if (!soft_limit_excess(root_memcg))
break;
}
mem_cgroup_iter_break(root_memcg, victim);
@@ -2081,12 +1875,8 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
oom_wait_memcg = oom_wait_info->memcg;
- /*
- * Both of oom_wait_info->memcg and wake_memcg are stable under us.
- * Then we can use css_is_ancestor without taking care of RCU.
- */
- if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
- && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
+ if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
+ !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
return 0;
return autoremove_wake_function(wait, mode, sync, arg);
}
@@ -2228,26 +2018,23 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
unsigned long *flags)
{
struct mem_cgroup *memcg;
- struct page_cgroup *pc;
rcu_read_lock();
if (mem_cgroup_disabled())
return NULL;
-
- pc = lookup_page_cgroup(page);
again:
- memcg = pc->mem_cgroup;
- if (unlikely(!memcg || !PageCgroupUsed(pc)))
+ memcg = page->mem_cgroup;
+ if (unlikely(!memcg))
return NULL;
*locked = false;
if (atomic_read(&memcg->moving_account) <= 0)
return memcg;
- move_lock_mem_cgroup(memcg, flags);
- if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
- move_unlock_mem_cgroup(memcg, flags);
+ spin_lock_irqsave(&memcg->move_lock, *flags);
+ if (memcg != page->mem_cgroup) {
+ spin_unlock_irqrestore(&memcg->move_lock, *flags);
goto again;
}
*locked = true;
@@ -2261,11 +2048,11 @@ again:
* @locked: value received from mem_cgroup_begin_page_stat()
* @flags: value received from mem_cgroup_begin_page_stat()
*/
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
- unsigned long flags)
+void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked,
+ unsigned long *flags)
{
- if (memcg && locked)
- move_unlock_mem_cgroup(memcg, &flags);
+ if (memcg && *locked)
+ spin_unlock_irqrestore(&memcg->move_lock, *flags);
rcu_read_unlock();
}
@@ -2316,33 +2103,32 @@ static DEFINE_MUTEX(percpu_charge_mutex);
static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
- bool ret = true;
+ bool ret = false;
if (nr_pages > CHARGE_BATCH)
- return false;
+ return ret;
stock = &get_cpu_var(memcg_stock);
- if (memcg == stock->cached && stock->nr_pages >= nr_pages)
+ if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
stock->nr_pages -= nr_pages;
- else /* need to call res_counter_charge */
- ret = false;
+ ret = true;
+ }
put_cpu_var(memcg_stock);
return ret;
}
/*
- * Returns stocks cached in percpu to res_counter and reset cached information.
+ * Returns stocks cached in percpu and reset cached information.
*/
static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
if (stock->nr_pages) {
- unsigned long bytes = stock->nr_pages * PAGE_SIZE;
-
- res_counter_uncharge(&old->res, bytes);
+ page_counter_uncharge(&old->memory, stock->nr_pages);
if (do_swap_account)
- res_counter_uncharge(&old->memsw, bytes);
+ page_counter_uncharge(&old->memsw, stock->nr_pages);
+ css_put_many(&old->css, stock->nr_pages);
stock->nr_pages = 0;
}
stock->cached = NULL;
@@ -2371,7 +2157,7 @@ static void __init memcg_stock_init(void)
}
/*
- * Cache charges(val) which is from res_counter, to local per_cpu area.
+ * Cache charges(val) to local per_cpu area.
* This will be consumed by consume_stock() function, later.
*/
static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
@@ -2388,13 +2174,15 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
/*
* Drains all per-CPU charge caches for given root_memcg resp. subtree
- * of the hierarchy under it. sync flag says whether we should block
- * until the work is done.
+ * of the hierarchy under it.
*/
-static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
+static void drain_all_stock(struct mem_cgroup *root_memcg)
{
int cpu, curcpu;
+ /* If someone's already draining, avoid adding running more workers. */
+ if (!mutex_trylock(&percpu_charge_mutex))
+ return;
/* Notify other cpus that system-wide "drain" is running */
get_online_cpus();
curcpu = get_cpu();
@@ -2405,7 +2193,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
memcg = stock->cached;
if (!memcg || !stock->nr_pages)
continue;
- if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
+ if (!mem_cgroup_is_descendant(memcg, root_memcg))
continue;
if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
if (cpu == curcpu)
@@ -2415,42 +2203,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
}
}
put_cpu();
-
- if (!sync)
- goto out;
-
- for_each_online_cpu(cpu) {
- struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
- if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
- flush_work(&stock->work);
- }
-out:
put_online_cpus();
-}
-
-/*
- * Tries to drain stocked charges in other cpus. This function is asynchronous
- * and just put a work per cpu for draining localy on each cpu. Caller can
- * expects some charges will be back to res_counter later but cannot wait for
- * it.
- */
-static void drain_all_stock_async(struct mem_cgroup *root_memcg)
-{
- /*
- * If someone calls draining, avoid adding more kworker runs.
- */
- if (!mutex_trylock(&percpu_charge_mutex))
- return;
- drain_all_stock(root_memcg, false);
- mutex_unlock(&percpu_charge_mutex);
-}
-
-/* This is a synchronous drain interface. */
-static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
-{
- /* called when force_empty is called */
- mutex_lock(&percpu_charge_mutex);
- drain_all_stock(root_memcg, true);
mutex_unlock(&percpu_charge_mutex);
}