diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-10 18:34:42 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-10 18:34:42 -0800 |
| commit | b6da0076bab5a12afb19312ffee41c95490af2a0 (patch) | |
| tree | 52a5675b9c2ff95d88b981d5b9a3822f6073c112 /mm/memcontrol.c | |
| parent | cbfe0de303a55ed96d8831c2d5f56f8131cd6612 (diff) | |
| parent | a53b831549141aa060a8b54b76e3a42870d74cc0 (diff) | |
| download | linux-b6da0076bab5a12afb19312ffee41c95490af2a0.tar.gz linux-b6da0076bab5a12afb19312ffee41c95490af2a0.tar.bz2 linux-b6da0076bab5a12afb19312ffee41c95490af2a0.zip | |
Merge branch 'akpm' (patchbomb from Andrew)
Merge first patchbomb from Andrew Morton:
- a few minor cifs fixes
- dma-debug upadtes
- ocfs2
- slab
- about half of MM
- procfs
- kernel/exit.c
- panic.c tweaks
- printk upates
- lib/ updates
- checkpatch updates
- fs/binfmt updates
- the drivers/rtc tree
- nilfs
- kmod fixes
- more kernel/exit.c
- various other misc tweaks and fixes
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (190 commits)
exit: pidns: fix/update the comments in zap_pid_ns_processes()
exit: pidns: alloc_pid() leaks pid_namespace if child_reaper is exiting
exit: exit_notify: re-use "dead" list to autoreap current
exit: reparent: call forget_original_parent() under tasklist_lock
exit: reparent: avoid find_new_reaper() if no children
exit: reparent: introduce find_alive_thread()
exit: reparent: introduce find_child_reaper()
exit: reparent: document the ->has_child_subreaper checks
exit: reparent: s/while_each_thread/for_each_thread/ in find_new_reaper()
exit: reparent: fix the cross-namespace PR_SET_CHILD_SUBREAPER reparenting
exit: reparent: fix the dead-parent PR_SET_CHILD_SUBREAPER reparenting
exit: proc: don't try to flush /proc/tgid/task/tgid
exit: release_task: fix the comment about group leader accounting
exit: wait: drop tasklist_lock before psig->c* accounting
exit: wait: don't use zombie->real_parent
exit: wait: cleanup the ptrace_reparented() checks
usermodehelper: kill the kmod_thread_locker logic
usermodehelper: don't use CLONE_VFORK for ____call_usermodehelper()
fs/hfs/catalog.c: fix comparison bug in hfs_cat_keycmp
nilfs2: fix the nilfs_iget() vs. nilfs_new_inode() races
...
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 1706 |
1 files changed, 505 insertions, 1201 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ee48428cf8e3..85df503ec023 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -25,7 +25,7 @@ * GNU General Public License for more details. */ -#include <linux/res_counter.h> +#include <linux/page_counter.h> #include <linux/memcontrol.h> #include <linux/cgroup.h> #include <linux/mm.h> @@ -51,7 +51,7 @@ #include <linux/seq_file.h> #include <linux/vmpressure.h> #include <linux/mm_inline.h> -#include <linux/page_cgroup.h> +#include <linux/swap_cgroup.h> #include <linux/cpu.h> #include <linux/oom.h> #include <linux/lockdep.h> @@ -143,14 +143,8 @@ struct mem_cgroup_stat_cpu { unsigned long targets[MEM_CGROUP_NTARGETS]; }; -struct mem_cgroup_reclaim_iter { - /* - * last scanned hierarchy member. Valid only if last_dead_count - * matches memcg->dead_count of the hierarchy root group. - */ - struct mem_cgroup *last_visited; - int last_dead_count; - +struct reclaim_iter { + struct mem_cgroup *position; /* scan generation, increased every round-trip */ unsigned int generation; }; @@ -162,10 +156,10 @@ struct mem_cgroup_per_zone { struct lruvec lruvec; unsigned long lru_size[NR_LRU_LISTS]; - struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; + struct reclaim_iter iter[DEF_PRIORITY + 1]; struct rb_node tree_node; /* RB tree node */ - unsigned long long usage_in_excess;/* Set to the value by which */ + unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; struct mem_cgroup *memcg; /* Back pointer, we cannot */ @@ -198,7 +192,7 @@ static struct mem_cgroup_tree soft_limit_tree __read_mostly; struct mem_cgroup_threshold { struct eventfd_ctx *eventfd; - u64 threshold; + unsigned long threshold; }; /* For threshold */ @@ -284,10 +278,13 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); */ struct mem_cgroup { struct cgroup_subsys_state css; - /* - * the counter to account for memory usage - */ - struct res_counter res; + + /* Accounted resources */ + struct page_counter memory; + struct page_counter memsw; + struct page_counter kmem; + + unsigned long soft_limit; /* vmpressure notifications */ struct vmpressure vmpressure; @@ -296,15 +293,6 @@ struct mem_cgroup { int initialized; /* - * the counter to account for mem+swap usage. - */ - struct res_counter memsw; - - /* - * the counter to account for kernel memory usage. - */ - struct res_counter kmem; - /* * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; @@ -352,7 +340,6 @@ struct mem_cgroup { struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; - atomic_t dead_count; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct cg_proto tcp_mem; #endif @@ -382,7 +369,6 @@ struct mem_cgroup { /* internal only representation about the status of kmem accounting. */ enum { KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ - KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ }; #ifdef CONFIG_MEMCG_KMEM @@ -396,22 +382,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg) return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); } -static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) -{ - /* - * Our caller must use css_get() first, because memcg_uncharge_kmem() - * will call css_put() if it sees the memcg is dead. - */ - smp_wmb(); - if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) - set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); -} - -static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) -{ - return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, - &memcg->kmem_account_flags); -} #endif /* Stuffs for move charges at task migration. */ @@ -650,7 +620,7 @@ static void disarm_kmem_keys(struct mem_cgroup *memcg) * This check can't live in kmem destruction function, * since the charges will outlive the cgroup */ - WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); + WARN_ON(page_counter_read(&memcg->kmem)); } #else static void disarm_kmem_keys(struct mem_cgroup *memcg) @@ -664,8 +634,6 @@ static void disarm_static_keys(struct mem_cgroup *memcg) disarm_kmem_keys(memcg); } -static void drain_all_stock_async(struct mem_cgroup *memcg); - static struct mem_cgroup_per_zone * mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) { @@ -706,7 +674,7 @@ soft_limit_tree_from_page(struct page *page) static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz, - unsigned long long new_usage_in_excess) + unsigned long new_usage_in_excess) { struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node *parent = NULL; @@ -755,10 +723,21 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, spin_unlock_irqrestore(&mctz->lock, flags); } +static unsigned long soft_limit_excess(struct mem_cgroup *memcg) +{ + unsigned long nr_pages = page_counter_read(&memcg->memory); + unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); + unsigned long excess = 0; + + if (nr_pages > soft_limit) + excess = nr_pages - soft_limit; + + return excess; +} static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) { - unsigned long long excess; + unsigned long excess; struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; @@ -769,7 +748,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) */ for (; memcg; memcg = parent_mem_cgroup(memcg)) { mz = mem_cgroup_page_zoneinfo(memcg, page); - excess = res_counter_soft_limit_excess(&memcg->res); + excess = soft_limit_excess(memcg); /* * We have to update the tree if mz is on RB-tree or * mem is over its softlimit. @@ -825,7 +804,7 @@ retry: * position in the tree. */ __mem_cgroup_remove_exceeded(mz, mctz); - if (!res_counter_soft_limit_excess(&mz->memcg->res) || + if (!soft_limit_excess(mz->memcg) || !css_tryget_online(&mz->memcg->css)) goto retry; done: @@ -1062,122 +1041,6 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } -/* - * Returns a next (in a pre-order walk) alive memcg (with elevated css - * ref. count) or NULL if the whole root's subtree has been visited. - * - * helper function to be used by mem_cgroup_iter - */ -static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, - struct mem_cgroup *last_visited) -{ - struct cgroup_subsys_state *prev_css, *next_css; - - prev_css = last_visited ? &last_visited->css : NULL; -skip_node: - next_css = css_next_descendant_pre(prev_css, &root->css); - - /* - * Even if we found a group we have to make sure it is - * alive. css && !memcg means that the groups should be - * skipped and we should continue the tree walk. - * last_visited css is safe to use because it is - * protected by css_get and the tree walk is rcu safe. - * - * We do not take a reference on the root of the tree walk - * because we might race with the root removal when it would - * be the only node in the iterated hierarchy and mem_cgroup_iter - * would end up in an endless loop because it expects that at - * least one valid node will be returned. Root cannot disappear - * because caller of the iterator should hold it already so - * skipping css reference should be safe. - */ - if (next_css) { - struct mem_cgroup *memcg = mem_cgroup_from_css(next_css); - - if (next_css == &root->css) - return memcg; - - if (css_tryget_online(next_css)) { - /* - * Make sure the memcg is initialized: - * mem_cgroup_css_online() orders the the - * initialization against setting the flag. - */ - if (smp_load_acquire(&memcg->initialized)) - return memcg; - css_put(next_css); - } - - prev_css = next_css; - goto skip_node; - } - - return NULL; -} - -static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) -{ - /* - * When a group in the hierarchy below root is destroyed, the - * hierarchy iterator can no longer be trusted since it might - * have pointed to the destroyed group. Invalidate it. - */ - atomic_inc(&root->dead_count); -} - -static struct mem_cgroup * -mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, - struct mem_cgroup *root, - int *sequence) -{ - struct mem_cgroup *position = NULL; - /* - * A cgroup destruction happens in two stages: offlining and - * release. They are separated by a RCU grace period. - * - * If the iterator is valid, we may still race with an - * offlining. The RCU lock ensures the object won't be - * released, tryget will fail if we lost the race. - */ - *sequence = atomic_read(&root->dead_count); - if (iter->last_dead_count == *sequence) { - smp_rmb(); - position = iter->last_visited; - - /* - * We cannot take a reference to root because we might race - * with root removal and returning NULL would end up in - * an endless loop on the iterator user level when root - * would be returned all the time. - */ - if (position && position != root && - !css_tryget_online(&position->css)) - position = NULL; - } - return position; -} - -static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, - struct mem_cgroup *last_visited, - struct mem_cgroup *new_position, - struct mem_cgroup *root, - int sequence) -{ - /* root reference counting symmetric to mem_cgroup_iter_load */ - if (last_visited && last_visited != root) - css_put(&last_visited->css); - /* - * We store the sequence count from the time @last_visited was - * loaded successfully instead of rereading it here so that we - * don't lose destruction events in between. We could have - * raced with the destruction of @new_position after all. - */ - iter->last_visited = new_position; - smp_wmb(); - iter->last_dead_count = sequence; -} - /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -1199,8 +1062,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { + struct reclaim_iter *uninitialized_var(iter); + struct cgroup_subsys_state *css = NULL; struct mem_cgroup *memcg = NULL; - struct mem_cgroup *last_visited = NULL; + struct mem_cgroup *pos = NULL; if (mem_cgroup_disabled()) return NULL; @@ -1209,50 +1074,101 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, root = root_mem_cgroup; if (prev && !reclaim) - last_visited = prev; + pos = prev; if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) - goto out_css_put; + goto out; return root; } rcu_read_lock(); - while (!memcg) { - struct mem_cgroup_reclaim_iter *uninitialized_var(iter); - int uninitialized_var(seq); - - if (reclaim) { - struct mem_cgroup_per_zone *mz; - - mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); - iter = &mz->reclaim_iter[reclaim->priority]; - if (prev && reclaim->generation != iter->generation) { - iter->last_visited = NULL; - goto out_unlock; - } - last_visited = mem_cgroup_iter_load(iter, root, &seq); + if (reclaim) { + struct mem_cgroup_per_zone *mz; + + mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); + iter = &mz->iter[reclaim->priority]; + + if (prev && reclaim->generation != iter->generation) + goto out_unlock; + + do { + pos = ACCESS_ONCE(iter->position); + /* + * A racing update may change the position and + * put the last reference, hence css_tryget(), + * or retry to see the updated position. + */ + } while (pos && !css_tryget(&pos->css)); + } + + if (pos) + css = &pos->css; + + for (;;) { + css = css_next_descendant_pre(css, &root->css); + if (!css) { + /* + * Reclaimers share the hierarchy walk, and a + * new one might jump in right at the end of + * the hierarchy - make sure they see at least + * one group and restart from the beginning. + */ + if (!prev) + continue; + break; } - memcg = __mem_cgroup_iter_next(root, last_visited); + /* + * Verify the css and acquire a reference. The root + * is provided by the caller, so we know it's alive + * and kicking, and don't take an extra reference. + */ + memcg = mem_cgroup_from_css(css); + + if (css == &root->css) + break; - if (reclaim) { - mem_cgroup_iter_update(iter, last_visited, memcg, root, - seq); + if (css_tryget(css)) { + /* + * Make sure the memcg is initialized: + * mem_cgroup_css_online() orders the the + * initialization against setting the flag. + */ + if (smp_load_acquire(&memcg->initialized)) + break; - if (!memcg) - iter->generation++; - else if (!prev && memcg) - reclaim->generation = iter->generation; + css_put(css); } - if (prev && !memcg) - goto out_unlock; + memcg = NULL; + } + + if (reclaim) { + if (cmpxchg(&iter->position, pos, memcg) == pos) { + if (memcg) + css_get(&memcg->css); + if (pos) + css_put(&pos->css); + } + + /* + * pairs with css_tryget when dereferencing iter->position + * above. + */ + if (pos) + css_put(&pos->css); + + if (!memcg) + iter->generation++; + else if (!prev) + reclaim->generation = iter->generation; } + out_unlock: rcu_read_unlock(); -out_css_put: +out: if (prev && prev != root) css_put(&prev->css); @@ -1346,15 +1262,18 @@ out: } /** - * mem_cgroup_page_lruvec - return lruvec for adding an lru page + * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page * @zone: zone of the page + * + * This function is only safe when following the LRU page isolation + * and putback protocol: the LRU lock must be held, and the page must + * either be PageLRU() or the caller must have isolated/allocated it. */ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) { struct mem_cgroup_per_zone *mz; struct mem_cgroup *memcg; - struct page_cgroup *pc; struct lruvec *lruvec; if (mem_cgroup_disabled()) { @@ -1362,20 +1281,13 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) goto out; } - pc = lookup_page_cgroup(page); - memcg = pc->mem_cgroup; - + memcg = page->mem_cgroup; /* - * Surreptitiously switch any uncharged offlist page to root: - * an uncharged page off lru does nothing to secure - * its former mem_cgroup from sudden removal. - * - * Our caller holds lru_lock, and PageCgroupUsed is updated - * under page_cgroup lock: between them, they make all uses - * of pc->mem_cgroup safe. + * Swapcache readahead pages are added to the LRU - and + * possibly migrated - before they are charged. */ - if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) - pc->mem_cgroup = memcg = root_mem_cgroup; + if (!memcg) + memcg = root_mem_cgroup; mz = mem_cgroup_page_zoneinfo(memcg, page); lruvec = &mz->lruvec; @@ -1414,41 +1326,24 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, VM_BUG_ON((long)(*lru_size) < 0); } -/* - * Checks whether given mem is same or in the root_mem_cgroup's - * hierarchy subtree - */ -bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, - struct mem_cgroup *memcg) +bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) { - if (root_memcg == memcg) + if (root == memcg) return true; - if (!root_memcg->use_hierarchy || !memcg) + if (!root->use_hierarchy) return false; - return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); -} - -static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, - struct mem_cgroup *memcg) -{ - bool ret; - - rcu_read_lock(); - ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); - rcu_read_unlock(); - return ret; + return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); } -bool task_in_mem_cgroup(struct task_struct *task, - const struct mem_cgroup *memcg) +bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) { - struct mem_cgroup *curr = NULL; + struct mem_cgroup *task_memcg; struct task_struct *p; bool ret; p = find_lock_task_mm(task); if (p) { - curr = get_mem_cgroup_from_mm(p->mm); + task_memcg = get_mem_cgroup_from_mm(p->mm); task_unlock(p); } else { /* @@ -1457,19 +1352,12 @@ bool task_in_mem_cgroup(struct task_struct *task, * killed to prevent needlessly killing additional tasks. */ rcu_read_lock(); - curr = mem_cgroup_from_task(task); - if (curr) - css_get(&curr->css); + task_memcg = mem_cgroup_from_task(task); + css_get(&task_memcg->css); rcu_read_unlock(); } - /* - * We should check use_hierarchy of "memcg" not "curr". Because checking - * use_hierarchy of "curr" here make this function true if hierarchy is - * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* - * hierarchy(even if use_hierarchy is disabled in "memcg"). - */ - ret = mem_cgroup_same_or_subtree(memcg, curr); - css_put(&curr->css); + ret = mem_cgroup_is_descendant(task_memcg, memcg); + css_put(&task_memcg->css); return ret; } @@ -1492,7 +1380,7 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) return inactive * inactive_ratio < active; } -#define mem_cgroup_from_res_counter(counter, member) \ +#define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) /** @@ -1504,12 +1392,23 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) */ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) { - unsigned long long margin; + unsigned long margin = 0; + unsigned long count; + unsigned long limit; - margin = res_counter_margin(&memcg->res); - if (do_swap_account) - margin = min(margin, res_counter_margin(&memcg->memsw)); - return margin >> PAGE_SHIFT; + count = page_counter_read(&memcg->memory); + limit = ACCESS_ONCE(memcg->memory.limit); + if (count < limit) + margin = limit - count; + + if (do_swap_account) { + count = page_counter_read(&memcg->memsw); + limit = ACCESS_ONCE(memcg->memsw.limit); + if (count <= limit) + margin = min(margin, limit - count); + } + + return margin; } int mem_cgroup_swappiness(struct mem_cgroup *memcg) @@ -1522,37 +1421,6 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) } /* - * memcg->moving_account is used for checking possibility that some thread is - * calling move_account(). When a thread on CPU-A starts moving pages under - * a memcg, other threads should check memcg->moving_account under - * rcu_read_lock(), like this: - * - * CPU-A CPU-B - * rcu_read_lock() - * memcg->moving_account+1 if (memcg->mocing_account) - * take heavy locks. - * synchronize_rcu() update something. - * rcu_read_unlock() - * start move here. - */ - -static void mem_cgroup_start_move(struct mem_cgroup *memcg) -{ - atomic_inc(&memcg->moving_account); - synchronize_rcu(); -} - -static void mem_cgroup_end_move(struct mem_cgroup *memcg) -{ - /* - * Now, mem_cgroup_clear_mc() may call this function with NULL. - * We check NULL in callee rather than caller. - */ - if (memcg) - atomic_dec(&memcg->moving_account); -} - -/* * A routine for checking "mem" is under move_account() or not. * * Checking a cgroup is mc.from or mc.to or under hierarchy of @@ -1574,8 +1442,8 @@ static bool mem_cgroup_under_move(struct mem_cgroup *memcg) if (!from) goto unlock; - ret = mem_cgroup_same_or_subtree(memcg, from) - || mem_cgroup_same_or_subtree(memcg, to); + ret = mem_cgroup_is_descendant(from, memcg) || + mem_cgroup_is_descendant(to, memcg); unlock: spin_unlock(&mc.lock); return ret; @@ -1597,23 +1465,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) return false; } -/* - * Take this lock when - * - a code tries to modify page's memcg while it's USED. - * - a code tries to modify page state accounting in a memcg. - */ -static void move_lock_mem_cgroup(struct mem_cgroup *memcg, - unsigned long *flags) -{ - spin_lock_irqsave(&memcg->move_lock, *flags); -} - -static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, - unsigned long *flags) -{ - spin_unlock_irqrestore(&memcg->move_lock, *flags); -} - #define K(x) ((x) << (PAGE_SHIFT-10)) /** * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. @@ -1644,18 +1495,15 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) rcu_read_unlock(); - pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", - res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, - res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, - res_counter_read_u64(&memcg->res, RES_FAILCNT)); - pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", - res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, - res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, - res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); - pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", - res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, - res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, - res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); + pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->memory)), + K((u64)memcg->memory.limit), memcg->memory.failcnt); + pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->memsw)), + K((u64)memcg->memsw.limit), memcg->memsw.failcnt); + pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->kmem)), + K((u64)memcg->kmem.limit), memcg->kmem.failcnt); for_each_mem_cgroup_tree(iter, memcg) { pr_info("Memory cgroup stats for "); @@ -1695,28 +1543,17 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) /* * Return the memory (and swap, if configured) limit for a memcg. */ -static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) +static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) { - u64 limit; - - limit = res_counter_read_u64(&memcg->res, RES_LIMIT); + unsigned long limit; - /* - * Do not consider swap space if we cannot swap due to swappiness - */ + limit = memcg->memory.limit; if (mem_cgroup_swappiness(memcg)) { - u64 memsw; + unsigned long memsw_limit; - limit += total_swap_pages << PAGE_SHIFT; - memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); - - /* - * If memsw is finite and limits the amount of swap space - * available to this memcg, return that limit. - */ - limit = min(limit, memsw); + memsw_limit = memcg->memsw.limit; + limit = min(limit + total_swap_pages, memsw_limit); } - return limit; } @@ -1740,7 +1577,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, } check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); - totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; + totalpages = mem_cgroup_get_limit(memcg) ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; struct task_struct *task; @@ -1880,52 +1717,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) memcg->last_scanned_node = node; return node; } - -/* - * Check all nodes whether it contains reclaimable pages or not. - * For quick scan, we make use of scan_nodes. This will allow us to skip - * unused nodes. But scan_nodes is lazily updated and may not cotain - * enough new information. We need to do double check. - */ -static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) -{ - int nid; - - /* - * quick check...making use of scan_node. - * We can skip unused nodes. - */ - if (!nodes_empty(memcg->scan_nodes)) { - for (nid = first_node(memcg->scan_nodes); - nid < MAX_NUMNODES; - nid = next_node(nid, memcg->scan_nodes)) { - - if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) - return true; - } - } - /* - * Check rest of nodes. - */ - for_each_node_state(nid, N_MEMORY) { - if (node_isset(nid, memcg->scan_nodes)) - continue; - if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) - return true; - } - return false; -} - #else int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) { return 0; } - -static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) -{ - return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); -} #endif static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, @@ -1943,7 +1739,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, .priority = 0, }; - excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; + excess = soft_limit_excess(root_memcg); while (1) { victim = mem_cgroup_iter(root_memcg, victim, &reclaim); @@ -1969,12 +1765,10 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, } continue; } - if (!mem_cgroup_reclaimable(victim, false)) - continue; total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, zone, &nr_scanned); *total_scanned += nr_scanned; - if (!res_counter_soft_limit_excess(&root_memcg->res)) + if (!soft_limit_excess(root_memcg)) break; } mem_cgroup_iter_break(root_memcg, victim); @@ -2081,12 +1875,8 @@ static int memcg_oom_wake_function(wait_queue_t *wait, oom_wait_info = container_of(wait, struct oom_wait_info, wait); oom_wait_memcg = oom_wait_info->memcg; - /* - * Both of oom_wait_info->memcg and wake_memcg are stable under us. - * Then we can use css_is_ancestor without taking care of RCU. - */ - if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) - && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) + if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && + !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) return 0; return autoremove_wake_function(wait, mode, sync, arg); } @@ -2228,26 +2018,23 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, unsigned long *flags) { struct mem_cgroup *memcg; - struct page_cgroup *pc; rcu_read_lock(); if (mem_cgroup_disabled()) return NULL; - - pc = lookup_page_cgroup(page); again: - memcg = pc->mem_cgroup; - if (unlikely(!memcg || !PageCgroupUsed(pc))) + memcg = page->mem_cgroup; + if (unlikely(!memcg)) return NULL; *locked = false; if (atomic_read(&memcg->moving_account) <= 0) return memcg; - move_lock_mem_cgroup(memcg, flags); - if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { - move_unlock_mem_cgroup(memcg, flags); + spin_lock_irqsave(&memcg->move_lock, *flags); + if (memcg != page->mem_cgroup) { + spin_unlock_irqrestore(&memcg->move_lock, *flags); goto again; } *locked = true; @@ -2261,11 +2048,11 @@ again: * @locked: value received from mem_cgroup_begin_page_stat() * @flags: value received from mem_cgroup_begin_page_stat() */ -void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, - unsigned long flags) +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, + unsigned long *flags) { - if (memcg && locked) - move_unlock_mem_cgroup(memcg, &flags); + if (memcg && *locked) + spin_unlock_irqrestore(&memcg->move_lock, *flags); rcu_read_unlock(); } @@ -2316,33 +2103,32 @@ static DEFINE_MUTEX(percpu_charge_mutex); static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; - bool ret = true; + bool ret = false; if (nr_pages > CHARGE_BATCH) - return false; + return ret; stock = &get_cpu_var(memcg_stock); - if (memcg == stock->cached && stock->nr_pages >= nr_pages) + if (memcg == stock->cached && stock->nr_pages >= nr_pages) { stock->nr_pages -= nr_pages; - else /* need to call res_counter_charge */ - ret = false; + ret = true; + } put_cpu_var(memcg_stock); return ret; } /* - * Returns stocks cached in percpu to res_counter and reset cached information. + * Returns stocks cached in percpu and reset cached information. */ static void drain_stock(struct memcg_stock_pcp *stock) { struct mem_cgroup *old = stock->cached; if (stock->nr_pages) { - unsigned long bytes = stock->nr_pages * PAGE_SIZE; - - res_counter_uncharge(&old->res, bytes); + page_counter_uncharge(&old->memory, stock->nr_pages); if (do_swap_account) - res_counter_uncharge(&old->memsw, bytes); + page_counter_uncharge(&old->memsw, stock->nr_pages); + css_put_many(&old->css, stock->nr_pages); stock->nr_pages = 0; } stock->cached = NULL; @@ -2371,7 +2157,7 @@ static void __init memcg_stock_init(void) } /* - * Cache charges(val) which is from res_counter, to local per_cpu area. + * Cache charges(val) to local per_cpu area. * This will be consumed by consume_stock() function, later. */ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) @@ -2388,13 +2174,15 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) /* * Drains all per-CPU charge caches for given root_memcg resp. subtree - * of the hierarchy under it. sync flag says whether we should block - * until the work is done. + * of the hierarchy under it. */ -static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) +static void drain_all_stock(struct mem_cgroup *root_memcg) { int cpu, curcpu; + /* If someone's already draining, avoid adding running more workers. */ + if (!mutex_trylock(&percpu_charge_mutex)) + return; /* Notify other cpus that system-wide "drain" is running */ get_online_cpus(); curcpu = get_cpu(); @@ -2405,7 +2193,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) memcg = stock->cached; if (!memcg || !stock->nr_pages) continue; - if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) + if (!mem_cgroup_is_descendant(memcg, root_memcg)) continue; if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { if (cpu == curcpu) @@ -2415,42 +2203,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) } } put_cpu(); - - if (!sync) - goto out; - - for_each_online_cpu(cpu) { - struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); - if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) - flush_work(&stock->work); - } -out: put_online_cpus(); -} - -/* - * Tries to drain stocked charges in other cpus. This function is asynchronous - * and just put a work per cpu for draining localy on each cpu. Caller can - * expects some charges will be back to res_counter later but cannot wait for - * it. - */ -static void drain_all_stock_async(struct mem_cgroup *root_memcg) -{ - /* - * If someone calls draining, avoid adding more kworker runs. - */ - if (!mutex_trylock(&percpu_charge_mutex)) - return; - drain_all_stock(root_memcg, false); - mutex_unlock(&percpu_charge_mutex); -} - -/* This is a synchronous drain interface. */ -static void drain_all_stock_sync(struct mem_cgroup *root_memcg) -{ - /* called when force_empty is called */ - mutex_lock(&percpu_charge_mutex); - drain_all_stock(root_memcg, true); mutex_unlock(&percpu_charge_mutex); } |
