diff options
| -rw-r--r-- | Documentation/cgroups/memory.txt | 4 | ||||
| -rw-r--r-- | include/linux/memcontrol.h | 5 | ||||
| -rw-r--r-- | include/linux/page_counter.h | 51 | ||||
| -rw-r--r-- | include/net/sock.h | 26 | ||||
| -rw-r--r-- | init/Kconfig | 5 | ||||
| -rw-r--r-- | mm/Makefile | 1 | ||||
| -rw-r--r-- | mm/memcontrol.c | 633 | ||||
| -rw-r--r-- | mm/page_counter.c | 207 | ||||
| -rw-r--r-- | net/ipv4/tcp_memcontrol.c | 87 |
9 files changed, 615 insertions, 404 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 02ab997a1ed2..f624727ab404 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -52,9 +52,9 @@ Brief summary of control files. tasks # attach a task(thread) and show list of threads cgroup.procs # show list of processes cgroup.event_control # an interface for event_fd() - memory.usage_in_bytes # show current res_counter usage for memory + memory.usage_in_bytes # show current usage for memory (See 5.5 for details) - memory.memsw.usage_in_bytes # show current res_counter usage for memory+Swap + memory.memsw.usage_in_bytes # show current usage for memory+Swap (See 5.5 for details) memory.limit_in_bytes # set/show limit of memory usage memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6b75640ef5ab..ea007615e8f9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -447,9 +447,8 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) /* * __GFP_NOFAIL allocations will move on even if charging is not * possible. Therefore we don't even try, and have this allocation - * unaccounted. We could in theory charge it with - * res_counter_charge_nofail, but we hope those allocations are rare, - * and won't be worth the trouble. + * unaccounted. We could in theory charge it forcibly, but we hope + * those allocations are rare, and won't be worth the trouble. */ if (gfp & __GFP_NOFAIL) return true; diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h new file mode 100644 index 000000000000..7cce3be99ff3 --- /dev/null +++ b/include/linux/page_counter.h @@ -0,0 +1,51 @@ +#ifndef _LINUX_PAGE_COUNTER_H +#define _LINUX_PAGE_COUNTER_H + +#include <linux/atomic.h> +#include <linux/kernel.h> +#include <asm/page.h> + +struct page_counter { + atomic_long_t count; + unsigned long limit; + struct page_counter *parent; + + /* legacy */ + unsigned long watermark; + unsigned long failcnt; +}; + +#if BITS_PER_LONG == 32 +#define PAGE_COUNTER_MAX LONG_MAX +#else +#define PAGE_COUNTER_MAX (LONG_MAX / PAGE_SIZE) +#endif + +static inline void page_counter_init(struct page_counter *counter, + struct page_counter *parent) +{ + atomic_long_set(&counter->count, 0); + counter->limit = PAGE_COUNTER_MAX; + counter->parent = parent; +} + +static inline unsigned long page_counter_read(struct page_counter *counter) +{ + return atomic_long_read(&counter->count); +} + +int page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); +void page_counter_charge(struct page_counter *counter, unsigned long nr_pages); +int page_counter_try_charge(struct page_counter *counter, + unsigned long nr_pages, + struct page_counter **fail); +int page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); +int page_counter_limit(struct page_counter *counter, unsigned long limit); +int page_counter_memparse(const char *buf, unsigned long *nr_pages); + +static inline void page_counter_reset_watermark(struct page_counter *counter) +{ + counter->watermark = page_counter_read(counter); +} + +#endif /* _LINUX_PAGE_COUNTER_H */ diff --git a/include/net/sock.h b/include/net/sock.h index e6f235ebf6c9..7ff44e062a38 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -54,8 +54,8 @@ #include <linux/security.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/page_counter.h> #include <linux/memcontrol.h> -#include <linux/res_counter.h> #include <linux/static_key.h> #include <linux/aio.h> #include <linux/sched.h> @@ -1062,7 +1062,7 @@ enum cg_proto_flags { }; struct cg_proto { - struct res_counter memory_allocated; /* Current allocated memory. */ + struct page_counter memory_allocated; /* Current allocated memory. */ struct percpu_counter sockets_allocated; /* Current number of sockets. */ int memory_pressure; long sysctl_mem[3]; @@ -1214,34 +1214,26 @@ static inline void memcg_memory_allocated_add(struct cg_proto *prot, unsigned long amt, int *parent_status) { - struct res_counter *fail; - int ret; + page_counter_charge(&prot->memory_allocated, amt); - ret = res_counter_charge_nofail(&prot->memory_allocated, - amt << PAGE_SHIFT, &fail); - if (ret < 0) + if (page_counter_read(&prot->memory_allocated) > + prot->memory_allocated.limit) *parent_status = OVER_LIMIT; } static inline void memcg_memory_allocated_sub(struct cg_proto *prot, unsigned long amt) { - res_counter_uncharge(&prot->memory_allocated, amt << PAGE_SHIFT); -} - -static inline u64 memcg_memory_allocated_read(struct cg_proto *prot) -{ - u64 ret; - ret = res_counter_read_u64(&prot->memory_allocated, RES_USAGE); - return ret >> PAGE_SHIFT; + page_counter_uncharge(&prot->memory_allocated, amt); } static inline long sk_memory_allocated(const struct sock *sk) { struct proto *prot = sk->sk_prot; + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - return memcg_memory_allocated_read(sk->sk_cgrp); + return page_counter_read(&sk->sk_cgrp->memory_allocated); return atomic_long_read(prot->memory_allocated); } @@ -1255,7 +1247,7 @@ sk_memory_allocated_add(struct sock *sk, int amt, int *parent_status) memcg_memory_allocated_add(sk->sk_cgrp, amt, parent_status); /* update the root cgroup regardless */ atomic_long_add_return(amt, prot->memory_allocated); - return memcg_memory_allocated_read(sk->sk_cgrp); + return page_counter_read(&sk->sk_cgrp->memory_allocated); } return atomic_long_add_return(amt, prot->memory_allocated); diff --git a/init/Kconfig b/init/Kconfig index 903505e66d1d..fd9e88791ba4 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -978,9 +978,12 @@ config RESOURCE_COUNTERS This option enables controller independent resource accounting infrastructure that works with cgroups. +config PAGE_COUNTER + bool + config MEMCG bool "Memory Resource Controller for Control Groups" - depends on RESOURCE_COUNTERS + select PAGE_COUNTER select EVENTFD help Provides a memory resource controller that manages both anonymous diff --git a/mm/Makefile b/mm/Makefile index 8405eb0023a9..6d9f40e922f7 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -55,6 +55,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o +obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d6ac0e33e150..4129ad74e93b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -25,7 +25,7 @@ * GNU General Public License for more details. */ -#include <linux/res_counter.h> +#include <linux/page_counter.h> #include <linux/memcontrol.h> #include <linux/cgroup.h> #include <linux/mm.h> @@ -165,7 +165,7 @@ struct mem_cgroup_per_zone { struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; struct rb_node tree_node; /* RB tree node */ - unsigned long long usage_in_excess;/* Set to the value by which */ + unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; struct mem_cgroup *memcg; /* Back pointer, we cannot */ @@ -198,7 +198,7 @@ static struct mem_cgroup_tree soft_limit_tree __read_mostly; struct mem_cgroup_threshold { struct eventfd_ctx *eventfd; - u64 threshold; + unsigned long threshold; }; /* For threshold */ @@ -284,10 +284,13 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); */ struct mem_cgroup { struct cgroup_subsys_state css; - /* - * the counter to account for memory usage - */ - struct res_counter res; + + /* Accounted resources */ + struct page_counter memory; + struct page_counter memsw; + struct page_counter kmem; + + unsigned long soft_limit; /* vmpressure notifications */ struct vmpressure vmpressure; @@ -296,15 +299,6 @@ struct mem_cgroup { int initialized; /* - * the counter to account for mem+swap usage. - */ - struct res_counter memsw; - - /* - * the counter to account for kernel memory usage. - */ - struct res_counter kmem; - /* * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; @@ -650,7 +644,7 @@ static void disarm_kmem_keys(struct mem_cgroup *memcg) * This check can't live in kmem destruction function, * since the charges will outlive the cgroup */ - WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); + WARN_ON(page_counter_read(&memcg->kmem)); } #else static void disarm_kmem_keys(struct mem_cgroup *memcg) @@ -706,7 +700,7 @@ soft_limit_tree_from_page(struct page *page) static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz, - unsigned long long new_usage_in_excess) + unsigned long new_usage_in_excess) { struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node *parent = NULL; @@ -755,10 +749,21 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, spin_unlock_irqrestore(&mctz->lock, flags); } +static unsigned long soft_limit_excess(struct mem_cgroup *memcg) +{ + unsigned long nr_pages = page_counter_read(&memcg->memory); + unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); + unsigned long excess = 0; + + if (nr_pages > soft_limit) + excess = nr_pages - soft_limit; + + return excess; +} static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) { - unsigned long long excess; + unsigned long excess; struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; @@ -769,7 +774,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) */ for (; memcg; memcg = parent_mem_cgroup(memcg)) { mz = mem_cgroup_page_zoneinfo(memcg, page); - excess = res_counter_soft_limit_excess(&memcg->res); + excess = soft_limit_excess(memcg); /* * We have to update the tree if mz is on RB-tree or * mem is over its softlimit. @@ -825,7 +830,7 @@ retry: * position in the tree. */ __mem_cgroup_remove_exceeded(mz, mctz); - if (!res_counter_soft_limit_excess(&mz->memcg->res) || + if (!soft_limit_excess(mz->memcg) || !css_tryget_online(&mz->memcg->css)) goto retry; done: @@ -1492,7 +1497,7 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) return inactive * inactive_ratio < active; } -#define mem_cgroup_from_res_counter(counter, member) \ +#define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) /** @@ -1504,12 +1509,23 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) */ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) { - unsigned long long margin; + unsigned long margin = 0; + unsigned long count; + unsigned long limit; - margin = res_counter_margin(&memcg->res); - if (do_swap_account) - margin = min(margin, res_counter_margin(&memcg->memsw)); - return margin >> PAGE_SHIFT; + count = page_counter_read(&memcg->memory); + limit = ACCESS_ONCE(memcg->memory.limit); + if (count < limit) + margin = limit - count; + + if (do_swap_account) { + count = page_counter_read(&memcg->memsw); + limit = ACCESS_ONCE(memcg->memsw.limit); + if (count <= limit) + margin = min(margin, limit - count); + } + + return margin; } int mem_cgroup_swappiness(struct mem_cgroup *memcg) @@ -1644,18 +1660,15 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) rcu_read_unlock(); - pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", - res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, - res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, - res_counter_read_u64(&memcg->res, RES_FAILCNT)); - pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", - res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, - res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, - res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); - pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", - res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, - res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, - res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); + pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->memory)), + K((u64)memcg->memory.limit), memcg->memory.failcnt); + pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->memsw)), + K((u64)memcg->memsw.limit), memcg->memsw.failcnt); + pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->kmem)), + K((u64)memcg->kmem.limit), memcg->kmem.failcnt); for_each_mem_cgroup_tree(iter, memcg) { pr_info("Memory cgroup stats for "); @@ -1695,28 +1708,17 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) /* * Return the memory (and swap, if configured) limit for a memcg. */ -static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) +static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) { - u64 limit; + unsigned long limit; - limit = res_counter_read_u64(&memcg->res, RES_LIMIT); - - /* - * Do not consider swap space if we cannot swap due to swappiness - */ + limit = memcg->memory.limit; if (mem_cgroup_swappiness(memcg)) { - u64 memsw; + unsigned long memsw_limit; - limit += total_swap_pages << PAGE_SHIFT; - memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); - - /* - * If memsw is finite and limits the amount of swap space - * available to this memcg, return that limit. - */ - limit = min(limit, memsw); + memsw_limit = memcg->memsw.limit; + limit = min(limit + total_swap_pages, memsw_limit); } - return limit; } @@ -1740,7 +1742,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, } check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); - totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; + totalpages = mem_cgroup_get_limit(memcg) ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; struct task_struct *task; @@ -1943,7 +1945,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, .priority = 0, }; - excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; + excess = soft_limit_excess(root_memcg); while (1) { victim = mem_cgroup_iter(root_memcg, victim, &reclaim); @@ -1974,7 +1976,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, zone, &nr_scanned); *total_scanned += nr_scanned; - if (!res_counter_soft_limit_excess(&root_memcg->res)) + if (!soft_limit_excess(root_memcg)) break; } mem_cgroup_iter_break(root_memcg, victim); @@ -2316,33 +2318,31 @@ static DEFINE_MUTEX(percpu_charge_mutex); static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; - bool ret = true; + bool ret = false; if (nr_pages > CHARGE_BATCH) - return false; + return ret; stock = &get_cpu_var(memcg_stock); - if (memcg == stock->cached && stock->nr_pages >= nr_pages) + if (memcg == stock->cached && stock->nr_pages >= nr_pages) { stock->nr_pages -= nr_pages; - else /* need to call res_counter_charge */ - ret = false; + ret = true; + } put_cpu_var(memcg_stock); return ret; } /* - * Returns stocks cached in percpu to res_counter and reset cached information. + * Returns stocks cached in percpu and reset cached information. */ static void drain_stock(struct memcg_stock_pcp *stock) { struct mem_cgroup *old = stock->cached; if (stock->nr_pages) { - unsigned long bytes = stock->nr_pages * PAGE_SIZE; - - res_counter_uncharge(&old->res, bytes); + page_counter_uncharge(&old->memory, stock->nr_pages); if (do_swap_account) - res_counter_uncharge(&old->memsw, bytes); + page_counter_uncharge(&old->memsw, stock->nr_pages); stock->nr_pages = 0; } stock->cached = NULL; @@ -2371,7 +2371,7 @@ static void __init memcg_stock_init(void) } /* - * Cache charges(val) which is from res_counter, to local per_cpu area. + * Cache charges(val) to local per_cpu area. * This will be consumed by consume_stock() function, later. */ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) @@ -2431,8 +2431,7 @@ out: /* * Tries to drain stocked charges in other cpus. This function is asynchronous * and just put a work per cpu for draining localy on each cpu. Caller can - * expects some charges will be back to res_counter later but cannot wait for - * it. + * expects some charges will be back later but cannot wait for it. */ static void drain_all_stock_async(struct mem_cgroup *root_memcg) { @@ -2506,9 +2505,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; - struct res_counter *fail_res; + struct page_counter *counter; unsigned long nr_reclaimed; - unsigned long long size; bool may_swap = true; bool drained = false; int ret = 0; @@ -2519,16 +2517,15 @@ retry: if (consume_stock(memcg, nr_pages)) goto done; - size = batch * PAGE_SIZE; if (!do_swap_account || - !res_counter_charge(&memcg->memsw, size, &fail_res)) { - if (!res_counter_charge(&memcg->res, size, &fail_res)) + !page_counter_try_charge(&memcg->memsw, batch, &counter)) { + if (!page_counter_try_charge(&memcg->memory, batch, &counter)) goto done_restock; if (do_swap_account) - res_counter_uncharge(&memcg->memsw, size); - mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); + page_counter_uncharge(&memcg->memsw, batch); + mem_over_limit = mem_cgroup_from_counter(counter, memory); } else { - mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); + mem_over_limit = mem_cgroup_from_counter(counter, memsw); may_swap = false; } @@ -2611,32 +2608,12 @@ done: static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { - unsigned long bytes = nr_pages * PAGE_SIZE; - if (mem_cgroup_is_root(memcg)) return; - res_counter_uncharge(&memcg->res, bytes); + page_counter_uncharge(&memcg->memory, nr_pages); if (do_swap_account) - res_counter_uncharge(&memcg->memsw, bytes); -} - -/* - * Cancel chrages in this cgroup....doesn't propagate to parent cgroup. - * This is useful when moving usage to parent cgroup. - */ -static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, - unsigned int nr_pages) -{ - unsigned long bytes = nr_pages * PAGE_SIZE; - - if (mem_cgroup_is_root(memcg)) - return; - - res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); - if (do_swap_account) - res_counter_uncharge_until(&memcg->memsw, - memcg->memsw.parent, bytes); + page_counter_uncharge(&memcg->memsw, nr_pages); } /* @@ -2760,8 +2737,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, unlock_page_lru(page, isolated); } -static DEFINE_MUTEX(set_limit_mutex); - #ifdef CONFIG_MEMCG_KMEM /* * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or @@ -2804,16 +2779,17 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) } #endif -static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) +static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, + unsigned long nr_pages) { - struct res_counter *fail_res; + struct page_counter *counter; int ret = 0; - ret = res_counter_charge(&memcg->kmem, size, &fail_res); - if (ret) + ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); + if (ret < 0) return ret; - ret = try_charge(memcg, gfp, size >> PAGE_SHIFT); + ret = try_charge(memcg, gfp, nr_pages); if (ret == -EINTR) { /* * try_charge() chose to bypass to root due to OOM kill or @@ -2830,25 +2806,25 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) * when the allocation triggers should have been already * directed to the root cgroup in memcontrol.h */ - res_counter_charge_nofail(&memcg->res, size, &fail_res); + page_counter_charge(&memcg->memory, nr_pages); if (do_swap_account) - res_counter_charge_nofail(&memcg->memsw, size, - &fail_res); + page_counter_charge(&memcg->memsw, nr_pages); ret = 0; } else if (ret) - res_counter_uncharge(&memcg->kmem, size); + page_counter_uncharge(&memcg->kmem, nr_pages); return ret; } -static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) +static void memcg_uncharge_kmem(struct mem_cgroup *memcg, + unsigned long nr_pages) { - res_counter_uncharge(&memcg->res, size); + page_counter_uncharge(&memcg->memory, nr_pages); if (do_swap_account) - res_counter_uncharge(&memcg->memsw, size); + page_counter_uncharge(&memcg->memsw, nr_pages); /* Not down to 0 */ - if (res_counter_uncharge(&memcg->kmem, size)) + if (page_counter_uncharge(&memcg->kmem, nr_pages)) return; /* @@ -3124,19 +3100,21 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) { + unsigned int nr_pages = 1 << order; int res; - res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, - PAGE_SIZE << order); + res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); if (!res) - atomic_add(1 << order, &cachep->memcg_params->nr_pages); + atomic_add(nr_pages, &cachep->memcg_params->nr_pages); return res; } void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) { - memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); - atomic_sub(1 << order, &cachep->memcg_params->nr_pages); + unsigned int nr_pages = 1 << order; + + memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); + atomic_sub(nr_pages, &cachep->memcg_params->nr_pages); } /* @@ -3257,7 +3235,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) return true; } - ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); + ret = memcg_charge_kmem(memcg, gfp, 1 << order); if (!ret) *_memcg = memcg; @@ -3274,7 +3252,7 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, /* The page allocation failed. Revert */ if (!page) { - memcg_uncharge_kmem(memcg, PAGE_SIZE << order); + memcg_uncharge_kmem(memcg, 1 << order); return; } /* @@ -3307,7 +3285,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) return; VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - memcg_uncharge_kmem(memcg, PAGE_SIZE << order); + memcg_uncharge_kmem(memcg, 1 << order); } #else static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) @@ -3485,8 +3463,12 @@ static int mem_cgroup_move_parent(struct page *page, ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent); - if (!ret) - __mem_cgroup_cancel_local_charge(child, nr_pages); + if (!ret) { + /* Take charge off the local counters */ + page_counter_cancel(&child->memory, nr_pages); + if (do_swap_account) + page_counter_cancel(&child->memsw, nr_pages); + } if (nr_pages > 1) compound_unlock_irqrestore(page, flags); @@ -3516,7 +3498,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, * * Returns 0 on success, -EINVAL on failure. * - * The caller must have charged to @to, IOW, called res_counter_charge() about + * The caller must have charged to @to, IOW, called page_counter_charge() about * both res and memsw, and called css_get(). */ static int mem_cgroup_move_swap_account(swp_entry_t entry, @@ -3532,7 +3514,7 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, mem_cgroup_swap_statistics(to, true); /* * This function is only called from task migration context now. - * It postpones res_counter and refcount handling till the end + * It postpones page_counter and refcount handling till the end * of task migration(mem_cgroup_clear_mc()) for performance * improvement. But we cannot postpone css_get(to) because if * the process that has been moved to @to does swap-in, the @@ -3590,60 +3572,57 @@ void mem_cgroup_print_bad_page(struct page *page) } #endif +static DEFINE_MUTEX(memcg_limit_mutex); + static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, - unsigned long long val) + unsigned long limit) { + unsigned long curusage; + unsigned long oldusage; + bool enlarge = false; int retry_count; - int ret = 0; - int children = mem_cgroup_count_children(memcg); - u64 curusage, oldusage; - int enlarge; + int ret; /* * For keeping hierarchical_reclaim simple, how long we should retry * is depends on callers. We set our retry-count to be function * of # of children which we should visit in this loop. */ - retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; + retry_count = MEM_CGROUP_RECLAIM_RETRIES * + mem_cgroup_count_children(memcg); - oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); + oldusage = page_counter_read(&memcg->memory); - enlarge = 0; - while (retry_count) { + do { if (signal_pending(current)) { ret = -EINTR; break; } - /* - * Rather than hide all in some function, I do this in - * open coded manner. You see what this really does. - * We have to guarantee memcg->res.limit <= memcg->memsw.limit. - */ - mutex_lock(&set_limit_mutex); - if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) { + + mutex_lock(&memcg_limit_mutex); + if (limit > memcg->memsw.limit) { + mutex_unlock(&memcg_limit_mutex); ret = -EINVAL; - mutex_unlock(&set_limit_mutex); break; } - - if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val) - enlarge = 1; - - ret = res_counter_set_limit(&memcg->res, val); - mutex_unlock(&set_limit_mutex); + if (limit > memcg->memory.limit) + enlarge = true; + ret = page_counter_limit(&memcg->memory, limit); + mutex_unlock(&memcg_limit_mutex); if (!ret) break; try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); - curusage = res_counter_read_u64(&memcg->res, RES_USAGE); + curusage = page_counter_read(&memcg->memory); /* Usage is reduced ? */ if (curusage >= oldusage) retry_count--; else oldusage = curusage; - } + } while (retry_count); + if (!ret && enlarge) memcg_oom_recover(memcg); @@ -3651,52 +3630,53 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, } static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, - unsigned long long val) + unsigned long limit) { + unsigned long curusage; + unsigned long oldusage; + bool enlarge = false; int retry_count; - u64 oldusage, curusage; - int children = mem_cgroup_count_children(memcg); - int ret = -EBUSY; - int enlarge = 0; + int ret; /* see mem_cgroup_resize_res_limit */ - retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; - oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); - while (retry_count) { + retry_count = MEM_CGROUP_RECLAIM_RETRIES * + mem_cgroup_count_children(memcg); + + oldusage = page_counter_read(&memcg->memsw); + + do { if (signal_pending(current)) { ret = -EINTR; break; } - /* - * Rather than hide all in some function, I do this in - * open coded manner. You see what this really does. - * We have to guarantee memcg->res.limit <= memcg->memsw.limit. - */ - mutex_lock(&set_limit_mutex); - if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) { + + mutex_lock(&memcg_limit_mutex); + if (limit < memcg->memory.limit) { + mutex_unlock(&memcg_limit_mutex); ret = -EINVAL; - mutex_unlock(&set_limit_mutex); break; } - if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) - enlarge = 1; - ret = res_counter_set_limit(&memcg->memsw, val); - mutex_unlock(&set_limit_mutex); + if (limit > memcg->memsw.limit) + enlarge = true; + ret = page_counter_limit(&memcg->memsw, limit); + mutex_unlock(&memcg_limit_mutex); if (!ret) break; try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); - curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + curusage = page_counter_read(&memcg->memsw); /* Usage is reduced ? */ if (curusage >= oldusage) retry_count--; else oldusage = curusage; - } + } while (retry_count); + if (!ret && enlarge) memcg_oom_recover(memcg); + return ret; } @@ -3709,7 +3689,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, unsigned long reclaimed; int loop = 0; struct mem_cgroup_tree_per_zone *mctz; - unsigned long long excess; + unsigned long excess; unsigned long nr_scanned; if (order > 0) @@ -3763,7 +3743,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, } while (1); } __mem_cgroup_remove_exceeded(mz, mctz); - excess = res_counter_soft_limit_excess(&mz->memcg->res); + excess = soft_limit_excess(mz->memcg); /* * One school of thought says that we should not add * back the node to the tree if reclaim returns 0. @@ -3856,7 +3836,6 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) { int node, zid; - u64 usage; do { /* This is for making all *used* pages to be on LRU. */ @@ -3888,9 +3867,8 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) * right after the check. RES_USAGE should be safe as we always * charge before adding to the LRU. */ - usage = res_counter_read_u64(&memcg->res, RES_USAGE) - - res_counter_read_u64(&memcg->kmem, RES_USAGE); - } while (usage > 0); + } while (page_counter_read(&memcg->memory) - + page_counter_read(&memcg->kmem) > 0); } /* @@ -3930,7 +3908,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) /* we call try-to-free pages for make this cgroup empty */ lru_add_drain_all(); /* try to free all pages in this cgroup */ - while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { + while (nr_retries && page_counter_read(&memcg->memory)) { int progress; if (signal_pending(current)) @@ -4001,8 +3979,8 @@ out: return retval; } -static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) +static unsigned long tree_stat(struct mem_cgroup *memcg, + enum mem_cgroup_stat_index idx) { struct mem_cgroup *iter; long val = 0; @@ -4020,55 +3998,72 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { u64 val; - if (!mem_cgroup_is_root(memcg)) { + if (mem_cgroup_is_root(memcg)) { + val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); + val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); + if (swap) + val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); + } else { if (!swap) - return res_counter_read_u64(&memcg->res, RES_USAGE); + val = page_counter_read(&memcg->memory); else - return res_counter_read_u64(&memcg->memsw, RES_USAGE); + val = page_counter_read(&memcg->memsw); } - - /* - * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS - * as well as in MEM_CGROUP_STAT_RSS_HUGE. - */ - val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); - - if (swap) - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); - return val << PAGE_SHIFT; } +enum { + RES_USAGE, + RES_LIMIT, + RES_MAX_USAGE, + RES_FAILCNT, + RES_SOFT_LIMIT, +}; static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - enum res_type type = MEMFILE_TYPE(cft->private); - int name = MEMFILE_ATTR(cft->private); + struct page_counter *counter; - switch (type) { + switch (MEMFILE_TYPE(cft->private)) { case _MEM: - if (name == RES_USAGE) - return mem |
