summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2020-12-29 02:38:22 +0000
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2021-01-09 13:37:37 +0100
commitaa93e9471afe103812badfd10be6509554312e72 (patch)
treee7f2af89648742c30e59f080045c56ab040cb487 /mm
parent955f8bc9eb69b3be9a7785015c726f7004ec36b9 (diff)
downloadlinux-aa93e9471afe103812badfd10be6509554312e72.tar.gz
linux-aa93e9471afe103812badfd10be6509554312e72.tar.bz2
linux-aa93e9471afe103812badfd10be6509554312e72.zip
mm: memcontrol: fix excessive complexity in memory.stat reporting
commit a983b5ebee57209c99f68c8327072f25e0e6e3da upstream We've seen memory.stat reads in top-level cgroups take up to fourteen seconds during a userspace bug that created tens of thousands of ghost cgroups pinned by lingering page cache. Even with a more reasonable number of cgroups, aggregating memory.stat is unnecessarily heavy. The complexity is this: nr_cgroups * nr_stat_items * nr_possible_cpus where the stat items are ~70 at this point. With 128 cgroups and 128 CPUs - decent, not enormous setups - reading the top-level memory.stat has to aggregate over a million per-cpu counters. This doesn't scale. Instead of spreading the source of truth across all CPUs, use the per-cpu counters merely to batch updates to shared atomic counters. This is the same as the per-cpu stocks we use for charging memory to the shared atomic page_counters, and also the way the global vmstat counters are implemented. Vmstat has elaborate spilling thresholds that depend on the number of CPUs, amount of memory, and memory pressure - carefully balancing the cost of counter updates with the amount of per-cpu error. That's because the vmstat counters are system-wide, but also used for decisions inside the kernel (e.g. NR_FREE_PAGES in the allocator). Neither is true for the memory controller. Use the same static batch size we already use for page_counter updates during charging. The per-cpu error in the stats will be 128k, which is an acceptable ratio of cores to memory accounting granularity. [hannes@cmpxchg.org: fix warning in __this_cpu_xchg() calls] Link: http://lkml.kernel.org/r/20171201135750.GB8097@cmpxchg.org Link: http://lkml.kernel.org/r/20171103153336.24044-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: stable@vger.kernel.org [shaoyi@amazon.com: resolved the conflict brought by commit 17ffa29c355658c8e9b19f56cbf0388500ca7905 in mm/memcontrol.c by contextual fix] Signed-off-by: Shaoying Xu <shaoyi@amazon.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c101
1 files changed, 51 insertions, 50 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index eba9dc4795b5..4e763cdccb33 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -542,39 +542,10 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
return mz;
}
-/*
- * Return page count for single (non recursive) @memcg.
- *
- * Implementation Note: reading percpu statistics for memcg.
- *
- * Both of vmstat[] and percpu_counter has threshold and do periodic
- * synchronization to implement "quick" read. There are trade-off between
- * reading cost and precision of value. Then, we may have a chance to implement
- * a periodic synchronization of counter in memcg's counter.
- *
- * But this _read() function is used for user interface now. The user accounts
- * memory usage by memory cgroup and he _always_ requires exact value because
- * he accounts memory. Even if we provide quick-and-fuzzy read, we always
- * have to visit all online cpus and make sum. So, for now, unnecessary
- * synchronization is not implemented. (just implemented for cpu hotplug)
- *
- * If there are kernel internal actions which can make use of some not-exact
- * value, and reading all cpu value can be performance bottleneck in some
- * common workload, threshold and synchronization as vmstat[] should be
- * implemented.
- *
- * The parameter idx can be of type enum memcg_event_item or vm_event_item.
- */
-
static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
int event)
{
- unsigned long val = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- val += per_cpu(memcg->stat->events[event], cpu);
- return val;
+ return atomic_long_read(&memcg->events[event]);
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
@@ -606,7 +577,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
nr_pages = -nr_pages; /* for event */
}
- __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
+ __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
}
unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
@@ -642,8 +613,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
{
unsigned long val, next;
- val = __this_cpu_read(memcg->stat->nr_page_events);
- next = __this_cpu_read(memcg->stat->targets[target]);
+ val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
+ next = __this_cpu_read(memcg->stat_cpu->targets[target]);
/* from time_after() in jiffies.h */
if ((long)(next - val) < 0) {
switch (target) {
@@ -659,7 +630,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
default:
break;
}
- __this_cpu_write(memcg->stat->targets[target], next);
+ __this_cpu_write(memcg->stat_cpu->targets[target], next);
return true;
}
return false;
@@ -1726,11 +1697,6 @@ void unlock_page_memcg(struct page *page)
}
EXPORT_SYMBOL(unlock_page_memcg);
-/*
- * size of first charge trial. "32" comes from vmscan.c's magic value.
- * TODO: maybe necessary to use big numbers in big irons.
- */
-#define CHARGE_BATCH 32U
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
@@ -1758,7 +1724,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
unsigned long flags;
bool ret = false;
- if (nr_pages > CHARGE_BATCH)
+ if (nr_pages > MEMCG_CHARGE_BATCH)
return ret;
local_irq_save(flags);
@@ -1827,7 +1793,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
}
stock->nr_pages += nr_pages;
- if (stock->nr_pages > CHARGE_BATCH)
+ if (stock->nr_pages > MEMCG_CHARGE_BATCH)
drain_stock(stock);
local_irq_restore(flags);
@@ -1877,9 +1843,44 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
+ struct mem_cgroup *memcg;
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
+
+ for_each_mem_cgroup(memcg) {
+ int i;
+
+ for (i = 0; i < MEMCG_NR_STAT; i++) {
+ int nid;
+ long x;
+
+ x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
+ if (x)
+ atomic_long_add(x, &memcg->stat[i]);
+
+ if (i >= NR_VM_NODE_STAT_ITEMS)
+ continue;
+
+ for_each_node(nid) {
+ struct mem_cgroup_per_node *pn;
+
+ pn = mem_cgroup_nodeinfo(memcg, nid);
+ x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
+ if (x)
+ atomic_long_add(x, &pn->lruvec_stat[i]);
+ }
+ }
+
+ for (i = 0; i < MEMCG_NR_EVENTS; i++) {
+ long x;
+
+ x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
+ if (x)
+ atomic_long_add(x, &memcg->events[i]);
+ }
+ }
+
return 0;
}
@@ -1900,7 +1901,7 @@ static void high_work_func(struct work_struct *work)
struct mem_cgroup *memcg;
memcg = container_of(work, struct mem_cgroup, high_work);
- reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL);
+ reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
}
/*
@@ -1924,7 +1925,7 @@ void mem_cgroup_handle_over_high(void)
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
- unsigned int batch = max(CHARGE_BATCH, nr_pages);
+ unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct page_counter *counter;
@@ -4203,8 +4204,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return 1;
- pn->lruvec_stat = alloc_percpu(struct lruvec_stat);
- if (!pn->lruvec_stat) {
+ pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
+ if (!pn->lruvec_stat_cpu) {
kfree(pn);
return 1;
}
@@ -4225,7 +4226,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return;
- free_percpu(pn->lruvec_stat);
+ free_percpu(pn->lruvec_stat_cpu);
kfree(pn);
}
@@ -4235,7 +4236,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
- free_percpu(memcg->stat);
+ free_percpu(memcg->stat_cpu);
kfree(memcg);
}
@@ -4264,8 +4265,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
if (memcg->id.id < 0)
goto fail;
- memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
- if (!memcg->stat)
+ memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
+ if (!memcg->stat_cpu)
goto fail;
for_each_node(node)
@@ -5686,7 +5687,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
__mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
__mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
- __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
+ __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);