From bee07b33db78d4ee7ed6a2fe810b9473d5471fe4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 30 Aug 2019 16:04:32 -0700 Subject: mm: memcontrol: flush percpu slab vmstats on kmem offlining I've noticed that the "slab" value in memory.stat is sometimes 0, even if some children memory cgroups have a non-zero "slab" value. The following investigation showed that this is the result of the kmem_cache reparenting in combination with the per-cpu batching of slab vmstats. At the offlining some vmstat value may leave in the percpu cache, not being propagated upwards by the cgroup hierarchy. It means that stats on ancestor levels are lower than actual. Later when slab pages are released, the precise number of pages is substracted on the parent level, making the value negative. We don't show negative values, 0 is printed instead. To fix this issue, let's flush percpu slab memcg and lruvec stats on memcg offlining. This guarantees that numbers on all ancestor levels are accurate and match the actual number of outstanding slab pages. Link: http://lkml.kernel.org/r/20190819202338.363363-3-guro@fb.com Fixes: fb2f2b0adb98 ("mm: memcg/slab: reparent memcg kmem_caches on cgroup removal") Signed-off-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 5 +++-- mm/memcontrol.c | 35 +++++++++++++++++++++++++++-------- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d77d717c620c..3f38c30d2f13 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -215,8 +215,9 @@ enum node_stat_item { NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ NR_UNEVICTABLE, /* " " " " " */ - NR_SLAB_RECLAIMABLE, - NR_SLAB_UNRECLAIMABLE, + NR_SLAB_RECLAIMABLE, /* Please do not reorder this item */ + NR_SLAB_UNRECLAIMABLE, /* and this one without looking at + * memcg_flush_percpu_vmstats() first. */ NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_NODES, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 26e2999af608..1f585d6c77c1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3260,37 +3260,49 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } -static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) +static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only) { unsigned long stat[MEMCG_NR_STAT]; struct mem_cgroup *mi; int node, cpu, i; + int min_idx, max_idx; - for (i = 0; i < MEMCG_NR_STAT; i++) + if (slab_only) { + min_idx = NR_SLAB_RECLAIMABLE; + max_idx = NR_SLAB_UNRECLAIMABLE; + } else { + min_idx = 0; + max_idx = MEMCG_NR_STAT; + } + + for (i = min_idx; i < max_idx; i++) stat[i] = 0; for_each_online_cpu(cpu) - for (i = 0; i < MEMCG_NR_STAT; i++) + for (i = min_idx; i < max_idx; i++) stat[i] += raw_cpu_read(memcg->vmstats_percpu->stat[i]); for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - for (i = 0; i < MEMCG_NR_STAT; i++) + for (i = min_idx; i < max_idx; i++) atomic_long_add(stat[i], &mi->vmstats[i]); + if (!slab_only) + max_idx = NR_VM_NODE_STAT_ITEMS; + for_each_node(node) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; struct mem_cgroup_per_node *pi; - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + for (i = min_idx; i < max_idx; i++) stat[i] = 0; for_each_online_cpu(cpu) - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + for (i = min_idx; i < max_idx; i++) stat[i] += raw_cpu_read( pn->lruvec_stat_cpu->count[i]); for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + for (i = min_idx; i < max_idx; i++) atomic_long_add(stat[i], &pi->lruvec_stat[i]); } } @@ -3363,7 +3375,14 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) if (!parent) parent = root_mem_cgroup; + /* + * Deactivate and reparent kmem_caches. Then flush percpu + * slab statistics to have precise values at the parent and + * all ancestor levels. It's required to keep slab stats + * accurate after the reparenting of kmem_caches. + */ memcg_deactivate_kmem_caches(memcg, parent); + memcg_flush_percpu_vmstats(memcg, true); kmemcg_id = memcg->kmemcg_id; BUG_ON(kmemcg_id < 0); @@ -4740,7 +4759,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) * Flush percpu vmstats and vmevents to guarantee the value correctness * on parent's and all ancestor levels. */ - memcg_flush_percpu_vmstats(memcg); + memcg_flush_percpu_vmstats(memcg, false); memcg_flush_percpu_vmevents(memcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); -- cgit v1.2.3 From 441e254cd40dc03beec3c650ce6ce6074bc6517f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 30 Aug 2019 16:04:35 -0700 Subject: mm/zsmalloc.c: fix build when CONFIG_COMPACTION=n Fixes: 701d678599d0c1 ("mm/zsmalloc.c: fix race condition in zs_destroy_pool") Link: http://lkml.kernel.org/r/201908251039.5oSbEEUT%25lkp@intel.com Reported-by: kbuild test robot Cc: Sergey Senozhatsky Cc: Henry Burns Cc: Minchan Kim Cc: Shakeel Butt Cc: Jonathan Adams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 08def3a0d200..e98bb6ab4f7e 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2412,7 +2412,9 @@ struct zs_pool *zs_create_pool(const char *name) if (!pool->name) goto err; +#ifdef CONFIG_COMPACTION init_waitqueue_head(&pool->migration_wait); +#endif if (create_cache(pool)) goto err; -- cgit v1.2.3 From b4c46484dc3fa3721d68fdfae85c1d7b1f6b5472 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 30 Aug 2019 16:04:39 -0700 Subject: mm, memcg: partially revert "mm/memcontrol.c: keep local VM counters in sync with the hierarchical ones" Commit 766a4c19d880 ("mm/memcontrol.c: keep local VM counters in sync with the hierarchical ones") effectively decreased the precision of per-memcg vmstats_local and per-memcg-per-node lruvec percpu counters. That's good for displaying in memory.stat, but brings a serious regression into the reclaim process. One issue I've discovered and debugged is the following: lruvec_lru_size() can return 0 instead of the actual number of pages in the lru list, preventing the kernel to reclaim last remaining pages. Result is yet another dying memory cgroups flooding. The opposite is also happening: scanning an empty lru list is the waste of cpu time. Also, inactive_list_is_low() can return incorrect values, preventing the active lru from being scanned and freed. It can fail both because the size of active and inactive lists are inaccurate, and because the number of workingset refaults isn't precise. In other words, the result is pretty random. I'm not sure, if using the approximate number of slab pages in count_shadow_number() is acceptable, but issues described above are enough to partially revert the patch. Let's keep per-memcg vmstat_local batched (they are only used for displaying stats to the userspace), but keep lruvec stats precise. This change fixes the dead memcg flooding on my setup. Link: http://lkml.kernel.org/r/20190817004726.2530670-1-guro@fb.com Fixes: 766a4c19d880 ("mm/memcontrol.c: keep local VM counters in sync with the hierarchical ones") Signed-off-by: Roman Gushchin Acked-by: Yafang Shao Cc: Johannes Weiner Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1f585d6c77c1..a247cb163245 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -752,15 +752,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update memcg */ __mod_memcg_state(memcg, idx, val); + /* Update lruvec */ + __this_cpu_add(pn->lruvec_stat_local->count[idx], val); + x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { struct mem_cgroup_per_node *pi; - /* - * Batch local counters to keep them in sync with - * the hierarchical ones. - */ - __this_cpu_add(pn->lruvec_stat_local->count[idx], x); for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) atomic_long_add(x, &pi->lruvec_stat[idx]); x = 0; -- cgit v1.2.3 From 14108b9131a47ff18a3c640f583eb2d625c75c0d Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 30 Aug 2019 16:04:43 -0700 Subject: mm/z3fold.c: fix lock/unlock imbalance in z3fold_page_isolate Fix lock/unlock imbalance by unlocking *zhdr* before return. Addresses Coverity ID 1452811 ("Missing unlock") Link: http://lkml.kernel.org/r/20190826030634.GA4379@embeddedor Fixes: d776aaa9895e ("mm/z3fold.c: fix race between migration and destruction") Signed-off-by: Gustavo A. R. Silva Reviewed-by: Andrew Morton Cc: Henry Burns Cc: Vitaly Wool Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/z3fold.c b/mm/z3fold.c index e31cd9bd4ed5..75b7962439ff 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1406,6 +1406,7 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) * should freak out. */ WARN(1, "Z3fold is experiencing kref problems\n"); + z3fold_page_unlock(zhdr); return false; } z3fold_page_unlock(zhdr); -- cgit v1.2.3 From a6c135bb1a59b5d67c8c45b214d3427d65dd7c00 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Fri, 30 Aug 2019 16:04:46 -0700 Subject: mailmap: add aliases for Dmitry Safonov I don't work for Virtuozzo or Samsung anymore and I've noticed that they have started sending annoying html email-replies. And I prioritize my personal emails over work email box, so while at it add an entry for Arista too - so I can reply faster when needed. Link: http://lkml.kernel.org/r/20190827220346.11123-1-dima@arista.com Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .mailmap | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.mailmap b/.mailmap index ebdca3fba91f..afaad605284a 100644 --- a/.mailmap +++ b/.mailmap @@ -64,6 +64,9 @@ Dengcheng Zhu Dengcheng Zhu Dengcheng Zhu Dmitry Eremin-Solenikov +Dmitry Safonov <0x7f454c46@gmail.com> +Dmitry Safonov <0x7f454c46@gmail.com> +Dmitry Safonov <0x7f454c46@gmail.com> Domen Puncer Douglas Gilbert Ed L. Cashin -- cgit v1.2.3 From d2e5fb927ee7f52c1fe2a98b554881e9dffd8514 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 30 Aug 2019 16:04:50 -0700 Subject: mm, memcg: do not set reclaim_state on soft limit reclaim Adric Blake has noticed[1] the following warning: WARNING: CPU: 7 PID: 175 at mm/vmscan.c:245 set_task_reclaim_state+0x1e/0x40 [...] Call Trace: mem_cgroup_shrink_node+0x9b/0x1d0 mem_cgroup_soft_limit_reclaim+0x10c/0x3a0 balance_pgdat+0x276/0x540 kswapd+0x200/0x3f0 ? wait_woken+0x80/0x80 kthread+0xfd/0x130 ? balance_pgdat+0x540/0x540 ? kthread_park+0x80/0x80 ret_from_fork+0x35/0x40 ---[ end trace 727343df67b2398a ]--- which tells us that soft limit reclaim is about to overwrite the reclaim_state configured up in the call chain (kswapd in this case but the direct reclaim is equally possible). This means that reclaim stats would get misleading once the soft reclaim returns and another reclaim is done. Fix the warning by dropping set_task_reclaim_state from the soft reclaim which is always called with reclaim_state set up. [1] http://lkml.kernel.org/r/CAE1jjeePxYPvw1mw2B3v803xHVR_BNnz0hQUY_JDMN8ny29M6w@mail.gmail.com Link: http://lkml.kernel.org/r/20190828071808.20410-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Adric Blake Acked-by: Yafang Shao Acked-by: Yang Shi Cc: Johannes Weiner Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index c77d1e3761a7..a6c5d0b28321 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3220,6 +3220,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_MEMCG +/* Only used by soft limit reclaim. Do not reuse for anything else. */ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, @@ -3235,7 +3236,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, }; unsigned long lru_pages; - set_task_reclaim_state(current, &sc.reclaim_state); + WARN_ON_ONCE(!current->reclaim_state); + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -3253,7 +3255,6 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); - set_task_reclaim_state(current, NULL); *nr_scanned = sc.nr_scanned; return sc.nr_reclaimed; -- cgit v1.2.3 From 6c1c280805ded72eceb2afc1a0d431b256608554 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 30 Aug 2019 16:04:53 -0700 Subject: mm: memcontrol: fix percpu vmstats and vmevents flush Instead of using raw_cpu_read() use per_cpu() to read the actual data of the corresponding cpu otherwise we will be reading the data of the current cpu for the number of online CPUs. Link: http://lkml.kernel.org/r/20190829203110.129263-1-shakeelb@google.com Fixes: bb65f89b7d3d ("mm: memcontrol: flush percpu vmevents before releasing memcg") Fixes: c350a99ea2b1 ("mm: memcontrol: flush percpu vmstats before releasing memcg") Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a247cb163245..9ec5e12486a7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3278,7 +3278,7 @@ static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only) for_each_online_cpu(cpu) for (i = min_idx; i < max_idx; i++) - stat[i] += raw_cpu_read(memcg->vmstats_percpu->stat[i]); + stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu); for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) for (i = min_idx; i < max_idx; i++) @@ -3296,8 +3296,8 @@ static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only) for_each_online_cpu(cpu) for (i = min_idx; i < max_idx; i++) - stat[i] += raw_cpu_read( - pn->lruvec_stat_cpu->count[i]); + stat[i] += per_cpu( + pn->lruvec_stat_cpu->count[i], cpu); for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) for (i = min_idx; i < max_idx; i++) @@ -3316,8 +3316,8 @@ static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg) for_each_online_cpu(cpu) for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - events[i] += raw_cpu_read( - memcg->vmstats_percpu->events[i]); + events[i] += per_cpu(memcg->vmstats_percpu->events[i], + cpu); for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) for (i = 0; i < NR_VM_EVENT_ITEMS; i++) -- cgit v1.2.3