From 44de9d0cad41f2c51ef26916842be046b582dcc9 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Tue, 31 Jul 2012 16:41:49 -0700 Subject: mm: account the total_vm in the vm_stat_account() vm_stat_account() accounts the shared_vm, stack_vm and reserved_vm now. But we can also account for total_vm in the vm_stat_account() which makes the code tidy. Even for mprotect_fixup(), we can get the right result in the end. Signed-off-by: Huang Shijie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f9f279cf5b1b..3955bedeeed1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1528,6 +1528,7 @@ void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); static inline void vm_stat_account(struct mm_struct *mm, unsigned long flags, struct file *file, long pages) { + mm->total_vm += pages; } #endif /* CONFIG_PROC_FS */ -- cgit v1.2.3 From 3965c9ae47d64aadf6f13b6fcd37767b83c0689a Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 31 Jul 2012 16:41:52 -0700 Subject: mm: prepare for removal of obsolete /proc/sys/vm/nr_pdflush_threads Since per-BDI flusher threads were introduced in 2.6, the pdflush mechanism is not used any more. But the old interface exported through /proc/sys/vm/nr_pdflush_threads still exists and is obviously useless. For back-compatibility, printk warning information and return 2 to notify the users that the interface is removed. Signed-off-by: Wanpeng Li Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 3 +++ include/linux/writeback.h | 5 ----- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 489de625cd25..c97c6b9cd38e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -17,6 +17,7 @@ #include #include #include +#include struct page; struct device; @@ -304,6 +305,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync); void set_bdi_congested(struct backing_dev_info *bdi, int sync); long congestion_wait(int sync, long timeout); long wait_iff_congested(struct zone *zone, int sync, long timeout); +int pdflush_proc_obsolete(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 6d0a0fcd80e7..c66fe3332d83 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -189,9 +189,4 @@ void tag_pages_for_writeback(struct address_space *mapping, void account_page_redirty(struct page *page); -/* pdflush.c */ -extern int nr_pdflush_threads; /* Global so it can be exported to sysctl - read-only. */ - - #endif /* WRITEBACK_H */ -- cgit v1.2.3 From 972dc4de13f667a7df27ee32573b2e6fc6cc8434 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:00 -0700 Subject: hugetlb: add an inline helper for finding hstate index Add an inline helper and use it in the code. Signed-off-by: Aneesh Kumar K.V Acked-by: David Rientjes Acked-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d5d6bbe2259e..217f52859fa7 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -302,6 +302,11 @@ static inline unsigned hstate_index_to_shift(unsigned index) return hstates[index].order + PAGE_SHIFT; } +static inline int hstate_index(struct hstate *h) +{ + return h - hstates; +} + #else struct hstate {}; #define alloc_huge_page_node(h, nid) NULL @@ -320,6 +325,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h) return 1; } #define hstate_index_to_shift(index) 0 +#define hstate_index(h) 0 #endif #endif /* _LINUX_HUGETLB_H */ -- cgit v1.2.3 From 24669e58477e2752c1fbca9c1c988e9dd0d79d15 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:03 -0700 Subject: hugetlb: use mmu_gather instead of a temporary linked list for accumulating pages Use a mmu_gather instead of a temporary linked list for accumulating pages when we unmap a hugepage range Signed-off-by: Aneesh Kumar K.V Reviewed-by: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: Hillf Danton Cc: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 217f52859fa7..0f23c1840c9b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -7,6 +7,7 @@ struct ctl_table; struct user_struct; +struct mmu_gather; #ifdef CONFIG_HUGETLB_PAGE @@ -40,9 +41,10 @@ int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int, unsigned int flags); void unmap_hugepage_range(struct vm_area_struct *, - unsigned long, unsigned long, struct page *); -void __unmap_hugepage_range(struct vm_area_struct *, - unsigned long, unsigned long, struct page *); + unsigned long, unsigned long, struct page *); +void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct page *ref_page); int hugetlb_prefault(struct address_space *, struct vm_area_struct *); void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(int, char *); @@ -98,7 +100,6 @@ static inline unsigned long hugetlb_total_pages(void) #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) -#define unmap_hugepage_range(vma, start, end, page) BUG() static inline void hugetlb_report_meminfo(struct seq_file *m) { } @@ -112,13 +113,24 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) #define huge_pte_offset(mm, address) 0 -#define dequeue_hwpoisoned_huge_page(page) 0 +static inline int dequeue_hwpoisoned_huge_page(struct page *page) +{ + return 0; +} + static inline void copy_huge_page(struct page *dst, struct page *src) { } #define hugetlb_change_protection(vma, address, end, newprot) +static inline void __unmap_hugepage_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct page *ref_page) +{ + BUG(); +} + #endif /* !CONFIG_HUGETLB_PAGE */ #define HUGETLB_ANON_FILE "anon_hugepage" -- cgit v1.2.3 From 189ebff2894a9d0f4e250dd1e154d282ef0a6779 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:06 -0700 Subject: hugetlb: simplify migrate_huge_page() Since we migrate only one hugepage, don't use linked list for passing the page around. Directly pass the page that need to be migrated as argument. This also removes the usage of page->lru in the migrate path. Signed-off-by: Aneesh Kumar K.V Reviewed-by: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: Hillf Danton Reviewed-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 855c337b20c3..ce7e6671968b 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -15,7 +15,7 @@ extern int migrate_page(struct address_space *, extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, enum migrate_mode mode); -extern int migrate_huge_pages(struct list_head *l, new_page_t x, +extern int migrate_huge_page(struct page *, new_page_t x, unsigned long private, bool offlining, enum migrate_mode mode); @@ -36,7 +36,7 @@ static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, enum migrate_mode mode) { return -ENOSYS; } -static inline int migrate_huge_pages(struct list_head *l, new_page_t x, +static inline int migrate_huge_page(struct page *page, new_page_t x, unsigned long private, bool offlining, enum migrate_mode mode) { return -ENOSYS; } -- cgit v1.2.3 From 0edaecfab218d747d30de4575e911907371e2cd2 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:07 -0700 Subject: hugetlb: add a list for tracking in-use HugeTLB pages hugepage_activelist will be used to track currently used HugeTLB pages. We need to find the in-use HugeTLB pages to support HugeTLB cgroup removal. On cgroup removal we update the page's HugeTLB cgroup to point to parent cgroup. Signed-off-by: Aneesh Kumar K.V Reviewed-by: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: Hillf Danton Reviewed-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0f23c1840c9b..ed550d819044 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -211,6 +211,7 @@ struct hstate { unsigned long resv_huge_pages; unsigned long surplus_huge_pages; unsigned long nr_overcommit_huge_pages; + struct list_head hugepage_activelist; struct list_head hugepage_freelists[MAX_NUMNODES]; unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; -- cgit v1.2.3 From c3f38a38715e9b4e7b1dda6840a1375f6894744d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:10 -0700 Subject: hugetlb: make some static variables global We will use them later in hugetlb_cgroup.c Signed-off-by: Aneesh Kumar K.V Cc: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Reviewed-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ed550d819044..3d677dd41898 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -21,6 +21,11 @@ struct hugepage_subpool { long max_hpages, used_hpages; }; +extern spinlock_t hugetlb_lock; +extern int hugetlb_max_hstate __read_mostly; +#define for_each_hstate(h) \ + for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++) + struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); void hugepage_put_subpool(struct hugepage_subpool *spool); -- cgit v1.2.3 From 2bc64a2046975410505bb119bba32705892b9255 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:12 -0700 Subject: mm/hugetlb: add new HugeTLB cgroup Implement a new controller that allows us to control HugeTLB allocations. The extension allows to limit the HugeTLB usage per control group and enforces the controller limit during page fault. Since HugeTLB doesn't support page reclaim, enforcing the limit at page fault time implies that, the application will get SIGBUS signal if it tries to access HugeTLB pages beyond its limit. This requires the application to know beforehand how much HugeTLB pages it would require for its use. The charge/uncharge calls will be added to HugeTLB code in later patch. Support for cgroup removal will be added in later patches. [akpm@linux-foundation.org: s/CONFIG_CGROUP_HUGETLB_RES_CTLR/CONFIG_MEMCG_HUGETLB/g] [akpm@linux-foundation.org: s/CONFIG_MEMCG_HUGETLB/CONFIG_CGROUP_HUGETLB/g] Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Aneesh Kumar K.V Cc: David Rientjes Cc: Hillf Danton Reviewed-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup_subsys.h | 6 ++++++ include/linux/hugetlb_cgroup.h | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 include/linux/hugetlb_cgroup.h (limited to 'include/linux') diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 0bd390ce98b2..5b41ce079024 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -72,3 +72,9 @@ SUBSYS(net_prio) #endif /* */ + +#ifdef CONFIG_CGROUP_HUGETLB +SUBSYS(hugetlb) +#endif + +/* */ diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h new file mode 100644 index 000000000000..f19889e56b47 --- /dev/null +++ b/include/linux/hugetlb_cgroup.h @@ -0,0 +1,37 @@ +/* + * Copyright IBM Corporation, 2012 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#ifndef _LINUX_HUGETLB_CGROUP_H +#define _LINUX_HUGETLB_CGROUP_H + +#include + +struct hugetlb_cgroup; + +#ifdef CONFIG_CGROUP_HUGETLB +static inline bool hugetlb_cgroup_disabled(void) +{ + if (hugetlb_subsys.disabled) + return true; + return false; +} + +#else +static inline bool hugetlb_cgroup_disabled(void) +{ + return true; +} + +#endif /* CONFIG_MEM_RES_CTLR_HUGETLB */ +#endif -- cgit v1.2.3 From 9dd540e23111d8884773ab942a736f3aba4040d4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:15 -0700 Subject: hugetlb/cgroup: add the cgroup pointer to page lru Add the hugetlb cgroup pointer to 3rd page lru.next. This limit the usage to hugetlb cgroup to only hugepages with 3 or more normal pages. I guess that is an acceptable limitation. Signed-off-by: Aneesh Kumar K.V Cc: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Reviewed-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb_cgroup.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index f19889e56b47..e5451a3b4ebc 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -18,8 +18,34 @@ #include struct hugetlb_cgroup; +/* + * Minimum page order trackable by hugetlb cgroup. + * At least 3 pages are necessary for all the tracking information. + */ +#define HUGETLB_CGROUP_MIN_ORDER 2 #ifdef CONFIG_CGROUP_HUGETLB + +static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) +{ + VM_BUG_ON(!PageHuge(page)); + + if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) + return NULL; + return (struct hugetlb_cgroup *)page[2].lru.next; +} + +static inline +int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) +{ + VM_BUG_ON(!PageHuge(page)); + + if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) + return -1; + page[2].lru.next = (void *)h_cg; + return 0; +} + static inline bool hugetlb_cgroup_disabled(void) { if (hugetlb_subsys.disabled) @@ -28,6 +54,17 @@ static inline bool hugetlb_cgroup_disabled(void) } #else +static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) +{ + return NULL; +} + +static inline +int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) +{ + return 0; +} + static inline bool hugetlb_cgroup_disabled(void) { return true; -- cgit v1.2.3 From 6d76dcf40405144a448040a350fd214ddc243d5e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:18 -0700 Subject: hugetlb/cgroup: add charge/uncharge routines for hugetlb cgroup Add the charge and uncharge routines for hugetlb cgroup. We do cgroup charging in page alloc and uncharge in compound page destructor. Assigning page's hugetlb cgroup is protected by hugetlb_lock. [liwp@linux.vnet.ibm.com: add huge_page_order check to avoid incorrect uncharge] Signed-off-by: Aneesh Kumar K.V Cc: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Cc: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Aneesh Kumar K.V Signed-off-by: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb_cgroup.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index e5451a3b4ebc..7d3fde996be3 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -53,6 +53,16 @@ static inline bool hugetlb_cgroup_disabled(void) return false; } +extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup **ptr); +extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg, + struct page *page); +extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, + struct page *page); +extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg); + #else static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) { @@ -70,5 +80,33 @@ static inline bool hugetlb_cgroup_disabled(void) return true; } +static inline int +hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup **ptr) +{ + return 0; +} + +static inline void +hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg, + struct page *page) +{ + return; +} + +static inline void +hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page) +{ + return; +} + +static inline void +hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg) +{ + return; +} + #endif /* CONFIG_MEM_RES_CTLR_HUGETLB */ #endif -- cgit v1.2.3 From abb8206cb07734d0b7bf033c715995d6371a94c3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:24 -0700 Subject: hugetlb/cgroup: add hugetlb cgroup control files Add the control files for hugetlb controller [akpm@linux-foundation.org: s/CONFIG_CGROUP_HUGETLB_RES_CTLR/CONFIG_MEMCG_HUGETLB/g] [akpm@linux-foundation.org: s/CONFIG_MEMCG_HUGETLB/CONFIG_CGROUP_HUGETLB/] Signed-off-by: Aneesh Kumar K.V Cc: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Reviewed-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 +++++ include/linux/hugetlb_cgroup.h | 6 ++++++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3d677dd41898..f9db20bfa9fc 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -4,6 +4,7 @@ #include #include #include +#include struct ctl_table; struct user_struct; @@ -221,6 +222,10 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; +#ifdef CONFIG_CGROUP_HUGETLB + /* cgroup control files */ + struct cftype cgroup_files[5]; +#endif char name[HSTATE_NAME_LEN]; }; diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 7d3fde996be3..73f1e600fc12 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -62,6 +62,7 @@ extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page); extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg); +extern int hugetlb_cgroup_file_init(int idx) __init; #else static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) @@ -108,5 +109,10 @@ hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, return; } +static inline int __init hugetlb_cgroup_file_init(int idx) +{ + return 0; +} + #endif /* CONFIG_MEM_RES_CTLR_HUGETLB */ #endif -- cgit v1.2.3 From 8e6ac7fab374816de9a8b0a8fbb02ef761a30ff4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:27 -0700 Subject: hugetlb/cgroup: migrate hugetlb cgroup info from oldpage to new page during migration With HugeTLB pages, hugetlb cgroup is uncharged in compound page destructor. Since we are holding a hugepage reference, we can be sure that old page won't get uncharged till the last put_page(). Signed-off-by: Aneesh Kumar K.V Cc: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Cc: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb_cgroup.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 73f1e600fc12..d73878c694b3 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -63,6 +63,8 @@ extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg); extern int hugetlb_cgroup_file_init(int idx) __init; +extern void hugetlb_cgroup_migrate(struct page *oldhpage, + struct page *newhpage); #else static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) @@ -114,5 +116,11 @@ static inline int __init hugetlb_cgroup_file_init(int idx) return 0; } +static inline void hugetlb_cgroup_migrate(struct page *oldhpage, + struct page *newhpage) +{ + return; +} + #endif /* CONFIG_MEM_RES_CTLR_HUGETLB */ #endif -- cgit v1.2.3 From c59e26104e3e0e952cd7d63e79cd71ee5a9ec25a Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 31 Jul 2012 16:42:49 -0700 Subject: mm/compaction: cleanup on compaction_deferred When CONFIG_COMPACTION is enabled, compaction_deferred() tries to recalculate the deferred limit again, which isn't necessary. When CONFIG_COMPACTION is disabled, compaction_deferred() should return "true" or "false" since it has "bool" for its return value. Signed-off-by: Gavin Shan Acked-by: Minchan Kim Acked-by: Johannes Weiner Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 51a90b7f2d60..133ddcf83397 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -58,7 +58,7 @@ static inline bool compaction_deferred(struct zone *zone, int order) if (++zone->compact_considered > defer_limit) zone->compact_considered = defer_limit; - return zone->compact_considered < (1UL << zone->compact_defer_shift); + return zone->compact_considered < defer_limit; } #else @@ -85,7 +85,7 @@ static inline void defer_compaction(struct zone *zone, int order) static inline bool compaction_deferred(struct zone *zone, int order) { - return 1; + return true; } #endif /* CONFIG_COMPACTION */ -- cgit v1.2.3 From c255a458055e459f65eb7b7f51dc5dbdd0caf1d8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 31 Jul 2012 16:43:02 -0700 Subject: memcg: rename config variables Sanity: CONFIG_CGROUP_MEM_RES_CTLR -> CONFIG_MEMCG CONFIG_CGROUP_MEM_RES_CTLR_SWAP -> CONFIG_MEMCG_SWAP CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED -> CONFIG_MEMCG_SWAP_ENABLED CONFIG_CGROUP_MEM_RES_CTLR_KMEM -> CONFIG_MEMCG_KMEM [mhocko@suse.cz: fix missed bits] Cc: Glauber Costa Acked-by: Michal Hocko Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Aneesh Kumar K.V Cc: David Rientjes Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup_subsys.h | 2 +- include/linux/memcontrol.h | 14 +++++++------- include/linux/mmzone.h | 8 ++++---- include/linux/page_cgroup.h | 10 +++++----- include/linux/sched.h | 2 +- include/linux/swap.h | 6 +++--- 6 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 5b41ce079024..dfae957398c3 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -31,7 +31,7 @@ SUBSYS(cpuacct) /* */ -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG SUBSYS(mem_cgroup) #endif diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 83e7ba90d6e5..170076222431 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -38,7 +38,7 @@ struct mem_cgroup_reclaim_cookie { unsigned int generation; }; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG /* * All "charge" functions with gfp_mask should use GFP_KERNEL or * (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't @@ -124,7 +124,7 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, extern void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP extern int do_swap_account; #endif @@ -193,7 +193,7 @@ void mem_cgroup_split_huge_fixup(struct page *head); bool mem_cgroup_bad_page_check(struct page *page); void mem_cgroup_print_bad_page(struct page *page); #endif -#else /* CONFIG_CGROUP_MEM_RES_CTLR */ +#else /* CONFIG_MEMCG */ struct mem_cgroup; static inline int mem_cgroup_newpage_charge(struct page *page, @@ -384,9 +384,9 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage) { } -#endif /* CONFIG_CGROUP_MEM_RES_CTLR */ +#endif /* CONFIG_MEMCG */ -#if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) +#if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM) static inline bool mem_cgroup_bad_page_check(struct page *page) { @@ -406,7 +406,7 @@ enum { }; struct sock; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_MEMCG_KMEM void sock_update_memcg(struct sock *sk); void sock_release_memcg(struct sock *sk); #else @@ -416,6 +416,6 @@ static inline void sock_update_memcg(struct sock *sk) static inline void sock_release_memcg(struct sock *sk) { } -#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ +#endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 458988bd55a1..3bdfa15b2012 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -201,7 +201,7 @@ struct zone_reclaim_stat { struct lruvec { struct list_head lists[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG struct zone *zone; #endif }; @@ -671,7 +671,7 @@ typedef struct pglist_data { int nr_zones; #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ struct page *node_mem_map; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG struct page_cgroup *node_page_cgroup; #endif #endif @@ -736,7 +736,7 @@ extern void lruvec_init(struct lruvec *lruvec, struct zone *zone); static inline struct zone *lruvec_zone(struct lruvec *lruvec) { -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG return lruvec->zone; #else return container_of(lruvec, struct zone, lruvec); @@ -1052,7 +1052,7 @@ struct mem_section { /* See declaration of similar field in struct zone */ unsigned long *pageblock_flags; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG /* * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use * section. (see memcontrol.h/page_cgroup.h about this.) diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index a88cdba27809..777a524716db 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -12,7 +12,7 @@ enum { #ifndef __GENERATING_BOUNDS_H #include -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG #include /* @@ -82,7 +82,7 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc) bit_spin_unlock(PCG_LOCK, &pc->flags); } -#else /* CONFIG_CGROUP_MEM_RES_CTLR */ +#else /* CONFIG_MEMCG */ struct page_cgroup; static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) @@ -102,11 +102,11 @@ static inline void __init page_cgroup_init_flatmem(void) { } -#endif /* CONFIG_CGROUP_MEM_RES_CTLR */ +#endif /* CONFIG_MEMCG */ #include -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new); extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); @@ -138,7 +138,7 @@ static inline void swap_cgroup_swapoff(int type) return; } -#endif /* CONFIG_CGROUP_MEM_RES_CTLR_SWAP */ +#endif /* CONFIG_MEMCG_SWAP */ #endif /* !__GENERATING_BOUNDS_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 68dcffaa62a0..865725adb9d3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1584,7 +1584,7 @@ struct task_struct { /* bitmask and counter of trace recursion */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ -#ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */ +#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ struct memcg_batch_info { int do_batch; /* incremented when batch uncharge started */ struct mem_cgroup *memcg; /* target memcg of uncharge */ diff --git a/include/linux/swap.h b/include/linux/swap.h index c84ec68eaec9..9a16bb1cefd1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -301,7 +301,7 @@ static inline void scan_unevictable_unregister_node(struct node *node) extern int kswapd_run(int nid); extern void kswapd_stop(int nid); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG extern int mem_cgroup_swappiness(struct mem_cgroup *mem); #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) @@ -309,7 +309,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) return vm_swappiness; } #endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP extern void mem_cgroup_uncharge_swap(swp_entry_t ent); #else static inline void mem_cgroup_uncharge_swap(swp_entry_t ent) @@ -360,7 +360,7 @@ extern int reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG extern void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout); #else -- cgit v1.2.3 From ca28ddc908fcfef0e5c1b6e5df632db7fc26de10 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 31 Jul 2012 16:43:04 -0700 Subject: mm: remove unused LRU_ALL_EVICTABLE Signed-off-by: Wanpeng Li Acked-by: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3bdfa15b2012..1495d952e641 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -209,7 +209,6 @@ struct lruvec { /* Mask used at gathering information at once (see memcontrol.c) */ #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) -#define LRU_ALL_EVICTABLE (LRU_ALL_FILE | LRU_ALL_ANON) #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) /* Isolate clean file */ -- cgit v1.2.3 From 7db8889ab05b57200158432755af318fb68854a2 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 31 Jul 2012 16:43:12 -0700 Subject: mm: have order > 0 compaction start off where it left Order > 0 compaction stops when enough free pages of the correct page order have been coalesced. When doing subsequent higher order allocations, it is possible for compaction to be invoked many times. However, the compaction code always starts out looking for things to compact at the start of the zone, and for free pages to compact things to at the end of the zone. This can cause quadratic behaviour, with isolate_freepages starting at the end of the zone each time, even though previous invocations of the compaction code already filled up all free memory on that end of the zone. This can cause isolate_freepages to take enormous amounts of CPU with certain workloads on larger memory systems. The obvious solution is to have isolate_freepages remember where it left off last time, and continue at that point the next time it gets invoked for an order > 0 compaction. This could cause compaction to fail if cc->free_pfn and cc->migrate_pfn are close together initially, in that case we restart from the end of the zone and try once more. Forced full (order == -1) compactions are left alone. [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: s/laste/last/, use 80 cols] Signed-off-by: Rik van Riel Reported-by: Jim Schutt Tested-by: Jim Schutt Cc: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1495d952e641..1aeadce4d56e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -368,6 +368,10 @@ struct zone { */ spinlock_t lock; int all_unreclaimable; /* All pages pinned */ +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + /* pfn where the last incremental compaction isolated free pages */ + unsigned long compact_cached_free_pfn; +#endif #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; -- cgit v1.2.3 From fe03025db3f4ade1f231b174938e0fe224722759 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Tue, 31 Jul 2012 16:43:14 -0700 Subject: mm: CONFIG_HAVE_MEMBLOCK_NODE -> CONFIG_HAVE_MEMBLOCK_NODE_MAP 0ee332c14518699 ("memblock: Kill early_node_map[]") wanted to replace CONFIG_ARCH_POPULATES_NODE_MAP with CONFIG_HAVE_MEMBLOCK_NODE_MAP but ended up replacing one occurence with a reference to the non-existent symbol CONFIG_HAVE_MEMBLOCK_NODE. The resulting omission of code would probably have been causing problems to 32-bit machines with memory hotplug. Signed-off-by: Rabin Vincent Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1aeadce4d56e..f64afa5929fe 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -776,7 +776,7 @@ extern int movable_zone; static inline int zone_movable_is_highmem(void) { -#if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE) +#if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) return movable_zone == ZONE_HIGHMEM; #else return 0; -- cgit v1.2.3 From 8e125cd85517c9716695b0abfabc0a4a3fcb94f3 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 31 Jul 2012 16:43:16 -0700 Subject: vmscan: remove obsolete shrink_control comment 09f363c7 ("vmscan: fix shrinker callback bug in fs/super.c") fixed a shrinker callback which was returning -1 when nr_to_scan is zero, which caused excessive slab scanning. But 635697c6 ("vmscan: fix initial shrinker size handling") fixed the problem, again so we can freely return -1 although nr_to_scan is zero. So let's revert 09f363c7 because the comment added in 09f363c7 made an unnecessary rule. Signed-off-by: Minchan Kim Cc: Al Viro Cc: Mikulas Patocka Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shrinker.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 07ceb97d53fa..ac6b8ee07825 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -20,7 +20,6 @@ struct shrink_control { * 'nr_to_scan' entries and attempt to free them up. It should return * the number of objects which remain in the cache. If it returns -1, it means * it cannot do any scanning at this time (eg. there is a risk of deadlock). - * The callback must not return -1 if nr_to_scan is zero. * * The 'gfpmask' refers to the allocation we are currently trying to * fulfil. -- cgit v1.2.3 From 9adb62a5df9c0fbef7b4665919329f73a34651ed Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 31 Jul 2012 16:43:28 -0700 Subject: mm/hotplug: correctly setup fallback zonelists when creating new pgdat When hotadd_new_pgdat() is called to create new pgdat for a new node, a fallback zonelist should be created for the new node. There's code to try to achieve that in hotadd_new_pgdat() as below: /* * The node we allocated has no zone fallback lists. For avoiding * to access not-initialized zonelist, build here. */ mutex_lock(&zonelists_mutex); build_all_zonelists(pgdat, NULL); mutex_unlock(&zonelists_mutex); But it doesn't work as expected. When hotadd_new_pgdat() is called, the new node is still in offline state because node_set_online(nid) hasn't been called yet. And build_all_zonelists() only builds zonelists for online nodes as: for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); build_zonelist_cache(pgdat); } Though we hope to create zonelist for the new pgdat, but it doesn't. So add a new parameter "pgdat" the build_all_zonelists() to build pgdat for the new pgdat too. Signed-off-by: Jiang Liu Signed-off-by: Xishi Qiu Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Rusty Russell Cc: Yinghai Lu Cc: Tony Luck Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: David Rientjes Cc: Keping Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f64afa5929fe..98f079bcf399 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -721,7 +721,7 @@ typedef struct pglist_data { #include extern struct mutex zonelists_mutex; -void build_all_zonelists(void *data); +void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags); -- cgit v1.2.3 From 340175b7d14d5617559d0c1a54fa0ea204d9edcd Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 31 Jul 2012 16:43:32 -0700 Subject: mm/hotplug: free zone->pageset when a zone becomes empty When a zone becomes empty after memory offlining, free zone->pageset. Otherwise it will cause memory leak when adding memory to the empty zone again because build_all_zonelists() will allocate zone->pageset for an empty zone. Signed-off-by: Jiang Liu Signed-off-by: Wei Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Rusty Russell Cc: Yinghai Lu Cc: Tony Luck Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: David Rientjes Cc: Keping Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3955bedeeed1..7c6dfd2faa69 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1331,6 +1331,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...); extern void setup_per_cpu_pageset(void); extern void zone_pcp_update(struct zone *zone); +extern void zone_pcp_reset(struct zone *zone); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; -- cgit v1.2.3 From 62ce1c706f817cb9defef3ac2dfdd815149f2968 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 31 Jul 2012 16:43:39 -0700 Subject: mm, oom: move declaration for mem_cgroup_out_of_memory to oom.h mem_cgroup_out_of_memory() is defined in mm/oom_kill.c, so declare it in linux/oom.h rather than linux/memcontrol.h. Acked-by: KAMEZAWA Hiroyuki Acked-by: KOSAKI Motohiro Acked-by: Michal Hocko Signed-off-by: David Rientjes Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 -- include/linux/oom.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 170076222431..c0bff8976a69 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -72,8 +72,6 @@ extern void mem_cgroup_uncharge_end(void); extern void mem_cgroup_uncharge_page(struct page *page); extern void mem_cgroup_uncharge_cache_page(struct page *page); -extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, - int order); bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, struct mem_cgroup *memcg); int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); diff --git a/include/linux/oom.h b/include/linux/oom.h index e4c29bc72e70..eb9dc14362c5 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -49,6 +49,8 @@ extern unsigned long oom_badness(struct task_struct *p, extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); +extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, + int order); extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *mask, bool force_kill); extern int register_oom_notifier(struct notifier_block *nb); -- cgit v1.2.3 From 9cbb78bb314360a860a8b23723971cb6fcb54176 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 31 Jul 2012 16:43:44 -0700 Subject: mm, memcg: introduce own oom handler to iterate only over its own threads The global oom killer is serialized by the per-zonelist try_set_zonelist_oom() which is used in the page allocator. Concurrent oom kills are thus a rare event and only occur in systems using mempolicies and with a large number of nodes. Memory controller oom kills, however, can frequently be concurrent since there is no serialization once the oom killer is called for oom conditions in several different memcgs in parallel. This creates a massive contention on tasklist_lock since the oom killer requires the readside for the tasklist iteration. If several memcgs are calling the oom killer, this lock can be held for a substantial amount of time, especially if threads continue to enter it as other threads are exiting. Since the exit path grabs the writeside of the lock with irqs disabled in a few different places, this can cause a soft lockup on cpus as a result of tasklist_lock starvation. The kernel lacks unfair writelocks, and successful calls to the oom killer usually result in at least one thread entering the exit path, so an alternative solution is needed. This patch introduces a seperate oom handler for memcgs so that they do not require tasklist_lock for as much time. Instead, it iterates only over the threads attached to the oom memcg and grabs a reference to the selected thread before calling oom_kill_process() to ensure it doesn't prematurely exit. This still requires tasklist_lock for the tasklist dump, iterating children of the selected process, and killing all other threads on the system sharing the same memory as the selected victim. So while this isn't a complete solution to tasklist_lock starvation, it significantly reduces the amount of time that it is held. Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Signed-off-by: David Rientjes Cc: Oleg Nesterov Cc: KOSAKI Motohiro Reviewed-by: Sha Zhengju Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 9 ++------- include/linux/oom.h | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c0bff8976a69..2a80544aec99 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -180,7 +180,8 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask, unsigned long *total_scanned); -u64 mem_cgroup_get_limit(struct mem_cgroup *memcg); +extern void __mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, + int order); void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -364,12 +365,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, return 0; } -static inline -u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) -{ - return 0; -} - static inline void mem_cgroup_split_huge_fixup(struct page *head) { } diff --git a/include/linux/oom.h b/include/linux/oom.h index eb9dc14362c5..5dc0e384ae9e 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -40,17 +40,33 @@ enum oom_constraint { CONSTRAINT_MEMCG, }; +enum oom_scan_t { + OOM_SCAN_OK, /* scan thread and find its badness */ + OOM_SCAN_CONTINUE, /* do not consider thread for oom kill */ + OOM_SCAN_ABORT, /* abort the iteration and return */ + OOM_SCAN_SELECT, /* always select this thread first */ +}; + extern void compare_swap_oom_score_adj(int old_val, int new_val); extern int test_set_oom_score_adj(int new_val); extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); +extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, + unsigned int points, unsigned long totalpages, + struct mem_cgroup *memcg, nodemask_t *nodemask, + const char *message); + extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); +extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, + unsigned long totalpages, const nodemask_t *nodemask, + bool force_kill); extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order); + extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *mask, bool force_kill); extern int register_oom_notifier(struct notifier_block *nb); -- cgit v1.2.3 From 876aafbfd9ba5bb352f1b14622c27f3fe9a99013 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 31 Jul 2012 16:43:48 -0700 Subject: mm, memcg: move all oom handling to memcontrol.c By globally defining check_panic_on_oom(), the memcg oom handler can be moved entirely to mm/memcontrol.c. This removes the ugly #ifdef in the oom killer and cleans up the code. Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: Oleg Nesterov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 -- include/linux/oom.h | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2a80544aec99..5a3ee6423634 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -180,8 +180,6 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask, unsigned long *total_scanned); -extern void __mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, - int order); void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/include/linux/oom.h b/include/linux/oom.h index 5dc0e384ae9e..49a3031fda50 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -61,6 +61,9 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); +extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, + int order, const nodemask_t *nodemask); + extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, unsigned long totalpages, const nodemask_t *nodemask, bool force_kill); -- cgit v1.2.3 From ee6f509c3274014d1f52e7a7a10aee9f85393c5e Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 31 Jul 2012 16:43:50 -0700 Subject: mm: factor out memory isolate functions mm/page_alloc.c has some memory isolation functions but they are used only when we enable CONFIG_{CMA|MEMORY_HOTPLUG|MEMORY_FAILURE}. So let's make it configurable by new CONFIG_MEMORY_ISOLATION so that it can reduce binary size and we can check it simple by CONFIG_MEMORY_ISOLATION, not if defined CONFIG_{CMA|MEMORY_HOTPLUG|MEMORY_FAILURE}. Signed-off-by: Minchan Kim Cc: Andi Kleen Cc: Marek Szyprowski Acked-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-isolation.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 3bdcab30ca41..105077aa7685 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -1,6 +1,11 @@ #ifndef __LINUX_PAGEISOLATION_H #define __LINUX_PAGEISOLATION_H + +bool has_unmovable_pages(struct zone *zone, struct page *page, int count); +void set_pageblock_migratetype(struct page *page, int migratetype); +int move_freepages_block(struct zone *zone, struct page *page, + int migratetype); /* * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE. * If specified range includes migrate types other than MOVABLE or CMA, @@ -10,7 +15,7 @@ * free all pages in the range. test_page_isolated() can be used for * test it. */ -extern int +int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned migratetype); @@ -18,7 +23,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE. * target range is [start_pfn, end_pfn) */ -extern int +int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned migratetype); @@ -30,8 +35,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn); /* * Internal functions. Changes pageblock's migrate type. */ -extern int set_migratetype_isolate(struct page *page); -extern void unset_migratetype_isolate(struct page *page, unsigned migratetype); +int set_migratetype_isolate(struct page *page); +void unset_migratetype_isolate(struct page *page, unsigned migratetype); #endif -- cgit v1.2.3 From 702d1a6e0766d45642c934444fd41f658d251305 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 31 Jul 2012 16:43:56 -0700 Subject: memory-hotplug: fix kswapd looping forever problem When hotplug offlining happens on zone A, it starts to mark freed page as MIGRATE_ISOLATE type in buddy for preventing further allocation. (MIGRATE_ISOLATE is very irony type because it's apparently on buddy but we can't allocate them). When the memory shortage happens during hotplug offlining, current task starts to reclaim, then wake up kswapd. Kswapd checks watermark, then go sleep because current zone_watermark_ok_safe doesn't consider MIGRATE_ISOLATE freed page count. Current task continue to reclaim in direct reclaim path without kswapd's helping. The problem is that zone->all_unreclaimable is set by only kswapd so that current task would be looping forever like below. __alloc_pages_slowpath restart: wake_all_kswapd rebalance: __alloc_pages_direct_reclaim do_try_to_free_pages if global_reclaim && !all_unreclaimable return 1; /* It means we did did_some_progress */ skip __alloc_pages_may_oom should_alloc_retry goto rebalance; If we apply KOSAKI's patch[1] which doesn't depends on kswapd about setting zone->all_unreclaimable, we can solve this problem by killing some task in direct reclaim path. But it doesn't wake up kswapd, still. It could be a problem still if other subsystem needs GFP_ATOMIC request. So kswapd should consider MIGRATE_ISOLATE when it calculate free pages BEFORE going sleep. This patch counts the number of MIGRATE_ISOLATE page block and zone_watermark_ok_safe will consider it if the system has such blocks (fortunately, it's very rare so no problem in POV overhead and kswapd is never hotpath). Copy/modify from Mel's quote " Ideal solution would be "allocating" the pageblock. It would keep the free space accounting as it is but historically, memory hotplug didn't allocate pages because it would be difficult to detect if a pageblock was isolated or if part of some balloon. Allocating just full pageblocks would work around this, However, it would play very badly with CMA. " [1] http://lkml.org/lkml/2012/6/14/74 [akpm@linux-foundation.org: simplify nr_zone_isolate_freepages(), rework zone_watermark_ok_safe() comment, simplify set_pageblock_isolate() and restore_pageblock_isolate()] [akpm@linux-foundation.org: fix CONFIG_MEMORY_ISOLATION=n build] Signed-off-by: Minchan Kim Suggested-by: KOSAKI Motohiro Tested-by: Aaditya Kumar Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 98f079bcf399..64b2c3a48286 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -478,6 +478,14 @@ struct zone { * rarely used fields: */ const char *name; +#ifdef CONFIG_MEMORY_ISOLATION + /* + * the number of MIGRATE_ISOLATE *pageblock*. + * We need this for free page counting. Look at zone_watermark_ok_safe. + * It's protected by zone->lock + */ + int nr_pageblock_isolate; +#endif } ____cacheline_internodealigned_in_smp; typedef enum { -- cgit v1.2.3 From 072bb0aa5e062902968c5c1007bba332c7820cf4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:43:58 -0700 Subject: mm: sl[au]b: add knowledge of PFMEMALLOC reserve pages When a user or administrator requires swap for their application, they create a swap partition and file, format it with mkswap and activate it with swapon. Swap over the network is considered as an option in diskless systems. The two likely scenarios are when blade servers are used as part of a cluster where the form factor or maintenance costs do not allow the use of disks and thin clients. The Linux Terminal Server Project recommends the use of the Network Block Device (NBD) for swap according to the manual at https://sourceforge.net/projects/ltsp/files/Docs-Admin-Guide/LTSPManual.pdf/download There is also documentation and tutorials on how to setup swap over NBD at places like https://help.ubuntu.com/community/UbuntuLTSP/EnableNBDSWAP The nbd-client also documents the use of NBD as swap. Despite this, the fact is that a machine using NBD for swap can deadlock within minutes if swap is used intensively. This patch series addresses the problem. The core issue is that network block devices do not use mempools like normal block devices do. As the host cannot control where they receive packets from, they cannot reliably work out in advance how much memory they might need. Some years ago, Peter Zijlstra developed a series of patches that supported swap over an NFS that at least one distribution is carrying within their kernels. This patch series borrows very heavily from Peter's work to support swapping over NBD as a pre-requisite to supporting swap-over-NFS. The bulk of the complexity is concerned with preserving memory that is allocated from the PFMEMALLOC reserves for use by the network layer which is needed for both NBD and NFS. Patch 1 adds knowledge of the PFMEMALLOC reserves to SLAB and SLUB to preserve access to pages allocated under low memory situations to callers that are freeing memory. Patch 2 optimises the SLUB fast path to avoid pfmemalloc checks Patch 3 introduces __GFP_MEMALLOC to allow access to the PFMEMALLOC reserves without setting PFMEMALLOC. Patch 4 opens the possibility for softirqs to use PFMEMALLOC reserves for later use by network packet processing. Patch 5 only sets page->pfmemalloc when ALLOC_NO_WATERMARKS was required Patch 6 ignores memory policies when ALLOC_NO_WATERMARKS is set. Patches 7-12 allows network processing to use PFMEMALLOC reserves when the socket has been marked as being used by the VM to clean pages. If packets are received and stored in pages that were allocated under low-memory situations and are unrelated to the VM, the packets are dropped. Patch 11 reintroduces __skb_alloc_page which the networking folk may object to but is needed in some cases to propogate pfmemalloc from a newly allocated page to an skb. If there is a strong objection, this patch can be dropped with the impact being that swap-over-network will be slower in some cases but it should not fail. Patch 13 is a micro-optimisation to avoid a function call in the common case. Patch 14 tags NBD sockets as being SOCK_MEMALLOC so they can use PFMEMALLOC if necessary. Patch 15 notes that it is still possible for the PFMEMALLOC reserve to be depleted. To prevent this, direct reclaimers get throttled on a waitqueue if 50% of the PFMEMALLOC reserves are depleted. It is expected that kswapd and the direct reclaimers already running will clean enough pages for the low watermark to be reached and the throttled processes are woken up. Patch 16 adds a statistic to track how often processes get throttled Some basic performance testing was r