diff options
| author | Mel Gorman <mgorman@techsingularity.net> | 2015-11-06 16:28:21 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-06 17:50:42 -0800 |
| commit | d0164adc89f6bb374d304ffcc375c6d2652fe67d (patch) | |
| tree | de1cbe09c86dcd24a4a476f7e0b41af239bbdc29 | |
| parent | 016c13daa5c9e4827eca703e2f0621c131f2cca3 (diff) | |
| download | linux-d0164adc89f6bb374d304ffcc375c6d2652fe67d.tar.gz linux-d0164adc89f6bb374d304ffcc375c6d2652fe67d.tar.bz2 linux-d0164adc89f6bb374d304ffcc375c6d2652fe67d.zip | |
mm, page_alloc: distinguish between being unable to sleep, unwilling to sleep and avoiding waking kswapd
__GFP_WAIT has been used to identify atomic context in callers that hold
spinlocks or are in interrupts. They are expected to be high priority and
have access one of two watermarks lower than "min" which can be referred
to as the "atomic reserve". __GFP_HIGH users get access to the first
lower watermark and can be called the "high priority reserve".
Over time, callers had a requirement to not block when fallback options
were available. Some have abused __GFP_WAIT leading to a situation where
an optimisitic allocation with a fallback option can access atomic
reserves.
This patch uses __GFP_ATOMIC to identify callers that are truely atomic,
cannot sleep and have no alternative. High priority users continue to use
__GFP_HIGH. __GFP_DIRECT_RECLAIM identifies callers that can sleep and
are willing to enter direct reclaim. __GFP_KSWAPD_RECLAIM to identify
callers that want to wake kswapd for background reclaim. __GFP_WAIT is
redefined as a caller that is willing to enter direct reclaim and wake
kswapd for background reclaim.
This patch then converts a number of sites
o __GFP_ATOMIC is used by callers that are high priority and have memory
pools for those requests. GFP_ATOMIC uses this flag.
o Callers that have a limited mempool to guarantee forward progress clear
__GFP_DIRECT_RECLAIM but keep __GFP_KSWAPD_RECLAIM. bio allocations fall
into this category where kswapd will still be woken but atomic reserves
are not used as there is a one-entry mempool to guarantee progress.
o Callers that are checking if they are non-blocking should use the
helper gfpflags_allow_blocking() where possible. This is because
checking for __GFP_WAIT as was done historically now can trigger false
positives. Some exceptions like dm-crypt.c exist where the code intent
is clearer if __GFP_DIRECT_RECLAIM is used instead of the helper due to
flag manipulations.
o Callers that built their own GFP flags instead of starting with GFP_KERNEL
and friends now also need to specify __GFP_KSWAPD_RECLAIM.
The first key hazard to watch out for is callers that removed __GFP_WAIT
and was depending on access to atomic reserves for inconspicuous reasons.
In some cases it may be appropriate for them to use __GFP_HIGH.
The second key hazard is callers that assembled their own combination of
GFP flags instead of starting with something like GFP_KERNEL. They may
now wish to specify __GFP_KSWAPD_RECLAIM. It's almost certainly harmless
if it's missed in most cases as other activity will wake kswapd.
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Vitaly Wool <vitalywool@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
66 files changed, 210 insertions, 172 deletions
diff --git a/Documentation/vm/balance b/Documentation/vm/balance index c46e68cf9344..964595481af6 100644 --- a/Documentation/vm/balance +++ b/Documentation/vm/balance @@ -1,12 +1,14 @@ Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com> -Memory balancing is needed for non __GFP_WAIT as well as for non -__GFP_IO allocations. +Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +well as for non __GFP_IO allocations. -There are two reasons to be requesting non __GFP_WAIT allocations: -the caller can not sleep (typically intr context), or does not want -to incur cost overheads of page stealing and possible swap io for -whatever reasons. +The first reason why a caller may avoid reclaim is that the caller can not +sleep due to holding a spinlock or is in interrupt context. The second may +be that the caller is willing to fail the allocation without incurring the +overhead of page reclaim. This may happen for opportunistic high-order +allocation requests that have order-0 fallback options. In such cases, +the caller may also wish to avoid waking kswapd. __GFP_IO allocation requests are made to prevent file system deadlocks. diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index ad4eb2d26e16..e62400e5fb99 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -651,12 +651,12 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, if (nommu()) addr = __alloc_simple_buffer(dev, size, gfp, &page); - else if (dev_get_cma_area(dev) && (gfp & __GFP_WAIT)) + else if (dev_get_cma_area(dev) && (gfp & __GFP_DIRECT_RECLAIM)) addr = __alloc_from_contiguous(dev, size, prot, &page, caller, want_vaddr); else if (is_coherent) addr = __alloc_simple_buffer(dev, size, gfp, &page); - else if (!(gfp & __GFP_WAIT)) + else if (!gfpflags_allow_blocking(gfp)) addr = __alloc_from_pool(size, &page); else addr = __alloc_remap_buffer(dev, size, gfp, prot, &page, @@ -1363,7 +1363,7 @@ static void *arm_iommu_alloc_attrs(struct device *dev, size_t size, *handle = DMA_ERROR_CODE; size = PAGE_ALIGN(size); - if (!(gfp & __GFP_WAIT)) + if (!gfpflags_allow_blocking(gfp)) return __iommu_alloc_atomic(dev, size, handle); /* diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index 7c34f7126b04..c5f9a9e3d1f3 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -25,7 +25,7 @@ unsigned long xen_get_swiotlb_free_pages(unsigned int order) { struct memblock_region *reg; - gfp_t flags = __GFP_NOWARN; + gfp_t flags = __GFP_NOWARN|__GFP_KSWAPD_RECLAIM; for_each_memblock(memory, reg) { if (reg->base < (phys_addr_t)0xffffffff) { diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 6320361d8d4c..bb4bf6a06ad6 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -100,7 +100,7 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size, if (IS_ENABLED(CONFIG_ZONE_DMA) && dev->coherent_dma_mask <= DMA_BIT_MASK(32)) flags |= GFP_DMA; - if (dev_get_cma_area(dev) && (flags & __GFP_WAIT)) { + if (dev_get_cma_area(dev) && gfpflags_allow_blocking(flags)) { struct page *page; void *addr; @@ -148,7 +148,7 @@ static void *__dma_alloc(struct device *dev, size_t size, size = PAGE_ALIGN(size); - if (!coherent && !(flags & __GFP_WAIT)) { + if (!coherent && !gfpflags_allow_blocking(flags)) { struct page *page = NULL; void *addr = __alloc_from_pool(size, &page, flags); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index cd99433b8ba1..6ba014c61d62 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -90,7 +90,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, again: page = NULL; /* CMA can be used only in the context which permits sleeping */ - if (flag & __GFP_WAIT) { + if (gfpflags_allow_blocking(flag)) { page = dma_alloc_from_contiguous(dev, count, get_order(size)); if (page && page_to_phys(page) + size > dma_mask) { dma_release_from_contiguous(dev, page, count); diff --git a/block/bio.c b/block/bio.c index ad3f276d74bc..4f184d938942 100644 --- a/block/bio.c +++ b/block/bio.c @@ -211,7 +211,7 @@ fallback: bvl = mempool_alloc(pool, gfp_mask); } else { struct biovec_slab *bvs = bvec_slabs + *idx; - gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); + gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); /* * Make this allocation restricted and don't dump info on @@ -221,11 +221,11 @@ fallback: __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; /* - * Try a slab allocation. If this fails and __GFP_WAIT + * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM * is set, retry with the 1-entry mempool */ bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); - if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) { + if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { *idx = BIOVEC_MAX_IDX; goto fallback; } @@ -395,12 +395,12 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is * backed by the @bs's mempool. * - * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be - * able to allocate a bio. This is due to the mempool guarantees. To make this - * work, callers must never allocate more than 1 bio at a time from this pool. - * Callers that need to allocate more than 1 bio must always submit the - * previously allocated bio for IO before attempting to allocate a new one. - * Failure to do so can cause deadlocks under memory pressure. + * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will + * always be able to allocate a bio. This is due to the mempool guarantees. + * To make this work, callers must never allocate more than 1 bio at a time + * from this pool. Callers that need to allocate more than 1 bio must always + * submit the previously allocated bio for IO before attempting to allocate + * a new one. Failure to do so can cause deadlocks under memory pressure. * * Note that when running under generic_make_request() (i.e. any block * driver), bios are not submitted until after you return - see the code in @@ -459,13 +459,13 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) * We solve this, and guarantee forward progress, with a rescuer * workqueue per bio_set. If we go to allocate and there are * bios on current->bio_list, we first try the allocation - * without __GFP_WAIT; if that fails, we punt those bios we - * would be blocking to the rescuer workqueue before we retry - * with the original gfp_flags. + * without __GFP_DIRECT_RECLAIM; if that fails, we punt those + * bios we would be blocking to the rescuer workqueue before + * we retry with the original gfp_flags. */ if (current->bio_list && !bio_list_empty(current->bio_list)) - gfp_mask &= ~__GFP_WAIT; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; p = mempool_alloc(bs->bio_pool, gfp_mask); if (!p && gfp_mask != saved_gfp) { diff --git a/block/blk-core.c b/block/blk-core.c index 89eec7965870..9e32f0868e36 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1206,8 +1206,8 @@ rq_starved: * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * - * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this - * function keeps retrying under memory pressure and fails iff @q is dead. + * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask, + * this function keeps retrying under memory pressure and fails iff @q is dead. * * Must be called with @q->queue_lock held and, * Returns ERR_PTR on failure, with @q->queue_lock held. @@ -1227,7 +1227,7 @@ retry: if (!IS_ERR(rq)) return rq; - if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) { + if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) { blk_put_rl(rl); return rq; } @@ -1305,11 +1305,11 @@ EXPORT_SYMBOL(blk_get_request); * BUG. * * WARNING: When allocating/cloning a bio-chain, careful consideration should be - * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for - * anything but the first bio in the chain. Otherwise you risk waiting for IO - * completion of a bio that hasn't been submitted yet, thus resulting in a - * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead - * of bio_alloc(), as that avoids the mempool deadlock. + * given to how you allocate bios. In particular, you cannot use + * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise + * you risk waiting for IO completion of a bio that hasn't been submitted yet, + * thus resulting in a deadlock. Alternatively bios should be allocated using + * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock. * If possible a big IO should be split into smaller parts when allocation * fails. Partial allocation should not be an error, or you risk a live-lock. */ diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 1a27f45ec776..381cb50a673c 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -289,7 +289,7 @@ struct io_context *get_task_io_context(struct task_struct *task, { struct io_context *ioc; - might_sleep_if(gfp_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_flags)); do { task_lock(task); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 60ac684c8b8c..a07ca3488d96 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -268,7 +268,7 @@ static int bt_get(struct blk_mq_alloc_data *data, if (tag != -1) return tag; - if (!(data->gfp & __GFP_WAIT)) + if (!gfpflags_allow_blocking(data->gfp)) return -1; bs = bt_wait_ptr(bt, hctx); diff --git a/block/blk-mq.c b/block/blk-mq.c index 1c27b3eaef64..68c0a3416b34 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -244,11 +244,11 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT, + blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM, reserved, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); - if (!rq && (gfp & __GFP_WAIT)) { + if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) { __blk_mq_run_hw_queue(hctx); blk_mq_put_ctx(ctx); @@ -1186,7 +1186,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, - __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx); + __GFP_WAIT|__GFP_HIGH, false, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); ctx = alloc_data.ctx; hctx = alloc_data.hctx; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c097909c589c..b4b5680ac6ad 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -357,7 +357,8 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto } if (has_payload && data_size) { - page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT)); + page = drbd_alloc_pages(peer_device, nr_pages, + gfpflags_allow_blocking(gfp_mask)); if (!page) goto fail; } diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index e22942596207..1b709a4e3b5e 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c @@ -271,7 +271,7 @@ static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask) goto err_out; tmp->bi_bdev = NULL; - gfpmask &= ~__GFP_WAIT; + gfpmask &= ~__GFP_DIRECT_RECLAIM; tmp->bi_next = NULL; if (!new_chain) diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c index 30f522848c73..d7373ca69c99 100644 --- a/drivers/connector/connector.c +++ b/drivers/connector/connector.c @@ -124,7 +124,8 @@ int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 __group, if (group) return netlink_broadcast(dev->nls, skb, portid, group, gfp_mask); - return netlink_unicast(dev->nls, skb, portid, !(gfp_mask&__GFP_WAIT)); + return netlink_unicast(dev->nls, skb, portid, + !gfpflags_allow_blocking(gfp_mask)); } EXPORT_SYMBOL_GPL(cn_netlink_send_mult); diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 2a3973a7c441..36a7c2d89a01 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -486,7 +486,7 @@ static int ioctl_get_info(struct client *client, union ioctl_arg *arg) static int add_client_resource(struct client *client, struct client_resource *resource, gfp_t gfp_mask) { - bool preload = !!(gfp_mask & __GFP_WAIT); + bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; int ret; diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 4d631a946481..d58cb9e034fe 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2215,7 +2215,7 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj) */ mapping = file_inode(obj->base.filp)->i_mapping; gfp = mapping_gfp_mask(mapping); - gfp |= __GFP_NORETRY | __GFP_NOWARN | __GFP_NO_KSWAPD; + gfp |= __GFP_NORETRY | __GFP_NOWARN; gfp &= ~(__GFP_IO | __GFP_WAIT); sg = st->sgl; st->nents = 0; diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 8c014b33d8e0..59ab264c99c4 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1083,7 +1083,7 @@ static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent) static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) { - bool preload = !!(gfp_mask & __GFP_WAIT); + bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; int ret, id; diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 0d533bba4ad1..8b2be1e7714f 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2668,7 +2668,7 @@ static void *alloc_coherent(struct device *dev, size_t size, page = alloc_pages(flag | __GFP_NOWARN, get_order(size)); if (!page) { - if (!(flag & __GFP_WAIT)) + if (!gfpflags_allow_blocking(flag)) return NULL; page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT, diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 7cf80c1a8a16..f1042daef9ad 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -3647,7 +3647,7 @@ static void *intel_alloc_coherent(struct device *dev, size_t size, flags |= GFP_DMA32; } - if (flags & __GFP_WAIT) { + if (gfpflags_allow_blocking(flags)) { unsigned int count = size >> PAGE_SHIFT; page = dma_alloc_from_contiguous(dev, count, order); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3729b394432c..917d47e290ae 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -994,7 +994,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size) struct bio_vec *bvec; retry: - if (unlikely(gfp_mask & __GFP_WAIT)) + if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) mutex_lock(&cc->bio_alloc_lock); clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); @@ -1010,7 +1010,7 @@ retry: if (!page) { crypt_free_buffer_pages(cc, clone); bio_put(clone); - gfp_mask |= __GFP_WAIT; + gfp_mask |= __GFP_DIRECT_RECLAIM; goto retry; } @@ -1027,7 +1027,7 @@ retry: } return_clone: - if (unlikely(gfp_mask & __GFP_WAIT)) + if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) mutex_unlock(&cc->bio_alloc_lock); return clone; diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 3a7cade5e27d..1452ed9aacb4 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -244,7 +244,7 @@ static int kcopyd_get_pages(struct dm_kcopyd_client *kc, *pages = NULL; do { - pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY); + pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY | __GFP_KSWAPD_RECLAIM); if (unlikely(!pl)) { /* Use reserved pages */ pl = kc->pages; |
