diff options
-rw-r--r-- | CREDITS | 12 | ||||
-rw-r--r-- | include/linux/slab_def.h | 124 | ||||
-rw-r--r-- | mm/slab.c | 4005 |
3 files changed, 8 insertions, 4133 deletions
@@ -9,10 +9,6 @@ Linus ---------- -N: Matt Mackal -E: mpm@selenic.com -D: SLOB slab allocator - N: Matti Aarnio E: mea@nic.funet.fi D: Alpha systems hacking, IPv6 and other network related stuff @@ -1572,6 +1568,10 @@ S: Ampferstr. 50 / 4 S: 6020 Innsbruck S: Austria +N: Mark Hemment +E: markhe@nextd.demon.co.uk +D: SLAB allocator implementation + N: Richard Henderson E: rth@twiddle.net E: rth@cygnus.com @@ -2437,6 +2437,10 @@ D: work on suspend-to-ram/disk, killing duplicates from ioctl32, D: Altera SoCFPGA and Nokia N900 support. S: Czech Republic +N: Olivia Mackall +E: olivia@selenic.com +D: SLOB slab allocator + N: Paul Mackerras E: paulus@samba.org D: PPP driver diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h deleted file mode 100644 index a61e7d55d0d3..000000000000 --- a/include/linux/slab_def.h +++ /dev/null @@ -1,124 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_SLAB_DEF_H -#define _LINUX_SLAB_DEF_H - -#include <linux/kfence.h> -#include <linux/reciprocal_div.h> - -/* - * Definitions unique to the original Linux SLAB allocator. - */ - -struct kmem_cache { - struct array_cache __percpu *cpu_cache; - -/* 1) Cache tunables. Protected by slab_mutex */ - unsigned int batchcount; - unsigned int limit; - unsigned int shared; - - unsigned int size; - struct reciprocal_value reciprocal_buffer_size; -/* 2) touched by every alloc & free from the backend */ - - slab_flags_t flags; /* constant flags */ - unsigned int num; /* # of objs per slab */ - -/* 3) cache_grow/shrink */ - /* order of pgs per slab (2^n) */ - unsigned int gfporder; - - /* force GFP flags, e.g. GFP_DMA */ - gfp_t allocflags; - - size_t colour; /* cache colouring range */ - unsigned int colour_off; /* colour offset */ - unsigned int freelist_size; - - /* constructor func */ - void (*ctor)(void *obj); - -/* 4) cache creation/removal */ - const char *name; - struct list_head list; - int refcount; - int object_size; - int align; - -/* 5) statistics */ -#ifdef CONFIG_DEBUG_SLAB - unsigned long num_active; - unsigned long num_allocations; - unsigned long high_mark; - unsigned long grown; - unsigned long reaped; - unsigned long errors; - unsigned long max_freeable; - unsigned long node_allocs; - unsigned long node_frees; - unsigned long node_overflow; - atomic_t allochit; - atomic_t allocmiss; - atomic_t freehit; - atomic_t freemiss; - - /* - * If debugging is enabled, then the allocator can add additional - * fields and/or padding to every object. 'size' contains the total - * object size including these internal fields, while 'obj_offset' - * and 'object_size' contain the offset to the user object and its - * size. - */ - int obj_offset; -#endif /* CONFIG_DEBUG_SLAB */ - -#ifdef CONFIG_KASAN_GENERIC - struct kasan_cache kasan_info; -#endif - -#ifdef CONFIG_SLAB_FREELIST_RANDOM - unsigned int *random_seq; -#endif - -#ifdef CONFIG_HARDENED_USERCOPY - unsigned int useroffset; /* Usercopy region offset */ - unsigned int usersize; /* Usercopy region size */ -#endif - - struct kmem_cache_node *node[MAX_NUMNODES]; -}; - -static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab, - void *x) -{ - void *object = x - (x - slab->s_mem) % cache->size; - void *last_object = slab->s_mem + (cache->num - 1) * cache->size; - - if (unlikely(object > last_object)) - return last_object; - else - return object; -} - -/* - * We want to avoid an expensive divide : (offset / cache->size) - * Using the fact that size is a constant for a particular cache, - * we can replace (offset / cache->size) by - * reciprocal_divide(offset, cache->reciprocal_buffer_size) - */ -static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct slab *slab, void *obj) -{ - u32 offset = (obj - slab->s_mem); - return reciprocal_divide(offset, cache->reciprocal_buffer_size); -} - -static inline int objs_per_slab(const struct kmem_cache *cache, - const struct slab *slab) -{ - if (is_kfence_address(slab_address(slab))) - return 1; - return cache->num; -} - -#endif /* _LINUX_SLAB_DEF_H */ diff --git a/mm/slab.c b/mm/slab.c deleted file mode 100644 index 37efe3241f9c..000000000000 --- a/mm/slab.c +++ /dev/null @@ -1,4005 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/mm/slab.c - * Written by Mark Hemment, 1996/97. - * (markhe@nextd.demon.co.uk) - * - * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli - * - * Major cleanup, different bufctl logic, per-cpu arrays - * (c) 2000 Manfred Spraul - * - * Cleanup, make the head arrays unconditional, preparation for NUMA - * (c) 2002 Manfred Spraul - * - * An implementation of the Slab Allocator as described in outline in; - * UNIX Internals: The New Frontiers by Uresh Vahalia - * Pub: Prentice Hall ISBN 0-13-101908-2 - * or with a little more detail in; - * The Slab Allocator: An Object-Caching Kernel Memory Allocator - * Jeff Bonwick (Sun Microsystems). - * Presented at: USENIX Summer 1994 Technical Conference - * - * The memory is organized in caches, one cache for each object type. - * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) - * Each cache consists out of many slabs (they are small (usually one - * page long) and always contiguous), and each slab contains multiple - * initialized objects. - * - * This means, that your constructor is used only for newly allocated - * slabs and you must pass objects with the same initializations to - * kmem_cache_free. - * - * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, - * normal). If you need a special memory type, then must create a new - * cache for that memory type. - * - * In order to reduce fragmentation, the slabs are sorted in 3 groups: - * full slabs with 0 free objects - * partial slabs - * empty slabs with no allocated objects - * - * If partial slabs exist, then new allocations come from these slabs, - * otherwise from empty slabs or new slabs are allocated. - * - * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache - * during kmem_cache_destroy(). The caller must prevent concurrent allocs. - * - * Each cache has a short per-cpu head array, most allocs - * and frees go into that array, and if that array overflows, then 1/2 - * of the entries in the array are given back into the global cache. - * The head array is strictly LIFO and should improve the cache hit rates. - * On SMP, it additionally reduces the spinlock operations. - * - * The c_cpuarray may not be read with enabled local interrupts - - * it's changed with a smp_call_function(). - * - * SMP synchronization: - * constructors and destructors are called without any locking. - * Several members in struct kmem_cache and struct slab never change, they - * are accessed without any locking. - * The per-cpu arrays are never accessed from the wrong cpu, no locking, - * and local interrupts are disabled so slab code is preempt-safe. - * The non-constant members are protected with a per-cache irq spinlock. - * - * Many thanks to Mark Hemment, who wrote another per-cpu slab patch - * in 2000 - many ideas in the current implementation are derived from - * his patch. - * - * Further notes from the original documentation: - * - * 11 April '97. Started multi-threading - markhe - * The global cache-chain is protected by the mutex 'slab_mutex'. - * The sem is only needed when accessing/extending the cache-chain, which - * can never happen inside an interrupt (kmem_cache_create(), - * kmem_cache_shrink() and kmem_cache_reap()). - * - * At present, each engine can be growing a cache. This should be blocked. - * - * 15 March 2005. NUMA slab allocator. - * Shai Fultheim <shai@scalex86.org>. - * Shobhit Dayal <shobhit@calsoftinc.com> - * Alok N Kataria <alokk@calsoftinc.com> - * Christoph Lameter <christoph@lameter.com> - * - * Modified the slab allocator to be node aware on NUMA systems. - * Each node has its own list of partial, free and full slabs. - * All object allocations for a node occur from node specific slab lists. - */ - -#include <linux/slab.h> -#include <linux/mm.h> -#include <linux/poison.h> -#include <linux/swap.h> -#include <linux/cache.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/compiler.h> -#include <linux/cpuset.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/notifier.h> -#include <linux/kallsyms.h> -#include <linux/kfence.h> -#include <linux/cpu.h> -#include <linux/sysctl.h> -#include <linux/module.h> -#include <linux/rcupdate.h> -#include <linux/string.h> -#include <linux/uaccess.h> -#include <linux/nodemask.h> -#include <linux/kmemleak.h> -#include <linux/mempolicy.h> -#include <linux/mutex.h> -#include <linux/fault-inject.h> -#include <linux/rtmutex.h> -#include <linux/reciprocal_div.h> -#include <linux/debugobjects.h> -#include <linux/memory.h> -#include <linux/prefetch.h> -#include <linux/sched/task_stack.h> - -#include <net/sock.h> - -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> -#include <asm/page.h> - -#include <trace/events/kmem.h> - -#include "internal.h" - -#include "slab.h" - -/* - * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. - * 0 for faster, smaller code (especially in the critical paths). - * - * STATS - 1 to collect stats for /proc/slabinfo. - * 0 for faster, smaller code (especially in the critical paths). - * - * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) - */ - -#ifdef CONFIG_DEBUG_SLAB -#define DEBUG 1 -#define STATS 1 -#define FORCED_DEBUG 1 -#else -#define DEBUG 0 -#define STATS 0 -#define FORCED_DEBUG 0 -#endif - -/* Shouldn't this be in a header file somewhere? */ -#define BYTES_PER_WORD sizeof(void *) -#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) - -#ifndef ARCH_KMALLOC_FLAGS -#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN -#endif - -#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \ - <= SLAB_OBJ_MIN_SIZE) ? 1 : 0) - -#if FREELIST_BYTE_INDEX -typedef unsigned char freelist_idx_t; -#else -typedef unsigned short freelist_idx_t; -#endif - -#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) - -/* - * struct array_cache - * - * Purpose: - * - LIFO ordering, to hand out cache-warm objects from _alloc - * - reduce the number of linked list operations - * - reduce spinlock operations - * - * The limit is stored in the per-cpu structure to reduce the data cache - * footprint. - * - */ -struct array_cache { - unsigned int avail; - unsigned int limit; - unsigned int batchcount; - unsigned int touched; - void *entry[]; /* - * Must have this definition in here for the proper - * alignment of array_cache. Also simplifies accessing - * the entries. - */ -}; - -struct alien_cache { - spinlock_t lock; - struct array_cache ac; -}; - -/* - * Need this for bootstrapping a per node allocator. - */ -#define NUM_INIT_LISTS (2 * MAX_NUMNODES) -static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; -#define CACHE_CACHE 0 -#define SIZE_NODE (MAX_NUMNODES) - -static int drain_freelist(struct kmem_cache *cache, - struct kmem_cache_node *n, int tofree); -static void free_block(struct kmem_cache *cachep, void **objpp, int len, - int node, struct list_head *list); -static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); -static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); -static void cache_reap(struct work_struct *unused); - -static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, - void **list); -static inline void fixup_slab_list(struct kmem_cache *cachep, - struct kmem_cache_node *n, struct slab *slab, - void **list); - -#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) - -static void kmem_cache_node_init(struct kmem_cache_node *parent) -{ - INIT_LIST_HEAD(&parent->slabs_full); - INIT_LIST_HEAD(&parent->slabs_partial); - INIT_LIST_HEAD(&parent->slabs_free); - parent->total_slabs = 0; - parent->free_slabs = 0; - parent->shared = NULL; - parent->alien = NULL; - parent->colour_next = 0; - raw_spin_lock_init(&parent->list_lock); - parent->free_objects = 0; - parent->free_touched = 0; -} - -#define MAKE_LIST(cachep, listp, slab, nodeid) \ - do { \ - INIT_LIST_HEAD(listp); \ - list_splice(&get_node(cachep, nodeid)->slab, listp); \ - } while (0) - -#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ - do { \ - MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ - MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ - MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ - } while (0) - -#define CFLGS_OBJFREELIST_SLAB ((slab_flags_t __force)0x40000000U) -#define CFLGS_OFF_SLAB ((slab_flags_t __force)0x80000000U) -#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) -#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) - -#define BATCHREFILL_LIMIT 16 -/* - * Optimization question: fewer reaps means less probability for unnecessary - * cpucache drain/refill cycles. - * - * OTOH the cpuarrays can contain lots of objects, - * which could lock up otherwise freeable slabs. - */ -#define REAPTIMEOUT_AC (2*HZ) -#define REAPTIMEOUT_NODE (4*HZ) - -#if STATS -#define STATS_INC_ACTIVE(x) ((x)->num_active++) -#define STATS_DEC_ACTIVE(x) ((x)->num_active--) -#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) -#define STATS_INC_GROWN(x) ((x)->grown++) -#define STATS_ADD_REAPED(x, y) ((x)->reaped += (y)) -#define STATS_SET_HIGH(x) \ - do { \ - if ((x)->num_active > (x)->high_mark) \ - (x)->high_mark = (x)->num_active; \ - } while (0) -#define STATS_INC_ERR(x) ((x)->errors++) -#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) -#define STATS_INC_NODEFREES(x) ((x)->node_frees++) -#define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++) -#define STATS_SET_FREEABLE(x, i) \ - do { \ - if ((x)->max_freeable < i) \ - (x)->max_freeable = i; \ - } while (0) -#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) -#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) -#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) -#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) -#else -#define STATS_INC_ACTIVE(x) do { } while (0) -#define STATS_DEC_ACTIVE(x) do { } while (0) -#define STATS_INC_ALLOCED(x) do { } while (0) -#define STATS_INC_GROWN(x) do { } while (0) -#define STATS_ADD_REAPED(x, y) do { (void)(y); } while (0) -#define STATS_SET_HIGH(x) do { } while (0) -#define STATS_INC_ERR(x) do { } while (0) -#define STATS_INC_NODEALLOCS(x) do { } while (0) -#define STATS_INC_NODEFREES(x) do { } while (0) -#define STATS_INC_ACOVERFLOW(x) do { } while (0) -#define STATS_SET_FREEABLE(x, i) do { } while (0) -#define STATS_INC_ALLOCHIT(x) do { } while (0) -#define STATS_INC_ALLOCMISS(x) do { } while (0) -#define STATS_INC_FREEHIT(x) do { } while (0) -#define STATS_INC_FREEMISS(x) do { } while (0) -#endif - -#if DEBUG - -/* - * memory layout of objects: - * 0 : objp - * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that - * the end of an object is aligned with the end of the real - * allocation. Catches writes behind the end of the allocation. - * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: - * redzone word. - * cachep->obj_offset: The real object. - * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] - * cachep->size - 1* BYTES_PER_WORD: last caller address - * [BYTES_PER_WORD long] - */ -static int obj_offset(struct kmem_cache *cachep) -{ - return cachep->obj_offset; -} - -static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) -{ - BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); - return (unsigned long long *) (objp + obj_offset(cachep) - - sizeof(unsigned long long)); -} - -static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) -{ - BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); - if (cachep->flags & SLAB_STORE_USER) - return (unsigned long long *)(objp + cachep->size - - sizeof(unsigned long long) - - REDZONE_ALIGN); - return (unsigned long long *) (objp + cachep->size - - sizeof(unsigned long long)); -} - -static void **dbg_userword(struct kmem_cache *cachep, void *objp) -{ - BUG_ON(!(cachep->flags & SLAB_STORE_USER)); - return (void **)(objp + cachep->size - BYTES_PER_WORD); -} - -#else - -#define obj_offset(x) 0 -#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) -#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) -#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) - -#endif - -/* - * Do not go above this order unless 0 objects fit into the slab or - * overridden on the command line. - */ -#define SLAB_MAX_ORDER_HI 1 -#define SLAB_MAX_ORDER_LO 0 -static int slab_max_order = SLAB_MAX_ORDER_LO; -static bool slab_max_order_set __initdata; - -static inline void *index_to_obj(struct kmem_cache *cache, - const struct slab *slab, unsigned int idx) -{ - return slab->s_mem + cache->size * idx; -} - -#define BOOT_CPUCACHE_ENTRIES 1 -/* internal cache of cache description objs */ -static struct kmem_cache kmem_cache_boot = { - .batchcount = 1, - .limit = BOOT_CPUCACHE_ENTRIES, - .shared = 1, - .size = sizeof(struct kmem_cache), - .name = "kmem_cache", -}; - -static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); - -static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) -{ - return this_cpu_ptr(cachep->cpu_cache); -} - -/* - * Calculate the number of objects and left-over bytes for a given buffer size. - */ -static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, - slab_flags_t flags, size_t *left_over) -{ - unsigned int num; - size_t slab_size = PAGE_SIZE << gfporder; - - /* - * The slab management structure can be either off the slab or - * on it. For the latter case, the memory allocated for a - * slab is used for: - * - * - @buffer_size bytes for each object - * - One freelist_idx_t for each object - * - * We don't need to consider alignment of freelist because - * freelist will be at the end of slab page. The objects will be - * at the correct alignment. - * - * If the slab management structure is off the slab, then the - * alignment will already be calculated into the size. Because - * the slabs are all pages aligned, the objects will be at the - * correct alignment when allocated. - */ - if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) { - num = slab_size / buffer_size; - *left_over = slab_size % buffer_size; - } else { - num = slab_size / (buffer_size + sizeof(freelist_idx_t)); - *left_over = slab_size % - (buffer_size + sizeof(freelist_idx_t)); - } - - return num; -} - -#if DEBUG -#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) - -static void __slab_error(const char *function, struct kmem_cache *cachep, - char *msg) -{ - pr_err("slab error in %s(): cache `%s': %s\n", - function, cachep->name, msg); - dump_stack(); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); -} -#endif - -/* - * By default on NUMA we use alien caches to stage the freeing of - * objects allocated from other nodes. This causes massive memory - * inefficiencies when using fake NUMA setup to split memory into a - * large number of small nodes, so it can be disabled on the command - * line - */ - -static int use_alien_caches __read_mostly = 1; -static int __init noaliencache_setup(char *s) -{ - use_alien_caches = 0; - return 1; -} -__setup("noaliencache", noaliencache_setup); - -static int __init slab_max_order_setup(char *str) -{ - get_option(&str, &slab_max_order); - slab_max_order = slab_max_order < 0 ? 0 : - min(slab_max_order, MAX_ORDER); - slab_max_order_set = true; - - return 1; -} -__setup("slab_max_order=", slab_max_order_setup); - -#ifdef CONFIG_NUMA -/* - * Special reaping functions for NUMA systems called from cache_reap(). - * These take care of doing round robin flushing of alien caches (containing - * objects freed on different nodes from which they were allocated) and the - * flushing of remote pcps by calling drain_node_pages. - */ -static DEFINE_PER_CPU(unsigned long, slab_reap_node); - -static void init_reap_node(int cpu) -{ - per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu), - node_online_map); -} - -static void next_reap_node(void) -{ - int node = __this_cpu_read(slab_reap_node); - - node = next_node_in(node, node_online_map); - __this_cpu_write(slab_reap_node, node); -} - -#else -#define init_reap_node(cpu) do { } while (0) -#define next_reap_node(void) do { } while (0) -#endif - -/* - * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz - * via the workqueue/eventd. - * Add the CPU number into the expiration time to minimize the possibility of - * the CPUs getting into lockstep and contending for the global cache chain - * lock. - */ -static void start_cpu_timer(int cpu) -{ - struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); - - if (reap_work->work.func == NULL) { - init_reap_node(cpu); - INIT_DEFERRABLE_WORK(reap_work, cache_reap); - schedule_delayed_work_on(cpu, reap_work, - __round_jiffies_relative(HZ, cpu)); - } -} - -static void init_arraycache(struct array_cache *ac, int limit, int batch) -{ - if (ac) { - ac->avail = 0; - ac->limit = limit; - ac->batchcount = batch; - ac->touched = 0; - } -} - -static struct array_cache *alloc_arraycache(int node, int entries, - int batchcount, gfp_t gfp) -{ - size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache); - struct array_cache *ac = NULL; - - ac = kmalloc_node(memsize, gfp, node); - /* - * The array_cache structures contain pointers to free object. - * However, when such objects are allocated or transferred to another - * cache the pointers are not cleared and they could be counted as - * valid references during a kmemleak scan. Therefore, kmemleak must - * not scan such objects. - */ - kmemleak_no_scan(ac); - init_arraycache(ac, entries, batchcount); - return ac; -} - -static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, - struct slab *slab, void *objp) -{ - struct kmem_cache_node *n; - int slab_node; - LIST_HEAD(list); - - slab_node = slab_nid(slab); - n = get_node(cachep, slab_node); - - raw_spin_lock(&n->list_lock); - free_block(cachep, &objp, 1, slab_node, &list); - raw_spin_unlock(&n->list_lock); - - slabs_destroy(cachep, &list); -} - -/* - * Transfer objects in one arraycache to another. - * Locking must be handled by the caller. - * - * Return the number of entries transferred. - */ -static int transfer_objects(struct array_cache *to, - struct array_cache *from, unsigned int max) -{ - /* Figure out how many entries to transfer */ - int nr = min3(from->avail, max, to->limit - to->avail); - - if (!nr) - return 0; - - memcpy(to->entry + to->avail, from->entry + from->avail - nr, - sizeof(void *) *nr); - - from->avail -= nr; - to->avail += nr; - return nr; -} - -/* &alien->lock must be held by alien callers. */ -static __always_inline void __free_one(struct array_cache *ac, void *objp) -{ - /* Avoid trivial double-free. */ - if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && - WARN_ON_ONCE(ac->avail > 0 && ac->entry[ac->avail - 1] == objp)) - return; - ac->entry[ac->avail++] = objp; -} - -#ifndef CONFIG_NUMA - -#define drain_alien_cache(cachep, alien) do { } while (0) -#define reap_alien(cachep, n) do { } while (0) - -static inline struct alien_cache **alloc_alien_cache(int node, - int limit, gfp_t gfp) -{ - return NULL; -} - -static inline void free_alien_cache(struct alien_cache **ac_ptr) -{ -} - -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) -{ - return 0; -} - -static inline gfp_t gfp_exact_node(gfp_t flags) -{ - return flags & ~__GFP_NOFAIL; -} - -#else /* CONFIG_NUMA */ - -static struct alien_cache *__alloc_alien_cache(int node, int entries, - int batch, gfp_t gfp) -{ - size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache); - struct alien_cache *alc = NULL; - - alc = kmalloc_node(memsize, gfp, node); - if (alc) { - kmemleak_no_scan(alc); - init_arraycache(&alc->ac, entries, batch); - spin_lock_init(&alc->lock); - } - return alc; -} - -static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) -{ - struct alien_cache **alc_ptr; - int i; - - if (limit > 1) - limit = 12; - alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node); - if (!alc_ptr) - return NULL; - - for_each_node(i) { - if (i == node || !node_online(i)) - continue; - alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp); - if (!alc_ptr[i]) { - for (i--; i >= 0; i--) - kfree(alc_ptr[i]); - kfree(alc_ptr); - return NULL; - } - } - return alc_ptr; -} - -static void free_alien_cache(struct alien_cache **alc_ptr) -{ - int i; - - if (!alc_ptr) - return; - for_each_node(i) - kfree(alc_ptr[i]); - kfree(alc_ptr); -} - -static void __drain_alien_cache(struct kmem_cache *cachep, - struct array_cache *ac, int node, - struct list_head *list) -{ - struct kmem_cache_node *n = get_node(cachep, node); - - if (ac->avail) { - raw_spin_lock(&n->list_lock); - /* - * Stuff objects into the remote nodes shared array first. - * That way we could avoid the overhead of putting the objects - * into the free lists and getting them back later. - */ - if (n->shared) - transfer_objects(n->shared, ac, ac->limit); - - free_block(cachep, ac->entry, ac->avail, node, list); - ac->avail = 0; - raw_spin_unlock(&n->list_lock); - } -} - -/* - * Called from cache_reap() to regularly drain alien caches round robin. - */ -static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) -{ - int node = __this_cpu_read(slab_reap_node); - - if (n->alien) { - struct alien_cache *alc = n->alien[node]; - struct array_cache *ac; - - if (alc) { - ac = &alc->ac; - if (ac->avail && spin_trylock_irq(&alc->lock)) { - LIST_HEAD(list); - - __drain_alien_cache(cachep, ac, node, &list); - spin_unlock_irq(&alc->lock); - slabs_destroy(cachep, &list); - } - } - } -} - -static void drain_alien_cache(struct kmem_cache *cachep, - struct alien_cache **alien) -{ - int i = 0; - struct alien_cache *alc; - struct array_cache *ac; - unsigned long flags; - - for_each_online_node(i) { - alc = alien[i]; - if (alc) { - LIST_HEAD(list); - - ac = &alc->ac; - spin_lock_irqsave(&alc->lock, flags); - __drain_alien_cache(cachep, ac, i, &list); - spin_unlock_irqrestore(&alc->lock, flags); - slabs_destroy(cachep, &list); - } - } -} - -static int __cache_free_alien(struct kmem_cache *cachep, void *objp, - int node, int slab_node) -{ - struct kmem_cache_node *n; - struct alien_cache *alien = NULL; - struct array_cache *ac; - LIST_HEAD(list); - - n = get_node(cachep, node); - STATS_INC_NODEFREES(cachep); - if (n->alien && n->alien[slab_node]) { - alien = n->alien[slab_node]; - ac = &alien->ac; - spin_lock(&alien->lock); - if (unlikely(ac->avail == ac->limit)) { - STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, ac, slab_node, &list); - } - __free_one(ac, objp); - spin_unlock(&alien->lock); - slabs_destroy(cachep, &list); - } else { - n = get_node(cachep, slab_node); - raw_spin_lock(&n->list_lock); - free_block(cachep, &objp, 1, slab_node, &list); - raw_spin_unlock(&n->list_lock); - slabs_destroy(cachep, &list); - } - return 1; -} - -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) -{ - int slab_node = slab_nid(virt_to_slab(objp)); - int node = numa_mem_id(); - /* - * Make sure we are not freeing an object from another node to the array - * cache on this cpu. - */ - if (likely(node == slab_node)) - return 0; - - return __cache_free_alien(cachep, objp, node, slab_node); -} - -/* - * Construct gfp mask to allocate from a specific node but do not reclaim or - * warn about failures. - */ -static inline gfp_t gfp_exact_node(gfp_t flags) -{ - return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL); -} -#endif - -static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) -{ - struct kmem_cache_node *n; - - /* - * Set up the kmem_cache_node for cpu before we can - * begin anything. Make sure some other cpu on this - * node has not already allocated this - */ - n = get_node(cachep, node); - if (n) { - raw_spin_lock_irq(&n->list_lock); - n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + - cachep->num; - raw_spin_unlock_irq(&n->list_lock); - - return 0; - } - - n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); - if (!n) - return -ENOMEM; - - kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_NODE + - ((unsigned long)cachep) % REAPTIMEOUT_NODE; - - n->free_limit = - (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - - /* - * The kmem_cache_nodes don't come and go as CPUs - * come and go. slab_mutex provides sufficient - * protection here. - */ - cachep->node[node] = n; - - return 0; -} - -#if defined(CONFIG_NUMA) || defined(CONFIG_SMP) -/* - * Allocates and initializes node for a node on each slab cache, used for - * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node - * will be allocated off-node since memory is not yet online for the new node. - * When hotplugging memory or a cpu, existing nodes are not replaced if - * already in use. - * - * Must hold slab_mutex. - */ -static int init_cache_node_node(int node) -{ - int ret; - struct kmem_cache *cachep; - - list_for_each_entry(cachep, &slab_caches, list) { - ret = init_cache_node(cachep, node, GFP_KERNEL); - if (ret) - return ret; - } - - return 0; -} -#endif - -static int setup_kmem_cache_node(struct kmem_cache *cachep, - int node, gfp_t gfp, bool force_change) -{ - int ret = -ENOMEM; - struct kmem_cache_node *n; - struct array_cache *old_shared = NULL; - struct array_cache *new_shared = NULL; - struct alien_cache **new_alien = NULL; - LIST_HEAD(list); - - if (use_alien_caches) { - new_alien = alloc_alien_cache(node, cachep->limit, gfp); - if (!new_alien) - goto fail; - } - - if (cachep->shared) { - new_shared = alloc_arraycache(node, - cachep->shared * cachep->batchcount, 0xbaadf00d, gfp); - if (!new_shared) - goto fail; - } - - ret = init_cache_node(cachep, node, gfp); - if (ret) - goto fail; - - n = get_node(cachep, node); - raw_spin_lock_irq(&n->list_lock); - if (n->shared && force_change) { - free_block(cachep, n->shared->entry, - n->shared->avail, node, &list); - n->shared->avail = 0; - } - - if (!n->shared || force_change) { - old_shared = n->shared; - n->shared = new_shared; - new_shared = NULL; - } - - if (!n->alien) { - n->alien = new_alien; - new_alien = NULL; - } - - raw_spin_unlock_irq(&n->list_lock); - slabs_destroy(cachep, &list); - - /* - * To protect lockless access to n->shared during irq disabled context. - * If n->shared isn't NULL in irq disabled context, accessing to it is - * guaranteed to be valid until irq is re-enabled, because it will be - * freed after synchronize_rcu(). - */ - if (old_shared && force_change) - synchronize_rcu(); - -fail: - kfree(old_shared); - kfree(new_shared); - free_alien_cache(new_alien); - - return ret; -} - -#ifdef CONFIG_SMP - -static void cpuup_canceled(long cpu) -{ - struct kmem_cache *cachep; - struct kmem_cache_node *n = NULL; - int node = cpu_to_mem(cpu); - const struct cpumask *mask = cpumask_of_node(node); - - list_for_each_entry(cachep, &slab_caches, list) { - struct array_cache *nc; - struct array_cache *shared; - struct alien_cache **alien; - LIST_HEAD(list); - - n = get_node(cachep, node); - if (!n) - continue; - - raw_spin_lock_irq(&n->list_lock); - - /* Free limit for this kmem_cache_node */ - n->free_limit -= cachep->batchcount; - - /* cpu is dead; no one can alloc from it. */ - nc = per_cpu_ptr(cachep->cpu_cache, cpu); - free_block(cachep, nc->entry, nc->avail, node, &list); - nc->avail = 0; - - if (!cpumask_empty(mask)) { - raw_spin_unlock_irq(&n->list_lock); - goto free_slab; - } - - shared = n->shared; - if (shared) { - free_block(cachep, shared->entry, - shared->avail, node, &list); - n->shared = NULL; - } - - alien = n->alien; - n->alien = NULL; - - raw_spin_unlock_irq(&n->list_lock); - - kfree(shared); - if (alien) { - drain_alien_cache(cachep, alien); - free_alien_cache(alien); - } - -free |