diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-10-02 15:58:05 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-10-02 15:58:05 -0700 |
| commit | 24d9e8b3c9c8a6f72c8b4c196a703e144928d919 (patch) | |
| tree | 81d9a41265b30c776a2a70a517fddb5e5da62ed0 | |
| parent | 07fdad3a93756b872da7b53647715c48d0f4a2d0 (diff) | |
| parent | ca74b8cadaad4b179f77f1f4dc3d288be9a580f1 (diff) | |
| download | linux-24d9e8b3c9c8a6f72c8b4c196a703e144928d919.tar.gz linux-24d9e8b3c9c8a6f72c8b4c196a703e144928d919.tar.bz2 linux-24d9e8b3c9c8a6f72c8b4c196a703e144928d919.zip | |
Merge tag 'slab-for-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab
Pull slab updates from Vlastimil Babka:
- A new layer for caching objects for allocation and free via percpu
arrays called sheaves.
The aim is to combine the good parts of SLAB (lower-overhead and
simpler percpu caching, compared to SLUB) without the past issues
with arrays for freeing remote NUMA node objects and their flushing.
It also allows more efficient kfree_rcu(), and cheaper object
preallocations for cases where the exact number of objects is
unknown, but an upper bound is.
Currently VMAs and maple nodes are using this new caching, with a
plan to enable it for all caches and remove the complex SLUB fastpath
based on cpu (partial) slabs and this_cpu_cmpxchg_double().
(Vlastimil Babka, with Liam Howlett and Pedro Falcato for the maple
tree changes)
- Re-entrant kmalloc_nolock(), which allows opportunistic allocations
from NMI and tracing/kprobe contexts.
Building on prior page allocator and memcg changes, it will result in
removing BPF-specific caches on top of slab (Alexei Starovoitov)
- Various fixes and cleanups. (Kuan-Wei Chiu, Matthew Wilcox, Suren
Baghdasaryan, Ye Liu)
* tag 'slab-for-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab: (40 commits)
slab: Introduce kmalloc_nolock() and kfree_nolock().
slab: Reuse first bit for OBJEXTS_ALLOC_FAIL
slab: Make slub local_(try)lock more precise for LOCKDEP
mm: Introduce alloc_frozen_pages_nolock()
mm: Allow GFP_ACCOUNT to be used in alloc_pages_nolock().
locking/local_lock: Introduce local_lock_is_locked().
maple_tree: Convert forking to use the sheaf interface
maple_tree: Add single node allocation support to maple state
maple_tree: Prefilled sheaf conversion and testing
tools/testing: Add support for prefilled slab sheafs
maple_tree: Replace mt_free_one() with kfree()
maple_tree: Use kfree_rcu in ma_free_rcu
testing/radix-tree/maple: Hack around kfree_rcu not existing
tools/testing: include maple-shim.c in maple.c
maple_tree: use percpu sheaves for maple_node_cache
mm, vma: use percpu sheaves for vm_area_struct cache
tools/testing: Add support for changes to slab for sheaves
slab: allow NUMA restricted allocations to use percpu sheaves
tools/testing/vma: Implement vm_refcnt reset
slab: skip percpu sheaves for remote object freeing
...
| -rw-r--r-- | include/linux/gfp.h | 2 | ||||
| -rw-r--r-- | include/linux/kasan.h | 13 | ||||
| -rw-r--r-- | include/linux/local_lock.h | 2 | ||||
| -rw-r--r-- | include/linux/local_lock_internal.h | 16 | ||||
| -rw-r--r-- | include/linux/maple_tree.h | 6 | ||||
| -rw-r--r-- | include/linux/memcontrol.h | 12 | ||||
| -rw-r--r-- | include/linux/rtmutex.h | 10 | ||||
| -rw-r--r-- | include/linux/slab.h | 51 | ||||
| -rw-r--r-- | kernel/bpf/stream.c | 2 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 2 | ||||
| -rw-r--r-- | kernel/locking/rtmutex_common.h | 9 | ||||
| -rw-r--r-- | lib/maple_tree.c | 667 | ||||
| -rw-r--r-- | lib/test_maple_tree.c | 137 | ||||
| -rw-r--r-- | mm/Kconfig | 1 | ||||
| -rw-r--r-- | mm/internal.h | 4 | ||||
| -rw-r--r-- | mm/kasan/common.c | 5 | ||||
| -rw-r--r-- | mm/page_alloc.c | 55 | ||||
| -rw-r--r-- | mm/slab.h | 20 | ||||
| -rw-r--r-- | mm/slab_common.c | 37 | ||||
| -rw-r--r-- | mm/slub.c | 2357 | ||||
| -rw-r--r-- | mm/vma_init.c | 1 | ||||
| -rw-r--r-- | tools/include/linux/slab.h | 165 | ||||
| -rw-r--r-- | tools/testing/radix-tree/maple.c | 514 | ||||
| -rw-r--r-- | tools/testing/shared/linux.c | 120 | ||||
| -rw-r--r-- | tools/testing/shared/maple-shared.h | 11 | ||||
| -rw-r--r-- | tools/testing/shared/maple-shim.c | 7 | ||||
| -rw-r--r-- | tools/testing/vma/vma_internal.h | 259 |
27 files changed, 2901 insertions, 1584 deletions
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5ebf26fcdcfa..0ceb4e09306c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -354,7 +354,7 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp, } #define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) -struct page *alloc_pages_nolock_noprof(int nid, unsigned int order); +struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); #define alloc_pages_nolock(...) alloc_hooks(alloc_pages_nolock_noprof(__VA_ARGS__)) extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); diff --git a/include/linux/kasan.h b/include/linux/kasan.h index fe5ce9215821..b4973e7c2940 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -200,7 +200,7 @@ static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s, } bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init, - bool still_accessible); + bool still_accessible, bool no_quarantine); /** * kasan_slab_free - Poison, initialize, and quarantine a slab object. * @object: Object to be freed. @@ -226,11 +226,13 @@ bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init, * @Return true if KASAN took ownership of the object; false otherwise. */ static __always_inline bool kasan_slab_free(struct kmem_cache *s, - void *object, bool init, - bool still_accessible) + void *object, bool init, + bool still_accessible, + bool no_quarantine) { if (kasan_enabled()) - return __kasan_slab_free(s, object, init, still_accessible); + return __kasan_slab_free(s, object, init, still_accessible, + no_quarantine); return false; } @@ -427,7 +429,8 @@ static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object) } static inline bool kasan_slab_free(struct kmem_cache *s, void *object, - bool init, bool still_accessible) + bool init, bool still_accessible, + bool no_quarantine) { return false; } diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h index 2ba846419524..0d91d060e3e9 100644 --- a/include/linux/local_lock.h +++ b/include/linux/local_lock.h @@ -66,6 +66,8 @@ */ #define local_trylock(lock) __local_trylock(this_cpu_ptr(lock)) +#define local_lock_is_locked(lock) __local_lock_is_locked(lock) + /** * local_trylock_irqsave - Try to acquire a per CPU local lock, save and disable * interrupts if acquired diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index d80b5306a2c0..a4dc479157b5 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -17,7 +17,10 @@ typedef struct { /* local_trylock() and local_trylock_irqsave() only work with local_trylock_t */ typedef struct { - local_lock_t llock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; + struct task_struct *owner; +#endif u8 acquired; } local_trylock_t; @@ -31,7 +34,7 @@ typedef struct { .owner = NULL, # define LOCAL_TRYLOCK_DEBUG_INIT(lockname) \ - .llock = { LOCAL_LOCK_DEBUG_INIT((lockname).llock) }, + LOCAL_LOCK_DEBUG_INIT(lockname) static inline void local_lock_acquire(local_lock_t *l) { @@ -81,7 +84,7 @@ do { \ local_lock_debug_init(lock); \ } while (0) -#define __local_trylock_init(lock) __local_lock_init(lock.llock) +#define __local_trylock_init(lock) __local_lock_init((local_lock_t *)lock) #define __spinlock_nested_bh_init(lock) \ do { \ @@ -162,6 +165,9 @@ do { \ !!tl; \ }) +/* preemption or migration must be disabled before calling __local_lock_is_locked */ +#define __local_lock_is_locked(lock) READ_ONCE(this_cpu_ptr(lock)->acquired) + #define __local_lock_release(lock) \ do { \ local_trylock_t *tl; \ @@ -282,4 +288,8 @@ do { \ __local_trylock(lock); \ }) +/* migration must be disabled before calling __local_lock_is_locked */ +#define __local_lock_is_locked(__lock) \ + (rt_mutex_owner(&this_cpu_ptr(__lock)->lock) == current) + #endif /* CONFIG_PREEMPT_RT */ diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index bafe143b1f78..51a64ff23b88 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -442,7 +442,9 @@ struct ma_state { struct maple_enode *node; /* The node containing this entry */ unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ - struct maple_alloc *alloc; /* Allocated nodes for this operation */ + struct slab_sheaf *sheaf; /* Allocated nodes for this operation */ + struct maple_node *alloc; /* A single allocated node for fast path writes */ + unsigned long node_request; /* The number of nodes to allocate for this operation */ enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ unsigned char offset; @@ -490,7 +492,9 @@ struct ma_wr_state { .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ + .sheaf = NULL, \ .alloc = NULL, \ + .node_request = 0, \ .mas_flags = 0, \ .store_type = wr_invalid, \ } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fb27e3d2fdac..9924f157aae0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -341,17 +341,25 @@ enum page_memcg_data_flags { __NR_MEMCG_DATA_FLAGS = (1UL << 2), }; +#define __OBJEXTS_ALLOC_FAIL MEMCG_DATA_OBJEXTS #define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS #else /* CONFIG_MEMCG */ +#define __OBJEXTS_ALLOC_FAIL (1UL << 0) #define __FIRST_OBJEXT_FLAG (1UL << 0) #endif /* CONFIG_MEMCG */ enum objext_flags { - /* slabobj_ext vector failed to allocate */ - OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG, + /* + * Use bit 0 with zero other bits to signal that slabobj_ext vector + * failed to allocate. The same bit 0 with valid upper bits means + * MEMCG_DATA_OBJEXTS. + */ + OBJEXTS_ALLOC_FAIL = __OBJEXTS_ALLOC_FAIL, + /* slabobj_ext vector allocated with kmalloc_nolock() */ + OBJEXTS_NOSPIN_ALLOC = __FIRST_OBJEXT_FLAG, /* the next bit after the last actual flag */ __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), }; diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index fa9f1021541e..ede4c6bf6f22 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -44,6 +44,16 @@ static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock) return READ_ONCE(lock->owner) != NULL; } +#ifdef CONFIG_RT_MUTEXES +#define RT_MUTEX_HAS_WAITERS 1UL + +static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) +{ + unsigned long owner = (unsigned long) READ_ONCE(lock->owner); + + return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); +} +#endif extern void rt_mutex_base_init(struct rt_mutex_base *rtb); /** diff --git a/include/linux/slab.h b/include/linux/slab.h index d5a8ab98035c..561597dd2164 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -335,6 +335,37 @@ struct kmem_cache_args { * %NULL means no constructor. */ void (*ctor)(void *); + /** + * @sheaf_capacity: Enable sheaves of given capacity for the cache. + * + * With a non-zero value, allocations from the cache go through caching + * arrays called sheaves. Each cpu has a main sheaf that's always + * present, and a spare sheaf that may be not present. When both become + * empty, there's an attempt to replace an empty sheaf with a full sheaf + * from the per-node barn. + * + * When no full sheaf is available, and gfp flags allow blocking, a + * sheaf is allocated and filled from slab(s) using bulk allocation. + * Otherwise the allocation falls back to the normal operation + * allocating a single object from a slab. + * + * Analogically when freeing and both percpu sheaves are full, the barn + * may replace it with an empty sheaf, unless it's over capacity. In + * that case a sheaf is bulk freed to slab pages. + * + * The sheaves do not enforce NUMA placement of objects, so allocations + * via kmem_cache_alloc_node() with a node specified other than + * NUMA_NO_NODE will bypass them. + * + * Bulk allocation and free operations also try to use the cpu sheaves + * and barn, but fallback to using slab pages directly. + * + * When slub_debug is enabled for the cache, the sheaf_capacity argument + * is ignored. + * + * %0 means no sheaves will be created. + */ + unsigned int sheaf_capacity; }; struct kmem_cache *__kmem_cache_create_args(const char *name, @@ -470,6 +501,7 @@ void * __must_check krealloc_noprof(const void *objp, size_t new_size, #define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__)) void kfree(const void *objp); +void kfree_nolock(const void *objp); void kfree_sensitive(const void *objp); size_t __ksize(const void *objp); @@ -798,6 +830,22 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment __malloc; #define kmem_cache_alloc_node(...) alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__)) +struct slab_sheaf * +kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size); + +int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf **sheafp, unsigned int size); + +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf); + +void *kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *cachep, gfp_t gfp, + struct slab_sheaf *sheaf) __assume_slab_alignment __malloc; +#define kmem_cache_alloc_from_sheaf(...) \ + alloc_hooks(kmem_cache_alloc_from_sheaf_noprof(__VA_ARGS__)) + +unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf); + /* * These macros allow declaring a kmem_buckets * parameter alongside size, which * can be compiled out with CONFIG_SLAB_BUCKETS=n so that a large number of call @@ -910,6 +958,9 @@ static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t f } #define kmalloc(...) alloc_hooks(kmalloc_noprof(__VA_ARGS__)) +void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node); +#define kmalloc_nolock(...) alloc_hooks(kmalloc_nolock_noprof(__VA_ARGS__)) + #define kmem_buckets_alloc(_b, _size, _flags) \ alloc_hooks(__kmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE)) diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index ab592db4a4bf..eb6c5a21c2ef 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -83,7 +83,7 @@ static struct bpf_stream_page *bpf_stream_page_replace(void) struct bpf_stream_page *stream_page, *old_stream_page; struct page *page; - page = alloc_pages_nolock(NUMA_NO_NODE, 0); + page = alloc_pages_nolock(/* Don't account */ 0, NUMA_NO_NODE, 0); if (!page) return NULL; stream_page = page_address(page); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a48fa86f82a7..2a9456a3e730 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -583,7 +583,7 @@ static bool can_alloc_pages(void) static struct page *__bpf_alloc_page(int nid) { if (!can_alloc_pages()) - return alloc_pages_nolock(nid, 0); + return alloc_pages_nolock(__GFP_ACCOUNT, nid, 0); return alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 78dd3d8c6554..cf6ddd1b23a2 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -153,15 +153,6 @@ static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) pi_tree.entry); } -#define RT_MUTEX_HAS_WAITERS 1UL - -static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) -{ - unsigned long owner = (unsigned long) READ_ONCE(lock->owner); - - return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); -} - /* * Constants for rt mutex functions which have a selectable deadlock * detection. diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b4ee2d29d7a9..ab4c6c21a625 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -83,13 +83,9 @@ /* * Maple state flags - * * MA_STATE_BULK - Bulk insert mode - * * MA_STATE_REBALANCE - Indicate a rebalance during bulk insert * * MA_STATE_PREALLOC - Preallocated nodes, WARN_ON allocation */ -#define MA_STATE_BULK 1 -#define MA_STATE_REBALANCE 2 -#define MA_STATE_PREALLOC 4 +#define MA_STATE_PREALLOC 1 #define ma_parent_ptr(x) ((struct maple_pnode *)(x)) #define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT) @@ -176,26 +172,25 @@ static inline struct maple_node *mt_alloc_one(gfp_t gfp) return kmem_cache_alloc(maple_node_cache, gfp); } -static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) +static inline void mt_free_bulk(size_t size, void __rcu **nodes) { - return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); + kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); } -static inline void mt_free_one(struct maple_node *node) +static void mt_return_sheaf(struct slab_sheaf *sheaf) { - kmem_cache_free(maple_node_cache, node); + kmem_cache_return_sheaf(maple_node_cache, GFP_NOWAIT, sheaf); } -static inline void mt_free_bulk(size_t size, void __rcu **nodes) +static struct slab_sheaf *mt_get_sheaf(gfp_t gfp, int count) { - kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); + return kmem_cache_prefill_sheaf(maple_node_cache, gfp, count); } -static void mt_free_rcu(struct rcu_head *head) +static int mt_refill_sheaf(gfp_t gfp, struct slab_sheaf **sheaf, + unsigned int size) { - struct maple_node *node = container_of(head, struct maple_node, rcu); - - kmem_cache_free(maple_node_cache, node); + return kmem_cache_refill_sheaf(maple_node_cache, gfp, sheaf, size); } /* @@ -208,7 +203,7 @@ static void mt_free_rcu(struct rcu_head *head) static void ma_free_rcu(struct maple_node *node) { WARN_ON(node->parent != ma_parent_ptr(node)); - call_rcu(&node->rcu, mt_free_rcu); + kfree_rcu(node, rcu); } static void mt_set_height(struct maple_tree *mt, unsigned char height) @@ -591,67 +586,6 @@ static __always_inline bool mte_dead_node(const struct maple_enode *enode) } /* - * mas_allocated() - Get the number of nodes allocated in a maple state. - * @mas: The maple state - * - * The ma_state alloc member is overloaded to hold a pointer to the first - * allocated node or to the number of requested nodes to allocate. If bit 0 is - * set, then the alloc contains the number of requested nodes. If there is an - * allocated node, then the total allocated nodes is in that node. - * - * Return: The total number of nodes allocated - */ -static inline unsigned long mas_allocated(const struct ma_state *mas) -{ - if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) - return 0; - - return mas->alloc->total; -} - -/* - * mas_set_alloc_req() - Set the requested number of allocations. - * @mas: the maple state - * @count: the number of allocations. - * - * The requested number of allocations is either in the first allocated node, - * located in @mas->alloc->request_count, or directly in @mas->alloc if there is - * no allocated node. Set the request either in the node or do the necessary - * encoding to store in @mas->alloc directly. - */ -static inline void mas_set_alloc_req(struct ma_state *mas, unsigned long count) -{ - if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) { - if (!count) - mas->alloc = NULL; - else - mas->alloc = (struct maple_alloc *)(((count) << 1U) | 1U); - return; - } - - mas->alloc->request_count = count; -} - -/* - * mas_alloc_req() - get the requested number of allocations. - * @mas: The maple state - * - * The alloc count is either stored directly in @mas, or in - * @mas->alloc->request_count if there is at least one node allocated. Decode - * the request count if it's stored directly in @mas->alloc. - * - * Return: The allocation request count. - */ -static inline unsigned int mas_alloc_req(const struct ma_state *mas) -{ - if ((unsigned long)mas->alloc & 0x1) - return (unsigned long)(mas->alloc) >> 1; - else if (mas->alloc) - return mas->alloc->request_count; - return 0; -} - -/* * ma_pivots() - Get a pointer to the maple node pivots. * @node: the maple node * @type: the node type @@ -1032,24 +966,6 @@ static inline void mas_descend(struct ma_state *mas) } /* - * mte_set_gap() - Set a maple node gap. - * @mn: The encoded maple node - * @gap: The offset of the gap to set - * @val: The gap value - */ -static inline void mte_set_gap(const struct maple_enode *mn, - unsigned char gap, unsigned long val) -{ - switch (mte_node_type(mn)) { - default: - break; - case maple_arange_64: - mte_to_node(mn)->ma64.gap[gap] = val; - break; - } -} - -/* * mas_ascend() - Walk up a level of the tree. * @mas: The maple state * @@ -1152,79 +1068,24 @@ static int mas_ascend(struct ma_state *mas) * * Return: A pointer to a maple node. */ -static inline struct maple_node *mas_pop_node(struct ma_state *mas) +static __always_inline struct maple_node *mas_pop_node(struct ma_state *mas) { - struct maple_alloc *ret, *node = mas->alloc; - unsigned long total = mas_allocated(mas); - unsigned int req = mas_alloc_req(mas); - - /* nothing or a request pending. */ - if (WARN_ON(!total)) - return NULL; + struct maple_node *ret; - if (total == 1) { - /* single allocation in this ma_state */ + if (mas->alloc) { + ret = mas->alloc; mas->alloc = NULL; - ret = node; - goto single_node; + goto out; } - if (node->node_count == 1) { - /* Single allocation in this node. */ - mas->alloc = node->slot[0]; - mas->alloc->total = node->total - 1; - ret = node; - goto new_head; - } - node->total--; - ret = node->slot[--node->node_count]; - node->slot[node->node_count] = NULL; + if (WARN_ON_ONCE(!mas->sheaf)) + return NULL; -single_node: -new_head: - if (req) { - req++; - mas_set_alloc_req(mas, req); - } + ret = kmem_cache_alloc_from_sheaf(maple_node_cache, GFP_NOWAIT, mas->sheaf); +out: memset(ret, 0, sizeof(*ret)); - return (struct maple_node *)ret; -} - -/* - * mas_push_node() - Push a node back on the maple state allocation. - * @mas: The maple state - * @used: The used maple node - * - * Stores the maple node back into @mas->alloc for reuse. Updates allocated and - * requested node count as necessary. - */ -static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) -{ - struct maple_alloc *reuse = (struct maple_alloc *)used; - struct maple_alloc *head = mas->alloc; - unsigned long count; - unsigned int requested = mas_alloc_req(mas); - - count = mas_allocated(mas); - - reuse->request_count = 0; - reuse->node_count = 0; - if (count) { - if (head->node_count < MAPLE_ALLOC_SLOTS) { - head->slot[head->node_count++] = reuse; - head->total++; - goto done; - } - reuse->slot[0] = head; - reuse->node_count = 1; - } - - reuse->total = count + 1; - mas->alloc = reuse; -done: - if (requested > 1) - mas_set_alloc_req(mas, requested - 1); + return ret; } /* @@ -1234,121 +1095,81 @@ done: */ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { - struct maple_alloc *node; - unsigned long allocated = mas_allocated(mas); - unsigned int requested = mas_alloc_req(mas); - unsigned int count; - void **slots = NULL; - unsigned int max_req = 0; - - if (!requested) + if (!mas->node_request) return; - mas_set_alloc_req(mas, 0); - if (mas->mas_flags & MA_STATE_PREALLOC) { - if (allocated) + if (mas->node_request == 1) { + if (mas->sheaf) + goto use_sheaf; + + if (mas->alloc) return; - WARN_ON(!allocated); - } - if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) { - node = (struct maple_alloc *)mt_alloc_one(gfp); - if (!node) - goto nomem_one; + mas->alloc = mt_alloc_one(gfp); + if (!mas->alloc) + goto error; - if (allocated) { - node->slot[0] = mas->alloc; - node->node_count = 1; - } else { - node->node_count = 0; - } + mas->node_request = 0; + return; + } - mas->alloc = node; - node->total = ++allocated; - node->request_count = 0; - requested--; +use_sheaf: + if (unlikely(mas->alloc)) { + kfree(mas->alloc); + mas->alloc = NULL; } - node = mas->alloc; - while (requested) { - max_req = MAPLE_ALLOC_SLOTS - node->node_count; - slots = (void **)&node->slot[node->node_count]; - max_req = min(requested, max_req); - count = mt_alloc_bulk(gfp, max_req, slots); - if (!count) - goto nomem_bulk; + if (mas->sheaf) { + unsigned long refill; - if (node->node_count == 0) { - node->slot[0]->node_count = 0; - node->slot[0]->request_count = 0; + refill = mas->node_request; + if (kmem_cache_sheaf_size(mas->sheaf) >= refill) { + mas->node_request = 0; + return; } - node->node_count += count; - allocated += count; - /* find a non-full node*/ - do { - node = node->slot[0]; - } while (unlikely(node->node_count == MAPLE_ALLOC_SLOTS)); - requested -= count; - } - mas->alloc->total = allocated; - return; + if (mt_refill_sheaf(gfp, &mas->sheaf, refill)) + goto error; -nomem_bulk: - /* Clean up potential freed allocations on bulk failure */ - memset(slots, 0, max_req * sizeof(unsigned long)); - mas->alloc->total = allocated; -nomem_one: - mas_set_alloc_req(mas, requested); - mas_set_err(mas, -ENOMEM); -} + mas->node_request = 0; + return; + } -/* - * mas_free() - Free an encoded maple node - * @mas: The maple state - * @used: The encoded maple node to free. - * - * Uses rcu free if necessary, pushes @used back on the maple state allocations - * otherwise. - */ -static inline void mas_free(struct ma_state *mas, struct maple_enode *used) -{ - struct maple_node *tmp = mte_to_node(used); + mas->sheaf = mt_get_sheaf(gfp, mas->node_request); + if (likely(mas->sheaf)) { + mas->node_request = 0; + return; + } - if (mt_in_rcu(mas->tree)) - ma_free_rcu(tmp); - else - mas_push_node(mas, tmp); +error: + mas_set_err(mas, -ENOMEM); } -/* - * mas_node_count_gfp() - Check if enough nodes are allocated and request more - * if there is not enough nodes. - * @mas: The maple state - * @count: The number of nodes needed - * @gfp: the gfp flags - */ -static void mas_node_count_gfp(struct ma_state *mas, int count, gfp_t gfp) +static inline void mas_empty_nodes(struct ma_state *mas) { - unsigned long allocated = mas_allocated(mas); + mas->node_request = 0; + if (mas->sheaf) { + mt_return_sheaf(mas->sheaf); + mas->sheaf = NULL; + } - if (allocated < count) { - mas_set_alloc_req(mas, count - allocated); - mas_alloc_nodes(mas, gfp); + if (mas->alloc) { + kfree(mas->alloc); + mas->alloc = NULL; } } /* - * mas_node_count() - Check if enough nodes are allocated and request more if - * there is not enough nodes. + * mas_free() - Free an encoded maple node * @mas: The maple state - * @count: The number of nodes needed + * @used: The encoded maple node to free. * - * Note: Uses GFP_NOWAIT | __GFP_NOWARN for gfp flags. + * Uses rcu free if necessary, pushes @used back on the maple state allocations + * otherwise. */ -static void mas_node_count(struct ma_state *mas, int count) +static inline void mas_free(struct ma_state *mas, struct maple_enode *used) { - return mas_node_count_gfp(mas, count, GFP_NOWAIT | __GFP_NOWARN); + ma_free_rcu(mte_to_node(used)); } /* @@ -1878,21 +1699,7 @@ static inline int mab_calc_split(struct ma_state *mas, * end on a NULL entry, with the exception of the left-most leaf. The * limitation means that the split of a node must be checked for this condition * and be able to put more data in one direction or the other. - */ - if (unlikely((mas->mas_flags & MA_STATE_BULK))) { - *mid_split = 0; - split = b_end - mt_min_slots[bn->type]; - - if (!ma_is_leaf(bn->type)) - return split; - - mas->mas_flags |= MA_STATE_REBALANCE; - if (!bn->slot[split]) - split--; - return split; - } - - /* + * * Although extremely rare, it is possible to enter what is known as the 3-way * split scenario. The 3-way split comes about by means of a store of a range * that overwrites the end and beginning of two full nodes. The result is a set @@ -2040,27 +1847,6 @@ static inline void mab_mas_cp(struct maple |
