diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-10-02 15:58:05 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-10-02 15:58:05 -0700 |
| commit | 24d9e8b3c9c8a6f72c8b4c196a703e144928d919 (patch) | |
| tree | 81d9a41265b30c776a2a70a517fddb5e5da62ed0 /mm | |
| parent | 07fdad3a93756b872da7b53647715c48d0f4a2d0 (diff) | |
| parent | ca74b8cadaad4b179f77f1f4dc3d288be9a580f1 (diff) | |
| download | linux-24d9e8b3c9c8a6f72c8b4c196a703e144928d919.tar.gz linux-24d9e8b3c9c8a6f72c8b4c196a703e144928d919.tar.bz2 linux-24d9e8b3c9c8a6f72c8b4c196a703e144928d919.zip | |
Merge tag 'slab-for-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab
Pull slab updates from Vlastimil Babka:
- A new layer for caching objects for allocation and free via percpu
arrays called sheaves.
The aim is to combine the good parts of SLAB (lower-overhead and
simpler percpu caching, compared to SLUB) without the past issues
with arrays for freeing remote NUMA node objects and their flushing.
It also allows more efficient kfree_rcu(), and cheaper object
preallocations for cases where the exact number of objects is
unknown, but an upper bound is.
Currently VMAs and maple nodes are using this new caching, with a
plan to enable it for all caches and remove the complex SLUB fastpath
based on cpu (partial) slabs and this_cpu_cmpxchg_double().
(Vlastimil Babka, with Liam Howlett and Pedro Falcato for the maple
tree changes)
- Re-entrant kmalloc_nolock(), which allows opportunistic allocations
from NMI and tracing/kprobe contexts.
Building on prior page allocator and memcg changes, it will result in
removing BPF-specific caches on top of slab (Alexei Starovoitov)
- Various fixes and cleanups. (Kuan-Wei Chiu, Matthew Wilcox, Suren
Baghdasaryan, Ye Liu)
* tag 'slab-for-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab: (40 commits)
slab: Introduce kmalloc_nolock() and kfree_nolock().
slab: Reuse first bit for OBJEXTS_ALLOC_FAIL
slab: Make slub local_(try)lock more precise for LOCKDEP
mm: Introduce alloc_frozen_pages_nolock()
mm: Allow GFP_ACCOUNT to be used in alloc_pages_nolock().
locking/local_lock: Introduce local_lock_is_locked().
maple_tree: Convert forking to use the sheaf interface
maple_tree: Add single node allocation support to maple state
maple_tree: Prefilled sheaf conversion and testing
tools/testing: Add support for prefilled slab sheafs
maple_tree: Replace mt_free_one() with kfree()
maple_tree: Use kfree_rcu in ma_free_rcu
testing/radix-tree/maple: Hack around kfree_rcu not existing
tools/testing: include maple-shim.c in maple.c
maple_tree: use percpu sheaves for maple_node_cache
mm, vma: use percpu sheaves for vm_area_struct cache
tools/testing: Add support for changes to slab for sheaves
slab: allow NUMA restricted allocations to use percpu sheaves
tools/testing/vma: Implement vm_refcnt reset
slab: skip percpu sheaves for remote object freeing
...
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 1 | ||||
| -rw-r--r-- | mm/internal.h | 4 | ||||
| -rw-r--r-- | mm/kasan/common.c | 5 | ||||
| -rw-r--r-- | mm/page_alloc.c | 55 | ||||
| -rw-r--r-- | mm/slab.h | 20 | ||||
| -rw-r--r-- | mm/slab_common.c | 37 | ||||
| -rw-r--r-- | mm/slub.c | 2357 | ||||
| -rw-r--r-- | mm/vma_init.c | 1 |
8 files changed, 2320 insertions, 160 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index e443fe8cd6cf..202e044f2b4d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -194,6 +194,7 @@ menu "Slab allocator options" config SLUB def_bool y + select IRQ_WORK config KVFREE_RCU_BATCHED def_bool y diff --git a/mm/internal.h b/mm/internal.h index 45b725c3dc03..9904421cabc1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -842,6 +842,10 @@ static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int ord #define alloc_frozen_pages(...) \ alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__)) +struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); +#define alloc_frozen_pages_nolock(...) \ + alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__)) + extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 9142964ab9c9..3264900b942f 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -252,7 +252,7 @@ bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object, } bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, - bool still_accessible) + bool still_accessible, bool no_quarantine) { if (!kasan_arch_is_ready() || is_kfence_address(object)) return false; @@ -274,6 +274,9 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, poison_slab_object(cache, object, init); + if (no_quarantine) + return false; + /* * If the object is put into quarantine, do not let slab put the object * onto the freelist for now. The object's metadata is kept until the diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d1d037f97c5f..5a40e2b7d148 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7478,22 +7478,7 @@ static bool __free_unaccepted(struct page *page) #endif /* CONFIG_UNACCEPTED_MEMORY */ -/** - * alloc_pages_nolock - opportunistic reentrant allocation from any context - * @nid: node to allocate from - * @order: allocation order size - * - * Allocates pages of a given order from the given node. This is safe to - * call from any context (from atomic, NMI, and also reentrant - * allocator -> tracepoint -> alloc_pages_nolock_noprof). - * Allocation is best effort and to be expected to fail easily so nobody should - * rely on the success. Failures are not reported via warn_alloc(). - * See always fail conditions below. - * - * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN. - * It means ENOMEM. There is no reason to call it again and expect !NULL. - */ -struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) +struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order) { /* * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed. @@ -7515,12 +7500,13 @@ struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) * specify it here to highlight that alloc_pages_nolock() * doesn't want to deplete reserves. */ - gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC - | __GFP_ACCOUNT; + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC | __GFP_COMP + | gfp_flags; unsigned int alloc_flags = ALLOC_TRYLOCK; struct alloc_context ac = { }; struct page *page; + VM_WARN_ON_ONCE(gfp_flags & ~__GFP_ACCOUNT); /* * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is * unsafe in NMI. If spin_trylock() is called from hard IRQ the current @@ -7555,15 +7541,38 @@ struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */ - if (page) - set_page_refcounted(page); - - if (memcg_kmem_online() && page && + if (memcg_kmem_online() && page && (gfp_flags & __GFP_ACCOUNT) && unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) { - free_pages_nolock(page, order); + __free_frozen_pages(page, order, FPI_TRYLOCK); page = NULL; } trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); kmsan_alloc_page(page, order, alloc_gfp); return page; } +/** + * alloc_pages_nolock - opportunistic reentrant allocation from any context + * @gfp_flags: GFP flags. Only __GFP_ACCOUNT allowed. + * @nid: node to allocate from + * @order: allocation order size + * + * Allocates pages of a given order from the given node. This is safe to + * call from any context (from atomic, NMI, and also reentrant + * allocator -> tracepoint -> alloc_pages_nolock_noprof). + * Allocation is best effort and to be expected to fail easily so nobody should + * rely on the success. Failures are not reported via warn_alloc(). + * See always fail conditions below. + * + * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN. + * It means ENOMEM. There is no reason to call it again and expect !NULL. + */ +struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order) +{ + struct page *page; + + page = alloc_frozen_pages_nolock_noprof(gfp_flags, nid, order); + if (page) + set_page_refcounted(page); + return page; +} +EXPORT_SYMBOL_GPL(alloc_pages_nolock_noprof); diff --git a/mm/slab.h b/mm/slab.h index 248b34c839b7..d63cc9b5e313 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -57,6 +57,10 @@ struct slab { struct { union { struct list_head slab_list; + struct { /* For deferred deactivate_slab() */ + struct llist_node llnode; + void *flush_freelist; + }; #ifdef CONFIG_SLUB_CPU_PARTIAL struct { struct slab *next; @@ -234,7 +238,9 @@ struct kmem_cache_order_objects { struct kmem_cache { #ifndef CONFIG_SLUB_TINY struct kmem_cache_cpu __percpu *cpu_slab; + struct lock_class_key lock_key; #endif + struct slub_percpu_sheaves __percpu *cpu_sheaves; /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; unsigned long min_partial; @@ -248,6 +254,7 @@ struct kmem_cache { /* Number of per cpu partial slabs to keep around */ unsigned int cpu_partial_slabs; #endif + unsigned int sheaf_capacity; struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */ @@ -433,6 +440,9 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s) return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT)); } +bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj); +void flush_all_rcu_sheaves(void); + #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_PANIC | \ SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \ @@ -526,8 +536,12 @@ static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) unsigned long obj_exts = READ_ONCE(slab->obj_exts); #ifdef CONFIG_MEMCG - VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS), - slab_page(slab)); + /* + * obj_exts should be either NULL, a valid pointer with + * MEMCG_DATA_OBJEXTS bit set or be equal to OBJEXTS_ALLOC_FAIL. + */ + VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS) && + obj_exts != OBJEXTS_ALLOC_FAIL, slab_page(slab)); VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab)); #endif return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK); @@ -656,6 +670,8 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) void __check_heap_object(const void *ptr, unsigned long n, const struct slab *slab, bool to_user); +void defer_free_barrier(void); + static inline bool slub_debug_orig_size(struct kmem_cache *s) { return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && diff --git a/mm/slab_common.c b/mm/slab_common.c index bfe7c40eeee1..932d13ada36c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -163,6 +163,9 @@ int slab_unmergeable(struct kmem_cache *s) return 1; #endif + if (s->cpu_sheaves) + return 1; + /* * We may have set a slab to be unmergeable during bootstrap. */ @@ -321,7 +324,7 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, object_size - args->usersize < args->useroffset)) args->usersize = args->useroffset = 0; - if (!args->usersize) + if (!args->usersize && !args->sheaf_capacity) s = __kmem_cache_alias(name, object_size, args->align, flags, args->ctor); if (s) @@ -507,6 +510,9 @@ void kmem_cache_destroy(struct kmem_cache *s) rcu_barrier(); } + /* Wait for deferred work from kmalloc/kfree_nolock() */ + defer_free_barrier(); + cpus_read_lock(); mutex_lock(&slab_mutex); @@ -1605,6 +1611,30 @@ static void kfree_rcu_work(struct work_struct *work) kvfree_rcu_list(head); } +static bool kfree_rcu_sheaf(void *obj) +{ + struct kmem_cache *s; + struct folio *folio; + struct slab *slab; + + if (is_vmalloc_addr(obj)) + return false; + + folio = virt_to_folio(obj); + if (unlikely(!folio_test_slab(folio))) + return false; + + slab = folio_slab(folio); + s = slab->slab_cache; + if (s->cpu_sheaves) { + if (likely(!IS_ENABLED(CONFIG_NUMA) || + slab_nid(slab) == numa_mem_id())) + return __kfree_rcu_sheaf(s, obj); + } + + return false; +} + static bool need_offload_krc(struct kfree_rcu_cpu *krcp) { @@ -1949,6 +1979,9 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) if (!head) might_sleep(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(ptr)) + return; + // Queue the object but don't yet schedule the batch. if (debug_rcu_head_queue(ptr)) { // Probable double kfree_rcu(), just leak. @@ -2023,6 +2056,8 @@ void kvfree_rcu_barrier(void) bool queued; int i, cpu; + flush_all_rcu_sheaves(); + /* * Firstly we detach objects and queue them over an RCU-batch * for all CPUs. Finally queued works are flushed for each CPU. diff --git a/mm/slub.c b/mm/slub.c index d257141896c9..a585d0ac45d4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -44,7 +44,8 @@ #include <kunit/test.h> #include <kunit/test-bug.h> #include <linux/sort.h> - +#include <linux/irq_work.h> +#include <linux/kprobes.h> #include <linux/debugfs.h> #include <trace/events/kmem.h> @@ -363,8 +364,12 @@ static inline void debugfs_slab_add(struct kmem_cache *s) { } #endif enum stat_item { + ALLOC_PCS, /* Allocation from percpu sheaf */ ALLOC_FASTPATH, /* Allocation from cpu slab */ ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ + FREE_PCS, /* Free to percpu sheaf */ + FREE_RCU_SHEAF, /* Free to rcu_free sheaf */ + FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */ FREE_FASTPATH, /* Free to cpu slab */ FREE_SLOWPATH, /* Freeing not to cpu slab */ FREE_FROZEN, /* Freeing to frozen slab */ @@ -389,6 +394,19 @@ enum stat_item { CPU_PARTIAL_FREE, /* Refill cpu partial on free */ CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ + SHEAF_FLUSH, /* Objects flushed from a sheaf */ + SHEAF_REFILL, /* Objects refilled to a sheaf */ + SHEAF_ALLOC, /* Allocation of an empty sheaf */ + SHEAF_FREE, /* Freeing of an empty sheaf */ + BARN_GET, /* Got full sheaf from barn */ + BARN_GET_FAIL, /* Failed to get full sheaf from barn */ + BARN_PUT, /* Put full sheaf to barn */ + BARN_PUT_FAIL, /* Failed to put full sheaf to barn */ + SHEAF_PREFILL_FAST, /* Sheaf prefill grabbed the spare sheaf */ + SHEAF_PREFILL_SLOW, /* Sheaf prefill found no spare sheaf */ + SHEAF_PREFILL_OVERSIZE, /* Allocation of oversize sheaf for prefill */ + SHEAF_RETURN_FAST, /* Sheaf return reattached spare sheaf */ + SHEAF_RETURN_SLOW, /* Sheaf return could not reattach spare */ NR_SLUB_STAT_ITEMS }; @@ -409,7 +427,7 @@ struct kmem_cache_cpu { #ifdef CONFIG_SLUB_CPU_PARTIAL struct slab *partial; /* Partially allocated slabs */ #endif - local_lock_t lock; /* Protects the fields above */ + local_trylock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS unsigned int stat[NR_SLUB_STAT_ITEMS]; #endif @@ -435,6 +453,37 @@ void stat_add(const struct kmem_cache *s, enum stat_item si, int v) #endif } +#define MAX_FULL_SHEAVES 10 +#define MAX_EMPTY_SHEAVES 10 + +struct node_barn { + spinlock_t lock; + struct list_head sheaves_full; + struct list_head sheaves_empty; + unsigned int nr_full; + unsigned int nr_empty; +}; + +struct slab_sheaf { + union { + struct rcu_head rcu_head; + struct list_head barn_list; + /* only used for prefilled sheafs */ + unsigned int capacity; + }; + struct kmem_cache *cache; + unsigned int size; + int node; /* only used for rcu_sheaf */ + void *objects[]; +}; + +struct slub_percpu_sheaves { + local_trylock_t lock; + struct slab_sheaf *main; /* never NULL when unlocked */ + struct slab_sheaf *spare; /* empty or full, may be NULL */ + struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */ +}; + /* * The slab lists for all objects. */ @@ -447,6 +496,7 @@ struct kmem_cache_node { atomic_long_t total_objects; struct list_head full; #endif + struct node_barn *barn; }; static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) @@ -454,6 +504,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) return s->node[node]; } +/* Get the barn of the current cpu's memory node */ +static inline struct node_barn *get_barn(struct kmem_cache *s) +{ + return get_node(s, numa_mem_id())->barn; +} + /* * Iterator over all nodes. The body will be executed for each node that has * a kmem_cache_node structure allocated (which is true for all online nodes) @@ -470,12 +526,19 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) */ static nodemask_t slab_nodes; -#ifndef CONFIG_SLUB_TINY /* * Workqueue used for flush_cpu_slab(). */ static struct workqueue_struct *flushwq; -#endif + +struct slub_flush_work { + struct work_struct work; + struct kmem_cache *s; + bool skip; +}; + +static DEFINE_MUTEX(flush_lock); +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); /******************************************************************** * Core slab cache functions @@ -822,6 +885,16 @@ static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) } #ifdef CONFIG_SLUB_DEBUG + +/* + * For debugging context when we want to check if the struct slab pointer + * appears to be valid. + */ +static inline bool validate_slab_ptr(struct slab *slab) +{ + return PageSlab(slab_page(slab)); +} + static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; static DEFINE_SPINLOCK(object_map_lock); @@ -1449,15 +1522,15 @@ static int check_object(struct kmem_cache *s, struct slab *slab, return ret; } +/* + * Checks if the slab state looks sane. Assumes the struct slab pointer + * was either obtained in a way that ensures it's valid, or validated + * by validate_slab_ptr() + */ static int check_slab(struct kmem_cache *s, struct slab *slab) { int maxobj; - if (!folio_test_slab(slab_folio(slab))) { - slab_err(s, slab, "Not a valid slab page"); - return 0; - } - maxobj = order_objects(slab_order(slab), s->size); if (slab->objects > maxobj) { slab_err(s, slab, "objects %u > max %u", @@ -1653,17 +1726,15 @@ static noinline bool alloc_debug_processing(struct kmem_cache *s, return true; bad: - if (folio_test_slab(slab_folio(slab))) { - /* - * If this is a slab page then lets do the best we can - * to avoid issues in the future. Marking all objects - * as used avoids touching the remaining objects. - */ - slab_fix(s, "Marking all objects used"); - slab->inuse = slab->objects; - slab->freelist = NULL; - slab->frozen = 1; /* mark consistency-failed slab as frozen */ - } + /* + * Let's do the best we can to avoid issues in the future. Marking all + * objects as used avoids touching the remaining objects. + */ + slab_fix(s, "Marking all objects used"); + slab->inuse = slab->objects; + slab->freelist = NULL; + slab->frozen = 1; /* mark consistency-failed slab as frozen */ + return false; } @@ -1684,10 +1755,7 @@ static inline int free_consistency_checks(struct kmem_cache *s, return 0; if (unlikely(s != slab->slab_cache)) { - if (!folio_test_slab(slab_folio(slab))) { - slab_err(s, slab, "Attempt to free object(0x%p) outside of slab", - object); - } else if (!slab->slab_cache) { + if (!slab->slab_cache) { slab_err(NULL, slab, "No slab cache for object 0x%p", object); } else { @@ -1989,7 +2057,7 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts, * objects with no tag reference. Mark all references in this * vector as empty to avoid warnings later on. */ - if (obj_exts & OBJEXTS_ALLOC_FAIL) { + if (obj_exts == OBJEXTS_ALLOC_FAIL) { unsigned int i; for (i = 0; i < objects; i++) @@ -2022,6 +2090,7 @@ static inline void init_slab_obj_exts(struct slab *slab) int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab) { + bool allow_spin = gfpflags_allow_spinning(gfp); unsigned int objects = objs_per_slab(s, slab); unsigned long new_exts; unsigned long old_exts; @@ -2030,17 +2099,32 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp &= ~OBJCGS_CLEAR_MASK; /* Prevent recursive extension vector allocation */ gfp |= __GFP_NO_OBJ_EXT; - vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, - slab_nid(slab)); + + /* + * Note that allow_spin may be false during early boot and its + * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting + * architectures with cmpxchg16b, early obj_exts will be missing for + * very early allocations on those. + */ + if (unlikely(!allow_spin)) { + size_t sz = objects * sizeof(struct slabobj_ext); + + vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT, + slab_nid(slab)); + } else { + vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, + slab_nid(slab)); + } if (!vec) { /* Mark vectors which failed to allocate */ - if (new_slab) - mark_failed_objexts_alloc(slab); + mark_failed_objexts_alloc(slab); return -ENOMEM; } new_exts = (unsigned long)vec; + if (unlikely(!allow_spin)) + new_exts |= OBJEXTS_NOSPIN_ALLOC; #ifdef CONFIG_MEMCG new_exts |= MEMCG_DATA_OBJEXTS; #endif @@ -2061,7 +2145,10 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, * objcg vector should be reused. */ mark_objexts_empty(vec); - kfree(vec); + if (unlikely(!allow_spin)) + kfree_nolock(vec); + else + kfree(vec); return 0; } @@ -2085,7 +2172,10 @@ static inline void free_slab_obj_exts(struct slab *slab) * the extension for obj_exts is expected to be NULL. */ mark_objexts_empty(obj_exts); - kfree(obj_exts); + if (unlikely(READ_ONCE(slab->obj_exts) & OBJEXTS_NOSPIN_ALLOC)) + kfree_nolock(obj_exts); + else + kfree(obj_exts); slab->obj_exts = 0; } @@ -2419,7 +2509,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init, } /* KASAN might put x into memory quarantine, delaying its reuse. */ - return !kasan_slab_free(s, x, init, still_accessible); + return !kasan_slab_free(s, x, init, still_accessible, false); } static __fastpath_inline @@ -2478,17 +2568,463 @@ static void *setup_object(struct kmem_cache *s, void *object) return object; } +static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) +{ + struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects, + s->sheaf_capacity), gfp); + + if (unlikely(!sheaf)) + return NULL; + + sheaf->cache = s; + + stat(s, SHEAF_ALLOC); + + return sheaf; +} + +static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) +{ + kfree(sheaf); + + stat(s, SHEAF_FREE); +} + +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p); + + +static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, + gfp_t gfp) +{ + int to_fill = s->sheaf_capacity - sheaf->size; + int filled; + + if (!to_fill) + return 0; + + filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, + &sheaf->objects[sheaf->size]); + + sheaf->size += filled; + + stat_add(s, SHEAF_REFILL, filled); + + if (filled < to_fill) + return -ENOMEM; + + return 0; +} + + +static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp) +{ + struct slab_sheaf *sheaf = alloc_empty_sheaf(s, gfp); + + if (!sheaf) + return NULL; + + if (refill_sheaf(s, sheaf, gfp)) { + free_empty_sheaf(s, sheaf); + return NULL; + } + + return sheaf; +} + +/* + * Maximum number of objects freed during a single flush of main pcs sheaf. + * Translates directly to an on-stack array size. + */ +#define PCS_BATCH_MAX 32U + +static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p); + +/* + * Free all objects from the main sheaf. In order to perform + * __kmem_cache_free_bulk() outside of cpu_sheaves->lock, work in batches where + * object pointers are moved to a on-stack array under the lock. To bound the + * stack usage, limit each batch to PCS_BATCH_MAX. + * + * returns true if at least partially flushed + */ +static bool sheaf_flush_main(struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + unsigned int batch, remaining; + void *objects[PCS_BATCH_MAX]; + struct slab_sheaf *sheaf; + bool ret = false; + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + return ret; + + pcs = this_cpu_ptr(s->cpu_sheaves); + sheaf = pcs->main; + + batch = min(PCS_BATCH_MAX, sheaf->size); + + sheaf->size -= batch; + memcpy(objects, sheaf->objects + sheaf->size, batch * sizeof(void *)); + + remaining = sheaf->size; + + local_unlock(&s->cpu_sheaves->lock); + + __kmem_cache_free_bulk(s, batch, &objects[0]); + + stat_add(s, SHEAF_FLUSH, batch); + + ret = true; + + if (remaining) + goto next_batch; + + return ret; +} + +/* + * Free all objects from a sheaf that's unused, i.e. not linked to any + * cpu_sheaves, so we need no locking and batching. The locking is also not + * necessary when flushing cpu's sheaves (both spare and main) during cpu + * hotremove as the cpu is not executing anymore. + */ +static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf) +{ + if (!sheaf->size) + return; + + stat_add(s, SHEAF_FLUSH, sheaf->size); + + __kmem_cache_free_bulk(s, sheaf->size, &sheaf->objects[0]); + + sheaf->size = 0; +} + +static void __rcu_free_sheaf_prepare(struct kmem_cache *s, + struct slab_sheaf *sheaf) +{ + bool init = slab_want_init_on_free(s); + void **p = &sheaf->objects[0]; + unsigned int i = 0; + + while (i < sheaf->size) { + struct slab *slab = virt_to_slab(p[i]); + + memcg_slab_free_hook(s, slab, p + i, 1); + alloc_tagging_slab_free_hook(s, slab, p + i, 1); + + if (unlikely(!slab_free_hook(s, p[i], init, true))) { + p[i] = p[--sheaf->size]; + continue; + } + + i++; + } +} + +static void rcu_free_sheaf_nobarn(struct rcu_head *head) +{ + struct slab_sheaf *sheaf; + struct kmem_cache *s; + + sheaf = container_of(head, struct slab_sheaf, rcu_head); + s = sheaf->cache; + + __rcu_free_sheaf_prepare(s, sheaf); + + sheaf_flush_unused(s, sheaf); + + free_empty_sheaf(s, sheaf); +} + +/* + * Caller needs to make sure migration is disabled in order to fully flush + * single cpu's sheaves + * + * must not be called from an irq + * + * flushing operations are rare so let's keep it simple and flush to slabs + * directly, skipping the barn + */ +static void pcs_flush_all(struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *spare, *rcu_free; + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + spare = pcs->spare; + pcs->spare = NULL; + + rcu_free = pcs->rcu_free; + pcs->rcu_free = NULL; + + local_unlock(&s->cpu_sheaves->lock); + + if (spare) { + sheaf_flush_unused(s, spare); + free_empty_sheaf(s, spare); + } + + if (rcu_free) + call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn); + + sheaf_flush_main(s); +} + +static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu) +{ + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + /* The cpu is not executing anymore so we don't need pcs->lock */ + sheaf_flush_unused(s, pcs->main); + if (pcs->spare) { + sheaf_flush_unused(s, pcs->spare); + free_empty_sheaf(s, pcs->spare); + pcs->spare = NULL; + } + + if (pcs->rcu_free) { + call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn); + pcs->rcu_free = NULL; + } +} + +static void pcs_destroy(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + /* can happen when unwinding failed create */ + if (!pcs->main) + continue; + + /* + * We have already passed __kmem_cache_shutdown() so everything + * was flushed and there should be no objects allocated from + * slabs, otherwise kmem_cache_destroy() would have aborted. + * Therefore something would have to be really wrong if the + * warnings here trigger, and we should rather leave objects and + * sheaves to leak in that case. + */ + + WARN_ON(pcs->spare); + WARN_ON(pcs->rcu_free); + + if (!WARN_ON(pcs->main->size)) { + free_empty_sheaf(s, pcs->main); + pcs->main = NULL; + } + } + + free_percpu(s->cpu_sheaves); + s->cpu_sheaves = NULL; +} + +static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) +{ + struct slab_sheaf *empty = NULL; + unsigned long flags; + + if (!data_race(barn->nr_empty)) + return NULL; + + spin_lock_irqsave(&barn->lock, flags); + + if (likely(barn->nr_empty)) { + empty = list_first_entry(&barn->sheaves_empty, + struct slab_sheaf, barn_list); + list_del(&empty->barn_list); + barn->nr_empty--; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return empty; +} + +/* + * The following two functions are used mainly in cases where we have to undo an + * intended action due to a race or cpu migration. Thus they do not check the + * empty or full sheaf limits for simplicity. + */ + +static void barn_put_empty_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf) +{ + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + list_add(&sheaf->barn_list, &barn->sheaves_empty); + barn->nr_empty++; + + spin_unlock_irqrestore(&barn->lock, flags); +} + +static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf) +{ + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + list_add(&sheaf->barn_list, &barn->sheaves_full); + barn->nr_full++; + + spin_unlock_irqrestore(&barn->lock, flags); +} + +static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn) +{ + struct slab_sheaf *sheaf = NULL; + unsigned long flags; + + if (!data_race(barn->nr_full) && !data_race(barn->nr_empty)) + return NULL; + + spin_lock_irqsave(&barn->lock, flags); + + if (barn->nr_full) { + sheaf = list_first_entry(&barn->sheaves_full, struct slab_sheaf, + barn_list); + list_del(&sheaf->barn_list); + barn->nr_full--; + } else if (barn->nr_empty) { + sheaf = list_first_entry(&barn->sheaves_empty, + struct slab_sheaf, barn_list); + list_del(&sheaf->barn_list); + barn->nr_empty--; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return sheaf; +} + +/* + * If a full sheaf is available, return it and put the supplied empty one to + * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't + * change. + */ +static struct slab_sheaf * +barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) +{ + struct slab_sheaf *full = NULL; + unsigned long flags; + + if (!data_race(barn->nr_full)) + return NULL; + + spin_lock_irqsave(&barn->lock, flags); + + if (likely(barn->nr_full)) { + full = list_first_entry(&barn->sheaves_full, struct slab_sheaf, + barn_list); + list_del(&full->barn_list); + list_add(&empty->barn_list, &barn->sheaves_empty); + barn->nr_full--; + barn->nr_empty++; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return full; +} + +/* + * If an empty sheaf is available, return it and put the supplied full one to + * barn. But if there are too many full sheaves, reject this with -E2BIG. + */ +static struct slab_sheaf * +barn_replace_full_sheaf(struct node_barn * |
