summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-11-01 16:15:42 -1000
committerLinus Torvalds <torvalds@linux-foundation.org>2023-11-01 16:15:42 -1000
commit05bf73aa27ba89474763cea7b9cd2626eda61e01 (patch)
tree0430a2d85df344f59030ac518a696a63139bfafd
parent1b10d2c8c6219bfc86d8c7d53a4f97a0a706d1ba (diff)
parent4758560fa268cecfa1144f015aa9f2525d164b7e (diff)
downloadlinux-05bf73aa27ba89474763cea7b9cd2626eda61e01.tar.gz
linux-05bf73aa27ba89474763cea7b9cd2626eda61e01.tar.bz2
linux-05bf73aa27ba89474763cea7b9cd2626eda61e01.zip
Merge tag 'probes-v6.7' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace
Pull probes updates from Masami Hiramatsu: "Cleanups: - kprobes: Fixes typo in kprobes samples - tracing/eprobes: Remove 'break' after return kretprobe/fprobe performance improvements: - lib: Introduce new `objpool`, which is a high performance lockless object queue. This uses per-cpu ring array to allocate/release objects from the pre-allocated object pool. Since the index of ring array is a 32bit sequential counter, we can retry to push/pop the object pointer from the ring without lock (as seq-lock does) - lib: Add an objpool test module to test the functionality and evaluate the performance under some circumstances - kprobes/fprobe: Improve kretprobe and rethook scalability performance with objpool. This improves both legacy kretprobe and fprobe exit handler (which is based on rethook) to be scalable on SMP systems. Even with 8-threads parallel test, it shows a great scalability improvement - Remove unneeded freelist.h which is replaced by objpool - objpool: Add maintainers entry for the objpool - objpool: Fix to remove unused include header lines" * tag 'probes-v6.7' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace: kprobes: unused header files removed MAINTAINERS: objpool added kprobes: freelist.h removed kprobes: kretprobe scalability improvement lib: objpool test module added lib: objpool added: ring-array based lockless MPMC tracing/eprobe: drop unneeded breaks samples: kprobes: Fixes a typo
-rw-r--r--MAINTAINERS7
-rw-r--r--include/linux/freelist.h129
-rw-r--r--include/linux/kprobes.h11
-rw-r--r--include/linux/objpool.h181
-rw-r--r--include/linux/rethook.h16
-rw-r--r--kernel/kprobes.c91
-rw-r--r--kernel/trace/fprobe.c28
-rw-r--r--kernel/trace/rethook.c90
-rw-r--r--kernel/trace/trace_eprobe.c5
-rw-r--r--lib/Kconfig.debug11
-rw-r--r--lib/Makefile4
-rw-r--r--lib/objpool.c280
-rw-r--r--lib/test_objpool.c690
-rw-r--r--samples/kprobes/kretprobe_example.c2
14 files changed, 1269 insertions, 276 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 6f0dda0603b8..14e1194faa4b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15553,6 +15553,13 @@ F: include/linux/objagg.h
F: lib/objagg.c
F: lib/test_objagg.c
+OBJPOOL
+M: Matt Wu <wuqiang.matt@bytedance.com>
+S: Supported
+F: include/linux/objpool.h
+F: lib/objpool.c
+F: lib/test_objpool.c
+
OBJTOOL
M: Josh Poimboeuf <jpoimboe@kernel.org>
M: Peter Zijlstra <peterz@infradead.org>
diff --git a/include/linux/freelist.h b/include/linux/freelist.h
deleted file mode 100644
index fc1842b96469..000000000000
--- a/include/linux/freelist.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */
-#ifndef FREELIST_H
-#define FREELIST_H
-
-#include <linux/atomic.h>
-
-/*
- * Copyright: cameron@moodycamel.com
- *
- * A simple CAS-based lock-free free list. Not the fastest thing in the world
- * under heavy contention, but simple and correct (assuming nodes are never
- * freed until after the free list is destroyed), and fairly speedy under low
- * contention.
- *
- * Adapted from: https://moodycamel.com/blog/2014/solving-the-aba-problem-for-lock-free-free-lists
- */
-
-struct freelist_node {
- atomic_t refs;
- struct freelist_node *next;
-};
-
-struct freelist_head {
- struct freelist_node *head;
-};
-
-#define REFS_ON_FREELIST 0x80000000
-#define REFS_MASK 0x7FFFFFFF
-
-static inline void __freelist_add(struct freelist_node *node, struct freelist_head *list)
-{
- /*
- * Since the refcount is zero, and nobody can increase it once it's
- * zero (except us, and we run only one copy of this method per node at
- * a time, i.e. the single thread case), then we know we can safely
- * change the next pointer of the node; however, once the refcount is
- * back above zero, then other threads could increase it (happens under
- * heavy contention, when the refcount goes to zero in between a load
- * and a refcount increment of a node in try_get, then back up to
- * something non-zero, then the refcount increment is done by the other
- * thread) -- so if the CAS to add the node to the actual list fails,
- * decrese the refcount and leave the add operation to the next thread
- * who puts the refcount back to zero (which could be us, hence the
- * loop).
- */
- struct freelist_node *head = READ_ONCE(list->head);
-
- for (;;) {
- WRITE_ONCE(node->next, head);
- atomic_set_release(&node->refs, 1);
-
- if (!try_cmpxchg_release(&list->head, &head, node)) {
- /*
- * Hmm, the add failed, but we can only try again when
- * the refcount goes back to zero.
- */
- if (atomic_fetch_add_release(REFS_ON_FREELIST - 1, &node->refs) == 1)
- continue;
- }
- return;
- }
-}
-
-static inline void freelist_add(struct freelist_node *node, struct freelist_head *list)
-{
- /*
- * We know that the should-be-on-freelist bit is 0 at this point, so
- * it's safe to set it using a fetch_add.
- */
- if (!atomic_fetch_add_release(REFS_ON_FREELIST, &node->refs)) {
- /*
- * Oh look! We were the last ones referencing this node, and we
- * know we want to add it to the free list, so let's do it!
- */
- __freelist_add(node, list);
- }
-}
-
-static inline struct freelist_node *freelist_try_get(struct freelist_head *list)
-{
- struct freelist_node *prev, *next, *head = smp_load_acquire(&list->head);
- unsigned int refs;
-
- while (head) {
- prev = head;
- refs = atomic_read(&head->refs);
- if ((refs & REFS_MASK) == 0 ||
- !atomic_try_cmpxchg_acquire(&head->refs, &refs, refs+1)) {
- head = smp_load_acquire(&list->head);
- continue;
- }
-
- /*
- * Good, reference count has been incremented (it wasn't at
- * zero), which means we can read the next and not worry about
- * it changing between now and the time we do the CAS.
- */
- next = READ_ONCE(head->next);
- if (try_cmpxchg_acquire(&list->head, &head, next)) {
- /*
- * Yay, got the node. This means it was on the list,
- * which means should-be-on-freelist must be false no
- * matter the refcount (because nobody else knows it's
- * been taken off yet, it can't have been put back on).
- */
- WARN_ON_ONCE(atomic_read(&head->refs) & REFS_ON_FREELIST);
-
- /*
- * Decrease refcount twice, once for our ref, and once
- * for the list's ref.
- */
- atomic_fetch_add(-2, &head->refs);
-
- return head;
- }
-
- /*
- * OK, the head must have changed on us, but we still need to decrement
- * the refcount we increased.
- */
- refs = atomic_fetch_add(-1, &prev->refs);
- if (refs == REFS_ON_FREELIST + 1)
- __freelist_add(prev, list);
- }
-
- return NULL;
-}
-
-#endif /* FREELIST_H */
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 85a64cb95d75..365eb092e9c4 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -26,8 +26,7 @@
#include <linux/rcupdate.h>
#include <linux/mutex.h>
#include <linux/ftrace.h>
-#include <linux/refcount.h>
-#include <linux/freelist.h>
+#include <linux/objpool.h>
#include <linux/rethook.h>
#include <asm/kprobes.h>
@@ -141,7 +140,7 @@ static inline bool kprobe_ftrace(struct kprobe *p)
*/
struct kretprobe_holder {
struct kretprobe *rp;
- refcount_t ref;
+ struct objpool_head pool;
};
struct kretprobe {
@@ -154,7 +153,6 @@ struct kretprobe {
#ifdef CONFIG_KRETPROBE_ON_RETHOOK
struct rethook *rh;
#else
- struct freelist_head freelist;
struct kretprobe_holder *rph;
#endif
};
@@ -165,10 +163,7 @@ struct kretprobe_instance {
#ifdef CONFIG_KRETPROBE_ON_RETHOOK
struct rethook_node node;
#else
- union {
- struct freelist_node freelist;
- struct rcu_head rcu;
- };
+ struct rcu_head rcu;
struct llist_node llist;
struct kretprobe_holder *rph;
kprobe_opcode_t *ret_addr;
diff --git a/include/linux/objpool.h b/include/linux/objpool.h
new file mode 100644
index 000000000000..15aff4a17f0c
--- /dev/null
+++ b/include/linux/objpool.h
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_OBJPOOL_H
+#define _LINUX_OBJPOOL_H
+
+#include <linux/types.h>
+#include <linux/refcount.h>
+
+/*
+ * objpool: ring-array based lockless MPMC queue
+ *
+ * Copyright: wuqiang.matt@bytedance.com,mhiramat@kernel.org
+ *
+ * objpool is a scalable implementation of high performance queue for
+ * object allocation and reclamation, such as kretprobe instances.
+ *
+ * With leveraging percpu ring-array to mitigate hot spots of memory
+ * contention, it delivers near-linear scalability for high parallel
+ * scenarios. The objpool is best suited for the following cases:
+ * 1) Memory allocation or reclamation are prohibited or too expensive
+ * 2) Consumers are of different priorities, such as irqs and threads
+ *
+ * Limitations:
+ * 1) Maximum objects (capacity) is fixed after objpool creation
+ * 2) All pre-allocated objects are managed in percpu ring array,
+ * which consumes more memory than linked lists
+ */
+
+/**
+ * struct objpool_slot - percpu ring array of objpool
+ * @head: head sequence of the local ring array (to retrieve at)
+ * @tail: tail sequence of the local ring array (to append at)
+ * @last: the last sequence number marked as ready for retrieve
+ * @mask: bits mask for modulo capacity to compute array indexes
+ * @entries: object entries on this slot
+ *
+ * Represents a cpu-local array-based ring buffer, its size is specialized
+ * during initialization of object pool. The percpu objpool node is to be
+ * allocated from local memory for NUMA system, and to be kept compact in
+ * continuous memory: CPU assigned number of objects are stored just after
+ * the body of objpool_node.
+ *
+ * Real size of the ring array is far too smaller than the value range of
+ * head and tail, typed as uint32_t: [0, 2^32), so only lower bits (mask)
+ * of head and tail are used as the actual position in the ring array. In
+ * general the ring array is acting like a small sliding window, which is
+ * always moving forward in the loop of [0, 2^32).
+ */
+struct objpool_slot {
+ uint32_t head;
+ uint32_t tail;
+ uint32_t last;
+ uint32_t mask;
+ void *entries[];
+} __packed;
+
+struct objpool_head;
+
+/*
+ * caller-specified callback for object initial setup, it's only called
+ * once for each object (just after the memory allocation of the object)
+ */
+typedef int (*objpool_init_obj_cb)(void *obj, void *context);
+
+/* caller-specified cleanup callback for objpool destruction */
+typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context);
+
+/**
+ * struct objpool_head - object pooling metadata
+ * @obj_size: object size, aligned to sizeof(void *)
+ * @nr_objs: total objs (to be pre-allocated with objpool)
+ * @nr_cpus: local copy of nr_cpu_ids
+ * @capacity: max objs can be managed by one objpool_slot
+ * @gfp: gfp flags for kmalloc & vmalloc
+ * @ref: refcount of objpool
+ * @flags: flags for objpool management
+ * @cpu_slots: pointer to the array of objpool_slot
+ * @release: resource cleanup callback
+ * @context: caller-provided context
+ */
+struct objpool_head {
+ int obj_size;
+ int nr_objs;
+ int nr_cpus;
+ int capacity;
+ gfp_t gfp;
+ refcount_t ref;
+ unsigned long flags;
+ struct objpool_slot **cpu_slots;
+ objpool_fini_cb release;
+ void *context;
+};
+
+#define OBJPOOL_NR_OBJECT_MAX (1UL << 24) /* maximum numbers of total objects */
+#define OBJPOOL_OBJECT_SIZE_MAX (1UL << 16) /* maximum size of an object */
+
+/**
+ * objpool_init() - initialize objpool and pre-allocated objects
+ * @pool: the object pool to be initialized, declared by caller
+ * @nr_objs: total objects to be pre-allocated by this object pool
+ * @object_size: size of an object (should be > 0)
+ * @gfp: flags for memory allocation (via kmalloc or vmalloc)
+ * @context: user context for object initialization callback
+ * @objinit: object initialization callback for extra setup
+ * @release: cleanup callback for extra cleanup task
+ *
+ * return value: 0 for success, otherwise error code
+ *
+ * All pre-allocated objects are to be zeroed after memory allocation.
+ * Caller could do extra initialization in objinit callback. objinit()
+ * will be called just after slot allocation and called only once for
+ * each object. After that the objpool won't touch any content of the
+ * objects. It's caller's duty to perform reinitialization after each
+ * pop (object allocation) or do clearance before each push (object
+ * reclamation).
+ */
+int objpool_init(struct objpool_head *pool, int nr_objs, int object_size,
+ gfp_t gfp, void *context, objpool_init_obj_cb objinit,
+ objpool_fini_cb release);
+
+/**
+ * objpool_pop() - allocate an object from objpool
+ * @pool: object pool
+ *
+ * return value: object ptr or NULL if failed
+ */
+void *objpool_pop(struct objpool_head *pool);
+
+/**
+ * objpool_push() - reclaim the object and return back to objpool
+ * @obj: object ptr to be pushed to objpool
+ * @pool: object pool
+ *
+ * return: 0 or error code (it fails only when user tries to push
+ * the same object multiple times or wrong "objects" into objpool)
+ */
+int objpool_push(void *obj, struct objpool_head *pool);
+
+/**
+ * objpool_drop() - discard the object and deref objpool
+ * @obj: object ptr to be discarded
+ * @pool: object pool
+ *
+ * return: 0 if objpool was released; -EAGAIN if there are still
+ * outstanding objects
+ *
+ * objpool_drop is normally for the release of outstanding objects
+ * after objpool cleanup (objpool_fini). Thinking of this example:
+ * kretprobe is unregistered and objpool_fini() is called to release
+ * all remained objects, but there are still objects being used by
+ * unfinished kretprobes (like blockable function: sys_accept). So
+ * only when the last outstanding object is dropped could the whole
+ * objpool be released along with the call of objpool_drop()
+ */
+int objpool_drop(void *obj, struct objpool_head *pool);
+
+/**
+ * objpool_free() - release objpool forcely (all objects to be freed)
+ * @pool: object pool to be released
+ */
+void objpool_free(struct objpool_head *pool);
+
+/**
+ * objpool_fini() - deref object pool (also releasing unused objects)
+ * @pool: object pool to be dereferenced
+ *
+ * objpool_fini() will try to release all remained free objects and
+ * then drop an extra reference of the objpool. If all objects are
+ * already returned to objpool (so called synchronous use cases),
+ * the objpool itself will be freed together. But if there are still
+ * outstanding objects (so called asynchronous use cases, such like
+ * blockable kretprobe), the objpool won't be released until all
+ * the outstanding objects are dropped, but the caller must assure
+ * there are no concurrent objpool_push() on the fly. Normally RCU
+ * is being required to make sure all ongoing objpool_push() must
+ * be finished before calling objpool_fini(), so does test_objpool,
+ * kretprobe or rethook
+ */
+void objpool_fini(struct objpool_head *pool);
+
+#endif /* _LINUX_OBJPOOL_H */
diff --git a/include/linux/rethook.h b/include/linux/rethook.h
index 26b6f3c81a76..ce69b2b7bc35 100644
--- a/include/linux/rethook.h
+++ b/include/linux/rethook.h
@@ -6,11 +6,10 @@
#define _LINUX_RETHOOK_H
#include <linux/compiler.h>
-#include <linux/freelist.h>
+#include <linux/objpool.h>
#include <linux/kallsyms.h>
#include <linux/llist.h>
#include <linux/rcupdate.h>
-#include <linux/refcount.h>
struct rethook_node;
@@ -30,14 +29,12 @@ typedef void (*rethook_handler_t) (struct rethook_node *, void *, unsigned long,
struct rethook {
void *data;
rethook_handler_t handler;
- struct freelist_head pool;
- refcount_t ref;
+ struct objpool_head pool;
struct rcu_head rcu;
};
/**
* struct rethook_node - The rethook shadow-stack entry node.
- * @freelist: The freelist, linked to struct rethook::pool.
* @rcu: The rcu_head for deferred freeing.
* @llist: The llist, linked to a struct task_struct::rethooks.
* @rethook: The pointer to the struct rethook.
@@ -48,20 +45,16 @@ struct rethook {
* on each entry of the shadow stack.
*/
struct rethook_node {
- union {
- struct freelist_node freelist;
- struct rcu_head rcu;
- };
+ struct rcu_head rcu;
struct llist_node llist;
struct rethook *rethook;
unsigned long ret_addr;
unsigned long frame;
};
-struct rethook *rethook_alloc(void *data, rethook_handler_t handler);
+struct rethook *rethook_alloc(void *data, rethook_handler_t handler, int size, int num);
void rethook_stop(struct rethook *rh);
void rethook_free(struct rethook *rh);
-void rethook_add_node(struct rethook *rh, struct rethook_node *node);
struct rethook_node *rethook_try_get(struct rethook *rh);
void rethook_recycle(struct rethook_node *node);
void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount);
@@ -98,4 +91,3 @@ void rethook_flush_task(struct task_struct *tk);
#endif
#endif
-
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0c6185aefaef..075a632e6c7c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1877,13 +1877,27 @@ static struct notifier_block kprobe_exceptions_nb = {
#ifdef CONFIG_KRETPROBES
#if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
+
+/* callbacks for objpool of kretprobe instances */
+static int kretprobe_init_inst(void *nod, void *context)
+{
+ struct kretprobe_instance *ri = nod;
+
+ ri->rph = context;
+ return 0;
+}
+static int kretprobe_fini_pool(struct objpool_head *head, void *context)
+{
+ kfree(context);
+ return 0;
+}
+
static void free_rp_inst_rcu(struct rcu_head *head)
{
struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
+ struct kretprobe_holder *rph = ri->rph;
- if (refcount_dec_and_test(&ri->rph->ref))
- kfree(ri->rph);
- kfree(ri);
+ objpool_drop(ri, &rph->pool);
}
NOKPROBE_SYMBOL(free_rp_inst_rcu);
@@ -1892,7 +1906,7 @@ static void recycle_rp_inst(struct kretprobe_instance *ri)
struct kretprobe *rp = get_kretprobe(ri);
if (likely(rp))
- freelist_add(&ri->freelist, &rp->freelist);
+ objpool_push(ri, &rp->rph->pool);
else
call_rcu(&ri->rcu, free_rp_inst_rcu);
}
@@ -1929,23 +1943,12 @@ NOKPROBE_SYMBOL(kprobe_flush_task);
static inline void free_rp_inst(struct kretprobe *rp)
{
- struct kretprobe_instance *ri;
- struct freelist_node *node;
- int count = 0;
-
- node = rp->freelist.head;
- while (node) {
- ri = container_of(node, struct kretprobe_instance, freelist);
- node = node->next;
-
- kfree(ri);
- count++;
- }
+ struct kretprobe_holder *rph = rp->rph;
- if (refcount_sub_and_test(count, &rp->rph->ref)) {
- kfree(rp->rph);
- rp->rph = NULL;
- }
+ if (!rph)
+ return;
+ rp->rph = NULL;
+ objpool_fini(&rph->pool);
}
/* This assumes the 'tsk' is the current task or the is not running. */
@@ -2087,19 +2090,17 @@ NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+ struct kretprobe_holder *rph = rp->rph;
struct kretprobe_instance *ri;
- struct freelist_node *fn;
- fn = freelist_try_get(&rp->freelist);
- if (!fn) {
+ ri = objpool_pop(&rph->pool);
+ if (!ri) {
rp->nmissed++;
return 0;
}
- ri = container_of(fn, struct kretprobe_instance, freelist);
-
if (rp->entry_handler && rp->entry_handler(ri, regs)) {
- freelist_add(&ri->freelist, &rp->freelist);
+ objpool_push(ri, &rph->pool);
return 0;
}
@@ -2193,7 +2194,6 @@ int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long o
int register_kretprobe(struct kretprobe *rp)
{
int ret;
- struct kretprobe_instance *inst;
int i;
void *addr;
@@ -2227,19 +2227,12 @@ int register_kretprobe(struct kretprobe *rp)
rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
#ifdef CONFIG_KRETPROBE_ON_RETHOOK
- rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler);
- if (!rp->rh)
- return -ENOMEM;
+ rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler,
+ sizeof(struct kretprobe_instance) +
+ rp->data_size, rp->maxactive);
+ if (IS_ERR(rp->rh))
+ return PTR_ERR(rp->rh);
- for (i = 0; i < rp->maxactive; i++) {
- inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL);
- if (inst == NULL) {
- rethook_free(rp->rh);
- rp->rh = NULL;
- return -ENOMEM;
- }
- rethook_add_node(rp->rh, &inst->node);
- }
rp->nmissed = 0;
/* Establish function entry probe point */
ret = register_kprobe(&rp->kp);
@@ -2248,24 +2241,18 @@ int register_kretprobe(struct kretprobe *rp)
rp->rh = NULL;
}
#else /* !CONFIG_KRETPROBE_ON_RETHOOK */
- rp->freelist.head = NULL;
rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL);
if (!rp->rph)
return -ENOMEM;
- rp->rph->rp = rp;
- for (i = 0; i < rp->maxactive; i++) {
- inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL);
- if (inst == NULL) {
- refcount_set(&rp->rph->ref, i);
- free_rp_inst(rp);
- return -ENOMEM;
- }
- inst->rph = rp->rph;
- freelist_add(&inst->freelist, &rp->freelist);
+ if (objpool_init(&rp->rph->pool, rp->maxactive, rp->data_size +
+ sizeof(struct kretprobe_instance), GFP_KERNEL,
+ rp->rph, kretprobe_init_inst, kretprobe_fini_pool)) {
+ kfree(rp->rph);
+ rp->rph = NULL;
+ return -ENOMEM;
}
- refcount_set(&rp->rph->ref, i);
-
+ rp->rph->rp = rp;
rp->nmissed = 0;
/* Establish function entry probe point */
ret = register_kprobe(&rp->kp);
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 881f90f0cbcf..6cd2a4e3afb8 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -187,7 +187,7 @@ static void fprobe_init(struct fprobe *fp)
static int fprobe_init_rethook(struct fprobe *fp, int num)
{
- int i, size;
+ int size;
if (num <= 0)
return -EINVAL;
@@ -205,26 +205,18 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
if (size <= 0)
return -EINVAL;
- fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler);
- if (!fp->rethook)
- return -ENOMEM;
- for (i = 0; i < size; i++) {
- struct fprobe_rethook_node *node;
-
- node = kzalloc(sizeof(*node) + fp->entry_data_size, GFP_KERNEL);
- if (!node) {
- rethook_free(fp->rethook);
- fp->rethook = NULL;
- return -ENOMEM;
- }
- rethook_add_node(fp->rethook, &node->node);
- }
+ /* Initialize rethook */
+ fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler,
+ sizeof(struct fprobe_rethook_node), size);
+ if (IS_ERR(fp->rethook))
+ return PTR_ERR(fp->rethook);
+
return 0;
}
static void fprobe_fail_cleanup(struct fprobe *fp)
{
- if (fp->rethook) {
+ if (!IS_ERR_OR_NULL(fp->rethook)) {
/* Don't need to cleanup rethook->handler because this is not used. */
rethook_free(fp->rethook);
fp->rethook = NULL;
@@ -379,14 +371,14 @@ int unregister_fprobe(struct fprobe *fp)
if (!fprobe_is_registered(fp))
return -EINVAL;
- if (fp->rethook)
+ if (!IS_ERR_OR_NULL(fp->rethook))
rethook_stop(fp->rethook);
ret = unregister_ftrace_function(&fp->ops);
if (ret < 0)
return ret;
- if (fp->rethook)
+ if (!IS_ERR_OR_NULL(fp->rethook))
rethook_free(fp->rethook);
ftrace_free_filter(&fp->ops);
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index 5eb9b598f4e9..6fd7d4ecbbc6 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -8,7 +8,6 @@
#include <linux/preempt.h>
#include <linux/rethook.h>
#include <linux/slab.h>
-#include <linux/sort.h>
/* Return hook list (shadow stack by list) */
@@ -36,21 +35,7 @@ void rethook_flush_task(struct task_struct *tk)
static void rethook_free_rcu(struct rcu_head *head)
{
struct rethook *rh = container_of(head, struct rethook, rcu);
- struct rethook_node *rhn;
- struct freelist_node *node;
- int count = 1;
-
- node = rh->pool.head;
- while (node) {
- rhn = container_of(node, struct rethook_node, freelist);
- node = node->next;
- kfree(rhn);
- count++;
- }
-
- /* The rh->ref is the number of pooled node + 1 */
- if (refcount_sub_and_test(count, &rh->ref))
- kfree(rh);
+ objpool_fini(&rh->pool);
}
/**
@@ -83,54 +68,62 @@ void rethook_free(struct rethook *rh)
call_rcu(&rh->rcu, rethook_free_rcu);
}
+static int rethook_init_node(void *nod, void *context)
+{
+ struct rethook_node *node = nod;
+
+ node->rethook = context;
+ return 0;
+}
+
+static int rethook_fini_pool(struct objpool_head *head, void *context)
+{
+ kfree(context);
+ return 0;
+}
+
/**
* rethook_alloc() - Allocate struct rethook.
* @data: a data to pass the @handler when hooking the return.
- * @handler: the return hook callback function.
+ * @handler: the return hook callback function, must NOT be NULL
+ * @size: node size: rethook node and additional data
+ * @num: number of rethook nodes to be preallocated
*
* Allocate and initialize a new rethook with @data and @handler.
- * Return NULL if memory allocation fails or @handler is NULL.
+ * Return pointer of new rethook, or error codes for failures.
+ *
* Note that @handler == NULL means this rethook is going to be freed.
*/
-struct rethook *rethook_alloc(void *data, rethook_handler_t handler)
+struct rethook *rethook_alloc(void *data, rethook_handler_t handler,
+ int size, int num)
{
- struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
+ struct rethook *rh;
- if (!rh || !handler) {
- kfree(rh);
- return NULL;
- }
+ if (!handler || num <= 0 || size < sizeof(struct rethook_node))
+ return ERR_PTR(-EINVAL);
+
+ rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
+ if (!rh)
+ return ERR_PTR(-ENOMEM);
rh->data = data;
rh->handler = handler;
- rh->pool.head = NULL;
- refcount_set(&rh->ref, 1);
+ /* initialize the objpool for rethook nodes */
+ if (objpool_init(&rh->pool, num, size, GFP_KERNEL, rh,
+ rethook_init_node, rethook_fini_pool)) {
+ kfree(rh);
+ return ERR_PTR(-ENOMEM);
+ }
return rh;
}
-/**
- * rethook_add_node() - Add a new node to the rethook.
- * @rh: the struct rethook.
- * @node: the struct rethook_node to be added.
- *
- * Add @node to @rh. User must allocate @node (as a part of user's
- * data structure.) The @node fields are initialized in this function.
- */
-void rethook_add_node(struct rethook *rh, struct rethook_node *node)
-{
- node->rethook = rh;
- freelist_add(&node->freelist, &rh->pool);
- refcount_inc(&rh->ref);
-}
-
static void free_rethook_node_rcu(struct rcu_head *head)
{
struct rethook_node *node = container_of(head, struct rethook_node, rcu);
+ struct rethook *rh = node->rethook;
- if (refcount_dec_and_test(&node->rethook->ref))
- kfree(node->rethook);
- kfree(node);
+ objpool_drop(node, &rh->pool);
}
/**
@@ -145,7 +138,7 @@ void rethook_recycle(struct rethook_node *node)
lockdep_assert_preemption_disabled();
if (likely(READ_ONCE(node->rethook->handler)))
- freelist_add(&node->freelist, &node->rethook->pool);
+ objpool_push(node, &node->rethook->pool);
else
call_rcu(&node->rcu, free_rethook_node_rcu);
}
@@ -161,7 +154,6 @@ NOKPROBE_SYMBOL(rethook_recycle);
struct rethook_node *rethook_try_get(struct rethook *rh)
{
rethook_handler_t handler = READ_ONCE(rh->handler);
- struct freelist_node *fn;
lockdep_assert_preemption_disabled();
@@ -178,11 +170,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh)
if (unlikely(!rcu_is_watching()))
return NULL;
- fn = freelist_try_get(&rh->pool);
- if (!fn)
- return NULL;
-
- return container_of(fn, struct rethook_node, freelist);
+ return (struct rethook_node *)objpool_pop(&rh->pool);
}
NOKPROBE_SYMBOL(rethook_try_get);
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 72714cbf475c..03c851f57969 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -788,12 +788,9 @@ find_and_get_event(const char *system, const char *event_name)
name = trace_event_name(tp_event);
if (!name || strcmp(event_name, name))
continue;
- if (!trace_event_try_get_ref(tp_event)) {
+ if (!trace_event_try_get_ref(tp_event))
return NULL;
- break;
- }
return tp_event;
- break;
}
return NULL;
}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 40b77c437a74..cc7d53d9dc01 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2954,6 +2954,17 @@ config TEST_CLOCKSOURCE_WATCHDOG
If unsure, say N.
+config TEST_OBJPOOL
+ tristate "Test module for correctness and stress of objpool"
+ default n
+ depends on m && DEBUG_KERNEL
+ help
+ This builds the "test_objpool" module that should be used for
+ correctness verification and concurrent testings of objects
+ allocation and reclamation.
+
+ If unsure, say N.
+
endif # RUNTIME_TESTING_MENU
config ARCH_USE_MEMTEST
diff --git a/lib/Makefile b/lib/Makefile
index 1b311c7fd32b..9e0972f7b764 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -34,7 +34,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
nmi_backtrace.o win_minmax.o memcat_p.o \
- buildid.o
+ buildid.o objpool.o
lib-$(CONFIG_PRINTK) += dump_stack.o
lib-$(CONFIG_SMP) += cpumask.o
@@ -107,6 +107,8 @@ obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_TEST_REF_TRACKER) += test_ref_tracker.o
CFLAGS_test_fprobe.o += $(CC_FLAGS_FTRACE)
obj-$(CONFIG_FPROBE_SANITY_TEST) += test_fprobe.o
+obj-$(CONFIG_TEST_OBJPOOL) += test_objpool.o
+
#
# CFLAGS for compiling floating point code inside the kernel. x86/Makefile turns
# off the generation of FPU/SSE* instructions for kernel proper but FPU_FLAGS
diff --git a/lib/objpool.c b/lib/objpool.c
new file mode 100644
index 000000000000..ce0087f64400
--- /dev/null
+++ b/lib/objpool.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/objpool.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/atomic.h>
+#include <linux/irqflags.h>
+#include <linux/cpumask.h>
+#include <linux/log2.h>
+
+/*
+ * objpool: ring-array based lockless MPMC/FIFO queues
+ *
+ * Copyright: wuqiang.matt@bytedance.com,mhiramat@kernel.org
+ */
+
+/* initialize percpu objpool_slot */
+static int
+objpool_init_percpu_slot(struct objpool_head *pool,
+ struct objpool_slot *slot,
+ int nodes, void *context,
+ objpool_init_obj_cb objinit)
+{
+ void *obj = (void *)&slot->entries[pool->capacity];
+ int i;
+
+ /* initialize elements of percpu objpool_slot */
+ slot->mask = pool->capacity - 1;