From 0c93b7d85d40b690f04786ea0f18798b73182e4f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Mar 2016 12:06:51 -0400
Subject: bpf: reject invalid names right in ->lookup()

... and other methods won't see them at all

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/bpf/inode.c | 37 ++++++++-----------------------------
 1 file changed, 8 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index f2ece3c174a5..35d21c189bb0 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -119,18 +119,10 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
 	return 0;
 }
 
-static bool bpf_dname_reserved(const struct dentry *dentry)
-{
-	return strchr(dentry->d_name.name, '.');
-}
-
 static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 
-	if (bpf_dname_reserved(dentry))
-		return -EPERM;
-
 	inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -152,9 +144,6 @@ static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,
 {
 	struct inode *inode;
 
-	if (bpf_dname_reserved(dentry))
-		return -EPERM;
-
 	inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -187,31 +176,21 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
 	}
 }
 
-static int bpf_link(struct dentry *old_dentry, struct inode *dir,
-		    struct dentry *new_dentry)
+static struct dentry *
+bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
 {
-	if (bpf_dname_reserved(new_dentry))
-		return -EPERM;
-
-	return simple_link(old_dentry, dir, new_dentry);
-}
-
-static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry,
-		      struct inode *new_dir, struct dentry *new_dentry)
-{
-	if (bpf_dname_reserved(new_dentry))
-		return -EPERM;
-
-	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+	if (strchr(dentry->d_name.name, '.'))
+		return ERR_PTR(-EPERM);
+	return simple_lookup(dir, dentry, flags);
 }
 
 static const struct inode_operations bpf_dir_iops = {
-	.lookup		= simple_lookup,
+	.lookup		= bpf_lookup,
 	.mknod		= bpf_mkobj,
 	.mkdir		= bpf_mkdir,
 	.rmdir		= simple_rmdir,
-	.rename		= bpf_rename,
-	.link		= bpf_link,
+	.rename		= simple_rename,
+	.link		= simple_link,
 	.unlink		= simple_unlink,
 };
 
-- 
cgit v1.2.3


From 63cc787e71ae12976b2116f09677d1f2805df83e Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Thu, 17 Mar 2016 12:00:31 +0800
Subject: irqdomain: Export irq_domain_free_irqs_common

Export irq_domain_free_irqs_common so it can be used by modules.

Signed-off-by: Axel Lin <axel.lin@ingics.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 kernel/irq/irqdomain.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3a519a01118b..245a485ffb61 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1099,6 +1099,7 @@ void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq,
 	}
 	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
 }
+EXPORT_SYMBOL_GPL(irq_domain_free_irqs_common);
 
 /**
  * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent
-- 
cgit v1.2.3


From c269cba35b061181bc23c470809c00e8f71e535a Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 22 Feb 2016 15:02:07 +0100
Subject: memremap: add arch specific hook for MEMREMAP_WB mappings

Currently, the memremap code serves MEMREMAP_WB mappings directly from
the kernel direct mapping, unless the region is in high memory, in which
case it falls back to using ioremap_cache(). However, the semantics of
ioremap_cache() are not unambiguously defined, and on ARM, it will
actually result in a mapping type that differs from the attributes used
for the linear mapping, and for this reason, the ioremap_cache() call
fails if the region is part of the memory managed by the kernel.

So instead, implement an optional hook 'arch_memremap_wb' whose default
implementation calls ioremap_cache() as before, but which can be
overridden by the architecture to do what is appropriate for it.

Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 kernel/memremap.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index a6d382312e6f..017532193fb1 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -27,6 +27,13 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
 }
 #endif
 
+#ifndef arch_memremap_wb
+static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
+{
+	return (__force void *)ioremap_cache(offset, size);
+}
+#endif
+
 static void *try_ram_remap(resource_size_t offset, size_t size)
 {
 	unsigned long pfn = PHYS_PFN(offset);
@@ -34,7 +41,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
 	/* In the simple case just return the existing linear address */
 	if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
 		return __va(offset);
-	return NULL; /* fallback to ioremap_cache */
+	return NULL; /* fallback to arch_memremap_wb */
 }
 
 /**
@@ -90,7 +97,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
 		if (is_ram == REGION_INTERSECTS)
 			addr = try_ram_remap(offset, size);
 		if (!addr)
-			addr = ioremap_cache(offset, size);
+			addr = arch_memremap_wb(offset, size);
 	}
 
 	/*
-- 
cgit v1.2.3


From 0bf676d1fd5144e66ac06939fa3c7c12086c3b18 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 31 Mar 2016 10:49:28 +0200
Subject: audit: cleanup prune_tree_thread

We can use kthread_run instead of kthread_create+wake_up_process for
creating the thread.

We do not need to set the task state to TASK_RUNNING after schedule(),
the process is in that state already.

And we do not need to set the state to TASK_INTERRUPTIBLE when not
doing schedule() as we set the state to TASK_RUNNING immediately
afterwards.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: <linux-audit@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 5efe9b299a12..25772476fa4a 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -661,10 +661,10 @@ static int tag_mount(struct vfsmount *mnt, void *arg)
 static int prune_tree_thread(void *unused)
 {
 	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (list_empty(&prune_list))
+		if (list_empty(&prune_list)) {
+			set_current_state(TASK_INTERRUPTIBLE);
 			schedule();
-		__set_current_state(TASK_RUNNING);
+		}
 
 		mutex_lock(&audit_cmd_mutex);
 		mutex_lock(&audit_filter_mutex);
@@ -693,16 +693,14 @@ static int audit_launch_prune(void)
 {
 	if (prune_thread)
 		return 0;
-	prune_thread = kthread_create(prune_tree_thread, NULL,
+	prune_thread = kthread_run(prune_tree_thread, NULL,
 				"audit_prune_tree");
 	if (IS_ERR(prune_thread)) {
 		pr_err("cannot start thread audit_prune_tree");
 		prune_thread = NULL;
 		return -ENOMEM;
-	} else {
-		wake_up_process(prune_thread);
-		return 0;
 	}
+	return 0;
 }
 
 /* called with audit_filter_mutex */
-- 
cgit v1.2.3


From 7ffb8e317bae03b8ee5bdcec93dc3723be945e9b Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Mon, 4 Apr 2016 16:44:02 -0400
Subject: audit: we don't need to __set_current_state(TASK_RUNNING)

Remove the calls to __set_current_state() to mark the task as running
and do some related cleanup in wait_for_auditd() to limit the amount
of work we do when we aren't going to reschedule the current task.

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 3a3e5deeda8d..f52fbefede09 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -430,7 +430,6 @@ restart:
 					attempts, audit_pid);
 				set_current_state(TASK_INTERRUPTIBLE);
 				schedule();
-				__set_current_state(TASK_RUNNING);
 				goto restart;
 			}
 		}
@@ -1324,15 +1323,14 @@ static inline void audit_get_stamp(struct audit_context *ctx,
 static long wait_for_auditd(long sleep_time)
 {
 	DECLARE_WAITQUEUE(wait, current);
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	add_wait_queue_exclusive(&audit_backlog_wait, &wait);
 
 	if (audit_backlog_limit &&
-	    skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
+	    skb_queue_len(&audit_skb_queue) > audit_backlog_limit) {
+		add_wait_queue_exclusive(&audit_backlog_wait, &wait);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 		sleep_time = schedule_timeout(sleep_time);
-
-	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&audit_backlog_wait, &wait);
+		remove_wait_queue(&audit_backlog_wait, &wait);
+	}
 
 	return sleep_time;
 }
-- 
cgit v1.2.3


From e68503bd6836ba765dc8e0ee77ea675fedc07e41 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Apr 2016 16:14:24 +0100
Subject: KEYS: Generalise system_verify_data() to provide access to internal
 content

Generalise system_verify_data() to provide access to internal content
through a callback.  This allows all the PKCS#7 stuff to be hidden inside
this function and removed from the PE file parser and the PKCS#7 test key.

If external content is not required, NULL should be passed as data to the
function.  If the callback is not required, that can be set to NULL.

The function is now called verify_pkcs7_signature() to contrast with
verify_pefile_signature() and the definitions of both have been moved into
linux/verification.h along with the key_being_used_for enum.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 kernel/module_signing.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 64b9dead4a07..593aace88a02 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -80,6 +80,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
 		return -EBADMSG;
 	}
 
-	return system_verify_data(mod, modlen, mod + modlen, sig_len,
-				  VERIFYING_MODULE_SIGNATURE);
+	return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
+				      NULL, -ENOKEY, VERIFYING_MODULE_SIGNATURE,
+				      NULL, NULL);
 }
-- 
cgit v1.2.3


From bda850cd214e90b1be0cc25bc48c4f6ac53eb543 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Apr 2016 16:14:24 +0100
Subject: PKCS#7: Make trust determination dependent on contents of trust
 keyring

Make the determination of the trustworthiness of a key dependent on whether
a key that can verify it is present in the supplied ring of trusted keys
rather than whether or not the verifying key has KEY_FLAG_TRUSTED set.

verify_pkcs7_signature() will return -ENOKEY if the PKCS#7 message trust
chain cannot be verified.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 kernel/module_signing.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 593aace88a02..6a64e03b9f44 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -81,6 +81,6 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
 	}
 
 	return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
-				      NULL, -ENOKEY, VERIFYING_MODULE_SIGNATURE,
+				      NULL, VERIFYING_MODULE_SIGNATURE,
 				      NULL, NULL);
 }
-- 
cgit v1.2.3


From a511e1af8b12f44c6e55786c463c9f093c214fb6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Apr 2016 16:14:26 +0100
Subject: KEYS: Move the point of trust determination to __key_link()

Move the point at which a key is determined to be trustworthy to
__key_link() so that we use the contents of the keyring being linked in to
to determine whether the key being linked in is trusted or not.

What is 'trusted' then becomes a matter of what's in the keyring.

Currently, the test is done when the key is parsed, but given that at that
point we can only sensibly refer to the contents of the system trusted
keyring, we can only use that as the basis for working out the
trustworthiness of a new key.

With this change, a trusted keyring is a set of keys that once the
trusted-only flag is set cannot be added to except by verification through
one of the contained keys.

Further, adding a key into a trusted keyring, whilst it might grant
trustworthiness in the context of that keyring, does not automatically
grant trustworthiness in the context of a second keyring to which it could
be secondarily linked.

To accomplish this, the authentication data associated with the key source
must now be retained.  For an X.509 cert, this means the contents of the
AuthorityKeyIdentifier and the signature data.


If system keyrings are disabled then restrict_link_by_builtin_trusted()
resolves to restrict_link_reject().  The integrity digital signature code
still works correctly with this as it was previously using
KEY_FLAG_TRUSTED_ONLY, which doesn't permit anything to be added if there
is no system keyring against which trust can be determined.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 kernel/module_signing.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 6a64e03b9f44..937c844bee4a 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -12,7 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
-#include <keys/system_keyring.h>
+#include <linux/verification.h>
 #include <crypto/public_key.h>
 #include "module-internal.h"
 
-- 
cgit v1.2.3


From 9ebc57cfaad21aacbc363eecead579269a46b493 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 12 Apr 2016 20:39:55 -0400
Subject: tracing: Rename check_ignore_pid() to ignore_this_task()

The name "check_ignore_pid" is confusing in trying to figure out if the pid
should be ignored or not. Rename it to "ignore_this_task" which is pretty
straight forward, as a task (not a pid) is passed in, and should if true
should be ignored.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 05ddc0820771..598a18675a6b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -484,7 +484,7 @@ static int cmp_pid(const void *key, const void *elt)
 }
 
 static bool
-check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task)
+ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
 {
 	pid_t search_pid;
 	pid_t *pid;
@@ -517,8 +517,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       check_ignore_pid(pid_list, prev) &&
-		       check_ignore_pid(pid_list, next));
+		       ignore_this_task(pid_list, prev) &&
+		       ignore_this_task(pid_list, next));
 }
 
 static void
@@ -531,7 +531,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       check_ignore_pid(pid_list, next));
+		       ignore_this_task(pid_list, next));
 }
 
 static void
@@ -547,7 +547,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       check_ignore_pid(pid_list, task));
+		       ignore_this_task(pid_list, task));
 }
 
 static void
@@ -564,7 +564,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
 
 	/* Set tracing if current is enabled */
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       check_ignore_pid(pid_list, current));
+		       ignore_this_task(pid_list, current));
 }
 
 static void __ftrace_clear_event_pids(struct trace_array *tr)
@@ -1561,7 +1561,7 @@ static void ignore_task_cpu(void *data)
 					     mutex_is_locked(&event_mutex));
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       check_ignore_pid(pid_list, current));
+		       ignore_this_task(pid_list, current));
 }
 
 static ssize_t
-- 
cgit v1.2.3


From f4d34a87e9c10f0ffd03d3548db6bfb200d06cdf Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 13 Apr 2016 16:27:49 -0400
Subject: tracing: Use pid bitmap instead of a pid array for set_event_pid

In order to add the ability to let tasks that are filtered by the events
have their children also be traced on fork (and then not traced on exit),
convert the array into a pid bitmask. Most of the time the number of pids is
only 32768 pids or a 4k bitmask, which is the same size as the default list
currently is, and that list could grow if more pids are listed.

This also greatly simplifies the code.

Suggested-by: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h        |   5 +-
 kernel/trace/trace_events.c | 221 ++++++++++++++++++++------------------------
 2 files changed, 102 insertions(+), 124 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3fff4adfd431..68cbb8e10aea 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -177,9 +177,8 @@ struct trace_options {
 };
 
 struct trace_pid_list {
-	unsigned int			nr_pids;
-	int				order;
-	pid_t				*pids;
+	int				pid_max;
+	unsigned long			*pids;
 };
 
 /*
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 598a18675a6b..45f7cc72bf25 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,7 +15,7 @@
 #include <linux/kthread.h>
 #include <linux/tracefs.h>
 #include <linux/uaccess.h>
-#include <linux/bsearch.h>
+#include <linux/vmalloc.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/sort.h>
@@ -471,23 +471,13 @@ static void ftrace_clear_events(struct trace_array *tr)
 	mutex_unlock(&event_mutex);
 }
 
-static int cmp_pid(const void *key, const void *elt)
-{
-	const pid_t *search_pid = key;
-	const pid_t *pid = elt;
-
-	if (*search_pid == *pid)
-		return 0;
-	if (*search_pid < *pid)
-		return -1;
-	return 1;
-}
+/* Shouldn't this be in a header? */
+extern int pid_max;
 
 static bool
 ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
 {
-	pid_t search_pid;
-	pid_t *pid;
+	pid_t pid;
 
 	/*
 	 * Return false, because if filtered_pids does not exist,
@@ -496,15 +486,16 @@ ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
 	if (!filtered_pids)
 		return false;
 
-	search_pid = task->pid;
+	pid = task->pid;
 
-	pid = bsearch(&search_pid, filtered_pids->pids,
-		      filtered_pids->nr_pids, sizeof(pid_t),
-		      cmp_pid);
-	if (!pid)
+	/*
+	 * If pid_max changed after filtered_pids was created, we
+	 * by default ignore all pids greater than the previous pid_max.
+	 */
+	if (task->pid >= filtered_pids->pid_max)
 		return true;
 
-	return false;
+	return !test_bit(task->pid, filtered_pids->pids);
 }
 
 static void
@@ -602,7 +593,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
 	/* Wait till all users are no longer using pid filtering */
 	synchronize_sched();
 
-	free_pages((unsigned long)pid_list->pids, pid_list->order);
+	vfree(pid_list->pids);
 	kfree(pid_list);
 }
 
@@ -946,11 +937,32 @@ static void t_stop(struct seq_file *m, void *p)
 	mutex_unlock(&event_mutex);
 }
 
+static void *
+p_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct trace_array *tr = m->private;
+	struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
+	unsigned long pid = (unsigned long)v;
+
+	(*pos)++;
+
+	/* pid already is +1 of the actual prevous bit */
+	pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
+
+	/* Return pid + 1 to allow zero to be represented */
+	if (pid < pid_list->pid_max)
+		return (void *)(pid + 1);
+
+	return NULL;
+}
+
 static void *p_start(struct seq_file *m, loff_t *pos)
 	__acquires(RCU)
 {
 	struct trace_pid_list *pid_list;
 	struct trace_array *tr = m->private;
+	unsigned long pid;
+	loff_t l = 0;
 
 	/*
 	 * Grab the mutex, to keep calls to p_next() having the same
@@ -963,10 +975,18 @@ static void *p_start(struct seq_file *m, loff_t *pos)
 
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
-	if (!pid_list || *pos >= pid_list->nr_pids)
+	if (!pid_list)
+		return NULL;
+
+	pid = find_first_bit(pid_list->pids, pid_list->pid_max);
+	if (pid >= pid_list->pid_max)
 		return NULL;
 
-	return (void *)&pid_list->pids[*pos];
+	/* Return pid + 1 so that zero can be the exit value */
+	for (pid++; pid && l < *pos;
+	     pid = (unsigned long)p_next(m, (void *)pid, &l))
+		;
+	return (void *)pid;
 }
 
 static void p_stop(struct seq_file *m, void *p)
@@ -976,25 +996,11 @@ static void p_stop(struct seq_file *m, void *p)
 	mutex_unlock(&event_mutex);
 }
 
-static void *
-p_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	struct trace_array *tr = m->private;
-	struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
-
-	(*pos)++;
-
-	if (*pos >= pid_list->nr_pids)
-		return NULL;
-
-	return (void *)&pid_list->pids[*pos];
-}
-
 static int p_show(struct seq_file *m, void *v)
 {
-	pid_t *pid = v;
+	unsigned long pid = (unsigned long)v - 1;
 
-	seq_printf(m, "%d\n", *pid);
+	seq_printf(m, "%lu\n", pid);
 	return 0;
 }
 
@@ -1543,11 +1549,6 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
 	return r;
 }
 
-static int max_pids(struct trace_pid_list *pid_list)
-{
-	return (PAGE_SIZE << pid_list->order) / sizeof(pid_t);
-}
-
 static void ignore_task_cpu(void *data)
 {
 	struct trace_array *tr = data;
@@ -1571,7 +1572,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 	struct seq_file *m = filp->private_data;
 	struct trace_array *tr = m->private;
 	struct trace_pid_list *filtered_pids = NULL;
-	struct trace_pid_list *pid_list = NULL;
+	struct trace_pid_list *pid_list;
 	struct trace_event_file *file;
 	struct trace_parser parser;
 	unsigned long val;
@@ -1579,7 +1580,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 	ssize_t read = 0;
 	ssize_t ret = 0;
 	pid_t pid;
-	int i;
+	int nr_pids = 0;
 
 	if (!cnt)
 		return 0;
@@ -1592,10 +1593,43 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 		return -ENOMEM;
 
 	mutex_lock(&event_mutex);
+	filtered_pids = rcu_dereference_protected(tr->filtered_pids,
+					     lockdep_is_held(&event_mutex));
+
 	/*
-	 * Load as many pids into the array before doing a
-	 * swap from the tr->filtered_pids to the new list.
+	 * Always recreate a new array. The write is an all or nothing
+	 * operation. Always create a new array when adding new pids by
+	 * the user. If the operation fails, then the current list is
+	 * not modified.
 	 */
+	pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
+	if (!pid_list) {
+		read = -ENOMEM;
+		goto out;
+	}
+	pid_list->pid_max = READ_ONCE(pid_max);
+	/* Only truncating will shrink pid_max */
+	if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
+		pid_list->pid_max = filtered_pids->pid_max;
+	pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
+	if (!pid_list->pids) {
+		kfree(pid_list);
+		read = -ENOMEM;
+		goto out;
+	}
+	if (filtered_pids) {
+		/* copy the current bits to the new max */
+		pid = find_first_bit(filtered_pids->pids,
+				     filtered_pids->pid_max);
+		while (pid < filtered_pids->pid_max) {
+			set_bit(pid, pid_list->pids);
+			pid = find_next_bit(filtered_pids->pids,
+					    filtered_pids->pid_max,
+					    pid + 1);
+			nr_pids++;
+		}
+	}
+
 	while (cnt > 0) {
 
 		this_pos = 0;
@@ -1613,92 +1647,35 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 		ret = -EINVAL;
 		if (kstrtoul(parser.buffer, 0, &val))
 			break;
-		if (val > INT_MAX)
+		if (val >= pid_list->pid_max)
 			break;
 
 		pid = (pid_t)val;
 
-		ret = -ENOMEM;
-		if (!pid_list) {
-			pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
-			if (!pid_list)
-				break;
-
-			filtered_pids = rcu_dereference_protected(tr->filtered_pids,
-							lockdep_is_held(&event_mutex));
-			if (filtered_pids)
-				pid_list->order = filtered_pids->order;
-			else
-				pid_list->order = 0;
-
-			pid_list->pids = (void *)__get_free_pages(GFP_KERNEL,
-								  pid_list->order);
-			if (!pid_list->pids)
-				break;
-
-			if (filtered_pids) {
-				pid_list->nr_pids = filtered_pids->nr_pids;
-				memcpy(pid_list->pids, filtered_pids->pids,
-				       pid_list->nr_pids * sizeof(pid_t));
-			} else
-				pid_list->nr_pids = 0;
-		}
-
-		if (pid_list->nr_pids >= max_pids(pid_list)) {
-			pid_t *pid_page;
-
-			pid_page = (void *)__get_free_pages(GFP_KERNEL,
-							    pid_list->order + 1);
-			if (!pid_page)
-				break;
-			memcpy(pid_page, pid_list->pids,
-			       pid_list->nr_pids * sizeof(pid_t));
-			free_pages((unsigned long)pid_list->pids, pid_list->order);
-
-			pid_list->order++;
-			pid_list->pids = pid_page;
-		}
+		set_bit(pid, pid_list->pids);
+		nr_pids++;
 
-		pid_list->pids[pid_list->nr_pids++] = pid;
 		trace_parser_clear(&parser);
 		ret = 0;
 	}
 	trace_parser_put(&parser);
 
 	if (ret < 0) {
-		if (pid_list)
-			free_pages((unsigned long)pid_list->pids, pid_list->order);
+		vfree(pid_list->pids);
 		kfree(pid_list);
-		mutex_unlock(&event_mutex);
-		return ret;
-	}
-
-	if (!pid_list) {
-		mutex_unlock(&event_mutex);
-		return ret;
+		read = ret;
+		goto out;
 	}
 
-	sort(pid_list->pids, pid_list->nr_pids, sizeof(pid_t), cmp_pid, NULL);
-
-	/* Remove duplicates */
-	for (i = 1; i < pid_list->nr_pids; i++) {
-		int start = i;
-
-		while (i < pid_list->nr_pids &&
-		       pid_list->pids[i - 1] == pid_list->pids[i])
-			i++;
-
-		if (start != i) {
-			if (i < pid_list->nr_pids) {
-				memmove(&pid_list->pids[start], &pid_list->pids[i],
-					(pid_list->nr_pids - i) * sizeof(pid_t));
-				pid_list->nr_pids -= i - start;
-				i = start;
-			} else
-				pid_list->nr_pids = start;
-		}
+	if (!nr_pids) {
+		/* Cleared the list of pids */
+		vfree(pid_list->pids);
+		kfree(pid_list);
+		read = ret;
+		if (!filtered_pids)
+			goto out;
+		pid_list = NULL;
 	}
-
 	rcu_assign_pointer(tr->filtered_pids, pid_list);
 
 	list_for_each_entry(file, &tr->events, list) {
@@ -1708,7 +1685,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 	if (filtered_pids) {
 		synchronize_sched();
 
-		free_pages((unsigned long)filtered_pids->pids, filtered_pids->order);
+		vfree(filtered_pids->pids);
 		kfree(filtered_pids);
 	} else {
 		/*
@@ -1745,10 +1722,12 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 	 */
 	on_each_cpu(ignore_task_cpu, tr, 1);
 
+ out:
 	mutex_unlock(&event_mutex);
 
 	ret = read;
-	*ppos += read;
+	if (read > 0)
+		*ppos += read;
 
 	return ret;
 }
-- 
cgit v1.2.3


From c37775d57830a36382a9774bb84eca4ce3d019cc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 13 Apr 2016 16:59:18 -0400
Subject: tracing: Add infrastructure to allow set_event_pid to follow children

Add the infrastructure needed to have the PIDs in set_event_pid to
automatically add PIDs of the children of the tasks that have their PIDs in
set_event_pid. This will also remove PIDs from set_event_pid when a task
exits

This is implemented by adding hooks into the fork and exit tracepoints. On
fork, the PIDs are added to the list, and on exit, they are removed.

Add a new option called event_fork that when set, PIDs in set_event_pid will
automatically get their children PIDs added when they fork, as well as any
task that exits will have its PID removed from set_event_pid.

This works for instances as well.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c        |  3 ++
 kernel/trace/trace.h        |  2 ++
 kernel/trace/trace_events.c | 84 +++++++++++++++++++++++++++++++++++++++------
 3 files changed, 79 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a2f0b9f33e9b..0d12dbde8399 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3571,6 +3571,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
 	if (mask == TRACE_ITER_RECORD_CMD)
 		trace_event_enable_cmd_record(enabled);
 
+	if (mask == TRACE_ITER_EVENT_FORK)
+		trace_event_follow_fork(tr, enabled);
+
 	if (mask == TRACE_ITER_OVERWRITE) {
 		ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
 #ifdef CONFIG_TRACER_MAX_TRACE
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 68cbb8e10aea..2525042760e6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -655,6 +655,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
 extern cycle_t ftrace_now(int cpu);
 
 extern void trace_find_cmdline(int pid, char comm[]);
+extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern unsigned long ftrace_update_tot_cnt;
@@ -966,6 +967,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
 		C(STOP_ON_FREE,		"disable_on_free"),	\
 		C(IRQ_INFO,		"irq-info"),		\
 		C(MARKERS,		"markers"),		\
+		C(EVENT_FORK,		"event-fork"),		\
 		FUNCTION_FLAGS					\
 		FGRAPH_FLAGS					\
 		STACK_FLAGS					\
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 45f7cc72bf25..add81dff7520 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -474,11 +474,23 @@ static void ftrace_clear_events(struct trace_array *tr)
 /* Shouldn't this be in a header? */
 extern int pid_max;
 
+/* Returns true if found in filter */
 static bool
-ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
+find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
 {
-	pid_t pid;
+	/*
+	 * If pid_max changed after filtered_pids was created, we
+	 * by default ignore all pids greater than the previous pid_max.
+	 */
+	if (search_pid >= filtered_pids->pid_max)
+		return false;
+
+	return test_bit(search_pid, filtered_pids->pids);
+}
 
+static bool
+ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
+{
 	/*
 	 * Return false, because if filtered_pids does not exist,
 	 * all pids are good to trace.
@@ -486,16 +498,68 @@ ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
 	if (!filtered_pids)
 		return false;
 
-	pid = task->pid;
+	return !find_filtered_pid(filtered_pids, task->pid);
+}
 
-	/*
-	 * If pid_max changed after filtered_pids was created, we
-	 * by default ignore all pids greater than the previous pid_max.
-	 */
-	if (task->pid >= filtered_pids->pid_max)
-		return true;
+static void filter_add_remove_task(struct trace_pid_list *pid_list,
+				   struct task_struct *self,
+				   struct task_struct *task)
+{
+	if (!pid_list)
+		return;
+
+	/* For forks, we only add if the forking task is listed */
+	if (self) {
+		if (!find_filtered_pid(pid_list, self->pid))
+			return;
+	}
+
+	/* Sorry, but we don't support pid_max changing after setting */
+	if (task->pid >= pid_list->pid_max)
+		return;
+
+	/* "self" is set for forks, and NULL for exits */
+	if (self)
+		set_bit(task->pid, pid_list->pids);
+	else
+		clear_bit(task->pid, pid_list->pids);
+}
+
+static void
+event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
+{
+	struct trace_pid_list *pid_list;
+	struct trace_array *tr = data;
+
+	pid_list = rcu_dereference_sched(tr->filtered_pids);
+	filter_add_remove_task(pid_list, NULL, task);
+}
 
-	return !test_bit(task->pid, filtered_pids->pids);
+static void
+event_filter_pid_sched_process_fork(void *data,
+				    struct task_struct *self,
+				    struct task_struct *task)
+{
+	struct trace_pid_list *pid_list;
+	struct trace_array *tr = data;
+
+	pid_list = rcu_dereference_sched(tr->filtered_pids);
+	filter_add_remove_task(pid_list, self, task);
+}
+
+void trace_event_follow_fork(struct trace_array *tr, bool enable)
+{
+	if (enable) {
+		register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork,
+						       tr, INT_MIN);
+		register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit,
+						       tr, INT_MAX);
+	} else {
+		unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork,
+						    tr);
+		unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit,
+						    tr);
+	}
 }
 
 static void
-- 
cgit v1.2.3


From 08d43a5fa063e03c860f2f391a30c388bcbc948e Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 10 Dec 2015 12:50:50 -0600
Subject: tracing: Add lock-free tracing_map

Add tracing_map, a special-purpose lock-free map for tracing.

tracing_map is designed to aggregate or 'sum' one or more values
associated with a specific object of type tracing_map_elt, which
is associated by the map to a given key.

It provides various hooks allowing per-tracer customization and is
separated out into a separate file in order to allow it to be shared
between multiple tracers, but isn't meant to be generally used outside
of that context.

The tracing_map implementation was inspired by lock-free map
algorithms originated by Dr. Cliff Click:

 http://www.azulsystems.com/blog/cliff/2007-03-26-non-blocking-hashtable
 http://www.azulsystems.com/events/javaone_2007/2007_LockFreeHash.pdf

Link: http://lkml.kernel.org/r/b43d68d1add33582a396f553c8ef705a33a6a748.1449767187.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig       |   13 +
 kernel/trace/Makefile      |    1 +
 kernel/trace/tracing_map.c | 1058 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/tracing_map.h |  282 ++++++++++++
 4 files changed, 1354 insertions(+)
 create mode 100644 kernel/trace/tracing_map.c
 create mode 100644 kernel/trace/tracing_map.h

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e45db6b0d878..48b732943e0e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -528,6 +528,19 @@ config MMIOTRACE
 	  See Documentation/trace/mmiotrace.txt.
 	  If you are not helping to develop drivers, say N.
 
+config TRACING_MAP
+	bool
+	depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
+	default n
+	help
+	  tracing_map is a special-purpose lock-free map for tracing,
+	  separated out as a stand-alone facility in order to allow it
+	  to be shared between multiple tracers.  It isn't meant to be
+	  generally used outside of that context, and is normally
+	  selected by tracers that use it.
+
+	  If in doubt, say N.
+
 config MMIOTRACE_TEST
 	tristate "Test module for mmiotrace"
 	depends on MMIOTRACE && m
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 9b1044e936a6..4255c4057aaa 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_seq.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_TRACING) += trace_printk.o
+obj-$(CONFIG_TRACING_MAP) += tracing_map.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
new file mode 100644
index 000000000000..dd6401dd145e
--- /dev/null
+++ b/kernel/trace/tracing_map.c
@@ -0,0 +1,1058 @@
+/*
+ * tracing_map - lock-free map for tracing
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com>
+ *
+ * tracing_map implementation inspired by lock-free map algorithms
+ * originated by Dr. Cliff Click:
+ *
+ * http://www.azulsystems.com/blog/cliff/2007-03-26-non-blocking-hashtable
+ * http://www.azulsystems.com/events/javaone_2007/2007_LockFreeHash.pdf
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+
+#include "tracing_map.h"
+#include "trace.h"
+
+/*
+ * NOTE: For a detailed description of the data structures used by
+ * these functions (such as tracing_map_elt) please see the overview
+ * of tracing_map data structures at the beginning of tracing_map.h.
+ */
+
+/**
+ * tracing_map_update_sum - Add a value to a tracing_map_elt's sum field
+ * @elt: The tracing_map_elt
+ * @i: The index of the given sum associated with the tracing_map_elt
+ * @n: The value to add to the sum
+ *
+ * Add n to sum i associated with the specified tracing_map_elt
+ * instance.  The index i is the index returned by the call to
+ * tracing_map_add_sum_field() when the tracing map was set up.
+ */
+void tracing_map_update_sum(struct tracing_map_elt *elt, unsigned int i, u64 n)
+{
+	atomic64_add(n, &elt->fields[i].sum);
+}
+
+/**
+ * tracing_map_read_sum - Return the value of a tracing_map_elt's sum field
+ * @elt: The tracing_map_elt
+ * @i: The index of the given sum associated with the tracing_map_elt
+ *
+ * Retrieve the value of the sum i associated with the specified
+ * tracing_map_elt instance.  The index i is the index returned by the
+ * call to tracing_map_add_sum_field() when the tracing map was set
+ * up.
+ *
+ * Return: The sum associated with field i for elt.
+ */
+u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i)
+{
+	return (u64)atomic64_read(&elt->fields[i].sum);
+}
+
+int tracing_map_cmp_string(void *val_a, void *val_b)
+{
+	char *a = val_a;
+	char *b = val_b;
+
+	return strcmp(a, b);
+}
+
+int tracing_map_cmp_none(void *val_a, void *val_b)
+{
+	return 0;
+}
+
+static int tracing_map_cmp_atomic64(void *val_a, void *val_b)
+{
+	u64 a = atomic64_read((atomic64_t *)val_a);
+	u64 b = atomic64_read((atomic64_t *)val_b);
+
+	return (a > b) ? 1 : ((a < b) ? -1 : 0);
+}
+
+#define DEFINE_TRACING_MAP_CMP_FN(type)					\
+static int tracing_map_cmp_##type(void *val_a, void *val_b)		\
+{									\
+	type a = *(type *)val_a;					\
+	type b = *(type *)val_b;					\
+									\
+	return (a > b) ? 1 : ((a < b) ? -1 : 0);			\
+}
+
+DEFINE_TRACING_MAP_CMP_FN(s64);
+DEFINE_TRACING_MAP_CMP_FN(u64);
+DEFINE_TRACING_MAP_CMP_FN(s32);
+DEFINE_TRACING_MAP_CMP_FN(u32);
+DEFINE_TRACING_MAP_CMP_FN(s16);
+DEFINE_TRACING_MAP_CMP_FN(u16);
+DEFINE_TRACING_MAP_CMP_FN(s8);
+DEFINE_TRACING_MAP_CMP_FN(u8);
+
+tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size,
+					 int field_is_signed)
+{
+	tracing_map_cmp_fn_t fn = tracing_map_cmp_none;
+
+	switch (field_size) {
+	case 8:
+		if (field_is_signed)
+			fn = tracing_map_cmp_s64;
+		else
+			fn = tracing_map_cmp_u64;
+		break;
+	case 4:
+		if (field_is_signed)
+			fn = tracing_map_cmp_s32;
+		else
+			fn = tracing_map_cmp_u32;
+		break;
+	case 2:
+		if (field_is_signed)
+			fn = tracing_map_cmp_s16;
+		else
+			fn = tracing_map_cmp_u16;
+		break;
+	case 1:
+		if (field_is_signed)
+			fn = tracing_map_cmp_s8;
+		else
+			fn = tracing_map_cmp_u8;
+		break;
+	}
+
+	return fn;
+}
+
+static int tracing_map_add_field(struct tracing_map *map,
+				 tracing_map_cmp_fn_t cmp_fn)
+{
+	int ret = -EINVAL;
+
+	if (map->n_fields < TRACING_MAP_FIELDS_MAX) {
+		ret = map->n_fields;
+		map->fields[map->n_fields++].cmp_fn = cmp_fn;
+	}
+
+	return ret;
+}
+
+/**
+ * tracing_map_add_sum_field - Add a field describing a tracing_map sum
+ * @map: The tracing_map
+ *
+ * Add a sum field to the key and return the index identifying it in
+ * the map and associated tracing_map_elts.  This is the index used
+ * for instance to update a sum for a particular tracing_map_elt using
+ * tracing_map_update_sum() or reading it via tracing_map_read_sum().
+ *
+ * Return: The index identifying the field in the map and associated
+ * tracing_map_elts.
+ */
+int tracing_map_add_sum_field(struct tracing_map *map)
+{
+	return tracing_map_add_field(map, tracing_map_cmp_atomic64);
+}
+
+/**
+ * tracing_map_add_key_field - Add a field describing a tracing_map key
+ * @map: The tracing_map
+ * @offset: The offset within the key
+ * @cmp_fn: The comparison function that will be used to sort on the key
+ *
+ * Let the map know there is a key and that if it's used as a sort key
+ * to use cmp_fn.
+ *
+ * A key can be a subset of a compound key; for that purpose, the
+ * offset param is used to describe where within the the compound key
+ * the key referenced by this key field resides.
+ *
+ * Return: The index identifying the field in the map and associated
+ * tracing_map_elts.
+ */
+int tracing_map_add_key_field(struct tracing_map *map,
+			      unsigned int offset,
+			      tracing_map_cmp_fn_t cmp_fn)
+
+{
+	int idx = tracing_map_add_field(map, cmp_fn);
+
+	if (idx < 0)
+		return idx;
+
+	map->fields[idx].offset = offset;
+
+	map->key_idx[map->n_keys++] = idx;
+
+	return idx;
+}
+
+void tracing_map_array_clear(struct tracing_map_array *a)
+{
+	unsigned int i;
+
+	if (!a->pages)
+		return;
+
+	for (i = 0; i < a->n_pages; i++)
+		memset(a->pages[i], 0, PAGE_SIZE);
+}
+
+void tracing_map_array_free(struct tracing_map_array *a)
+{
+	unsigned int i;
+
+	if (!a)
+		return;
+
+	if (!a->pages) {
+		kfree(a);
+		return;
+	}
+
+	for (i = 0; i < a->n_pages; i++) {
+		if (!a->pages[i])
+			break;
+		free_page((unsigned long)a->pages[i]);
+	}
+}
+
+struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,
+						  unsigned int entry_size)
+{
+	struct tracing_map_array *a;
+	unsigned int i;
+
+	a = kzalloc(sizeof(*a), GFP_KERNEL);
+	if (!a)
+		return NULL;
+
+	a->entry_size_shift = fls(roundup_pow_of_two(entry_size) - 1);
+	a->entries_per_page = PAGE_SIZE / (1 << a->entry_size_shift);
+	a->n_pages = n_elts / a->entries_per_page;
+	if (!a->n_pages)
+		a->n_pages = 1;
+	a->entry_shift = fls(a->entries_per_page) - 1;
+	a->entry_mask = (1 << a->entry_shift) - 1;
+
+	a->pages = kcalloc(a->n_pages, sizeof(void *), GFP_KERNEL);
+	if (!a->pages)
+		goto free;
+
+	for (i = 0; i < a->n_pages; i++) {
+		a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+		if (!a->pages[i])
+			goto free;
+	}
+ out:
+	return a;
+ free:
+	tracing_map_array_free(a);
+	a = NULL;
+
+	goto out;
+}
+
+static void tracing_map_elt_clear(struct tracing_map_elt *elt)
+{
+	unsigned i;
+
+	for (i = 0; i < elt->map->n_fields; i++)
+		if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64)
+			atomic64_set(&elt->fields[i].sum, 0);
+
+	if (elt->map->ops && elt->map->ops->elt_clear)
+		elt->map->ops->elt_clear(elt);
+}
+
+static void tracing_map_elt_init_fields(struct tracing_map_elt *elt)
+{
+	unsigned int i;
+
+	tracing_map_elt_clear(elt);
+
+	for (i = 0; i < elt->map->n_fields; i++) {
+		elt->fields[i].cmp_fn = elt->map->fields[i].cmp_fn;
+
+		if (elt->fields[i].cmp_fn != tracing_map_cmp_atomic64)
+			elt->fields[i].offset = elt->map->fields[i].offset;
+	}
+}
+
+static void tracing_map_elt_free(struct tracing_map_elt *elt)
+{
+	if (!elt)
+		return;
+
+	if (elt->map->ops && elt->map->ops->elt_free)
+		elt->map->ops->elt_free(elt);
+	kfree(elt->fields);
+	kfree(elt->key);
+	kfree(elt);
+}
+
+static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
+{
+	struct tracing_map_elt *elt;
+	int err = 0;
+
+	elt = kzalloc(sizeof(*elt), GFP_KERNEL);
+	if (!elt)
+		return ERR_PTR(-ENOMEM);
+
+	elt->map = map;
+
+	elt->key = kzalloc(map->key_size, GFP_KERNEL);
+	if (!elt->key) {
+		err = -ENOMEM;
+		goto free;
+	}
+
+	elt->fields = kcalloc(map->n_fields, sizeof(*elt->fields), GFP_KERNEL);
+	if (!elt->fields) {
+		err = -ENOMEM;
+		goto free;
+	}
+
+	tracing_map_elt_init_fields(elt);
+
+	if (map->ops && map->ops->elt_alloc) {
+		err = map->ops->elt_alloc(elt);
+		if (err)
+			goto free;
+	}
+	return elt;
+ free:
+	tracing_map_elt_free(elt);
+
+	return ERR_PTR(err);
+}
+
+static struct tracing_map_elt *get_free_elt(struct tracing_map *map)
+{
+	struct tracing_map_elt *elt = NULL;
+	int idx;
+
+	idx = atomic_inc_return(&map->next_elt);
+	if (idx < map->max_elts) {
+		elt = *(TRACING_MAP_ELT(map->elts, idx));
+		if (map->ops && map->ops->elt_init)
+			map->ops->elt_init(elt);
+	}
+
+	return elt;
+}
+
+static void tracing_map_free_elts(struct tracing_map *map)
+{
+	unsigned int i;
+
+	if (!map->elts)
+		return;
+
+	for (i = 0; i < map->max_elts; i++)
+		tracing_map_elt_free(*(TRACING_MAP_ELT(map->elts, i)));
+
+	tracing_map_array_free(map->elts);
+}
+
+static int tracing_map_alloc_elts(struct tracing_map *map)
+{
+	unsigned int i;
+
+	map->elts = tracing_map_array_alloc(map->max_elts,
+					    sizeof(struct tracing_map_elt *));
+	if (!map->elts)
+		return -ENOMEM;
+
+	for (i = 0; i < map->max_elts; i++) {
+		*(TRACING_MAP_ELT(map->elts, i)) = tracing_map_elt_alloc(map);
+		if (!(*(TRACING_MAP_ELT(map->elts, i)))) {
+			tracing_map_free_elts(map);
+
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static inline bool keys_match(void *key, void *test_key, unsigned key_size)
+{
+	bool match = true;
+
+	if (memcmp(key, test_key, key_size))
+		match = false;
+
+	return match;
+}
+
+static inline struct tracing_map_elt *
+__tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
+{
+	u32 idx, key_hash, test_key;
+	struct tracing_map_entry *entry;
+
+	key_hash = jhash(key, map->key_size, 0);
+	if (key_hash == 0)
+		key_hash = 1;
+	idx = key_hash >> (32 - (map->map_bits + 1));
+
+	while (1) {
+		idx &= (map->map_size - 1);
+		entry = TRACING_MAP_ENTRY(map->map, idx);
+		test_key = entry->key;
+
+		if (test_key && test_key == key_hash && entry->val &&
+		    keys_match(key, entry->val->key, map->key_size)) {
+			atomic64_inc(&map->hits);
+			return entry->val;
+		}
+
+		if (!test_key) {
+			if (lookup_only)
+				break;
+
+			if (!cmpxchg(&entry->key, 0, key_hash)) {
+				struct tracing_map_elt *elt;
+
+				elt = get_free_elt(map);
+				if (!elt) {
+					atomic64_inc(&map->drops);
+					entry->key = 0;
+					break;
+				}
+
+				memcpy(elt->key, key, map->key_size);
+				entry->val = elt;
+				atomic64_inc(&map->hits);
+
+				return entry->val;
+			}
+		}
+
+		idx++;
+	}
+
+	return NULL;
+}
+
+/**
+ * tracing_map_insert - Insert key and/or retrieve val from a tracing_map
+ * @map: The tracing_map to insert into
+ * @key: The key to insert
+ *
+ * Inserts a key into a tracing_map and creates and returns a new
+ * tracing_map_elt for it, or if the key has already been inserted by
+ * a previous call, returns the tracing_map_elt already associated
+ * with it.  When the map was created, the number of elements to be
+ * allocated for the map was specified (internally maintained as
+ * 'max_elts' in struct tracing_map), and that number of
+ * tracing_map_elts was created by tracing_map_init().  This is the
+ * pre-allocated pool of tracing_map_elts that tracing_map_insert()
+ * will allocate from when adding new keys.  Once that pool is
+ * exhausted, tracing_map_insert() is useless and will return NULL to
+ * signal that state.  There are two user-visible tracing_map
+ * variables, 'hits' and 'drops', which are updated by this function.
+ * Every time an element is either successfully inserted or retrieved,
+ * the 'hits' value is incrememented.  Every time an element insertion
+ * fails, the 'drops' value is incremented.
+ *
+ * This is a lock-free tracing map insertion function implementing a
+ * modified form of Cliff Click's basic insertion algorithm.  It
+ * requires the table size be a power of two.  To prevent any
+ * possibility of an infinite loop we always make the internal table
+ * size double the size of the requested table size (max_elts * 2).
+ * Likewise, we never reuse a slot or resize or delete elements - when
+ * we've reached max_elts entries, we simply return NULL once we've
+ * run out of entries.  Readers can at any point in time traverse the
+ * tracing map and safely access the key/val pairs.
+ *
+ * Return: the tracing_map_elt pointer val associated with the key.
+ * If this was a newly inserted key, the val will be a newly allocated
+ * and associated tracing_map_elt pointer val.  If the key wasn't
+ * found and the pool of tracing_map_elts has been exhausted, NULL is
+ * returned and no further insertions will succeed.
+ */
+struct tracing_map_elt *tracing_map_insert(struct tracing_map *map, void *key)
+{
+	return __tracing_map_insert(map, key, false);
+}
+
+/**
+ * tracing_map_lookup - Retrieve val from a tracing_map
+ * @map: The tracing_map to perform the lookup on
+ * @key: The key to look up
+ *
+ * Looks up key in tracing_map and if found returns the matching
+ * tracing_map_elt.  This is a lock-free lookup; see
+ * tracing_map_insert() for details on tracing_map and how it works.
+ * Every time an element is retrieved, the 'hits' value is
+ * incrememented.  There is one user-visible tracing_map variable,
+ * 'hits', which is updated by this function.  Every time an element
+ * is successfully retrieved, the 'hits' value is incrememented.  The
+ * 'drops' value is never updated by this function.
+ *
+ * Return: the tracing_map_elt pointer val associated with the key.
+ * If the key wasn't found, NULL is returned.
+ */
+struct tracing_map_elt *tracing_map_lookup(struct tracing_map *map, void *key)
+{
+	return __tracing_map_insert(map, key, true);
+}
+
+/**
+ * tracing_map_destroy - Destroy a tracing_map
+ * @map: The tracing_map to destroy
+ *
+ * Frees a tracing_map along with its associated array of
+ * tracing_map_elts.
+ *
+ * Callers should make sure there are no readers or writers actively
+ * reading or inserting into the map before calling this.
+ */
+void tracing_map_destroy(struct tracing_map *map)
+{
+	if (!map)
+		return;
+
+	tracing_map_free_elts(map);
+
+	tracing_map_array_free(map->map);
+	kfree(map);
+}
+
+/**
+ * tracing_map_clear - Clear a tracing_map
+ * @map: The tracing_map to clear
+ *
+ * Resets the tracing map to a cleared or initial state.  The
+ * tracing_map_elts are all cleared, and the array of struct
+ * tracing_map_entry is reset to an initialized state.
+ *
+ * Callers should make sure there are no writers actively inserting
+ * into the map before calling this.
+ */
+void tracing_map_clear(struct tracing_map *map)
+{
+	unsigned int i;
+
+	atomic_set(&map->next_elt, -1);
+	atomic64_set(&map->hits, 0);
+	atomic64_set(&map->drops, 0);
+
+	tracing_map_array_clear(map->map);
+
+	for (i = 0; i < map->max_elts; i++)
+		tracing_map_elt_clear(*(TRACING_MAP_ELT(map->elts, i)));
+}
+
+static void set_sort_key(struct tracing_map *map,
+			 struct tracing_map_sort_key *sort_key)
+{
+	map->sort_key = *sort_key;
+}
+
+/**
+ * tracing_map_create - Create a lock-free map and element pool
+ * @map_bits: The size of the map (2 ** map_bits)
+ * @key_size: The size of the key for the map in bytes
+ * @ops: Optional client-defined tracing_map_ops instance
+ * @private_data: Client data associated with the map
+ *
+ * Creates and sets up a map to contain 2 ** map_bits number of
+ * elements (internally maintained as 'max_elts' in struct
+ * tracing_map).  Before using, map fields should be added to the map
+ * with tracing_map_add_sum_field() and tracing_map_add_key_field().
+ * tracing_map_init() should then be called to allocate the array of
+ * tracing_map_elts, in order to avoid allocating anything in the map
+ * insertion path.  The user-specified map size reflects the maximum
+ * number of elements that can be contained in the table requested by
+ * the user - internally we double that in order to keep the table
+ * sparse and keep collisions manageable.
+ *
+ * A tracing_map is a special-purpose map designed to aggregate or
+ * 'sum' one or more values associated with a specific object of type
+ * tracing_map_elt, which is attached by the map to a given key.
+ *
+ * tracing_map_create() sets up the map itself, and provides
+ * operations for inserting tracing_map_elts, but doesn't allocate the
+ * tracing_map_elts themselves, or provide a means for describing the
+ * keys or sums associated with the tracing_map_elts.  All
+ * tracing_map_elts for a given map have the same set of sums and
+ * keys, which are defined by the client using the functions
+ * tracing_map_add_key_field() and tracing_map_add_sum_field().  Once
+ * the fields are defined, the pool of elements allocated for the map
+ * can be created, which occurs when the client code calls
+ * tracing_map_init().
+ *
+ * When tracing_map_init() returns, tracing_map_elt elements can be
+ * inserted into the map using tracing_map_insert().  When called,
+ * tracing_map_insert() grabs a free tracing_map_elt from the pool, or
+ * finds an existing match in the map and in either case returns it.
+ * The client can then use tracing_map_update_sum() and
+ * tracing_map_read_sum() to update or read a given sum field for the
+ * tracing_map_elt.
+ *
+ * The client can at any point retrieve and traverse the current set
+ * of inserted tracing_map_elts in a tracing_map, via
+ * tracing_map_sort_entries().  Sorting can be done on any field,
+ * including keys.
+ *
+ * See tracing_map.h for a description of tracing_map_ops.
+ *
+ * Return: the tracing_map pointer if successful, ERR_PTR if not.
+ */
+struct tracing_map *tracing_map_create(unsigned int map_bits,
+				       unsigned int key_size,
+				       const struct tracing_map_ops *ops,
+				       void *private_data)
+{
+	struct tracing_map *map;
+	unsigned int i;
+
+	if (map_bits < TRACING_MAP_BITS_MIN ||
+	    map_bits > TRACING_MAP_BITS_MAX)
+		return ERR_PTR(-EINVAL);
+
+	map = kzalloc(sizeof(*map), GFP_KERNEL);
+	if (!map)
+		return ERR_PTR(-ENOMEM);
+
+	map->map_bits = map_bits;
+	map->max_elts = (1 << map_bits);
+	atomic_set(&map->next_elt, -1);
+
+	map->map_size = (1 << (map_bits + 1));
+	map->ops = ops;
+
+	map->private_data = private_data;
+
+	map->map = tracing_map_array_alloc(map->map_size,
+					   sizeof(struct tracing_map_entry));
+	if (!map->map)
+		goto free;
+
+	map->key_size = key_size;
+	for (i = 0; i < TRACING_MAP_KEYS_MAX; i++)
+		map->key_idx[i] = -1;
+ out:
+	return map;
+ free:
+	tracing_map_destroy(map);
+	map = ERR_PTR(-ENOMEM);
+
+	goto out;
+}
+
+/**
+ * tracing_map_init - Allocate and clear a map's tracing_map_elts
+ * @map: The tracing_map to initialize
+ *
+ * Allocates a clears a pool of tracing_map_elts equal to the
+ * user-specified size of 2 ** map_bits (internally maintained as
+ * 'max_elts' in struct tracing_map).  Before using, the map fields
+ * should be added to the map with tracing_map_add_sum_field() and
+ * tracing_map_add_key_field().  tracing_map_init() should then be
+ * called to allocate the array of tracing_map_elts, in order to avoid
+ * allocating anything in the map insertion path.  The user-specified
+ * map size reflects the max number of elements requested by the user
+ * - internally we double that in order to keep the table sparse and
+ * keep collisions manageable.
+ *
+ * See tracing_map.h for a description of tracing_map_ops.
+ *
+ * Return: the tracing_map pointer if successful, ERR_PTR if not.
+ */
+int tracing_map_init(struct tracing_map *map)
+{
+	int err;
+
+	if (map->n_fields < 2)
+		return -EINVAL; /* need at least 1 key and 1 val */
+
+	err = tracing_map_alloc_elts(map);
+	if (err)
+		return err;
+
+	tracing_map_clear(map);
+
+	return err;
+}
+
+static int cmp_entries_dup(const struct tracing_map_sort_entry **a,
+			   const struct tracing_map_sort_entry **b)
+{
+	int ret = 0;
+
+	if (memcmp((*a)->key, (*b)->key, (*a)->elt->map->key_size))
+		ret = 1;
+
+	return ret;
+}
+
+static int cmp_entries_sum(const struct tracing_map_sort_entry **a,
+			   const struct tracing_map_sort_entry **b)
+{
+	const struct tracing_map_elt *elt_a, *elt_b;
+	struct tracing_map_sort_key *sort_key;
+	struct tracing_map_field *field;
+	tracing_map_cmp_fn_t cmp_fn;
+	void *val_a, *val_b;
+	int ret = 0;
+
+	elt_a = (*a)->elt;
+	elt_b = (*b)->elt;
+
+	sort_key = &elt_a->map->sort_key;
+
+	field = &elt_a->fields[sort_key->field_idx];
+	cmp_fn = field->cmp_fn;
+
+	val_a = &elt_a->fields[sort_key->field_idx].sum;
+	val_b = &elt_b->fields[sort_key->field_idx].sum;
+
+	ret = cmp_fn(val_a, val_b);
+	if (sort_key->descending)
+		ret = -ret;
+
+	return ret;
+}
+
+static int cmp_entries_key(const struct tracing_map_sort_entry **a,
+			   const struct tracing_map_sort_entry **b)
+{
+	const struct tracing_map_elt *elt_a, *elt_b;
+	struct tracing_map_sort_key *sort_key;
+	struct tracing_map_field *field;
+	tracing_map_cmp_fn_t cmp_fn;
+	void *val_a, *val_b;
+	int ret = 0;
+
+	elt_a = (*a)->elt;
+	elt_b = (*b)->elt;
+
+	sort_key = &elt_a->map->sort_key;
+
+	field = &elt_a->fields[sort_key->field_idx];
+
+	cmp_fn = field->cmp_fn;
+
+	val_a = elt_a->key + field->offset;
+	val_b = elt_b->key + field->offset;
+
+	ret = cmp_fn(val_a, val_b);
+	if (sort_key->descending)
+		ret = -ret;
+
+	return ret;
+}
+
+static void destroy_sort_entry(struct tracing_map_sort_entry *entry)
+{
+	if (!entry)
+		return;
+
+	if (entry->elt_copied)
+		tracing_map_elt_free(entry->elt);
+
+	kfree(entry);
+}
+
+/**
+ * tracing_map_destroy_sort_entries - Destroy an array of sort entries
+ * @entries: The entries to destroy
+ * @n_entries: The number of entries in the array
+ *
+ * Destroy the elements returned by a tracing_map_sort_entries() call.
+ */
+void tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries,
+				      unsigned int n_entries)
+{
+	unsigned int i;
+
+	for (i = 0; i < n_entries; i++)
+		destroy_sort_entry(entries[i]);
+
+	vfree(entries);
+}
+
+static struct tracing_map_sort_entry *
+create_sort_entry(void *key, struct tracing_map_elt *elt)
+{
+	struct tracing_map_sort_entry *sort_entry;
+
+	sort_entry = kzalloc(sizeof(*sort_entry), GFP_KERNEL);
+	if (!sort_entry)
+		return NULL;
+
+	sort_entry->key = key;
+	sort_entry->elt = elt;
+
+	return sort_entry;
+}
+
+static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt)
+{
+	struct tracing_map_elt *dup_elt;
+	unsigned int i;
+
+	dup_elt = tracing_map_elt_alloc(elt->map);
+	if (!dup_elt)
+		return NULL;
+
+	if (elt->map->ops && elt->map->ops->elt_copy)
+		elt->map->ops->elt_copy(dup_elt, elt);
+
+	dup_elt->private_data = elt->private_data;
+	memcpy(dup_elt->key, elt->key, elt->map->key_size);
+
+	for (i = 0; i < elt->map->n_fields; i++) {
+		atomic64_set(&dup_elt->fields[i].sum,
+			     atomic64_read(&elt->fields[i].sum));
+		dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn;
+	}
+
+	return dup_elt;
+}
+
+static int merge_dup(struct tracing_map_sort_entry **sort_entries,
+		     unsigned int target, unsigned int dup)
+{
+	struct tracing_map_elt *target_elt, *elt;
+	bool first_dup = (target - dup) == 1;
+	int i;
+
+	if (first_dup) {
+		elt = sort_entries[target]->elt;
+		target_elt = copy_elt(elt);
+		if (!target_elt)
+			return -ENOMEM;
+		sort_entries[target]->elt = target_elt;
+		sort_entries[target]->elt_copied = true;
+	} else
+		target_elt = sort_entries[target]->elt;
+
+	elt = sort_entries[dup]->elt;
+
+	for (i = 0; i < elt->map->n_fields; i++)
+		atomic64_add(atomic64_read(&elt->fields[i].sum),
+			     &target_elt->fields[i].sum);
+
+	sort_entries[dup]->dup = true;
+
+	return 0;
+}
+
+static int merge_dups(struct tracing_map_sort_entry **sort_entries,
+		      int n_entries, unsigned int key_size)
+{
+	unsigned int dups = 0, total_dups = 0;
+	int err, i, j;
+	void *key;
+
+	if (n_entries < 2)
+		return total_dups;
+
+	sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *),
+	     (int (*)(const void *, const void *))cmp_entries_dup, NULL);
+
+	key = sort_entries[0]->key;
+	for (i = 1; i < n_entries; i++) {
+		if (!memcmp(sort_entries[i]->key, key, key_size)) {
+			dups++; total_dups++;
+			err = merge_dup(sort_entries, i - dups, i);
+			if (err)
+				return err;
+			continue;
+		}
+		key = sort_entries[i]->key;
+		dups = 0;
+	}
+
+	if (!total_dups)
+		return total_dups;
+
+	for (i = 0, j = 0; i < n_entries; i++) {
+		if (!sort_entries[i]->dup) {
+			sort_entries[j] = sort_entries[i];
+			if (j++ != i)
+				sort_entries[i] = NULL;
+		} else {
+			destroy_sort_entry(sort_entries[i]);
+			sort_entries[i] = NULL;
+		}
+	}
+
+	return total_dups;
+}
+
+static bool is_key(struct tracing_map *map, unsigned int field_idx)
+{
+	unsigned int i;
+
+	for (i = 0; i < map->n_keys; i++)
+		if (map->key_idx[i] == field_idx)
+			return true;
+	return false;
+}
+
+static void sort_secondary(struct tracing_map *map,
+			   const struct tracing_map_sort_entry **entries,
+			   unsigned int n_entries,
+			   struct tracing_map_sort_key *primary_key,
+			   struct tracing_map_sort_key *secondary_key)
+{
+	int (*primary_fn)(const struct tracing_map_sort_entry **,
+			  const struct tracing_map_sort_entry **);
+	int (*secondary_fn)(const struct tracing_map_sort_entry **,
+			    const struct tracing_map_sort_entry **);
+	unsigned i, start = 0, n_sub = 1;
+
+	if (is_key(map, primary_key->field_idx))
+		primary_fn = cmp_entries_key;
+	else
+		primary_fn = cmp_entries_sum;
+
+	if (is_key(map, secondary_key->field_idx))
+		secondary_fn = cmp_entries_key;
+	else
+		secondary_fn = cmp_entries_sum;
+
+	for (i = 0; i < n_entries - 1; i++) {
+		const struct tracing_map_sort_entry **a = &entries[i];
+		const struct tracing_map_sort_entry **b = &entries[i + 1];
+
+		if (primary_fn(a, b) == 0) {
+			n_sub++;
+			if (i < n_entries - 2)
+				continue;
+		}
+
+		if (n_sub < 2) {
+			start = i + 1;
+			n_sub = 1;
+			continue;
+		}
+
+		set_sort_key(map, secondary_key);
+		sort(&entries[start], n_sub,
+		     sizeof(struct tracing_map_sort_entry *),
+		     (int (*)(const void *, const void *))secondary_fn, NULL);
+		set_sort_key(map, primary_key);
+
+		start = i + 1;
+		n_sub = 1;
+	}
+}
+
+/**
+ * tracing_map_sort_entries - Sort the current set of tracing_map_elts in a map
+ * @map: The tracing_map
+ * @sort_key: The sort key to use for sorting
+ * @sort_entries: outval: pointer to allocated and sorted array of entries
+ *
+ * tracing_map_sort_entries() sorts the current set of entries in the
+ * map and returns the list of tracing_map_sort_entries containing
+ * them to the client in the sort_entries param.  The client can
+ * access the struct tracing_map_elt element of interest directly as
+ * the 'elt' field of a returned struct tracing_map_sort_entry object.
+ *
+ * The sort_key has only two fields: idx and descending.  'idx' refers
+ * to the index of the field added via tracing_map_add_sum_field() or
+ * tracing_map_add_key_field() when the tracing_map was initialized.
+ * 'descending' is a flag that if set reverses the sort order, which
+ * by default is ascending.
+ *
+ * The client should not hold on to the returned array but should use
+ * it and call tracing_map_destroy_sort_entries() when done.
+ *
+ * Return: the number of sort_entries in the struct tracing_map_sort_entry
+ * array, negative on error
+ */
+int tracing_map_sort_entries(struct tracing_map *map,
+			     struct tracing_map_sort_key *sort_keys,
+			     unsigned int n_sort_keys,
+			     struct tracing_map_sort_entry ***sort_entries)
+{
+	int (*cmp_entries_fn)(const struct tracing_map_sort_entry **,
+			      const struct tracing_map_sort_entry **);
+	struct tracing_map_sort_entry *sort_entry, **entries;
+	int i, n_entries, ret;
+
+	entries = vmalloc(map->max_elts * sizeof(sort_entry));
+	if (!entries)
+		return -ENOMEM;
+
+	for (i = 0, n_entries = 0; i < map->map_size; i++) {
+		struct tracing_map_entry *entry;
+
+		entry = TRACING_MAP_ENTRY(map->map, i);
+
+		if (!entry->key || !entry->val)
+			continue;
+
+		entries[n_entries] = create_sort_entry(entry->val->key,
+						       entry->val);
+		if (!entries[n_entries++]) {
+			ret = -ENOMEM;
+			goto free;
+		}
+	}
+
+	if (n_entries == 0) {
+		ret = 0;
+		goto free;
+	}
+
+	if (n_entries == 1) {
+		*sort_entries = entries;
+		return 1;
+	}
+
+	ret = merge_dups(entries, n_entries, map->key_size);
+	if (ret < 0)
+		goto free;
+	n_entries -= ret;
+
+	if (is