summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorViktor Rosendahl <Viktor.Rosendahl@bmw.de>2021-02-12 14:44:21 +0100
committerSteven Rostedt (VMware) <rostedt@goodmis.org>2021-02-12 11:52:59 -0500
commite23db805da2dfc39e5281b5efd3e36d132aa83af (patch)
treeeee683e00a0c0a1d584c0ca4757f3e63cba63b09
parent99e22ce73c59ac2d6d08893af376483ca7d62850 (diff)
downloadlinux-e23db805da2dfc39e5281b5efd3e36d132aa83af.tar.gz
linux-e23db805da2dfc39e5281b5efd3e36d132aa83af.tar.bz2
linux-e23db805da2dfc39e5281b5efd3e36d132aa83af.zip
tracing/tools: Add the latency-collector to tools directory
This is a tool that is intended to work around the fact that the preemptoff, irqsoff, and preemptirqsoff tracers only work in overwrite mode. The idea is to act randomly in such a way that we do not systematically lose any latencies, so that if enough testing is done, all latencies will be captured. If the same burst of latencies is repeated, then sooner or later we will have captured all the latencies. It also works with the wakeup_dl, wakeup_rt, and wakeup tracers. However, in that case it is probably not useful to use the random sleep functionality. The reason why it may be desirable to catch all latencies with a long test campaign is that for some organizations, it's necessary to test the kernel in the field and not practical for developers to work iteratively with field testers. Because of cost and project schedules it is not possible to start a new test campaign every time a latency problem has been fixed. It uses inotify to detect changes to /sys/kernel/tracing/trace. When a latency is detected, it will either sleep or print immediately, depending on a function that act as an unfair coin toss. If immediate print is chosen, it means that we open /sys/kernel/tracing/trace and thereby cause a blackout period that will hide any subsequent latencies. If sleep is chosen, it means that we wait before opening /sys/kernel/tracing/trace, by default for 1000 ms, to see if there is another latency during this period. If there is, then we will lose the previous latency. The coin will be tossed again with a different probability, and we will either print the new latency, or possibly a subsequent one. The probability for the unfair coin toss is chosen so that there is equal probability to obtain any of the latencies in a burst. However, this assumes that we make an assumption of how many latencies there can be. By default the program assumes that there are no more than 2 latencies in a burst, the probability of immediate printout will be: 1/2 and 1 Thus, the probability of getting each of the two latencies will be 1/2. If we ever find that there is more than one latency in a series, meaning that we reach the probability of 1, then the table will be expanded to: 1/3, 1/2, and 1 Thus, we assume that there are no more than three latencies and each with a probability of 1/3 of being captured. If the probability of 1 is reached in the new table, that is we see more than two closely occurring latencies, then the table will again be extended, and so on. On my systems, it seems like this scheme works fairly well, as long as the latencies we trace are long enough, 300 us seems to be enough. This userspace program receive the inotify event at the end of a latency, and it has time until the end of the next latency to react, that is to open /sys/kernel/tracing/trace. Thus, if we trace latencies that are >300 us, then we have at least 300 us to react. The minimum latency will of course not be 300 us on all systems, it will depend on the hardware, kernel version, workload and configuration. Example usage: In one shell, give the following command: sudo latency-collector -rvv -t preemptirqsoff -s 2000 -a 3 This will trace latencies > 2000us with the preemptirqsoff tracer, using random sleep with maximum verbosity, with a probability table initialized to a size of 3. In another shell, generate a few bursts of latencies: root@host:~# modprobe preemptirq_delay_test delay=3000 test_mode=alternate burst_size=3 root@host:~# echo 1 > /sys/kernel/preemptirq_delay_test/trigger root@host:~# echo 1 > /sys/kernel/preemptirq_delay_test/trigger root@host:~# echo 1 > /sys/kernel/preemptirq_delay_test/trigger root@host:~# echo 1 > /sys/kernel/preemptirq_delay_test/trigger If all goes well, you should be getting stack traces that shows all the different latencies, i.e. you should see all the three functions preemptirqtest_0, preemptirqtest_1, preemptirqtest_2 in the stack traces. Link: https://lkml.kernel.org/r/20210212134421.172750-2-Viktor.Rosendahl@bmw.de Signed-off-by: Viktor Rosendahl <Viktor.Rosendahl@bmw.de> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
-rw-r--r--tools/Makefile14
-rw-r--r--tools/tracing/Makefile19
-rw-r--r--tools/tracing/latency/.gitignore2
-rw-r--r--tools/tracing/latency/Makefile24
-rw-r--r--tools/tracing/latency/latency-collector.c2108
5 files changed, 2161 insertions, 6 deletions
diff --git a/tools/Makefile b/tools/Makefile
index 85af6ebbce91..7e9d34ddd74c 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -31,6 +31,7 @@ help:
@echo ' bootconfig - boot config tool'
@echo ' spi - spi tools'
@echo ' tmon - thermal monitoring and tuning tool'
+ @echo ' tracing - misc tracing tools'
@echo ' turbostat - Intel CPU idle stats and freq reporting tool'
@echo ' usb - USB testing tools'
@echo ' virtio - vhost test module'
@@ -64,7 +65,7 @@ acpi: FORCE
cpupower: FORCE
$(call descend,power/$@)
-cgroup firewire hv guest bootconfig spi usb virtio vm bpf iio gpio objtool leds wmi pci firmware debugging: FORCE
+cgroup firewire hv guest bootconfig spi usb virtio vm bpf iio gpio objtool leds wmi pci firmware debugging tracing: FORCE
$(call descend,$@)
bpf/%: FORCE
@@ -103,7 +104,7 @@ all: acpi cgroup cpupower gpio hv firewire liblockdep \
perf selftests bootconfig spi turbostat usb \
virtio vm bpf x86_energy_perf_policy \
tmon freefall iio objtool kvm_stat wmi \
- pci debugging
+ pci debugging tracing
acpi_install:
$(call descend,power/$(@:_install=),install)
@@ -111,7 +112,7 @@ acpi_install:
cpupower_install:
$(call descend,power/$(@:_install=),install)
-cgroup_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install vm_install bpf_install objtool_install wmi_install pci_install debugging_install:
+cgroup_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install vm_install bpf_install objtool_install wmi_install pci_install debugging_install tracing_install:
$(call descend,$(@:_install=),install)
liblockdep_install:
@@ -137,7 +138,8 @@ install: acpi_install cgroup_install cpupower_install gpio_install \
perf_install selftests_install turbostat_install usb_install \
virtio_install vm_install bpf_install x86_energy_perf_policy_install \
tmon_install freefall_install objtool_install kvm_stat_install \
- wmi_install pci_install debugging_install intel-speed-select_install
+ wmi_install pci_install debugging_install intel-speed-select_install \
+ tracing_install
acpi_clean:
$(call descend,power/acpi,clean)
@@ -145,7 +147,7 @@ acpi_clean:
cpupower_clean:
$(call descend,power/cpupower,clean)
-cgroup_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean vm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean pci_clean firmware_clean debugging_clean:
+cgroup_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean vm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean pci_clean firmware_clean debugging_clean tracing_clean:
$(call descend,$(@:_clean=),clean)
liblockdep_clean:
@@ -184,6 +186,6 @@ clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean \
vm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
freefall_clean build_clean libbpf_clean libsubcmd_clean liblockdep_clean \
gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \
- intel-speed-select_clean
+ intel-speed-select_clean tracing_clean
.PHONY: FORCE
diff --git a/tools/tracing/Makefile b/tools/tracing/Makefile
new file mode 100644
index 000000000000..87e0ec48e2e7
--- /dev/null
+++ b/tools/tracing/Makefile
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
+include ../scripts/Makefile.include
+
+all: latency
+
+clean: latency_clean
+
+install: latency_install
+
+latency:
+ $(call descend,latency)
+
+latency_install:
+ $(call descend,latency,install)
+
+latency_clean:
+ $(call descend,latency,clean)
+
+.PHONY: all install clean latency latency_install latency_clean
diff --git a/tools/tracing/latency/.gitignore b/tools/tracing/latency/.gitignore
new file mode 100644
index 000000000000..0863960761e7
--- /dev/null
+++ b/tools/tracing/latency/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+latency-collector
diff --git a/tools/tracing/latency/Makefile b/tools/tracing/latency/Makefile
new file mode 100644
index 000000000000..40c4ddaf8be1
--- /dev/null
+++ b/tools/tracing/latency/Makefile
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for vm tools
+#
+VAR_CFLAGS := $(shell pkg-config --cflags libtracefs 2>/dev/null)
+VAR_LDLIBS := $(shell pkg-config --libs libtracefs 2>/dev/null)
+
+TARGETS = latency-collector
+CFLAGS = -Wall -Wextra -g -O2 $(VAR_CFLAGS)
+LDFLAGS = -lpthread $(VAR_LDLIBS)
+
+all: $(TARGETS)
+
+%: %.c
+ $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
+
+clean:
+ $(RM) latency-collector
+
+prefix ?= /usr/local
+sbindir ?= ${prefix}/sbin
+
+install: all
+ install -d $(DESTDIR)$(sbindir)
+ install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir)
diff --git a/tools/tracing/latency/latency-collector.c b/tools/tracing/latency/latency-collector.c
new file mode 100644
index 000000000000..57b20802e71b
--- /dev/null
+++ b/tools/tracing/latency/latency-collector.c
@@ -0,0 +1,2108 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2017, 2018, 2019, 2021 BMW Car IT GmbH
+ * Author: Viktor Rosendahl (viktor.rosendahl@bmw.de)
+ */
+
+#define _GNU_SOURCE
+#define _POSIX_C_SOURCE 200809L
+
+#include <ctype.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <sched.h>
+#include <linux/unistd.h>
+#include <signal.h>
+#include <sys/inotify.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <tracefs.h>
+
+static const char *prg_name;
+static const char *prg_unknown = "unknown program name";
+
+static int fd_stdout;
+
+static int sched_policy;
+static bool sched_policy_set;
+
+static int sched_pri;
+static bool sched_pri_set;
+
+static bool trace_enable = true;
+static bool setup_ftrace = true;
+static bool use_random_sleep;
+
+#define TRACE_OPTS \
+ C(FUNC_TR, "function-trace"), \
+ C(DISP_GR, "display-graph"), \
+ C(NR, NULL)
+
+#undef C
+#define C(a, b) OPTIDX_##a
+
+enum traceopt {
+ TRACE_OPTS
+};
+
+#undef C
+#define C(a, b) b
+
+static const char *const optstr[] = {
+ TRACE_OPTS
+};
+
+enum errhandling {
+ ERR_EXIT = 0,
+ ERR_WARN,
+ ERR_CLEANUP,
+};
+
+static bool use_options[OPTIDX_NR];
+
+static char inotify_buffer[655360];
+
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#define bool2str(x) (x ? "true":"false")
+
+#define DEFAULT_NR_PRINTER_THREADS (3)
+static unsigned int nr_threads = DEFAULT_NR_PRINTER_THREADS;
+
+#define DEFAULT_TABLE_SIZE (2)
+static unsigned int table_startsize = DEFAULT_TABLE_SIZE;
+
+static int verbosity;
+
+#define verbose_sizechange() (verbosity >= 1)
+#define verbose_lostevent() (verbosity >= 2)
+#define verbose_ftrace() (verbosity >= 1)
+
+#define was_changed(ORIG, CUR) (strcmp(ORIG, CUR) != 0)
+#define needs_change(CUR, WANTED) (strcmp(CUR, WANTED) != 0)
+
+static const char *debug_tracefile;
+static const char *debug_tracefile_dflt;
+static const char *debug_maxlat;
+static const char *debug_maxlat_dflt;
+static const char * const DEBUG_NOFILE = "[file not found]";
+
+static const char * const TR_MAXLAT = "tracing_max_latency";
+static const char * const TR_THRESH = "tracing_thresh";
+static const char * const TR_CURRENT = "current_tracer";
+static const char * const TR_OPTIONS = "trace_options";
+
+static const char * const NOP_TRACER = "nop";
+
+static const char * const OPT_NO_PREFIX = "no";
+
+#define DFLT_THRESHOLD_US "0"
+static const char *threshold = DFLT_THRESHOLD_US;
+
+#define DEV_URANDOM "/dev/urandom"
+#define RT_DEFAULT_PRI (99)
+#define DEFAULT_PRI (0)
+
+#define USEC_PER_MSEC (1000L)
+#define NSEC_PER_USEC (1000L)
+#define NSEC_PER_MSEC (USEC_PER_MSEC * NSEC_PER_USEC)
+
+#define MSEC_PER_SEC (1000L)
+#define USEC_PER_SEC (USEC_PER_MSEC * MSEC_PER_SEC)
+#define NSEC_PER_SEC (NSEC_PER_MSEC * MSEC_PER_SEC)
+
+#define SLEEP_TIME_MS_DEFAULT (1000L)
+#define TRY_PRINTMUTEX_MS (1000)
+
+static long sleep_time = (USEC_PER_MSEC * SLEEP_TIME_MS_DEFAULT);
+
+static const char * const queue_full_warning =
+"Could not queue trace for printing. It is likely that events happen faster\n"
+"than what they can be printed. Probably partly because of random sleeping\n";
+
+static const char * const no_tracer_msg =
+"Could not find any tracers! Running this program as root may help!\n";
+
+static const char * const no_latency_tr_msg =
+"No latency tracers are supported by your kernel!\n";
+
+struct policy {
+ const char *name;
+ int policy;
+ int default_pri;
+};
+
+static const struct policy policies[] = {
+ { "other", SCHED_OTHER, DEFAULT_PRI },
+ { "batch", SCHED_BATCH, DEFAULT_PRI },
+ { "idle", SCHED_IDLE, DEFAULT_PRI },
+ { "rr", SCHED_RR, RT_DEFAULT_PRI },
+ { "fifo", SCHED_FIFO, RT_DEFAULT_PRI },
+ { NULL, 0, DEFAULT_PRI }
+};
+
+/*
+ * The default tracer will be the first on this list that is supported by the
+ * currently running Linux kernel.
+ */
+static const char * const relevant_tracers[] = {
+ "preemptirqsoff",
+ "preemptoff",
+ "irqsoff",
+ "wakeup",
+ "wakeup_rt",
+ "wakeup_dl",
+ NULL
+};
+
+/* This is the list of tracers for which random sleep makes sense */
+static const char * const random_tracers[] = {
+ "preemptirqsoff",
+ "preemptoff",
+ "irqsoff",
+ NULL
+};
+
+static const char *current_tracer;
+static bool force_tracer;
+
+struct ftrace_state {
+ char *tracer;
+ char *thresh;
+ bool opt[OPTIDX_NR];
+ bool opt_valid[OPTIDX_NR];
+ pthread_mutex_t mutex;
+};
+
+struct entry {
+ int ticket;
+ int ticket_completed_ref;
+};
+
+struct print_state {
+ int ticket_counter;
+ int ticket_completed;
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+ int cnt;
+ pthread_mutex_t cnt_mutex;
+};
+
+struct short_msg {
+ char buf[160];
+ int len;
+};
+
+static struct print_state printstate;
+static struct ftrace_state save_state;
+volatile sig_atomic_t signal_flag;
+
+#define PROB_TABLE_MAX_SIZE (1000)
+
+int probabilities[PROB_TABLE_MAX_SIZE];
+
+struct sleep_table {
+ int *table;
+ int size;
+ pthread_mutex_t mutex;
+};
+
+static struct sleep_table sleeptable;
+
+#define QUEUE_SIZE (10)
+
+struct queue {
+ struct entry entries[QUEUE_SIZE];
+ int next_prod_idx;
+ int next_cons_idx;
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+};
+
+#define MAX_THREADS (40)
+
+struct queue printqueue;
+pthread_t printthread[MAX_THREADS];
+pthread_mutex_t print_mtx;
+#define PRINT_BUFFER_SIZE (16 * 1024 * 1024)
+
+static void cleanup_exit(int status);
+static int set_trace_opt(const char *opt, bool value);
+
+static __always_inline void *malloc_or_die(size_t size)
+{
+ void *ptr = malloc(size);
+
+ if (unlikely(ptr == NULL)) {
+ warn("malloc() failed");
+ cleanup_exit(EXIT_FAILURE);
+ }
+ return ptr;
+}
+
+static __always_inline void *malloc_or_die_nocleanup(size_t size)
+{
+ void *ptr = malloc(size);
+
+ if (unlikely(ptr == NULL))
+ err(0, "malloc() failed");
+ return ptr;
+}
+
+static __always_inline void write_or_die(int fd, const char *buf, size_t count)
+{
+ ssize_t r;
+
+ do {
+ r = write(fd, buf, count);
+ if (unlikely(r < 0)) {
+ if (errno == EINTR)
+ continue;
+ warn("write() failed");
+ cleanup_exit(EXIT_FAILURE);
+ }
+ count -= r;
+ buf += r;
+ } while (count > 0);
+}
+
+static __always_inline void clock_gettime_or_die(clockid_t clk_id,
+ struct timespec *tp)
+{
+ int r = clock_gettime(clk_id, tp);
+
+ if (unlikely(r != 0))
+ err(EXIT_FAILURE, "clock_gettime() failed");
+}
+
+static __always_inline void sigemptyset_or_die(sigset_t *s)
+{
+ if (unlikely(sigemptyset(s) != 0)) {
+ warn("sigemptyset() failed");
+ cleanup_exit(EXIT_FAILURE);
+ }
+}
+
+static __always_inline void sigaddset_or_die(sigset_t *s, int signum)
+{
+ if (unlikely(sigaddset(s, signum) != 0)) {
+ warn("sigemptyset() failed");
+ cleanup_exit(EXIT_FAILURE);
+ }
+}
+
+static __always_inline void sigaction_or_die(int signum,
+ const struct sigaction *act,
+ struct sigaction *oldact)
+{
+ if (unlikely(sigaction(signum, act, oldact) != 0)) {
+ warn("sigaction() failed");
+ cleanup_exit(EXIT_FAILURE);
+ }
+}
+
+static void open_stdout(void)
+{
+ if (setvbuf(stdout, NULL, _IONBF, 0) != 0)
+ err(EXIT_FAILURE, "setvbuf() failed");
+ fd_stdout = fileno(stdout);
+ if (fd_stdout < 0)
+ err(EXIT_FAILURE, "fileno() failed");
+}
+
+/*
+ * It's not worth it to call cleanup_exit() from mutex functions because
+ * cleanup_exit() uses mutexes.
+ */
+static __always_inline void mutex_lock(pthread_mutex_t *mtx)
+{
+ errno = pthread_mutex_lock(mtx);
+ if (unlikely(errno))
+ err(EXIT_FAILURE, "pthread_mutex_lock() failed");
+}
+
+
+static __always_inline void mutex_unlock(pthread_mutex_t *mtx)
+{
+ errno = pthread_mutex_unlock(mtx);
+ if (unlikely(errno))
+ err(EXIT_FAILURE, "pthread_mutex_unlock() failed");
+}
+
+static __always_inline void cond_signal(pthread_cond_t *cond)
+{
+ errno = pthread_cond_signal(cond);
+ if (unlikely(errno))
+ err(EXIT_FAILURE, "pthread_cond_signal() failed");
+}
+
+static __always_inline void cond_wait(pthread_cond_t *restrict cond,
+ pthread_mutex_t *restrict mutex)
+{
+ errno = pthread_cond_wait(cond, mutex);
+ if (unlikely(errno))
+ err(EXIT_FAILURE, "pthread_cond_wait() failed");
+}
+
+static __always_inline void cond_broadcast(pthread_cond_t *cond)
+{
+ errno = pthread_cond_broadcast(cond);
+ if (unlikely(errno))
+ err(EXIT_FAILURE, "pthread_cond_broadcast() failed");
+}
+
+static __always_inline void
+mutex_init(pthread_mutex_t *mutex,
+ const pthread_mutexattr_t *attr)
+{
+ errno = pthread_mutex_init(mutex, attr);
+ if (errno)
+ err(EXIT_FAILURE, "pthread_mutex_init() failed");
+}
+
+static __always_inline void mutexattr_init(pthread_mutexattr_t *attr)
+{
+ errno = pthread_mutexattr_init(attr);
+ if (errno)
+ err(EXIT_FAILURE, "pthread_mutexattr_init() failed");
+}
+
+static __always_inline void mutexattr_destroy(pthread_mutexattr_t *attr)
+{
+ errno = pthread_mutexattr_destroy(attr);
+ if (errno)
+ err(EXIT_FAILURE, "pthread_mutexattr_destroy() failed");
+}
+
+static __always_inline void mutexattr_settype(pthread_mutexattr_t *attr,
+ int type)
+{
+ errno = pthread_mutexattr_settype(attr, type);
+ if (errno)
+ err(EXIT_FAILURE, "pthread_mutexattr_settype() failed");
+}
+
+static __always_inline void condattr_init(pthread_condattr_t *attr)
+{
+ errno = pthread_condattr_init(attr);
+ if (errno)
+ err(EXIT_FAILURE, "pthread_condattr_init() failed");
+}
+
+static __always_inline void condattr_destroy(pthread_condattr_t *attr)
+{
+ errno = pthread_condattr_destroy(attr);
+ if (errno)
+ err(EXIT_FAILURE, "pthread_condattr_destroy() failed");
+}
+
+static __always_inline void condattr_setclock(pthread_condattr_t *attr,
+ clockid_t clock_id)
+{
+ errno = pthread_condattr_setclock(attr, clock_id);
+ if (unlikely(errno))
+ err(EXIT_FAILURE, "pthread_condattr_setclock() failed");
+}
+
+static __always_inline void cond_init(pthread_cond_t *cond,
+ const pthread_condattr_t *attr)
+{
+ errno = pthread_cond_init(cond, attr);
+ if (errno)
+ err(EXIT_FAILURE, "pthread_cond_init() failed");
+}
+
+static __always_inline int
+cond_timedwait(pthread_cond_t *restrict cond,
+ pthread_mutex_t *restrict mutex,
+ const struct timespec *restrict abstime)
+{
+ errno = pthread_cond_timedwait(cond, mutex, abstime);
+ if (errno && errno != ETIMEDOUT)
+ err(EXIT_FAILURE, "pthread_cond_timedwait() failed");
+ return errno;
+}
+
+static void init_printstate(void)
+{
+ pthread_condattr_t cattr;
+
+ printstate.ticket_counter = 0;
+ printstate.ticket_completed = 0;
+ printstate.cnt = 0;
+
+ mutex_init(&printstate.mutex, NULL);
+
+ condattr_init(&cattr);
+ condattr_setclock(&cattr, CLOCK_MONOTONIC);
+ cond_init(&printstate.cond, &cattr);
+ condattr_destroy(&cattr);
+}
+
+static void init_print_mtx(void)
+{
+ pthread_mutexattr_t mattr;
+
+ mutexattr_init(&mattr);
+ mutexattr_settype(&mattr, PTHREAD_MUTEX_RECURSIVE);
+ mutex_init(&print_mtx, &mattr);
+ mutexattr_destroy(&mattr);
+
+}
+
+static void signal_blocking(int how)
+{
+ sigset_t s;
+
+ sigemptyset_or_die(&s);
+ sigaddset_or_die(&s, SIGHUP);
+ sigaddset_or_die(&s, SIGTERM);
+ sigaddset_or_die(&s, SIGINT);
+
+ errno = pthread_sigmask(how, &s, NULL);
+ if (unlikely(errno)) {
+ warn("pthread_sigmask() failed");
+ cleanup_exit(EXIT_FAILURE);
+ }
+}
+
+static void signal_handler(int num)
+{
+ signal_flag = num;
+}
+
+static void setup_sig_handler(void)
+{
+ struct sigaction sa;
+
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = signal_handler;
+
+ sigaction_or_die(SIGHUP, &sa, NULL);
+ sigaction_or_die(SIGTERM, &sa, NULL);
+ sigaction_or_die(SIGINT, &sa, NULL);
+}
+
+static void process_signal(int signal)
+{
+ char *name;
+
+ name = strsignal(signal);
+ if (name == NULL)
+ printf("Received signal %d\n", signal);
+ else
+ printf("Received signal %d (%s)\n", signal, name);
+ cleanup_exit(EXIT_SUCCESS);
+}
+
+static __always_inline void check_signals(void)
+{
+ int signal = signal_flag;
+
+ if (unlikely(signal))
+ process_signal(signal);
+}
+
+static __always_inline void get_time_in_future(struct timespec *future,
+ long time_us)
+{
+ long nsec;
+
+ clock_gettime_or_die(CLOCK_MONOTONIC, future);
+ future->tv_sec += time_us / USEC_PER_SEC;
+ nsec = future->tv_nsec + (time_us * NSEC_PER_USEC) % NSEC_PER_SEC;
+ if (nsec >= NSEC_PER_SEC) {
+ future->tv_nsec = nsec % NSEC_PER_SEC;
+ future->tv_sec += 1;
+ }
+}
+
+static __always_inline bool time_has_passed(const struct timespec *time)
+{
+ struct timespec now;
+
+ clock_gettime_or_die(CLOCK_MONOTONIC, &now);
+ if (now.tv_sec > time->tv_sec)
+ return true;
+ if (now.tv_sec < time->tv_sec)
+ return false;
+ return (now.tv_nsec >= time->tv_nsec);
+}
+
+static bool mutex_trylock_limit(pthread_mutex_t *mutex, int time_ms)
+{
+ long time_us = time_ms * USEC_PER_MSEC;
+ struct timespec limit;
+
+ get_time_in_future(&limit, time_us);
+ do {
+ errno = pthread_mutex_trylock(mutex);
+ if (errno && errno != EBUSY)
+ err(EXIT_FAILURE, "pthread_mutex_trylock() failed");
+ } while (errno && !time_has_passed(&limit));
+ return errno == 0;
+}
+
+static void restore_trace_opts(const struct ftrace_state *state,
+ const bool *cur)
+{
+ int i;
+ int r;
+
+ for (i = 0; i < OPTIDX_NR; i++)
+ if (state->opt_valid[i] && state->opt[i] != cur[i]) {
+ r = set_trace_opt(optstr[i], state->opt[i]);
+ if (r < 0)
+ warnx("Failed to restore the %s option to %s",
+ optstr[i], bool2str(state->opt[i]));
+ else if (verbose_ftrace())
+ printf("Restored the %s option in %s to %s\n",
+ optstr[i], TR_OPTIONS,
+ bool2str(state->opt[i]));
+ }
+}
+
+static char *read_file(const char *file, enum errhandling h)
+{
+ int psize;
+ char *r;
+ static const char *emsg = "Failed to read the %s file";
+
+ r = tracefs_instance_file_read(NULL, file, &psize);
+ if (!r) {
+ if (h) {
+ warn(emsg, file);
+ if (h == ERR_CLEANUP)
+ cleanup_exit(EXIT_FAILURE);
+ } else
+ errx(EXIT_FAILURE, emsg, file);
+ }
+
+ if (r && r[psize - 1] == '\n')
+ r[psize - 1] = '\0';
+ return r;
+}
+
+static void restore_file(const char *file, char **saved, const char *cur)
+{
+ if (*saved && was_changed(*saved, cur)) {
+ if (tracefs_instance_file_write(NULL, file, *saved) < 0)
+ warnx("Failed to restore %s to %s!", file, *saved);
+ else if (verbose_ftrace())
+ printf("Restored %s to %s\n", file, *saved);
+ free(*saved);
+ *saved = NULL;
+ }
+}
+
+static void restore_ftrace(void)
+{
+ mutex_lock(&save_state.mutex);
+
+ restore_file(TR_CURRENT, &save_state.tracer, current_tracer);
+ restore_file(TR_THRESH, &save_state.thresh, threshold);
+ restore_trace_opts(&save_state, use_options);
+
+ mutex_unlock(&save_state.mutex);
+}
+
+static void cleanup_exit(int status)
+{
+ char *maxlat;
+
+ if (!setup_ftrace)
+ exit(status);
+
+ /*
+ * We try the print_mtx for 1 sec in order to avoid garbled
+ * output if possible, but if it cannot be obtained we proceed anyway.
+ */
+ mutex_trylock_limit(&print_mtx, TRY_PRINTMUTEX_MS);
+
+ maxlat = read_file(TR_MAXLAT, ERR_WARN);
+ if (maxlat) {
+ printf("The maximum detected latency was: %sus\n", maxlat);
+ free(maxlat);
+ }
+
+ restore_ftrace();
+ /*
+ * We do not need to unlock the print_mtx here because we will exit at
+ * the end of this function. Unlocking print_mtx causes problems if a
+ * print thread happens to be waiting for the mutex because we have
+ * just changed the ftrace settings to the original and thus the
+ * print thread would output incorrect data from ftrace.
+ */
+ exit(status);
+}
+
+static void init_save_state(void)
+{
+ pthread_mutexattr_t mattr;
+
+ mutexattr_init(&mattr);
+ mutexattr_settype(&mattr, PTHREAD_MUTEX_RECURSIVE);
+ mutex_init(&save_state.mutex, &mattr);
+ mutexattr_destroy(&mattr);
+
+ save_state.tracer = NULL;
+ save_state.thresh = NULL;
+ save_state.opt_valid[OPTIDX_FUNC_TR] = false;
+ save_state.opt_valid[OPTIDX_DISP_GR] = false;
+}
+
+static int printstate_next_ticket(struct entry *req)
+{
+ int r;
+
+ r = ++(printstate.ticket_counter);
+ req->ticket = r;
+ req->ticket_completed_ref = printstate.ticket_completed;
+ cond_broadcast(&printstate.cond);
+ return r;
+}
+
+static __always_inline
+void printstate_mark_req_completed(const struct entry *req)
+{
+ if (req->ticket > printstate.ticket_completed)
+ printstate.ticket_completed = req->ticket;
+}
+
+static __always_inline
+bool printstate_has_new_req_arrived(const struct entry *req)
+{
+ return (printstate.ticket_counter != req->ticket);
+}
+
+static __always_inline int printstate_cnt_inc(void)
+{
+ int value;
+
+ mutex_lock(&printstate.cnt_mutex);
+ value = ++printstate.cnt;
+ mutex_unlock(&printstate.cnt_mutex);
+ return value;
+}
+
+static __always_inline int printstate_cnt_dec(void)
+{
+ int value;
+
+ mutex_lock(&printstate.cnt_mutex);
+ value = --printstate.cnt;
+ mutex_unlock(&printstate.cnt_mutex);
+ return value;
+}
+
+static __always_inline int printstate_cnt_read(void)
+{
+ int value;
+
+ mutex_lock(&printstate.cnt_mutex);
+ value = printstate.cnt;
+ mutex_unlock(&printstate.cnt_mutex);
+ return value;
+}
+
+static __always_inline
+bool prev_req_won_race(const struct entry *req)
+{
+ return (printstate.ticket_completed != req->ticket_completed_ref);
+}
+
+static void sleeptable_resize(int size, bool printout, struct short_msg *msg)
+{
+ int bytes;
+
+ if (printout) {
+ msg->len = 0;
+ if (unlikely(size > PROB_TABLE_MAX_SIZE))
+ bytes = snprintf(msg->buf, sizeof(msg->buf),
+"Cannot increase probability table to %d (maximum size reached)\n", size);
+ else
+ bytes = snprintf(msg->buf, sizeof(msg->buf),
+"Increasing probability table to %d\n", size);
+ if (bytes < 0)
+ warn("snprintf() failed");
+ else
+ msg->len = bytes;
+ }
+
+ if (unlikely(size < 0)) {
+ /* Should never happen */
+ warnx("Bad program state at %s:%d", __FILE__, __LINE__);
+ cleanup_exit(EXIT_FAILURE);
+ return;
+ }
+ sleeptable.size = size;
+ sleeptable.table = &probabilities[PROB_TABLE_MAX_SIZE - size];
+}
+
+static void init_probabilities(void)
+{
+ int i;
+ int j = 1000;
+
+ for (i = 0; i < PROB_TABLE_MAX_SIZE; i++) {
+ probabilities[i] = 1000 / j;
+ j--;
+ }
+ mutex_init(&sleeptable.mutex, NULL);
+}
+
+static int table_get_probability(const struct entry *req,
+ struct short_msg *msg)
+{
+ int diff = req->ticket - req->ticket_completed_ref;
+ int rval = 0;
+
+ msg->len = 0;
+ diff--;
+ /* Should never happen...*/
+ if (unlikely(diff < 0)) {
+ warnx("Programmer assumption error at %s:%d\n", __FILE__,
+ __LINE__);
+ cleanup_exit(EXIT_FAILURE);
+ }
+ mutex_lock(&sleeptable.mutex);
+ if (diff >= (sleeptable.size - 1)) {
+ rval = sleeptable.table[sleeptable.size - 1];
+ sleeptable_resize(sleeptable.size + 1, verbose_sizechange(),
+ msg);
+ } else {
+ rval = sleeptable.table[diff];
+ }
+ mutex_unlock(&sleeptable.mutex);
+ return rval;
+}
+
+static void init_queue(struct queue *q)
+{
+ q->next_prod_idx = 0;
+ q->next_cons_idx = 0;
+ mutex_init(&q->mutex, NULL);
+ errno = pthread_cond_init(&q->cond, NULL);
+ if (errno)
+ err(EXIT_FAILURE, "pthread_cond_init() failed");
+}
+
+static __always_inline int queue_len(const struct queue *q)
+{
+ if (q->next_prod_idx >= q->next_cons_idx)
+ return q->next_prod_idx - q->next_cons_idx;
+ else
+ return QUEUE_SIZE - q->next_cons_idx + q->next_prod_idx;
+}
+
+static __always_inline int queue_nr_free(const struct queue *q)
+{
+ int nr_free = QUEUE_SIZE - queue_len(q);
+
+ /*
+ * If there is only one slot left we will anyway lie and claim that the
+ * queue is full because adding an element will make it appear empty
+ */
+ if (nr_free == 1)
+ nr_free = 0;
+ return nr_free;
+}
+
+static __always_inline void queue_idx_inc(int *idx)
+{
+ *idx = (*idx + 1) % QUEUE_SIZE;
+}
+
+static __always_inline void queue_push_to_back(struct queue *q,
+ const struct entry *e)
+{
+ q->entries[q->next_prod_idx] = *e;
+ queue_idx_inc(&q->next_prod_idx);
+}
+
+static __always_inline struct entry queue_pop_from_front(struct queue *q)
+{
+ struct entry e = q->entries[q->next_cons_idx];
+
+ queue_idx_inc(&q->next_cons_idx);
+ return e;
+}
+
+static __always_inline void queue_cond_signal(struct queue *q)
+{
+ cond_signal(&q->cond);
+}
+
+static __always_inline void queue_cond_wait(struct queue *q)
+{
+ cond_wait(&q->cond, &q->mutex);
+}
+
+static __always_inline int queue_try_to_add_entry(struct queue *q,
+ const struct entry *e)
+{
+ int r = 0;
+
+ mutex_lock(&q->mutex);
+ if (queue_nr_free(q) > 0) {
+ queue_push_to_back(q, e);
+ cond_signal(&q->cond);
+ } else
+ r = -1;
+ mutex_unlock(&q->mutex);
+ return r;
+}
+
+static struct entry queue_wait_for_entry(struct queue *q)
+{
+ struct entry e;
+
+ mutex_lock(&q->mutex);
+ while (true) {
+ if (queue_len(&printqueue) > 0) {
+ e = queue_pop_from_front(q);
+ break;
+ }
+ queue_cond_wait(q);
+ }
+ mutex_unlock(&q->mutex);
+
+ return e;
+}
+
+static const struct policy *policy_from_name(const char *name)
+{
+ const struct policy *p = &policies[0];
+
+ while (p->name != NULL) {
+ if (!strcmp(name, p->name))
+ return p;
+ p++;
+ }
+ return NULL;
+}
+
+static const char *policy_name(int policy)
+{
+ const struct policy *p = &policies[0];
+ static const char *rval = "unknown";
+
+ while (p->name != NULL) {
+ if (p->policy == policy)
+ return p->name;
+ p++;
+ }
+ return rval;
+}
+
+static bool is_relevant_tracer(const char *name)
+{
+ unsigned int i;
+
+ for (i = 0; relevant_tracers[i]; i++)
+ if (!strcmp(name, relevant_tracers[i]))
+ return true;
+ return false;
+}
+
+static bool random_makes_sense(const char *name)
+{
+ unsigned int i;
+
+ for (i = 0; random_tracers[i]; i++)
+ if (!strcmp(name, random_tracers[i]))
+ return true;
+ return false;
+}
+
+static void show_available(void)
+{
+ char **tracers;
+ int found = 0;
+ int i;
+
+ tracers = tracefs_tracers(NULL);
+ for (i = 0; tracers && tracers[i]; i++) {
+ if (is_relevant_tracer(tracers[i]))
+ found++;
+ }
+
+ if (!tracers) {
+ warnx(no_tracer_msg);
+ return;
+ }
+
+ if (!found) {
+ warnx(no_latency_tr_msg);
+ tracefs_list_free(tracers);
+ return;
+ }
+
+ printf("The following latency tracers are available on your system:\n");
+ for (i = 0; tracers[i]; i++) {