summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/tools/rv/rv-mon-sched.rst69
-rw-r--r--Documentation/trace/rv/monitor_sched.rst171
-rw-r--r--include/linux/rv.h4
-rw-r--r--include/linux/sched.h16
-rw-r--r--include/trace/define_trace.h7
-rw-r--r--include/trace/events/osnoise.h96
-rw-r--r--include/trace/events/sched.h13
-rw-r--r--kernel/sched/core.c23
-rw-r--r--kernel/trace/rv/Kconfig7
-rw-r--r--kernel/trace/rv/Makefile7
-rw-r--r--kernel/trace/rv/monitors/sched/Kconfig11
-rw-r--r--kernel/trace/rv/monitors/sched/sched.c38
-rw-r--r--kernel/trace/rv/monitors/sched/sched.h3
-rw-r--r--kernel/trace/rv/monitors/sco/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/sco/sco.c88
-rw-r--r--kernel/trace/rv/monitors/sco/sco.h47
-rw-r--r--kernel/trace/rv/monitors/sco/sco_trace.h15
-rw-r--r--kernel/trace/rv/monitors/scpd/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd.c96
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd.h49
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd_trace.h15
-rw-r--r--kernel/trace/rv/monitors/sncid/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/sncid/sncid.c96
-rw-r--r--kernel/trace/rv/monitors/sncid/sncid.h49
-rw-r--r--kernel/trace/rv/monitors/sncid/sncid_trace.h15
-rw-r--r--kernel/trace/rv/monitors/snep/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/snep/snep.c96
-rw-r--r--kernel/trace/rv/monitors/snep/snep.h49
-rw-r--r--kernel/trace/rv/monitors/snep/snep_trace.h15
-rw-r--r--kernel/trace/rv/monitors/snroc/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc.c85
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc.h47
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc_trace.h15
-rw-r--r--kernel/trace/rv/monitors/tss/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/tss/tss.c91
-rw-r--r--kernel/trace/rv/monitors/tss/tss.h47
-rw-r--r--kernel/trace/rv/monitors/tss/tss_trace.h15
-rw-r--r--kernel/trace/rv/monitors/wip/Kconfig2
-rw-r--r--kernel/trace/rv/monitors/wip/wip.c2
-rw-r--r--kernel/trace/rv/monitors/wip/wip.h1
-rw-r--r--kernel/trace/rv/monitors/wwnr/Kconfig2
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.c2
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.h1
-rw-r--r--kernel/trace/rv/rv.c154
-rw-r--r--kernel/trace/rv/rv.h4
-rw-r--r--kernel/trace/rv/rv_reactors.c28
-rw-r--r--kernel/trace/rv/rv_trace.h6
-rw-r--r--kernel/trace/trace_osnoise.c55
-rw-r--r--tools/verification/dot2/dot2k27
-rw-r--r--tools/verification/dot2/dot2k.py80
-rw-r--r--tools/verification/dot2/dot2k_templates/Kconfig3
-rw-r--r--tools/verification/dot2/dot2k_templates/main.c4
-rw-r--r--tools/verification/dot2/dot2k_templates/main_container.c38
-rw-r--r--tools/verification/dot2/dot2k_templates/main_container.h3
-rw-r--r--tools/verification/models/sched/sco.dot18
-rw-r--r--tools/verification/models/sched/scpd.dot18
-rw-r--r--tools/verification/models/sched/sncid.dot18
-rw-r--r--tools/verification/models/sched/snep.dot18
-rw-r--r--tools/verification/models/sched/snroc.dot18
-rw-r--r--tools/verification/models/sched/tss.dot18
-rw-r--r--tools/verification/rv/include/in_kernel.h2
-rw-r--r--tools/verification/rv/include/rv.h3
-rw-r--r--tools/verification/rv/src/in_kernel.c256
-rw-r--r--tools/verification/rv/src/rv.c38
64 files changed, 2135 insertions, 166 deletions
diff --git a/Documentation/tools/rv/rv-mon-sched.rst b/Documentation/tools/rv/rv-mon-sched.rst
new file mode 100644
index 000000000000..da0fe4c79ae5
--- /dev/null
+++ b/Documentation/tools/rv/rv-mon-sched.rst
@@ -0,0 +1,69 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============
+rv-mon-sched
+============
+-----------------------------
+Scheduler monitors collection
+-----------------------------
+
+:Manual section: 1
+
+SYNOPSIS
+========
+
+**rv mon sched** [*OPTIONS*]
+
+**rv mon <NESTED_MONITOR>** [*OPTIONS*]
+
+**rv mon sched:<NESTED_MONITOR>** [*OPTIONS*]
+
+DESCRIPTION
+===========
+
+The scheduler monitor collection is a container for several monitors to model
+the behaviour of the scheduler. Each monitor describes a specification that
+the scheduler should follow.
+
+As a monitor container, it will enable all nested monitors and set them
+according to OPTIONS.
+Nevertheless nested monitors can also be activated independently both by name
+and by specifying sched: , e.g. to enable only monitor tss you can do any of:
+
+ # rv mon sched:tss
+
+ # rv mon tss
+
+See kernel documentation for further information about this monitor:
+<https://docs.kernel.org/trace/rv/monitor_sched.html>
+
+OPTIONS
+=======
+
+.. include:: common_ikm.rst
+
+NESTED MONITOR
+==============
+
+The available nested monitors are:
+ * scpd: schedule called with preemption disabled
+ * snep: schedule does not enable preempt
+ * sncid: schedule not called with interrupt disabled
+ * snroc: set non runnable on its own context
+ * sco: scheduling context operations
+ * tss: task switch while scheduling
+
+SEE ALSO
+========
+
+**rv**\(1), **rv-mon**\(1)
+
+Linux kernel *RV* documentation:
+<https://www.kernel.org/doc/html/latest/trace/rv/index.html>
+
+AUTHOR
+======
+
+Written by Gabriele Monaco <gmonaco@redhat.com>
+
+.. include:: common_appendix.rst
diff --git a/Documentation/trace/rv/monitor_sched.rst b/Documentation/trace/rv/monitor_sched.rst
new file mode 100644
index 000000000000..24b2c62a3bc2
--- /dev/null
+++ b/Documentation/trace/rv/monitor_sched.rst
@@ -0,0 +1,171 @@
+Scheduler monitors
+==================
+
+- Name: sched
+- Type: container for multiple monitors
+- Author: Gabriele Monaco <gmonaco@redhat.com>, Daniel Bristot de Oliveira <bristot@kernel.org>
+
+Description
+-----------
+
+Monitors describing complex systems, such as the scheduler, can easily grow to
+the point where they are just hard to understand because of the many possible
+state transitions.
+Often it is possible to break such descriptions into smaller monitors,
+sharing some or all events. Enabling those smaller monitors concurrently is,
+in fact, testing the system as if we had one single larger monitor.
+Splitting models into multiple specification is not only easier to
+understand, but gives some more clues when we see errors.
+
+The sched monitor is a set of specifications to describe the scheduler behaviour.
+It includes several per-cpu and per-task monitors that work independently to verify
+different specifications the scheduler should follow.
+
+To make this system as straightforward as possible, sched specifications are *nested*
+monitors, whereas sched itself is a *container*.
+From the interface perspective, sched includes other monitors as sub-directories,
+enabling/disabling or setting reactors to sched, propagates the change to all monitors,
+however single monitors can be used independently as well.
+
+It is important that future modules are built after their container (sched, in
+this case), otherwise the linker would not respect the order and the nesting
+wouldn't work as expected.
+To do so, simply add them after sched in the Makefile.
+
+Specifications
+--------------
+
+The specifications included in sched are currently a work in progress, adapting the ones
+defined in by Daniel Bristot in [1].
+
+Currently we included the following:
+
+Monitor tss
+~~~~~~~~~~~
+
+The task switch while scheduling (tss) monitor ensures a task switch happens
+only in scheduling context, that is inside a call to `__schedule`::
+
+ |
+ |
+ v
+ +-----------------+
+ | thread | <+
+ +-----------------+ |
+ | |
+ | schedule_entry | schedule_exit
+ v |
+ sched_switch |
+ +--------------- |
+ | sched |
+ +--------------> -+
+
+Monitor sco
+~~~~~~~~~~~
+
+The scheduling context operations (sco) monitor ensures changes in a task state
+happen only in thread context::
+
+
+ |
+ |
+ v
+ sched_set_state +------------------+
+ +------------------ | |
+ | | thread_context |
+ +-----------------> | | <+
+ +------------------+ |
+ | |
+ | schedule_entry | schedule_exit
+ v |
+ |
+ scheduling_context -+
+
+Monitor snroc
+~~~~~~~~~~~~~
+
+The set non runnable on its own context (snroc) monitor ensures changes in a
+task state happens only in the respective task's context. This is a per-task
+monitor::
+
+ |
+ |
+ v
+ +------------------+
+ | other_context | <+
+ +------------------+ |
+ | |
+ | sched_switch_in | sched_switch_out
+ v |
+ sched_set_state |
+ +------------------ |
+ | own_context |
+ +-----------------> -+
+
+Monitor scpd
+~~~~~~~~~~~~
+
+The schedule called with preemption disabled (scpd) monitor ensures schedule is
+called with preemption disabled::
+
+ |
+ |
+ v
+ +------------------+
+ | cant_sched | <+
+ +------------------+ |
+ | |
+ | preempt_disable | preempt_enable
+ v |
+ schedule_entry |
+ schedule_exit |
+ +----------------- can_sched |
+ | |
+ +----------------> -+
+
+Monitor snep
+~~~~~~~~~~~~
+
+The schedule does not enable preempt (snep) monitor ensures a schedule call
+does not enable preemption::
+
+ |
+ |
+ v
+ preempt_disable +------------------------+
+ preempt_enable | |
+ +------------------ | non_scheduling_context |
+ | | |
+ +-----------------> | | <+
+ +------------------------+ |
+ | |
+ | schedule_entry | schedule_exit
+ v |
+ |
+ scheduling_contex -+
+
+Monitor sncid
+~~~~~~~~~~~~~
+
+The schedule not called with interrupt disabled (sncid) monitor ensures
+schedule is not called with interrupt disabled::
+
+ |
+ |
+ v
+ schedule_entry +--------------+
+ schedule_exit | |
+ +----------------- | can_sched |
+ | | |
+ +----------------> | | <+
+ +--------------+ |
+ | |
+ | irq_disable | irq_enable
+ v |
+ |
+ cant_sched -+
+
+References
+----------
+
+[1] - https://bristot.me/linux-task-model
diff --git a/include/linux/rv.h b/include/linux/rv.h
index 8883b41d88ec..3452b5e4b29e 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -7,7 +7,7 @@
#ifndef _LINUX_RV_H
#define _LINUX_RV_H
-#define MAX_DA_NAME_LEN 24
+#define MAX_DA_NAME_LEN 32
#ifdef CONFIG_RV
/*
@@ -56,7 +56,7 @@ struct rv_monitor {
bool rv_monitoring_on(void);
int rv_unregister_monitor(struct rv_monitor *monitor);
-int rv_register_monitor(struct rv_monitor *monitor);
+int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent);
int rv_get_task_monitor_slot(void);
void rv_put_task_monitor_slot(int slot);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e5c38718ff5..56ddeb37b5cd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -46,6 +46,7 @@
#include <linux/rv.h>
#include <linux/livepatch_sched.h>
#include <linux/uidgid_types.h>
+#include <linux/tracepoint-defs.h>
#include <asm/kmap_size.h>
/* task_struct member predeclarations (sorted alphabetically): */
@@ -187,6 +188,12 @@ struct user_event_mm;
# define debug_rtlock_wait_restore_state() do { } while (0)
#endif
+#define trace_set_current_state(state_value) \
+ do { \
+ if (tracepoint_enabled(sched_set_state_tp)) \
+ __trace_set_current_state(state_value); \
+ } while (0)
+
/*
* set_current_state() includes a barrier so that the write of current->__state
* is correctly serialised wrt the caller's subsequent test of whether to
@@ -227,12 +234,14 @@ struct user_event_mm;
#define __set_current_state(state_value) \
do { \
debug_normal_state_change((state_value)); \
+ trace_set_current_state(state_value); \
WRITE_ONCE(current->__state, (state_value)); \
} while (0)
#define set_current_state(state_value) \
do { \
debug_normal_state_change((state_value)); \
+ trace_set_current_state(state_value); \
smp_store_mb(current->__state, (state_value)); \
} while (0)
@@ -248,6 +257,7 @@ struct user_event_mm;
\
raw_spin_lock_irqsave(&current->pi_lock, flags); \
debug_special_state_change((state_value)); \
+ trace_set_current_state(state_value); \
WRITE_ONCE(current->__state, (state_value)); \
raw_spin_unlock_irqrestore(&current->pi_lock, flags); \
} while (0)
@@ -283,6 +293,7 @@ struct user_event_mm;
raw_spin_lock(&current->pi_lock); \
current->saved_state = current->__state; \
debug_rtlock_wait_set_state(); \
+ trace_set_current_state(TASK_RTLOCK_WAIT); \
WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \
raw_spin_unlock(&current->pi_lock); \
} while (0);
@@ -292,6 +303,7 @@ struct user_event_mm;
lockdep_assert_irqs_disabled(); \
raw_spin_lock(&current->pi_lock); \
debug_rtlock_wait_restore_state(); \
+ trace_set_current_state(current->saved_state); \
WRITE_ONCE(current->__state, current->saved_state); \
current->saved_state = TASK_RUNNING; \
raw_spin_unlock(&current->pi_lock); \
@@ -328,6 +340,10 @@ extern void io_schedule_finish(int token);
extern long io_schedule_timeout(long timeout);
extern void io_schedule(void);
+/* wrapper function to trace from this header file */
+DECLARE_TRACEPOINT(sched_set_state_tp);
+extern void __trace_set_current_state(int state_value);
+
/**
* struct prev_cputime - snapshot of system and user cputime
* @utime: time spent in user mode
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index e1c1079f8c8d..ed52d0506c69 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -76,6 +76,10 @@
#define DECLARE_TRACE(name, proto, args) \
DEFINE_TRACE(name, PARAMS(proto), PARAMS(args))
+#undef DECLARE_TRACE_CONDITION
+#define DECLARE_TRACE_CONDITION(name, proto, args, cond) \
+ DEFINE_TRACE(name, PARAMS(proto), PARAMS(args))
+
/* If requested, create helpers for calling these tracepoints from Rust. */
#ifdef CREATE_RUST_TRACE_POINTS
#undef DEFINE_RUST_DO_TRACE
@@ -108,6 +112,8 @@
/* Make all open coded DECLARE_TRACE nops */
#undef DECLARE_TRACE
#define DECLARE_TRACE(name, proto, args)
+#undef DECLARE_TRACE_CONDITION
+#define DECLARE_TRACE_CONDITION(name, proto, args, cond)
#ifdef TRACEPOINTS_ENABLED
#include <trace/trace_events.h>
@@ -129,6 +135,7 @@
#undef DEFINE_EVENT_CONDITION
#undef TRACE_HEADER_MULTI_READ
#undef DECLARE_TRACE
+#undef DECLARE_TRACE_CONDITION
/* Only undef what we defined in this file */
#ifdef UNDEF_TRACE_INCLUDE_FILE
diff --git a/include/trace/events/osnoise.h b/include/trace/events/osnoise.h
index a2379a4f0684..3f4273623801 100644
--- a/include/trace/events/osnoise.h
+++ b/include/trace/events/osnoise.h
@@ -3,9 +3,105 @@
#define TRACE_SYSTEM osnoise
#if !defined(_OSNOISE_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+
+#ifndef _OSNOISE_TRACE_H
#define _OSNOISE_TRACE_H
+/*
+ * osnoise sample structure definition. Used to store the statistics of a
+ * sample run.
+ */
+struct osnoise_sample {
+ u64 runtime; /* runtime */
+ u64 noise; /* noise */
+ u64 max_sample; /* max single noise sample */
+ int hw_count; /* # HW (incl. hypervisor) interference */
+ int nmi_count; /* # NMIs during this sample */
+ int irq_count; /* # IRQs during this sample */
+ int softirq_count; /* # softirqs during this sample */
+ int thread_count; /* # threads during this sample */
+};
+
+#ifdef CONFIG_TIMERLAT_TRACER
+/*
+ * timerlat sample structure definition. Used to store the statistics of
+ * a sample run.
+ */
+struct timerlat_sample {
+ u64 timer_latency; /* timer_latency */
+ unsigned int seqnum; /* unique sequence */
+ int context; /* timer context */
+};
+#endif // CONFIG_TIMERLAT_TRACER
+#endif // _OSNOISE_TRACE_H
#include <linux/tracepoint.h>
+TRACE_EVENT(osnoise_sample,
+
+ TP_PROTO(struct osnoise_sample *s),
+
+ TP_ARGS(s),
+
+ TP_STRUCT__entry(
+ __field( u64, runtime )
+ __field( u64, noise )
+ __field( u64, max_sample )
+ __field( int, hw_count )
+ __field( int, irq_count )
+ __field( int, nmi_count )
+ __field( int, softirq_count )
+ __field( int, thread_count )
+ ),
+
+ TP_fast_assign(
+ __entry->runtime = s->runtime;
+ __entry->noise = s->noise;
+ __entry->max_sample = s->max_sample;
+ __entry->hw_count = s->hw_count;
+ __entry->irq_count = s->irq_count;
+ __entry->nmi_count = s->nmi_count;
+ __entry->softirq_count = s->softirq_count;
+ __entry->thread_count = s->thread_count;
+ ),
+
+ TP_printk("runtime=%llu noise=%llu max_sample=%llu hw_count=%d"
+ " irq_count=%d nmi_count=%d softirq_count=%d"
+ " thread_count=%d",
+ __entry->runtime,
+ __entry->noise,
+ __entry->max_sample,
+ __entry->hw_count,
+ __entry->irq_count,
+ __entry->nmi_count,
+ __entry->softirq_count,
+ __entry->thread_count)
+);
+
+#ifdef CONFIG_TIMERLAT_TRACER
+TRACE_EVENT(timerlat_sample,
+
+ TP_PROTO(struct timerlat_sample *s),
+
+ TP_ARGS(s),
+
+ TP_STRUCT__entry(
+ __field( u64, timer_latency )
+ __field( unsigned int, seqnum )
+ __field( int, context )
+ ),
+
+ TP_fast_assign(
+ __entry->timer_latency = s->timer_latency;
+ __entry->seqnum = s->seqnum;
+ __entry->context = s->context;
+ ),
+
+ TP_printk("timer_latency=%llu seqnum=%u context=%d",
+ __entry->timer_latency,
+ __entry->seqnum,
+ __entry->context)
+);
+#endif // CONFIG_TIMERLAT_TRACER
+
TRACE_EVENT(thread_noise,
TP_PROTO(struct task_struct *t, u64 start, u64 duration),
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index bfd97cce40a1..8994e97d86c1 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -822,6 +822,19 @@ DECLARE_TRACE(sched_compute_energy_tp,
unsigned long max_util, unsigned long busy_time),
TP_ARGS(p, dst_cpu, energy, max_util, busy_time));
+DECLARE_TRACE(sched_entry_tp,
+ TP_PROTO(bool preempt, unsigned long ip),
+ TP_ARGS(preempt, ip));
+
+DECLARE_TRACE(sched_exit_tp,
+ TP_PROTO(bool is_switch, unsigned long ip),
+ TP_ARGS(is_switch, ip));
+
+DECLARE_TRACE_CONDITION(sched_set_state_tp,
+ TP_PROTO(struct task_struct *tsk, int state),
+ TP_ARGS(tsk, state),
+ TP_CONDITION(!!(tsk->__state) != !!state));
+
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 87540217fc09..cfaca3040b2f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -488,6 +488,16 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
#endif /* CONFIG_SCHED_CORE */
+/* need a wrapper since we may need to trace from modules */
+EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp);
+
+/* Call via the helper macro trace_set_current_state. */
+void __trace_set_current_state(int state_value)
+{
+ trace_sched_set_state_tp(current, state_value);
+}
+EXPORT_SYMBOL(__trace_set_current_state);
+
/*
* Serialization rules:
*
@@ -5295,6 +5305,12 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
*/
finish_task_switch(prev);
+ /*
+ * This is a special case: the newly created task has just
+ * switched the context for the first time. It is returning from
+ * schedule for the first time in this path.
+ */
+ trace_sched_exit_tp(true, CALLER_ADDR0);
preempt_enable();
if (current->set_child_tid)
@@ -6634,12 +6650,15 @@ static void __sched notrace __schedule(int sched_mode)
* as a preemption by schedule_debug() and RCU.
*/
bool preempt = sched_mode > SM_NONE;
+ bool is_switch = false;
unsigned long *switch_count;
unsigned long prev_state;
struct rq_flags rf;
struct rq *rq;
int cpu;
+ trace_sched_entry_tp(preempt, CALLER_ADDR0);
+
cpu = smp_processor_id();
rq = cpu_rq(cpu);
prev = rq->curr;
@@ -6705,7 +6724,8 @@ picked:
clear_preempt_need_resched();
rq->last_seen_need_resched_ns = 0;
- if (likely(prev != next)) {
+ is_switch = prev != next;
+ if (likely(is_switch)) {
rq->nr_switches++;
/*
* RCU users of rcu_dereference(rq->curr) may not see