diff options
| -rw-r--r-- | Documentation/RCU/Design/Requirements/Requirements.html | 35 | ||||
| -rw-r--r-- | Documentation/RCU/stallwarn.txt | 2 | ||||
| -rw-r--r-- | Documentation/RCU/whatisRCU.txt | 3 | ||||
| -rw-r--r-- | Documentation/sysctl/kernel.txt | 12 | ||||
| -rw-r--r-- | include/linux/kernel.h | 1 | ||||
| -rw-r--r-- | include/linux/rcupdate.h | 23 | ||||
| -rw-r--r-- | include/linux/torture.h | 4 | ||||
| -rw-r--r-- | init/Kconfig | 1 | ||||
| -rw-r--r-- | kernel/rcu/rcuperf.c | 25 | ||||
| -rw-r--r-- | kernel/rcu/rcutorture.c | 9 | ||||
| -rw-r--r-- | kernel/rcu/tree.c | 586 | ||||
| -rw-r--r-- | kernel/rcu/tree.h | 15 | ||||
| -rw-r--r-- | kernel/rcu/tree_exp.h | 656 | ||||
| -rw-r--r-- | kernel/rcu/tree_plugin.h | 95 | ||||
| -rw-r--r-- | kernel/rcu/update.c | 7 | ||||
| -rw-r--r-- | kernel/sysctl.c | 11 | ||||
| -rw-r--r-- | kernel/torture.c | 176 | ||||
| -rw-r--r-- | lib/Kconfig.debug | 33 | ||||
| -rw-r--r-- | tools/testing/selftests/rcutorture/bin/functions.sh | 12 | ||||
| -rwxr-xr-x | tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 34 | ||||
| -rwxr-xr-x | tools/testing/selftests/rcutorture/bin/kvm.sh | 2 | ||||
| -rwxr-xr-x | tools/testing/selftests/rcutorture/bin/parse-console.sh | 7 | ||||
| -rw-r--r-- | tools/testing/selftests/rcutorture/doc/initrd.txt | 22 |
23 files changed, 978 insertions, 793 deletions
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index e7e24b3e86e2..ece410f40436 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -2391,6 +2391,41 @@ and <tt>RCU_NONIDLE()</tt> on the other while inspecting idle-loop code. Steven Rostedt supplied <tt>_rcuidle</tt> event tracing, which is used quite heavily in the idle loop. +However, there are some restrictions on the code placed within +<tt>RCU_NONIDLE()</tt>: + +<ol> +<li> Blocking is prohibited. + In practice, this is not a serious restriction given that idle + tasks are prohibited from blocking to begin with. +<li> Although nesting <tt>RCU_NONIDLE()</tt> is permited, they cannot + nest indefinitely deeply. + However, given that they can be nested on the order of a million + deep, even on 32-bit systems, this should not be a serious + restriction. + This nesting limit would probably be reached long after the + compiler OOMed or the stack overflowed. +<li> Any code path that enters <tt>RCU_NONIDLE()</tt> must sequence + out of that same <tt>RCU_NONIDLE()</tt>. + For example, the following is grossly illegal: + + <blockquote> + <pre> + 1 RCU_NONIDLE({ + 2 do_something(); + 3 goto bad_idea; /* BUG!!! */ + 4 do_something_else();}); + 5 bad_idea: + </pre> + </blockquote> + + <p> + It is just as illegal to transfer control into the middle of + <tt>RCU_NONIDLE()</tt>'s argument. + Yes, in theory, you could transfer in as long as you also + transferred out, but in practice you could also expect to get sharply + worded review comments. +</ol> <p> It is similarly socially unacceptable to interrupt an diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 0f7fb4298e7e..e93d04133fe7 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -49,7 +49,7 @@ rcupdate.rcu_task_stall_timeout This boot/sysfs parameter controls the RCU-tasks stall warning interval. A value of zero or less suppresses RCU-tasks stall warnings. A positive value sets the stall-warning interval - in jiffies. An RCU-tasks stall warning starts wtih the line: + in jiffies. An RCU-tasks stall warning starts with the line: INFO: rcu_tasks detected stalls on tasks: diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 111770ffa10e..204422719197 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -5,6 +5,9 @@ to start learning about RCU: 2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/ 3. RCU part 3: the RCU API http://lwn.net/Articles/264090/ 4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/ + 2010 Big API Table http://lwn.net/Articles/419086/ +5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/ + 2014 Big API Table http://lwn.net/Articles/609973/ What is RCU? diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index a3683ce2a2f3..33204604de6c 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -58,6 +58,7 @@ show up in /proc/sys/kernel: - panic_on_stackoverflow - panic_on_unrecovered_nmi - panic_on_warn +- panic_on_rcu_stall - perf_cpu_time_max_percent - perf_event_paranoid - perf_event_max_stack @@ -618,6 +619,17 @@ a kernel rebuild when attempting to kdump at the location of a WARN(). ============================================================== +panic_on_rcu_stall: + +When set to 1, calls panic() after RCU stall detection messages. This +is useful to define the root cause of RCU stalls using a vmcore. + +0: do not panic() when RCU stall takes place, default behavior. + +1: panic() after printing RCU stall messages. + +============================================================== + perf_cpu_time_max_percent: Hints to the kernel how much CPU time it should be allowed to diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 94aa10ffe156..c42082112ec8 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -451,6 +451,7 @@ extern int panic_on_oops; extern int panic_on_unrecovered_nmi; extern int panic_on_io_nmi; extern int panic_on_warn; +extern int sysctl_panic_on_rcu_stall; extern int sysctl_panic_on_stackoverflow; extern bool crash_kexec_post_notifiers; diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 5f1533e3d032..3bc5de08c0b7 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -45,6 +45,7 @@ #include <linux/bug.h> #include <linux/compiler.h> #include <linux/ktime.h> +#include <linux/irqflags.h> #include <asm/barrier.h> @@ -379,12 +380,13 @@ static inline void rcu_init_nohz(void) * in the inner idle loop. * * This macro provides the way out: RCU_NONIDLE(do_something_with_RCU()) - * will tell RCU that it needs to pay attending, invoke its argument - * (in this example, a call to the do_something_with_RCU() function), + * will tell RCU that it needs to pay attention, invoke its argument + * (in this example, calling the do_something_with_RCU() function), * and then tell RCU to go back to ignoring this CPU. It is permissible - * to nest RCU_NONIDLE() wrappers, but the nesting level is currently - * quite limited. If deeper nesting is required, it will be necessary - * to adjust DYNTICK_TASK_NESTING_VALUE accordingly. + * to nest RCU_NONIDLE() wrappers, but not indefinitely (but the limit is + * on the order of a million or so, even on 32-bit systems). It is + * not legal to block within RCU_NONIDLE(), nor is it permissible to + * transfer control either into or out of RCU_NONIDLE()'s statement. */ #define RCU_NONIDLE(a) \ do { \ @@ -649,7 +651,16 @@ static inline void rcu_preempt_sleep_check(void) * please be careful when making changes to rcu_assign_pointer() and the * other macros that it invokes. */ -#define rcu_assign_pointer(p, v) smp_store_release(&p, RCU_INITIALIZER(v)) +#define rcu_assign_pointer(p, v) \ +({ \ + uintptr_t _r_a_p__v = (uintptr_t)(v); \ + \ + if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \ + WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \ + else \ + smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \ + _r_a_p__v; \ +}) /** * rcu_access_pointer() - fetch RCU pointer with no dereferencing diff --git a/include/linux/torture.h b/include/linux/torture.h index 7759fc3c622d..6685a73736a2 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -50,6 +50,10 @@ do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); } while (0) /* Definitions for online/offline exerciser. */ +bool torture_offline(int cpu, long *n_onl_attempts, long *n_onl_successes, + unsigned long *sum_offl, int *min_onl, int *max_onl); +bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, + unsigned long *sum_onl, int *min_onl, int *max_onl); int torture_onoff_init(long ooholdoff, long oointerval); void torture_onoff_stats(void); bool torture_onoff_failures(void); diff --git a/init/Kconfig b/init/Kconfig index f755a602d4a1..a068265fbcaf 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -517,6 +517,7 @@ config SRCU config TASKS_RCU bool default n + depends on !UML select SRCU help This option enables a task-based RCU implementation that uses diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 3cee0d8393ed..d38ab08a3fe7 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -58,7 +58,7 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); #define VERBOSE_PERFOUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) -torture_param(bool, gp_exp, true, "Use expedited GP wait primitives"); +torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); @@ -96,12 +96,7 @@ static int rcu_perf_writer_state; #define MAX_MEAS 10000 #define MIN_MEAS 100 -#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE) -#define RCUPERF_RUNNABLE_INIT 1 -#else -#define RCUPERF_RUNNABLE_INIT 0 -#endif -static int perf_runnable = RCUPERF_RUNNABLE_INIT; +static int perf_runnable = IS_ENABLED(MODULE); module_param(perf_runnable, int, 0444); MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); @@ -363,8 +358,6 @@ rcu_perf_writer(void *arg) u64 *wdpp = writer_durations[me]; VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); - WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp); - WARN_ON(rcu_gp_is_normal() && gp_exp); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); sp.sched_priority = 1; @@ -631,12 +624,24 @@ rcu_perf_init(void) firsterr = -ENOMEM; goto unwind; } + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) { + VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + firsterr = -EINVAL; + goto unwind; + } + if (rcu_gp_is_normal() && gp_exp) { + VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + firsterr = -EINVAL; + goto unwind; + } for (i = 0; i < nrealwriters; i++) { writer_durations[i] = kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), GFP_KERNEL); - if (!writer_durations[i]) + if (!writer_durations[i]) { + firsterr = -ENOMEM; goto unwind; + } firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, writer_tasks[i]); if (firsterr) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 084a28a732eb..971e2b138063 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -182,12 +182,7 @@ static const char *rcu_torture_writer_state_getname(void) return rcu_torture_writer_state_names[i]; } -#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) -#define RCUTORTURE_RUNNABLE_INIT 1 -#else -#define RCUTORTURE_RUNNABLE_INIT 0 -#endif -static int torture_runnable = RCUTORTURE_RUNNABLE_INIT; +static int torture_runnable = IS_ENABLED(MODULE); module_param(torture_runnable, int, 0444); MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); @@ -1476,7 +1471,7 @@ static int rcu_torture_barrier_cbs(void *arg) break; /* * The above smp_load_acquire() ensures barrier_phase load - * is ordered before the folloiwng ->call(). + * is ordered before the following ->call(). */ local_irq_disable(); /* Just to test no-irq call_rcu(). */ cur_ops->call(&rcu, rcu_torture_barrier_cbf); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c7f1bc4f817c..f433959e9322 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -125,12 +125,14 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; /* Number of rcu_nodes at specified level. */ static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ +/* panic() on RCU Stall sysctl. */ +int sysctl_panic_on_rcu_stall __read_mostly; /* * The rcu_scheduler_active variable transitions from zero to one just * before the first task is spawned. So when this variable is zero, RCU * can assume that there is but one task, allowing RCU to (for example) - * optimize synchronize_sched() to a simple barrier(). When this variable + * optimize synchronize_rcu() to a simple barrier(). When this variable * is one, RCU must actually do all the hard work required to detect real * grace periods. This variable is also used to suppress boot-time false * positives from lockdep-RCU error checking. @@ -159,6 +161,7 @@ static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, bool wake); +static void sync_sched_exp_online_cleanup(int cpu); /* rcuc/rcub kthread realtime priority */ #ifdef CONFIG_RCU_KTHREAD_PRIO @@ -1284,9 +1287,9 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask != 0) { - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) - dump_cpu_task(rnp->grplo + cpu); + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) + dump_cpu_task(cpu); } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -1311,6 +1314,12 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) } } +static inline void panic_on_rcu_stall(void) +{ + if (sysctl_panic_on_rcu_stall) + panic("RCU Stall\n"); +} + static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; @@ -1351,10 +1360,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) raw_spin_lock_irqsave_rcu_node(rnp, flags); ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) { - print_cpu_stall_info(rsp, - rnp->grplo + cpu); + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { + print_cpu_stall_info(rsp, cpu); ndetected++; } } @@ -1390,6 +1398,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) rcu_check_gp_kthread_starvation(rsp); + panic_on_rcu_stall(); + force_quiescent_state(rsp); /* Kick them all. */ } @@ -1430,6 +1440,8 @@ static void print_cpu_stall(struct rcu_state *rsp) jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + panic_on_rcu_stall(); + /* * Attempt to revive the RCU machinery by forcing a context switch. * @@ -1989,8 +2001,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) * of the tree within the rsp->node[] array. Note that other CPUs * will access only the leaves of the hierarchy, thus seeing that no * grace period is in progress, at least until the corresponding - * leaf node has been initialized. In addition, we have excluded - * CPU-hotplug operations. + * leaf node has been initialized. * * The grace period cannot complete until the initialization * process finishes, because this kthread handles both. @@ -2872,7 +2883,6 @@ static void force_qs_rnp(struct rcu_state *rsp, unsigned long *maxj), bool *isidle, unsigned long *maxj) { - unsigned long bit; int cpu; unsigned long flags; unsigned long mask; @@ -2907,9 +2917,8 @@ static void force_qs_rnp(struct rcu_state *rsp, continue; } } - cpu = rnp->grplo; - bit = 1; - for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + unsigned long bit = leaf_node_cpu_bit(rnp, cpu); if ((rnp->qsmask & bit) != 0) { if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) mask |= bit; @@ -3448,549 +3457,6 @@ static bool rcu_seq_done(unsigned long *sp, unsigned long s) return ULONG_CMP_GE(READ_ONCE(*sp), s); } -/* Wrapper functions for expedited grace periods. */ -static void rcu_exp_gp_seq_start(struct rcu_state *rsp) -{ - rcu_seq_start(&rsp->expedited_sequence); -} -static void rcu_exp_gp_seq_end(struct rcu_state *rsp) -{ - rcu_seq_end(&rsp->expedited_sequence); - smp_mb(); /* Ensure that consecutive grace periods serialize. */ -} -static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) -{ - unsigned long s; - - smp_mb(); /* Caller's modifications seen first by other CPUs. */ - s = rcu_seq_snap(&rsp->expedited_sequence); - trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); - return s; -} -static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) -{ - return rcu_seq_done(&rsp->expedited_sequence, s); -} - -/* - * Reset the ->expmaskinit values in the rcu_node tree to reflect any - * recent CPU-online activity. Note that these masks are not cleared - * when CPUs go offline, so they reflect the union of all CPUs that have - * ever been online. This means that this function normally takes its - * no-work-to-do fastpath. - */ -static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) -{ - bool done; - unsigned long flags; - unsigned long mask; - unsigned long oldmask; - int ncpus = READ_ONCE(rsp->ncpus); - struct rcu_node *rnp; - struct rcu_node *rnp_up; - - /* If no new CPUs onlined since last time, nothing to do. */ - if (likely(ncpus == rsp->ncpus_snap)) - return; - rsp->ncpus_snap = ncpus; - - /* - * Each pass through the following loop propagates newly onlined - * CPUs for the current rcu_node structure up the rcu_node tree. - */ - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (rnp->expmaskinit == rnp->expmaskinitnext) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - continue; /* No new CPUs, nothing to do. */ - } - - /* Update this node's mask, track old value for propagation. */ - oldmask = rnp->expmaskinit; - rnp->expmaskinit = rnp->expmaskinitnext; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - /* If was already nonzero, nothing to propagate. */ - if (oldmask) - continue; - - /* Propagate the new CPU up the tree. */ - mask = rnp->grpmask; - rnp_up = rnp->parent; - done = false; - while (rnp_up) { - raw_spin_lock_irqsave_rcu_node(rnp_up, flags); - if (rnp_up->expmaskinit) - done = true; - rnp_up->expmaskinit |= mask; - raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); - if (done) - break; - mask = rnp_up->grpmask; - rnp_up = rnp_up->parent; - } - } -} - -/* - * Reset the ->expmask values in the rcu_node tree in preparation for - * a new expedited grace period. - */ -static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) -{ - unsigned long flags; - struct rcu_node *rnp; - - sync_exp_reset_tree_hotplug(rsp); - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - WARN_ON_ONCE(rnp->expmask); - rnp->expmask = rnp->expmaskinit; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } -} - -/* - * Return non-zero if there is no RCU expedited grace period in progress - * for the specified rcu_node structure, in other words, if all CPUs and - * tasks covered by the specified rcu_node structure have done their bit - * for the current expedited grace period. Works only for preemptible - * RCU -- other RCU implementation use other means. - * - * Caller must hold the rcu_state's exp_mutex. - */ -static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) -{ - return rnp->exp_tasks == NULL && - READ_ONCE(rnp->expmask) == 0; -} - -/* - * Report the exit from RCU read-side critical section for the last task - * that queued itself during or before the current expedited preemptible-RCU - * grace period. This event is reported either to the rcu_node structure on - * which the task was queued or to one of that rcu_node structure's ancestors, - * recursively up the tree. (Calm down, calm down, we do the recursion - * iteratively!) - * - * Caller must hold the rcu_state's exp_mutex and the specified rcu_node - * structure's ->lock. - */ -static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake, unsigned long flags) - __releases(rnp->lock) -{ - unsigned long mask; - - for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) { - if (!rnp->expmask) - rcu_initiate_boost(rnp, flags); - else - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - break; - } - if (rnp->parent == NULL) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (wake) { - smp_mb(); /* EGP done before wake_up(). */ - swake_up(&rsp->expedited_wq); - } - break; - } - mask = rnp->grpmask; - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ - rnp = rnp->parent; - raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ - WARN_ON_ONCE(!(rnp->expmask & mask)); - rnp->expmask &= ~mask; - } -} - -/* - * Report expedited quiescent state for specified node. This is a - * lock-acquisition wrapper function for __rcu_report_exp_rnp(). - * - * Caller must hold the rcu_state's exp_mutex. - */ -static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, - struct rcu_node *rnp, bool wake) -{ - unsigned long flags; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - __rcu_report_exp_rnp(rsp, rnp, wake, flags); -} - -/* - * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. Caller must hold the rcu_state's - * exp_mutex. - */ -static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, - unsigned long mask, bool wake) -{ - unsigned long flags; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (!(rnp->expmask & mask)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - rnp->expmask &= ~mask; - __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ -} - -/* - * Report expedited quiescent state for specified rcu_data (CPU). - */ -static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, - bool wake) -{ - rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); -} - -/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, - unsigned long s) -{ - if (rcu_exp_gp_seq_done(rsp, s)) { - trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); - /* Ensure test happens before caller kfree(). */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(stat); - return true; - } - return false; -} - -/* - * Funnel-lock acquisition for expedited grace periods. Returns true - * if some other task completed an expedited grace period that this task - * can piggy-back on, and with no mutex held. Otherwise, returns false - * with the mutex held, indicating that the caller must actually do the - * expedited grace period. - */ -static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) -{ - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - struct rcu_node *rnp = rdp->mynode; - struct rcu_node *rnp_root = rcu_get_root(rsp); - - /* Low-contention fastpath. */ - if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && - (rnp == rnp_root || - ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && - !mutex_is_locked(&rsp->exp_mutex) && - mutex_trylock(&rsp->exp_mutex)) - goto fastpath; - - /* - * Each pass through the following loop works its way up - * the rcu_node tree, returning if others have done the work or - * otherwise falls through to acquire rsp->exp_mutex. The mapping - * from CPU to rcu_node structure can be inexact, as it is just - * promoting locality and is not strictly needed for correctness. - */ - for (; rnp != NULL; rnp = rnp->parent) { - if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) - return true; - - /* Work not done, either wait here or go up. */ - spin_lock(&rnp->exp_lock); - if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { - - /* Someone else doing GP, so wait for them. */ - spin_unlock(&rnp->exp_lock); - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, - rnp->grplo, rnp->grphi, - TPS("wait")); - wait_event(rnp->exp_wq[(s >> 1) & 0x3], - sync_exp_work_done(rsp, - &rdp->exp_workdone2, s)); - return true; - } - rnp->exp_seq_rq = s; /* Followers can wait on us. */ - spin_unlock(&rnp->exp_lock); - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, - rnp->grphi, TPS("nxtlvl")); - } - mutex_lock(&rsp->exp_mutex); -fastpath: - if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { - mutex_unlock(&rsp->exp_mutex); - return true; - } - rcu_exp_gp_seq_start(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); - return false; -} - -/* Invoked on each online non-idle CPU for expedited quiescent state. */ -static void sync_sched_exp_handler(void *data) -{ - struct rcu_data *rdp; - struct rcu_node *rnp; - struct rcu_state *rsp = data; - - rdp = this_cpu_ptr(rsp->rda); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - if (rcu_is_cpu_rrupt_from_idle()) { - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), true); - return; - } - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); - resched_cpu(smp_processor_id()); -} - -/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ - struct rcu_data *rdp; - int ret; - struct rcu_node *rnp; - struct rcu_state *rsp = &rcu_sched_state; - - rdp = per_cpu_ptr(rsp->rda, cpu); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) - return; - ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); - WARN_ON_ONCE(ret); -} - -/* - * Select the nodes that the upcoming expedited grace period needs - * to wait for. - */ -static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, - smp_call_func_t func) -{ - int cpu; - unsigned long flags; - unsigned long mask; - unsigned long mask_ofl_test; - unsigned long mask_ofl_ipi; - int ret; - struct rcu_node *rnp; - - sync_exp_reset_tree(rsp); - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - - /* Each pass checks a CPU for identity, offline, and idle. */ - mask_ofl_test = 0; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - if (raw_smp_processor_id() == cpu || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) - mask_ofl_test |= rdp->grpmask; - } - mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited - * GP until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - /* IPI the remaining CPUs for expedited quiescent state. */ - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - if (!(mask_ofl_ipi & mask)) - continue; -retry_ipi: - ret = smp_call_function_single(cpu, func, rsp, 0); - if (!ret) { - mask_ofl_ipi &= ~mask; - continue; - } - /* Failed, raced with offline. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (cpu_online(cpu) && - (rnp->expmask & mask)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - schedule_timeout_uninterruptible(1); - if (cpu_online(cpu) && - (rnp->expmask & mask)) - goto retry_ipi; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - } - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } - /* Report quiescent states for those that went offline. */ - mask_ofl_test |= mask_ofl_ipi; - if (mask_ofl_test) - rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); - } -} - -static void synchronize_sched_expedited_wait(struct rcu_state *rsp) -{ - int cpu; - unsigned long jiffies_stall; - unsigned long jiffies_start; - unsigned long mask; - int ndetected; - struct rcu_node *rnp; - struct rcu_node *rnp_root = rcu_get_root(rsp); - int ret; - - jiffies_stall = rcu_jiffies_till_stall_check(); - jiffies_start = jiffies; - - for (;;) { - ret = swait_event_timeout( - rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root), - jiffies_stall); - if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) - return; - if (ret < 0) { - /* Hit a signal, disable CPU stall warnings. */ - swait_event(rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root)); - return; - } - pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", - rsp->name); - ndetected = 0; - rcu_for_each_leaf_node(rsp, rnp) { - ndetected += rcu_print_task_exp_stall(rnp); - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - struct rcu_data *rdp; - - if (!(rnp->expmask & mask)) - continue; - ndetected++; - rdp = per_cpu_ptr(rsp->rda, cpu); - pr_cont(" %d-%c%c%c", cpu, - "O."[!!cpu_online(cpu)], - "o."[!!(rdp->grpmask & rnp->expmaskinit)], - "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); - } - mask <<= 1; - } - pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - jiffies - jiffies_start, rsp->expedited_sequence, - rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); - if (ndetected) { - pr_err("blocking rcu_node structures:"); - rcu_for_each_node_breadth_first(rsp, rnp) { - if (rnp == rnp_root) - continue; /* printed unconditionally */ - if (sync_rcu_preempt_exp_done(rnp)) - continue; - |
