From e2c73a6860bdf54f2c6bf8cddc34ddc91a1343e1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Sep 2021 14:18:51 -0700 Subject: rcu: Remove the RCU_FAST_NO_HZ Kconfig option All of the uses of CONFIG_RCU_FAST_NO_HZ=y that I have seen involve systems with RCU callbacks offloaded. In this situation, all that this Kconfig option does is slow down idle entry/exit with an additional allways-taken early exit. If this is the only use case, then this Kconfig option nothing but an attractive nuisance that needs to go away. This commit therefore removes the RCU_FAST_NO_HZ Kconfig option. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 18 ----- kernel/rcu/tree.c | 11 --- kernel/rcu/tree.h | 7 -- kernel/rcu/tree_plugin.h | 185 +---------------------------------------------- kernel/rcu/tree_stall.h | 27 +------ 5 files changed, 4 insertions(+), 244 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 3128b7cf8e1f..4005f2728f1a 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -169,24 +169,6 @@ config RCU_FANOUT_LEAF Take the default if unsure. -config RCU_FAST_NO_HZ - bool "Accelerate last non-dyntick-idle CPU's grace periods" - depends on NO_HZ_COMMON && SMP && RCU_EXPERT - default n - help - This option permits CPUs to enter dynticks-idle state even if - they have RCU callbacks queued, and prevents RCU from waking - these CPUs up more than roughly once every four jiffies (by - default, you can adjust this using the rcutree.rcu_idle_gp_delay - parameter), thus improving energy efficiency. On the other - hand, this option increases the duration of RCU grace periods, - for example, slowing down synchronize_rcu(). - - Say Y if energy efficiency is critically important, and you - don't care about increased grace-period durations. - - Say N if you are unsure. - config RCU_BOOST bool "Enable RCU priority boosting" depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ef8d36f580fc..5e98257c2910 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -624,7 +624,6 @@ static noinstr void rcu_eqs_enter(bool user) instrumentation_begin(); trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); // instrumentation for the noinstr rcu_dynticks_eqs_enter() @@ -768,9 +767,6 @@ noinstr void rcu_nmi_exit(void) trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ - if (!in_nmi()) - rcu_prepare_for_idle(); - // instrumentation for the noinstr rcu_dynticks_eqs_enter() instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); instrumentation_end(); @@ -872,7 +868,6 @@ static void noinstr rcu_eqs_exit(bool user) // instrumentation for the noinstr rcu_dynticks_eqs_exit() instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); - rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(rdp->dynticks_nesting, 1); @@ -1014,12 +1009,6 @@ noinstr void rcu_nmi_enter(void) rcu_dynticks_eqs_exit(); // ... but is watching here. - if (!in_nmi()) { - instrumentation_begin(); - rcu_cleanup_after_idle(); - instrumentation_end(); - } - instrumentation_begin(); // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks)); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 305cf6aeb408..7ca1aa46083c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -189,11 +189,6 @@ struct rcu_data { bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */ -#ifdef CONFIG_RCU_FAST_NO_HZ - unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ - unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */ - int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ -#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ /* 4) rcu_barrier(), OOM callbacks, and expediting. */ struct rcu_head barrier_head; @@ -419,8 +414,6 @@ static bool rcu_is_callbacks_kthread(void); static void rcu_cpu_kthread_setup(unsigned int cpu); static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp); static void __init rcu_spawn_boost_kthreads(void); -static void rcu_cleanup_after_idle(void); -static void rcu_prepare_for_idle(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); static void rcu_preempt_deferred_qs(struct task_struct *t); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5199559fbbf0..19f7d578cedb 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -51,8 +51,6 @@ static void __init rcu_bootup_announce_oddness(void) RCU_FANOUT); if (rcu_fanout_exact) pr_info("\tHierarchical RCU autobalancing is disabled.\n"); - if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) - pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) @@ -1253,16 +1251,14 @@ static void __init rcu_spawn_boost_kthreads(void) #endif /* #else #ifdef CONFIG_RCU_BOOST */ -#if !defined(CONFIG_RCU_FAST_NO_HZ) - /* * Check to see if any future non-offloaded RCU-related work will need * to be done by the current CPU, even if none need be done immediately, * returning 1 if so. This function is part of the RCU implementation; * it is -not- an exported member of the RCU API. * - * Because we not have RCU_FAST_NO_HZ, just check whether or not this - * CPU has RCU callbacks queued. + * Just check whether or not this CPU has non-offloaded RCU callbacks + * queued. */ int rcu_needs_cpu(u64 basemono, u64 *nextevt) { @@ -1271,183 +1267,6 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data)); } -/* - * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up - * after it. - */ -static void rcu_cleanup_after_idle(void) -{ -} - -/* - * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, - * is nothing. - */ -static void rcu_prepare_for_idle(void) -{ -} - -#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ - -/* - * This code is invoked when a CPU goes idle, at which point we want - * to have the CPU do everything required for RCU so that it can enter - * the energy-efficient dyntick-idle mode. - * - * The following preprocessor symbol controls this: - * - * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted - * to sleep in dyntick-idle mode with RCU callbacks pending. This - * is sized to be roughly one RCU grace period. Those energy-efficiency - * benchmarkers who might otherwise be tempted to set this to a large - * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your - * system. And if you are -that- concerned about energy efficiency, - * just power the system down and be done with it! - * - * The value below works well in practice. If future workloads require - * adjustment, they can be converted into kernel config parameters, though - * making the state machine smarter might be a better option. - */ -#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ - -static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; -module_param(rcu_idle_gp_delay, int, 0644); - -/* - * Try to advance callbacks on the current CPU, but only if it has been - * awhile since the last time we did so. Afterwards, if there are any - * callbacks ready for immediate invocation, return true. - */ -static bool __maybe_unused rcu_try_advance_all_cbs(void) -{ - bool cbs_ready = false; - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - struct rcu_node *rnp; - - /* Exit early if we advanced recently. */ - if (jiffies == rdp->last_advance_all) - return false; - rdp->last_advance_all = jiffies; - - rnp = rdp->mynode; - - /* - * Don't bother checking unless a grace period has - * completed since we last checked and there are - * callbacks not yet ready to invoke. - */ - if ((rcu_seq_completed_gp(rdp->gp_seq, - rcu_seq_current(&rnp->gp_seq)) || - unlikely(READ_ONCE(rdp->gpwrap))) && - rcu_segcblist_pend_cbs(&rdp->cblist)) - note_gp_changes(rdp); - - if (rcu_segcblist_ready_cbs(&rdp->cblist)) - cbs_ready = true; - return cbs_ready; -} - -/* - * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready - * to invoke. If the CPU has callbacks, try to advance them. Tell the - * caller about what to set the timeout. - * - * The caller must have disabled interrupts. - */ -int rcu_needs_cpu(u64 basemono, u64 *nextevt) -{ - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - unsigned long dj; - - lockdep_assert_irqs_disabled(); - - /* If no non-offloaded callbacks, RCU doesn't need the CPU. */ - if (rcu_segcblist_empty(&rdp->cblist) || - rcu_rdp_is_offloaded(rdp)) { - *nextevt = KTIME_MAX; - return 0; - } - - /* Attempt to advance callbacks. */ - if (rcu_try_advance_all_cbs()) { - /* Some ready to invoke, so initiate later invocation. */ - invoke_rcu_core(); - return 1; - } - rdp->last_accelerate = jiffies; - - /* Request timer and round. */ - dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; - - *nextevt = basemono + dj * TICK_NSEC; - return 0; -} - -/* - * Prepare a CPU for idle from an RCU perspective. The first major task is to - * sense whether nohz mode has been enabled or disabled via sysfs. The second - * major task is to accelerate (that is, assign grace-period numbers to) any - * recently arrived callbacks. - * - * The caller must have disabled interrupts. - */ -static void rcu_prepare_for_idle(void) -{ - bool needwake; - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - struct rcu_node *rnp; - int tne; - - lockdep_assert_irqs_disabled(); - if (rcu_rdp_is_offloaded(rdp)) - return; - - /* Handle nohz enablement switches conservatively. */ - tne = READ_ONCE(tick_nohz_active); - if (tne != rdp->tick_nohz_enabled_snap) { - if (!rcu_segcblist_empty(&rdp->cblist)) - invoke_rcu_core(); /* force nohz to see update. */ - rdp->tick_nohz_enabled_snap = tne; - return; - } - if (!tne) - return; - - /* - * If we have not yet accelerated this jiffy, accelerate all - * callbacks on this CPU. - */ - if (rdp->last_accelerate == jiffies) - return; - rdp->last_accelerate = jiffies; - if (rcu_segcblist_pend_cbs(&rdp->cblist)) { - rnp = rdp->mynode; - raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - needwake = rcu_accelerate_cbs(rnp, rdp); - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ - if (needwake) - rcu_gp_kthread_wake(); - } -} - -/* - * Clean up for exit from idle. Attempt to advance callbacks based on - * any grace periods that elapsed while the CPU was idle, and if any - * callbacks are now ready to invoke, initiate invocation. - */ -static void rcu_cleanup_after_idle(void) -{ - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - - lockdep_assert_irqs_disabled(); - if (rcu_rdp_is_offloaded(rdp)) - return; - if (rcu_try_advance_all_cbs()) - invoke_rcu_core(); -} - -#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 5e2fa6fd97f1..21bebf7c9030 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -347,26 +347,6 @@ static void rcu_dump_cpu_stacks(void) } } -#ifdef CONFIG_RCU_FAST_NO_HZ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - - sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d", - rdp->last_accelerate & 0xffff, jiffies & 0xffff, - !!rdp->tick_nohz_enabled_snap); -} - -#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ - *cp = '\0'; -} - -#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ - static const char * const gp_state_names[] = { [RCU_GP_IDLE] = "RCU_GP_IDLE", [RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS", @@ -408,13 +388,12 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp) * of RCU grace periods that this CPU is ignorant of, for example, "1" * if the CPU was aware of the previous grace period. * - * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. + * Also print out idle info. */ static void print_cpu_stall_info(int cpu) { unsigned long delta; bool falsepositive; - char fast_no_hz[72]; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); char *ticks_title; unsigned long ticks_value; @@ -432,11 +411,10 @@ static void print_cpu_stall_info(int cpu) ticks_title = "ticks this GP"; ticks_value = rdp->ticks_this_gp; } - print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); falsepositive = rcu_is_gp_kthread_starving(NULL) && rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s%s\n", + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], @@ -449,7 +427,6 @@ static void print_cpu_stall_info(int cpu) rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, - fast_no_hz, falsepositive ? " (false positive?)" : ""); } -- cgit v1.2.3 From bc849e9192c75833a85f2e9376a265ab31f8eec7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Sep 2021 14:30:20 -0700 Subject: rcu: Move rcu_needs_cpu() to tree.c Now that RCU_FAST_NO_HZ is no more, there is but one implementation of the rcu_needs_cpu() function. This commit therefore moves this function from kernel/rcu/tree_plugin.c to kernel/rcu/tree.c. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 18 ++++++++++++++++++ kernel/rcu/tree_plugin.h | 16 ---------------- 2 files changed, 18 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5e98257c2910..4ac019e9b25f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1075,6 +1075,24 @@ void rcu_irq_enter_irqson(void) local_irq_restore(flags); } +/* + * Check to see if any future non-offloaded RCU-related work will need + * to be done by the current CPU, even if none need be done immediately, + * returning 1 if so. This function is part of the RCU implementation; + * it is -not- an exported member of the RCU API. This is used by + * the idle-entry code to figure out whether it is safe to disable the + * scheduler-clock interrupt. + * + * Just check whether or not this CPU has non-offloaded RCU callbacks + * queued. + */ +int rcu_needs_cpu(u64 basemono, u64 *nextevt) +{ + *nextevt = KTIME_MAX; + return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && + !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data)); +} + /* * If any sort of urgency was applied to the current CPU (for example, * the scheduler-clock interrupt was enabled on a nohz_full CPU) in order diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 19f7d578cedb..0575757a0f8f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1251,22 +1251,6 @@ static void __init rcu_spawn_boost_kthreads(void) #endif /* #else #ifdef CONFIG_RCU_BOOST */ -/* - * Check to see if any future non-offloaded RCU-related work will need - * to be done by the current CPU, even if none need be done immediately, - * returning 1 if so. This function is part of the RCU implementation; - * it is -not- an exported member of the RCU API. - * - * Just check whether or not this CPU has non-offloaded RCU callbacks - * queued. - */ -int rcu_needs_cpu(u64 basemono, u64 *nextevt) -{ - *nextevt = KTIME_MAX; - return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && - !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data)); -} - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? -- cgit v1.2.3 From 2407a64f8045552203ee5cb9904ce75ce2fceef4 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 28 Sep 2021 08:21:28 +0800 Subject: rcu: in_irq() cleanup This commit replaces the obsolete and ambiguous macro in_irq() with its shiny new in_hardirq() equivalent. Signed-off-by: Changbin Du Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_plugin.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ef8d36f580fc..f0f19dc7f19e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1467,7 +1467,7 @@ static void rcu_gp_kthread_wake(void) { struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); - if ((current == t && !in_irq() && !in_serving_softirq()) || + if ((current == t && !in_hardirq() && !in_serving_softirq()) || !READ_ONCE(rcu_state.gp_flags) || !t) return; WRITE_ONCE(rcu_state.gp_wake_time, jiffies); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5199559fbbf0..599084c4c21f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -642,7 +642,7 @@ static void rcu_read_unlock_special(struct task_struct *t) (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node); // Need to defer quiescent state until everything is enabled. - if (use_softirq && (in_irq() || (expboost && !irqs_were_disabled))) { + if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) { // Using softirq, safe to awaken, and either the // wakeup is free or there is either an expedited // GP in flight or a potential need to deboost. -- cgit v1.2.3 From 17ea3718824912e773b0fd78579694b2e75ee597 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Sun, 24 Oct 2021 08:36:34 +0800 Subject: rcu: Improve tree_plugin.h comments and add code cleanups This commit cleans up some comments and code in kernel/rcu/tree_plugin.h. Signed-off-by: Zhouyi Zhou Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 599084c4c21f..f44580276fbd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -16,7 +16,7 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) { /* - * In order to read the offloaded state of an rdp is a safe + * In order to read the offloaded state of an rdp in a safe * and stable way and prevent from its value to be changed * under us, we must either hold the barrier mutex, the cpu * hotplug lock (read or write) or the nocb lock. Local @@ -56,7 +56,7 @@ static void __init rcu_bootup_announce_oddness(void) if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) - pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n"); + pr_info("\tRCU strict (and thus non-scalable) grace periods are enabled.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) @@ -88,13 +88,13 @@ static void __init rcu_bootup_announce_oddness(void) if (rcu_kick_kthreads) pr_info("\tKick kthreads if too-long grace period.\n"); if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) - pr_info("\tRCU callback double-/use-after-free debug enabled.\n"); + pr_info("\tRCU callback double-/use-after-free debug is enabled.\n"); if (gp_preinit_delay) pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay); if (gp_init_delay) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); if (gp_cleanup_delay) - pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); + pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay); if (!use_softirq) pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n"); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) @@ -1153,7 +1153,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) /* * Create an RCU-boost kthread for the specified node if one does not * already exist. We only create this kthread for preemptible RCU. - * Returns zero if all is well, a negated errno otherwise. */ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { @@ -1455,7 +1454,7 @@ static void rcu_cleanup_after_idle(void) * CPU unless the grace period has extended for too long. * * This code relies on the fact that all NO_HZ_FULL CPUs are also - * CONFIG_RCU_NOCB_CPU CPUs. + * RCU_NOCB_CPU CPUs. */ static bool rcu_nohz_full_cpu(void) { -- cgit v1.2.3 From c2cf0767e98eb4487444e5c7ebba491a866811ce Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 15 Nov 2021 13:15:46 +0800 Subject: rcu: Avoid running boost kthreads on isolated CPUs When the boost kthreads are created on systems with nohz_full CPUs, the cpus_allowed_ptr is set to housekeeping_cpumask(HK_FLAG_KTHREAD). However, when the rcu_boost_kthread_setaffinity() is called, the original affinity will be changed and these kthreads can subsequently run on nohz_full CPUs. This commit makes rcu_boost_kthread_setaffinity() restrict these boost kthreads to housekeeping CPUs. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f44580276fbd..87fc4609b756 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1203,8 +1203,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) if ((mask & leaf_node_cpu_bit(rnp, cpu)) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); + cpumask_and(cm, cm, housekeeping_cpumask(HK_FLAG_RCU)); if (cpumask_weight(cm) == 0) - cpumask_setall(cm); + cpumask_copy(cm, housekeeping_cpumask(HK_FLAG_RCU)); set_cpus_allowed_ptr(t, cm); free_cpumask_var(cm); } -- cgit v1.2.3 From 300c0c5e721834f484b03fa3062602dd8ff48413 Mon Sep 17 00:00:00 2001 From: Jun Miao Date: Tue, 16 Nov 2021 07:23:02 +0800 Subject: rcu: Avoid alloc_pages() when recording stack The default kasan_record_aux_stack() calls stack_depot_save() with GFP_NOWAIT, which in turn can then call alloc_pages(GFP_NOWAIT, ...). In general, however, it is not even possible to use either GFP_ATOMIC nor GFP_NOWAIT in certain non-preemptive contexts/RT kernel including raw_spin_locks (see gfp.h and ab00db216c9c7). Fix it by instructing stackdepot to not expand stack storage via alloc_pages() in case it runs out by using kasan_record_aux_stack_noalloc(). Jianwei Hu reported: BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:969 in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 15319, name: python3 INFO: lockdep is turned off. irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [] copy_process+0xaf3/0x2590 softirqs last enabled at (0): [] copy_process+0xaf3/0x2590 softirqs last disabled at (0): [<0000000000000000>] 0x0 CPU: 6 PID: 15319 Comm: python3 Tainted: G W O 5.15-rc7-preempt-rt #1 Hardware name: Supermicro SYS-E300-9A-8C/A2SDi-8C-HLN4F, BIOS 1.1b 12/17/2018 Call Trace: show_stack+0x52/0x58 dump_stack+0xa1/0xd6 ___might_sleep.cold+0x11c/0x12d rt_spin_lock+0x3f/0xc0 rmqueue+0x100/0x1460 rmqueue+0x100/0x1460 mark_usage+0x1a0/0x1a0 ftrace_graph_ret_addr+0x2a/0xb0 rmqueue_pcplist.constprop.0+0x6a0/0x6a0 __kasan_check_read+0x11/0x20 __zone_watermark_ok+0x114/0x270 get_page_from_freelist+0x148/0x630 is_module_text_address+0x32/0xa0 __alloc_pages_nodemask+0x2f6/0x790 __alloc_pages_slowpath.constprop.0+0x12d0/0x12d0 create_prof_cpu_mask+0x30/0x30 alloc_pages_current+0xb1/0x150 stack_depot_save+0x39f/0x490 kasan_save_stack+0x42/0x50 kasan_save_stack+0x23/0x50 kasan_record_aux_stack+0xa9/0xc0 __call_rcu+0xff/0x9c0 call_rcu+0xe/0x10 put_object+0x53/0x70 __delete_object+0x7b/0x90 kmemleak_free+0x46/0x70 slab_free_freelist_hook+0xb4/0x160 kfree+0xe5/0x420 kfree_const+0x17/0x30 kobject_cleanup+0xaa/0x230 kobject_put+0x76/0x90 netdev_queue_update_kobjects+0x17d/0x1f0 ... ... ksys_write+0xd9/0x180 __x64_sys_write+0x42/0x50 do_syscall_64+0x38/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Links: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/include/linux/kasan.h?id=7cb3007ce2da27ec02a1a3211941e7fe6875b642 Fixes: 84109ab58590 ("rcu: Record kvfree_call_rcu() call stack for KASAN") Fixes: 26e760c9a7c8 ("rcu: kasan: record and print call_rcu() call stack") Reported-by: Jianwei Hu Reviewed-by: Uladzislau Rezki (Sony) Acked-by: Marco Elver Tested-by: Juri Lelli Signed-off-by: Jun Miao Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f0f19dc7f19e..c8beea8dd3a0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2982,7 +2982,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) head->func = func; head->next = NULL; local_irq_save(flags); - kasan_record_aux_stack(head); + kasan_record_aux_stack_noalloc(head); rdp = this_cpu_ptr(&rcu_data); /* Add the callback to our list. */ @@ -3547,7 +3547,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) return; } - kasan_record_aux_stack(ptr); + kasan_record_aux_stack_noalloc(ptr); success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); if (!success) { run_page_cache_worker(krcp); -- cgit v1.2.3 From 1f8da406a964dcc01121385a54fafd28e1c6a796 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 23 Sep 2021 10:07:14 -0700 Subject: srcu: Prevent redundant __srcu_read_unlock() wakeup Tiny SRCU readers can appear at task level, but also in interrupt and softirq handlers. Because Tiny SRCU is selected only in kernels built with CONFIG_SMP=n and CONFIG_PREEMPTION=n, it is not possible for a grace period to start while there is a non-task-level SRCU reader executing. This means that it does not make sense for __srcu_read_unlock() to awaken the Tiny SRCU grace period, because that can only happen when the grace period is waiting for one value of ->srcu_idx and __srcu_read_unlock() is ending the last reader for some other value of ->srcu_idx. After all, any such wakeup will be redundant. Worse yet, in some cases, such wakeups generate lockdep splats: ====================================================== WARNING: possible circular locking dependency detected 5.15.0-rc1+ #3758 Not tainted ------------------------------------------------------ rcu_torture_rea/53 is trying to acquire lock: ffffffff9514e6a8 (srcu_ctl.srcu_wq.lock){..-.}-{2:2}, at: xa/0x30 but task is already holding lock: ffff95c642479d80 (&p->pi_lock){-.-.}-{2:2}, at: _extend+0x370/0x400 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&p->pi_lock){-.-.}-{2:2}: _raw_spin_lock_irqsave+0x2f/0x50 try_to_wake_up+0x50/0x580 swake_up_locked.part.7+0xe/0x30 swake_up_one+0x22/0x30 rcutorture_one_extend+0x1b6/0x400 rcu_torture_one_read+0x290/0x5d0 rcu_torture_timer+0x1a/0x70 call_timer_fn+0xa6/0x230 run_timer_softirq+0x493/0x4c0 __do_softirq+0xc0/0x371 irq_exit+0x73/0x90 sysvec_apic_timer_interrupt+0x63/0x80 asm_sysvec_apic_timer_interrupt+0x12/0x20 default_idle+0xb/0x10 default_idle_call+0x5e/0x170 do_idle+0x18a/0x1f0 cpu_startup_entry+0xa/0x10 start_kernel+0x678/0x69f secondary_startup_64_no_verify+0xc2/0xcb -> #0 (srcu_ctl.srcu_wq.lock){..-.}-{2:2}: __lock_acquire+0x130c/0x2440 lock_acquire+0xc2/0x270 _raw_spin_lock_irqsave+0x2f/0x50 swake_up_one+0xa/0x30 rcutorture_one_extend+0x387/0x400 rcu_torture_one_read+0x290/0x5d0 rcu_torture_reader+0xac/0x200 kthread+0x12d/0x150 ret_from_fork+0x22/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&p->pi_lock); lock(srcu_ctl.srcu_wq.lock); lock(&p->pi_lock); lock(srcu_ctl.srcu_wq.lock); *** DEADLOCK *** 1 lock held by rcu_torture_rea/53: #0: ffff95c642479d80 (&p->pi_lock){-.-.}-{2:2}, at: _extend+0x370/0x400 stack backtrace: CPU: 0 PID: 53 Comm: rcu_torture_rea Not tainted 5.15.0-rc1+ Hardware name: Red Hat KVM/RHEL-AV, BIOS e_el8.5.0+746+bbd5d70c 04/01/2014 Call Trace: check_noncircular+0xfe/0x110 ? find_held_lock+0x2d/0x90 __lock_acquire+0x130c/0x2440 lock_acquire+0xc2/0x270 ? swake_up_one+0xa/0x30 ? find_held_lock+0x72/0x90 _raw_spin_lock_irqsave+0x2f/0x50 ? swake_up_one+0xa/0x30 swake_up_one+0xa/0x30 rcutorture_one_extend+0x387/0x400 rcu_torture_one_read+0x290/0x5d0 rcu_torture_reader+0xac/0x200 ? rcutorture_oom_notify+0xf0/0xf0 ? __kthread_parkme+0x61/0x90 ? rcu_torture_one_read+0x5d0/0x5d0 kthread+0x12d/0x150 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x22/0x30 This is a false positive because there is only one CPU, and both locks are raw (non-preemptible) spinlocks. However, it is worthwhile getting rid of the redundant wakeup, which has the side effect of breaking the theoretical deadlock cycle. This commit therefore eliminates the redundant wakeups. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutiny.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index a0ba2ed49bc6..92c002d65482 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -99,7 +99,7 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); - if (!newval && READ_ONCE(ssp->srcu_gp_waiting)) + if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task()) swake_up_one(&ssp->srcu_wq); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); -- cgit v1.2.3 From f5dbc594b5bac1fa694174032b8d3d0249945fd3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 18 Sep 2021 20:40:48 -0700 Subject: rcu-tasks: Don't remove tasks with pending IPIs from holdout list Currently, the check_all_holdout_tasks_trace() function removes all tasks marked with ->trc_reader_checked from the holdout list, including those with IPIs pending. This means that the IPI handler might arrive at a task that has already been removed from the list, which is at best an accident waiting to happen. This commit therefore avoids removing tasks with IPIs pending from the holdout list. This in turn means that the "if" condition in the for_each_online_cpu() loop in rcu_tasks_trace_postgp() should always evaluate to false, so a WARN_ON_ONCE() is added to check that. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 7da3c81c3f59..bd44cd4794d3 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1121,7 +1121,8 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, trc_wait_for_one_reader(t, hop); // If check succeeded, remove this task from the list. - if (READ_ONCE(t->trc_reader_checked)) + if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 && + READ_ONCE(t->trc_reader_checked)) trc_del_holdout(t); else if (needreport) show_stalled_task_trace(t, firstreport); @@ -1156,7 +1157,7 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) // Yes, this assumes that CPUs process IPIs in order. If that ever // changes, there will need to be a recheck and/or timed wait. for_each_online_cpu(cpu) - if (smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu))) + if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu)))) smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1); // Remove the safety count. -- cgit v1.2.3 From 902d82e6299631ec6b6c759ac4b45e44b45cc832 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 22 Sep 2021 20:31:44 -0700 Subject: rcutorture: Sanitize RCUTORTURE_RDR_MASK RCUTORTURE_RDR_MASK is currently not the bit indicated by RCUTORTURE_RDR_SHIFT, but is instead all the bits less significant than that one. This is an accident waiting to happen, so this commit makes RCUTORTURE_RDR_MASK be that one bit and adjusts uses accordingly. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8b410d982990..1e03fe681f02 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -54,7 +54,7 @@ MODULE_AUTHOR("Paul E. McKenney and Josh Triplett extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1) +#define RCUTORTURE_RDR_MASK (1 << RCUTORTURE_RDR_SHIFT) #define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */ #define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */ #define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */ @@ -1466,6 +1466,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (lockit) raw_spin_lock_irqsave(¤t->pi_lock, flags); cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); + idxold = 0; if (lockit) raw_spin_unlock_irqrestore(¤t->pi_lock, flags); } @@ -1476,7 +1477,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, /* Update the reader state. */ if (idxnew == -1) - idxnew = idxold & ~RCUTORTURE_RDR_MASK; + idxnew = idxold & RCUTORTURE_RDR_MASK; WARN_ON_ONCE(idxnew < 0); WARN_ON_ONCE((idxnew >> RCUTORTURE_RDR_SHIFT) > 1); *readstate = idxnew | newstate; @@ -1626,7 +1627,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) rcu_torture_writer_state, cookie, cur_ops->get_gp_state()); rcutorture_one_extend(&readstate, 0, trsp, rtrsp); - WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); + WARN_ON_ONCE(readstate); // This next splat is expected behavior if leakpointer, especially // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1); -- cgit v1.2.3 From 1c3d53986f744c469f0e0b136b8922c83363bfde Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 22 Sep 2021 20:49:12 -0700 Subject: rcutorture: More thoroughly test nested readers Currently, nested readers occur only when a timer handler interrupts a reader. This is rare, and is thus insufficient testing of the transition between nesting levels. This commit therefore causes rcutorture nested readers to be the rule rather than the exception. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 73 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1e03fe681f02..d295da380fb4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -53,15 +53,18 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); /* Bits for ->extendables field, extendables param, and related definitions. */ -#define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK (1 << RCUTORTURE_RDR_SHIFT) +#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */ +#define RCUTORTURE_RDR_MASK_1 (1 << RCUTORTURE_RDR_SHIFT_1) +#define RCUTORTURE_RDR_SHIFT_2 9 /* Put SRCU index in upper bits. */ +#define RCUTORTURE_RDR_MASK_2 (1 << RCUTORTURE_RDR_SHIFT_2) #define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */ #define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */ #define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */ #define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */ #define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */ -#define RCUTORTURE_RDR_RCU 0x20 /* ... entering another RCU reader. */ -#define RCUTORTURE_RDR_NBITS 6 /* Number of bits defined above. */ +#define RCUTORTURE_RDR_RCU_1 0x20 /* ... entering another RCU reader. */ +#define RCUTORTURE_RDR_RCU_2 0x40 /* ... entering another RCU reader. */ +#define RCUTORTURE_RDR_NBITS 7 /* Number of bits defined above. */ #define RCUTORTURE_MAX_EXTEND \ (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \ RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) @@ -1420,13 +1423,15 @@ static void rcutorture_one_extend(int *readstate, int newstate, struct rt_read_seg *rtrsp) { unsigned long flags; - int idxnew = -1; - int idxold = *readstate; + int idxnew1 = -1; + int idxnew2 = -1; + int idxold1 = *readstate; + int idxold2 = idxold1; int statesnew = ~*readstate & newstate; int statesold = *readstate & ~newstate; - WARN_ON_ONCE(idxold < 0); - WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1); + WARN_ON_ONCE(idxold2 < 0); + WARN_ON_ONCE((idxold2 >> RCUTORTURE_RDR_SHIFT_2) > 1); rtrsp->rt_readstate = newstate; /* First, put new protection in place to avoid critical-section gap. */ @@ -1440,8 +1445,10 @@ static void rcutorture_one_extend(int *readstate, int newstate, preempt_disable(); if (statesnew & RCUTORTURE_RDR_SCHED) rcu_read_lock_sched(); - if (statesnew & RCUTORTURE_RDR_RCU) - idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT; + if (statesnew & RCUTORTURE_RDR_RCU_1) + idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1; + if (statesnew & RCUTORTURE_RDR_RCU_2) + idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2; /* * Next, remove old protection, in decreasing order of strength @@ -1460,13 +1467,19 @@ static void rcutorture_one_extend(int *readstate, int newstate, local_bh_enable(); if (statesold & RCUTORTURE_RDR_RBH) rcu_read_unlock_bh(); - if (statesold & RCUTORTURE_RDR_RCU) { + if (statesold & RCUTORTURE_RDR_RCU_2) { + cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1); + WARN_ON_ONCE(idxnew2 != -1); + idxold2 = 0; + } + if (statesold & RCUTORTURE_RDR_RCU_1) { bool lockit = !statesnew && !(torture_random(trsp) & 0xffff); if (lockit) raw_spin_lock_irqsave(¤t->pi_lock, flags); - cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); - idxold = 0; + cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1); + WARN_ON_ONCE(idxnew1 != -1); + idxold1 = 0; if (lockit) raw_spin_unlock_irqrestore(¤t->pi_lock, flags); } @@ -1476,13 +1489,19 @@ static void rcutorture_one_extend(int *readstate, int newstate, cur_ops->read_delay(trsp, rtrsp); /* Update the reader state. */ - if (idxnew == -1) - idxnew = idxold & RCUTORTURE_RDR_MASK; - WARN_ON_ONCE(idxnew < 0); - WARN_ON_ONCE((idxnew >> RCUTORTURE_RDR_SHIFT) > 1); - *readstate = idxnew | newstate; - WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) < 0); - WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) > 1); + if (idxnew1 == -1) + idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1; + WARN_ON_ONCE(idxnew1 < 0); + if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1)) + pr_info("Unexpected idxnew1 value of %#x\n", idxnew1); + if (idxnew2 == -1) + idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2; + WARN_ON_ONCE(idxnew2 < 0); + WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1); + *readstate = idxnew1 | idxnew2 | newstate; + WARN_ON_ONCE(*readstate < 0); + if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1)) + pr_info("Unexpected idxnew2 value of %#x\n", idxnew2); } /* Return the biggest extendables mask given current RCU and boot parameters. */ @@ -1492,7 +1511,7 @@ static int rcutorture_extend_mask_max(void) WARN_ON_ONCE(extendables & ~RCUTORTURE_MAX_EXTEND); mask = extendables & RCUTORTURE_MAX_EXTEND & cur_ops->extendables; - mask = mask | RCUTORTURE_RDR_RCU; + mask = mask | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; return mask; } @@ -1507,13 +1526,21 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ; unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; - WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); + WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1); /* Mostly only one bit (need preemption!), sometimes lots of bits. */ if (!(randmask1 & 0x7)) mask = mask & randmask2; else mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS)); + // Can't have nested RCU reader without outer RCU reader. + if (!(mask & RCUTORTURE_RDR_RCU_1) && (mask & RCUTORTURE_RDR_RCU_2)) { + if (oldmask & RCUTORTURE_RDR_RCU_1) + mask &= ~RCUTORTURE_RDR_RCU_2; + else + mask |= RCUTORTURE_RDR_RCU_1; + } + /* * Can't enable bh w/irq disabled. */ @@ -1533,7 +1560,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) mask |= oldmask & bhs; } - return mask ?: RCUTORTURE_RDR_RCU; + return mask ?: RCUTORTURE_RDR_RCU_1; } /* -- cgit v1.2.3 From 340170fef01b18631128006de03522b671e6e2b5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Sep 2021 21:30:26 -0700 Subject: rcutorture: Suppress pi-lock-across read-unlock testing for Tiny SRCU Because Tiny srcu_read_unlock() directly calls swake_up_one(), lockdep complains when a pi lock is held across that srcu_read_unlock(). Although this is a lockdep false positive (there is no other CPU to complete the deadlock cycle), lockdep is what it is at the moment. This commit therefore prevents rcutorture from holding pi lock across a Tiny srcu_read_unlock(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d295da380fb4..503e14e62e8f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -349,6 +349,7 @@ struct rcu_torture_ops { int can_boost; int extendables; int slow_gps; + int no_pi_lock; const char *name; }; @@ -670,6 +671,7 @@ static struct rcu_torture_ops srcu_ops = { .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .irq_capable = 1, + .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .name = "srcu" }; @@ -703,6 +705,7 @@ static struct rcu_torture_ops srcud_ops = { .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .irq_capable = 1, + .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .name = "srcud" }; @@ -723,6 +726,7 @@ static struct rcu_torture_ops busted_srcud_ops = { .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .irq_capable = 1, + .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .extendables = RCUTORTURE_MAX_EXTEND, .name = "busted_srcud" }; @@ -1473,8 +1477,9 @@ static void rcutorture_one_extend(int *readstate, int newstate, idxold2 = 0; } if (statesold & RCUTORTURE_RDR_RCU_1) { - bool lockit = !statesnew && !(torture_random(trsp) & 0xffff); + bool lockit; + lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff); if (lockit) raw_spin_lock_irqsave(¤t->pi_lock, flags); cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1); -- cgit v1.2.3 From c30c876312f66b0422a1d632b8c45b6416deeec0 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Mon, 25 Oct 2021 11:26:56 +0800 Subject: refscale: Simplify the errexit checkpoint There is only the one OOM error case in main_func(), so this commit eliminates the errexit local variable in favor of a branch to cleanup code. Signed-off-by: Li Zhijian Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 1631ef8a138d..b59457cef88d 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -637,7 +637,6 @@ static u64 process_durations(int n) // point all the timestamps are printed. static int main_func(void *arg) { - bool errexit = false; int exp, r; char buf1[64]; char *buf; @@ -651,7 +650,7 @@ static int main_func(void *arg) buf = kzalloc(64 + nruns * 32, GFP_KERNEL); if (!result_avg || !buf) { VERBOSE_SCALEOUT_ERRSTRING("out of memory"); - errexit = true; + goto oom_exit; } if (holdoff) schedule_timeout_interruptible(holdoff * HZ); @@ -663,8 +662,6 @@ static int main_func(void *arg) // Start exp readers up per experiment for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { - if (errexit) - break; if (torture_must_stop()) goto end; @@ -698,26 +695,22 @@ static int main_func(void *arg) // Print the average of all experiments SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); - if (!errexit) { - buf[0] = 0; - strcat(buf, "\n"); - strcat(buf, "Runs\tTime(ns)\n"); - } + buf[0] = 0; + strcat(buf, "\n"); + strcat(buf, "Runs\tTime(ns)\n"); for (exp = 0; exp < nruns; exp++) { u64 avg; u32 rem; - if (errexit) - break; avg = div_u64_rem(result_avg[exp], 1000, &rem); sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem); strcat(buf, buf1); } - if (!errexit) - SCALEOUT("%s", buf); + SCALEOUT("%s", buf); +oom_exit: // This will shutdown everything including us. if (shutdown) { shutdown_start = 1; -- cgit v1.2.3 From 9880eb878c315b241002d43d7ff89df23713a51b Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Mon, 25 Oct 2021 11:26:57 +0800 Subject: refscale: Prevent buffer to pr_alert() being too long 0Day/LKP observed that the refscale results fail to complete when larger values of nrun (such as 300) are specified. The problem is that printk() can accept at most a 1024-byte buffer. This commit therefore prints the buffer whenever its length exceeds 800 bytes. CC: Philip Li Reported-by: kernel test robot Signed-off-by: Li Zhijian Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index b59457cef88d..d8ed1ca3adc0 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -604,7 +604,7 @@ static u64 process_durations(int n) char *buf; u64 sum = 0; - buf = kmalloc(128 + nreaders * 32, GFP_KERNEL); + buf = kmalloc(800 + 64, GFP_KERNEL); if (!buf) return 0; buf[0] = 0; @@ -617,13 +617,15 @@ static u64 process_durations(int n) if (i % 5 == 0) strcat(buf, "\n"); + if (strlen(buf) >= 800) { + pr_alert("%s", buf); + buf[0] = 0; + } strcat(buf, buf1); sum += rt->last_duration_ns; } - strcat(buf, "\n"); - - SCALEOUT("%s\n", buf); + pr_alert("%s\n", buf); kfree(buf); return sum; @@ -647,7 +649,7 @@ static int main_func(void *arg) VERBOSE_SCALEOUT("main_func task started"); result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); - buf = kzalloc(64 + nruns * 32, GFP_KERNEL); + buf = kzalloc(800 + 64, GFP_KERNEL); if (!result_avg || !buf) { VERBOSE_SCALEOUT_ERRSTRING("out of memory"); goto oom_exit; @@ -695,10 +697,7 @@ static int main_func(void *arg) // Print the average of all experiments SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); - buf[0] = 0; - strcat(buf, "\n"); - strcat(buf, "Runs\tTime(ns)\n"); - + pr_alert("Runs\tTime(ns)\n"); for (exp = 0; exp < nruns; exp++) { u64 avg; u32 rem; @@ -706,9 +705,13 @@ static int main_func(void *arg) avg = div_u64_rem(result_avg[exp], 1000, &rem); sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem); strcat(buf, buf1); + if (strlen(buf) >= 800) { + pr_alert("%s", buf); + buf[0] = 0; + } } - SCALEOUT("%s", buf); + pr_alert("%s", buf); oom_exit: // This will shutdown everything including us. -- cgit v1.2.3 From a4382659487f84c00b5fbb61df25a9ad59396789 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Sep 2021 14:10:45 +0200 Subject: rcu: Ignore rdp.cpu_no_qs.b.exp on preemptible RCU's rcu_qs() Preemptible RCU does not use the rcu_data structure's ->cpu_no_qs.b.exp, instead using a separate ->exp_deferred_qs field to record the need for an expedited quiescent state. In fact ->cpu_no_qs.b.exp should never be set in preemptible RCU because preemptible RCU's expedited grace periods use other mechanisms to record quiescent states. This commit therefore removes the implicit rcu_qs() reference to ->cpu_no_qs.b.exp in favor of a direct reference to ->cpu_no_qs.b.norm. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5199559fbbf0..113c19b086ac 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -277,12 +277,16 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * current task, there might be any number of other tasks blocked while * in an RCU read-side critical section. * + * Unlike non-preemptible-RCU, quiescent state reports for expedited + * grace periods are handled separately via deferred quiescent states + * and context switch events. + * * Callers to this function must disable preemption. */ static void rcu_qs(void) { RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n"); - if (__this_cpu_read(rcu_data.cpu_no_qs.s)) { + if (__this_cpu_read(rcu_data.cpu_no_qs.b.norm)) { trace_rcu_grace_period(TPS("rcu_preempt"), __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); -- cgit v1.2.3 From 6e16b0f7bae3817ea67f4bef4f84298e880fbf66 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Sep 2021 14:10:47 +0200 Subject: rcu: Move rcu_data.cpu_no_qs.b.exp reset to rcu_export_exp_rdp() On non-preemptible RCU, move clearing of the rcu_data structure's ->cpu_no_qs.b.exp filed to the actual expedited quiescent state report function, matching hw preemptible RCU handles the ->exp_deferred_qs field. This prepares for removing ->exp_deferred_qs in favor of ->cpu_no_qs.b.exp for both preemptible and non-preemptible RCU. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 1 + kernel/rcu/tree_plugin.h | 6 ++---- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index f3947c49eee7..6c6eb3220385 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -256,6 +256,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, static void rcu_report_exp_rdp(struct rcu_data *rdp) { WRITE_ONCE(rdp->exp_deferred_qs, false); + WRITE_ONCE(rdp->cpu_no_qs.b.exp, false); rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 113c19b086ac..6d58b75d2782 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -849,10 +849,8 @@ static void rcu_qs(void) trace_rcu_grace_period(TPS("rcu_sched"), __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); - if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp)) - return; - __this_cpu_write(rcu_data.cpu_no_qs.b.exp, false); - rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); + if (__this_cpu_read(rcu_data.cpu_no_qs.b.exp)) + rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); } /* -- cgit v1.2.3 From 6120b72e25e195b6fa15b0a674479a38166c392a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Sep 2021 14:10:48 +0200 Subject: rcu: Remove rcu_data.exp_deferred_qs and convert to rcu_data.cpu no_qs.b.exp Having two fields for the same purpose with subtle differences on different RCU flavours is confusing, especially when both fields always exist on both RCU flavours. Fortunately, it is now safe for preemptible RCU to rely on the rcu_data structure's ->cpu_no_qs.b.exp field, just like non-preemptible RCU. This commit therefore removes the ad-hoc ->exp_deferred_qs field. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 - kernel/rcu/tree_exp.h | 5 ++--- kernel/rcu/tree_plugin.h | 12 ++++++------ 3 files changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 305cf6aeb408..ea46ed40f6bc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -157,7 +157,6 @@ struct rcu_data { bool core_needs_qs; /* Core waits for quiescent state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ - bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */ bool cpu_started; /* RCU watching this onlining CPU. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6c6eb3220385..fc2ee326a6f7 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -255,7 +255,6 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, */ static void rcu_report_exp_rdp(struct rcu_data *rdp) { - WRITE_ONCE(rdp->exp_deferred_qs, false); WRITE_ONCE(rdp->cpu_no_qs.b.exp, false); rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); } @@ -656,7 +655,7 @@ static void rcu_exp_handler(void *unused) rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); } else { - rdp->exp_deferred_qs = true; + WRITE_ONCE(rdp->cpu_no_qs.b.exp, true); set_tsk_need_resched(t); set_preempt_need_resched(); } @@ -678,7 +677,7 @@ static void rcu_exp_handler(void *unused) if (depth > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { - rdp->exp_deferred_qs = true; + WRITE_ONCE(rdp->cpu_no_qs.b.exp, true); t->rcu_read_unlock_special.b.exp_hint = true; } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6d58b75d2782..e1a9fb96e0b9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -260,10 +260,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * no need to check for a subsequent expedited GP. (Though we are * still in a quiescent state in any case.) */ - if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs) + if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); else - WARN_ON_ONCE(rdp->exp_deferred_qs); + WARN_ON_ONCE(rdp->cpu_no_qs.b.exp); } /* @@ -354,7 +354,7 @@ void rcu_note_context_switch(bool preempt) * means that we continue to block the current grace period. */ rcu_qs(); - if (rdp->exp_deferred_qs) + if (rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); @@ -481,7 +481,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ special = t->rcu_read_unlock_special; rdp = this_cpu_ptr(&rcu_data); - if (!special.s && !rdp->exp_deferred_qs) { + if (!special.s && !rdp->cpu_no_qs.b.exp) { local_irq_restore(flags); return; } @@ -501,7 +501,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * tasks are handled when removing the task from the * blocked-tasks list below. */ - if (rdp->exp_deferred_qs) + if (rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); /* Clean up if blocked during RCU read-side critical section. */ @@ -584,7 +584,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { - return (__this_cpu_read(rcu_data.exp_deferred_qs) || + return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) || READ_ONCE(t->rcu_read_unlock_special.s)) && rcu_preempt_depth() == 0; } -- cgit v1.2.3 From 5401cc5264fff75aea10cf44b380b265d18d17ba Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 28 Sep 2021 11:06:35 -0700 Subject: rcu: Mark sync_sched_exp_online_cleanup() ->cpu_no_qs.b.exp load The sync_sched_exp_online_cleanup() is called from rcutree_online_cpu(), which can be invoked with interrupts enabled. This means that the ->cpu_no_qs.b.exp field is subject to data races from the rcu_exp_handler() IPI handler, so this commit marks the load from that field. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index fc2ee326a6f7..12f1a18d7b6d 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -759,7 +759,7 @@ static void sync_sched_exp_online_cleanup(int cpu) my_cpu = get_cpu(); /* Quiescent state either not needed or already requested, leave. */ if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - rdp->cpu_no_qs.b.exp) { + READ_ONCE(rdp->cpu_no_qs.b.exp)) { put_cpu(); return; } -- cgit v1.2.3 From 147f04b14adde831eb4a0a1e378667429732f9e8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Sep 2021 09:21:34 -0700 Subject: rcu: Prevent expedited GP from enabling tick on offline CPU If an RCU expedited grace period starts just when a CPU is in the process of going offline, so that the outgoing CPU has completed its pass through stop-machine but has not yet completed its final dive into the idle loop, RCU will attempt to enable that CPU's scheduling-clock tick via a call to tick_dep_set_cpu(). For this to happen, that CPU has to have been online when the expedited grace period completed its CPU-selection phase. This is pointless: The outgoing CPU has interrupts disabled, so it cannot take a scheduling-clock tick anyway. In addition, the tick_dep_set_cpu() function's eventual call to irq_work_queue_on() will splat as follows: smpboot: CPU 1 is now offline WARNING: CPU: 6 PID: 124 at kernel/irq_work.c:95 +irq_work_queue_on+0x57/0x60 Modules linked in: CPU: 6 PID: 124 Comm: kworker/6:2 Not tainted 5.15.0-rc1+ #3 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS +rel-1.14.0-0-g155821a-rebuilt.opensuse.org 04/01/2014 Workqueue: rcu_gp wait_rcu_exp_gp RIP: 0010:irq_work_queue_on+0x57/0x60 Code: 8b 05 1d c7 ea 62 a9 00 00 f0 00 75 21 4c 89 ce 44 89 c7 e8 +9b 37 fa ff ba 01 00 00 00 89 d0 c3 4c 89 cf e8 3b ff ff ff eb ee <0f> 0b eb b7 +0f 0b eb db 90 48 c7 c0 98 2a 02 00 65 48 03 05 91 6f RSP: 0000:ffffb12cc038fe48 EFLAGS: 00010282 RAX: 0000000000000001 RBX: 0000000000005208 RCX: 0000000000000020 RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff9ad01f45a680 RBP: 000000000004c990 R08: 0000000000000001 R09: ffff9ad01f45a680 R10: ffffb12cc0317db0 R11: 0000000000000001 R12: 00000000fffecee8 R13: 0000000000000001 R14: 0000000000026980 R15: ffffffff9e53ae00 FS: 0000000000000000(0000) GS:ffff9ad01f580000(0000) +knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000000de0c000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tick_nohz_dep_set_cpu+0x59/0x70 rcu_exp_wait_wake+0x54e/0x870 ? sync_rcu_exp_select_cpus+0x1fc/0x390 process_one_work+0x1ef/0x3c0 ? process_one_work+0x3c0/0x3c0 worker_thread+0x28/0x3c0 ? process_one_work+0x3c0/0x3c0 kthread+0x115/0x140 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x22/0x30 ---[ end trace c5bf75eb6aa80bc6 ]--- This commit therefore avoids invoking tick_dep_set_cpu() on offlined CPUs to limit both futility and false-positive splats. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 12f1a18d7b6d..a96d17206d87 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -506,7 +506,10 @@ static void synchronize_rcu_expedited_wait(void) if (rdp->rcu_forced_tick_exp) continue; rdp->rcu_forced_tick_exp = true; - tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); + preempt_disable(); + if (cpu_online(cpu)) + tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); + preempt_enable(); } } j = READ_ONCE(jiffies_till_first_fqs); -- cgit v1.2.3 From 790da248978a0722d92d1471630c881704f7eb0d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Sep 2021 11:09:34 -0700 Subject: rcu: Make idle entry report expedited quiescent states In non-preemptible kernels, an unfortunately timed expedited grace period can result in the rcu_exp_handler() IPI handler setting the rcu_data structure's cpu_no_qs.b.exp field just as the target CPU enters idle. There are situations in which this field will not be checked until after that CPU exits idle. The resulting grace-period latency does not qualify as "expedited". This commit therefore checks this field upon non-preemptible idle entry in the rcu_preempt_deferred_qs() function. It also qualifies the rcu_core() preempt_count() check with IS_ENABLED(CONFIG_PREEMPT_COUNT) to prevent false-positive quiescent states from count-free kernels. Reported-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_plugin.h | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ef8d36f580fc..a8d1fe35f482 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2714,7 +2714,7 @@ static __latent_entropy void rcu_core(void) WARN_ON_ONCE(!rdp->beenonline); /* Report any deferred quiescent states if preemption enabled. */ - if (!(preempt_count() & PREEMPT_MASK)) { + if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) { rcu_preempt_deferred_qs(current); } else if (rcu_preempt_need_deferred_qs(current)) { set_tsk_need_resched(current); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e1a9fb96e0b9..8fb1612021a0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -927,7 +927,18 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return false; } -static void