diff options
-rw-r--r-- | Documentation/RCU/Design/Requirements/Requirements.rst | 36 | ||||
-rw-r--r-- | Documentation/RCU/whatisRCU.rst | 1 | ||||
-rw-r--r-- | Documentation/admin-guide/kernel-parameters.txt | 13 | ||||
-rw-r--r-- | include/linux/notifier.h | 10 | ||||
-rw-r--r-- | include/linux/rcupdate.h | 54 | ||||
-rw-r--r-- | include/linux/srcu.h | 8 | ||||
-rw-r--r-- | kernel/locking/locktorture.c | 51 | ||||
-rw-r--r-- | kernel/rcu/Kconfig | 18 | ||||
-rw-r--r-- | kernel/rcu/rcu.h | 6 | ||||
-rw-r--r-- | kernel/rcu/rcuscale.c | 199 | ||||
-rw-r--r-- | kernel/rcu/tasks.h | 12 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 131 | ||||
-rw-r--r-- | kernel/rcu/tree_exp.h | 2 | ||||
-rw-r--r-- | kernel/rcu/tree_nocb.h | 52 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 4 | ||||
-rw-r--r-- | tools/testing/selftests/rcutorture/bin/functions.sh | 2 | ||||
-rw-r--r-- | tools/testing/selftests/rcutorture/configs/rcu/BUSTED-BOOST.boot | 2 | ||||
-rw-r--r-- | tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot | 2 |
18 files changed, 332 insertions, 271 deletions
diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index 49387d823619..f3b605285a87 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -2071,41 +2071,7 @@ call. Because RCU avoids interrupting idle CPUs, it is illegal to execute an RCU read-side critical section on an idle CPU. (Kernels built with -``CONFIG_PROVE_RCU=y`` will splat if you try it.) The RCU_NONIDLE() -macro and ``_rcuidle`` event tracing is provided to work around this -restriction. In addition, rcu_is_watching() may be used to test -whether or not it is currently legal to run RCU read-side critical -sections on this CPU. I learned of the need for diagnostics on the one -hand and RCU_NONIDLE() on the other while inspecting idle-loop code. -Steven Rostedt supplied ``_rcuidle`` event tracing, which is used quite -heavily in the idle loop. However, there are some restrictions on the -code placed within RCU_NONIDLE(): - -#. Blocking is prohibited. In practice, this is not a serious - restriction given that idle tasks are prohibited from blocking to - begin with. -#. Although nesting RCU_NONIDLE() is permitted, they cannot nest - indefinitely deeply. However, given that they can be nested on the - order of a million deep, even on 32-bit systems, this should not be a - serious restriction. This nesting limit would probably be reached - long after the compiler OOMed or the stack overflowed. -#. Any code path that enters RCU_NONIDLE() must sequence out of that - same RCU_NONIDLE(). For example, the following is grossly - illegal: - - :: - - 1 RCU_NONIDLE({ - 2 do_something(); - 3 goto bad_idea; /* BUG!!! */ - 4 do_something_else();}); - 5 bad_idea: - - - It is just as illegal to transfer control into the middle of - RCU_NONIDLE()'s argument. Yes, in theory, you could transfer in - as long as you also transferred out, but in practice you could also - expect to get sharply worded review comments. +``CONFIG_PROVE_RCU=y`` will splat if you try it.) It is similarly socially unacceptable to interrupt an ``nohz_full`` CPU running in userspace. RCU must therefore track ``nohz_full`` userspace diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index 8eddef28d3a1..e488c8e557a9 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -1117,7 +1117,6 @@ All: lockdep-checked RCU utility APIs:: RCU_LOCKDEP_WARN rcu_sleep_check - RCU_NONIDLE All: Unchecked RCU-protected pointer access:: diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index c21245c0cce6..5b544b3f00fc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5094,8 +5094,17 @@ rcutorture.stall_cpu_block= [KNL] Sleep while stalling if set. This will result - in warnings from preemptible RCU in addition - to any other stall-related activity. + in warnings from preemptible RCU in addition to + any other stall-related activity. Note that + in kernels built with CONFIG_PREEMPTION=n and + CONFIG_PREEMPT_COUNT=y, this parameter will + cause the CPU to pass through a quiescent state. + Given CONFIG_PREEMPTION=n, this will suppress + RCU CPU stall warnings, but will instead result + in scheduling-while-atomic splats. + + Use of this module parameter results in splats. + rcutorture.stall_cpu_holdoff= [KNL] Time to wait (s) after boot before inducing stall. diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 2aba75145144..86544707236a 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -106,12 +106,22 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); #define RAW_NOTIFIER_INIT(name) { \ .head = NULL } +#ifdef CONFIG_TREE_SRCU #define SRCU_NOTIFIER_INIT(name, pcpu) \ { \ .mutex = __MUTEX_INITIALIZER(name.mutex), \ .head = NULL, \ + .srcuu = __SRCU_USAGE_INIT(name.srcuu), \ .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \ } +#else +#define SRCU_NOTIFIER_INIT(name, pcpu) \ + { \ + .mutex = __MUTEX_INITIALIZER(name.mutex), \ + .head = NULL, \ + .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \ + } +#endif #define ATOMIC_NOTIFIER_HEAD(name) \ struct atomic_notifier_head name = \ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index dcd2cf1e8326..7d9c2a63b7cd 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -156,31 +156,6 @@ static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; } static inline void rcu_nocb_flush_deferred_wakeup(void) { } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ -/** - * RCU_NONIDLE - Indicate idle-loop code that needs RCU readers - * @a: Code that RCU needs to pay attention to. - * - * RCU read-side critical sections are forbidden in the inner idle loop, - * that is, between the ct_idle_enter() and the ct_idle_exit() -- RCU - * will happily ignore any such read-side critical sections. However, - * things like powertop need tracepoints in the inner idle loop. - * - * This macro provides the way out: RCU_NONIDLE(do_something_with_RCU()) - * will tell RCU that it needs to pay attention, invoke its argument - * (in this example, calling the do_something_with_RCU() function), - * and then tell RCU to go back to ignoring this CPU. It is permissible - * to nest RCU_NONIDLE() wrappers, but not indefinitely (but the limit is - * on the order of a million or so, even on 32-bit systems). It is - * not legal to block within RCU_NONIDLE(), nor is it permissible to - * transfer control either into or out of RCU_NONIDLE()'s statement. - */ -#define RCU_NONIDLE(a) \ - do { \ - ct_irq_enter_irqson(); \ - do { a; } while (0); \ - ct_irq_exit_irqson(); \ - } while (0) - /* * Note a quasi-voluntary context switch for RCU-tasks's benefit. * This is a macro rather than an inline function to avoid #include hell. @@ -957,9 +932,8 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) /** * kfree_rcu() - kfree an object after a grace period. - * @ptr: pointer to kfree for both single- and double-argument invocations. - * @rhf: the name of the struct rcu_head within the type of @ptr, - * but only for double-argument invocations. + * @ptr: pointer to kfree for double-argument invocations. + * @rhf: the name of the struct rcu_head within the type of @ptr. * * Many rcu callbacks functions just call kfree() on the base structure. * These functions are trivial, but their size adds up, and furthermore @@ -984,26 +958,18 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * The BUILD_BUG_ON check must not involve any function calls, hence the * checks are done in macros here. */ -#define kfree_rcu(ptr, rhf...) kvfree_rcu(ptr, ## rhf) +#define kfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf) +#define kvfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf) /** - * kvfree_rcu() - kvfree an object after a grace period. - * - * This macro consists of one or two arguments and it is - * based on whether an object is head-less or not. If it - * has a head then a semantic stays the same as it used - * to be before: - * - * kvfree_rcu(ptr, rhf); - * - * where @ptr is a pointer to kvfree(), @rhf is the name - * of the rcu_head structure within the type of @ptr. + * kfree_rcu_mightsleep() - kfree an object after a grace period. + * @ptr: pointer to kfree for single-argument invocations. * * When it comes to head-less variant, only one argument * is passed and that is just a pointer which has to be * freed after a grace period. Therefore the semantic is * - * kvfree_rcu(ptr); + * kfree_rcu_mightsleep(ptr); * * where @ptr is the pointer to be freed by kvfree(). * @@ -1012,13 +978,9 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * annotation. Otherwise, please switch and embed the * rcu_head structure within the type of @ptr. */ -#define kvfree_rcu(...) KVFREE_GET_MACRO(__VA_ARGS__, \ - kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__) - +#define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) #define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) -#define kfree_rcu_mightsleep(ptr) kvfree_rcu_mightsleep(ptr) -#define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 41c4b26fb1c1..eb92a50a4599 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -212,7 +212,7 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) srcu_check_nmi_safety(ssp, false); retval = __srcu_read_lock(ssp); - srcu_lock_acquire(&(ssp)->dep_map); + srcu_lock_acquire(&ssp->dep_map); return retval; } @@ -229,7 +229,7 @@ static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp srcu_check_nmi_safety(ssp, true); retval = __srcu_read_lock_nmisafe(ssp); - rcu_lock_acquire(&(ssp)->dep_map); + rcu_lock_acquire(&ssp->dep_map); return retval; } @@ -284,7 +284,7 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) { WARN_ON_ONCE(idx & ~0x1); srcu_check_nmi_safety(ssp, false); - srcu_lock_release(&(ssp)->dep_map); + srcu_lock_release(&ssp->dep_map); __srcu_read_unlock(ssp, idx); } @@ -300,7 +300,7 @@ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) { WARN_ON_ONCE(idx & ~0x1); srcu_check_nmi_safety(ssp, true); - rcu_lock_release(&(ssp)->dep_map); + rcu_lock_release(&ssp->dep_map); __srcu_read_unlock_nmisafe(ssp, idx); } diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 153ddc4c47ef..949d3deae506 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -33,24 +33,19 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); -torture_param(int, nwriters_stress, -1, - "Number of write-locking stress-test threads"); -torture_param(int, nreaders_stress, -1, - "Number of read-locking stress-test threads"); +torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); +torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads"); +torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); -torture_param(int, onoff_interval, 0, - "Time between CPU hotplugs (s), 0=disable"); -torture_param(int, shuffle_interval, 3, - "Number of jiffies between shuffles, 0=disable"); +torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable"); torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); -torture_param(int, stat_interval, 60, - "Number of seconds between stats printk()s"); +torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); torture_param(int, rt_boost, 2, - "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); + "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens."); -torture_param(int, verbose, 1, - "Enable verbose debugging printk()s"); +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); /* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */ #define MAX_NESTED_LOCKS 8 @@ -120,7 +115,7 @@ static int torture_lock_busted_write_lock(int tid __maybe_unused) static void torture_lock_busted_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % @@ -198,16 +193,18 @@ __acquires(torture_spinlock) static void torture_spin_lock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; + unsigned long j; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) { + j = jiffies; mdelay(longdelay_ms); - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2 * shortdelay_us))) + pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j); + } + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us))) udelay(shortdelay_us); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ @@ -322,7 +319,7 @@ __acquires(torture_rwlock) static void torture_rwlock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. @@ -455,14 +452,12 @@ __acquires(torture_mutex) static void torture_mutex_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms * 5); - else - mdelay(longdelay_ms / 5); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -630,7 +625,7 @@ __acquires(torture_rtmutex) static void torture_rtmutex_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* * We want a short delay mostly to emulate likely code, and @@ -640,7 +635,7 @@ static void torture_rtmutex_delay(struct torture_random_state *trsp) (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms); if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2 * shortdelay_us))) + (cxt.nrealwriters_stress * 200 * shortdelay_us))) udelay(shortdelay_us); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ @@ -695,14 +690,12 @@ __acquires(torture_rwsem) static void torture_rwsem_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms * 10); - else - mdelay(longdelay_ms / 10); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -848,8 +841,8 @@ static int lock_torture_writer(void *arg) lwsp->n_lock_acquired++; } - cxt.cur_ops->write_delay(&rand); if (!skip_main_lock) { + cxt.cur_ops->write_delay(&rand); lock_is_write_held = false; WRITE_ONCE(last_lock_release, jiffies); cxt.cur_ops->writeunlock(tid); diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 9071182b1284..bdd7eadb33d8 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -314,4 +314,22 @@ config RCU_LAZY To save power, batch RCU callbacks and flush after delay, memory pressure, or callback list growing too big. +config RCU_DOUBLE_CHECK_CB_TIME + bool "RCU callback-batch backup time check" + depends on RCU_EXPERT + default n + help + Use this option to provide more precise enforcement of the + rcutree.rcu_resched_ns module parameter in situations where + a single RCU callback might run for hundreds of microseconds, + thus defeating the 32-callback batching used to amortize the + cost of the fine-grained but expensive local_clock() function. + + This option rounds rcutree.rcu_resched_ns up to the next + jiffy, and overrides the 32-callback batching if this limit + is exceeded. + + Say Y here if you need tighter callback-limit enforcement. + Say N here if you are unsure. + endmenu # "RCU Subsystem" diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 4a1b9622598b..98c1544cf572 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -642,4 +642,10 @@ void show_rcu_tasks_trace_gp_kthread(void); static inline void show_rcu_tasks_trace_gp_kthread(void) {} #endif +#ifdef CONFIG_TINY_RCU +static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; } +#else +bool rcu_cpu_beenfullyonline(int cpu); +#endif + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index e82ec9f9a5d8..d1221731c7cf 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -522,89 +522,6 @@ rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag) scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown); } -static void -rcu_scale_cleanup(void) -{ - int i; - int j; - int ngps = 0; - u64 *wdp; - u64 *wdpp; - - /* - * Would like warning at start, but everything is expedited - * during the mid-boot phase, so have to wait till the end. - */ - if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) - SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); - if (rcu_gp_is_normal() && gp_exp) - SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); - if (gp_exp && gp_async) - SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); - - if (torture_cleanup_begin()) - return; - if (!cur_ops) { - torture_cleanup_end(); - return; - } - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) - torture_stop_kthread(rcu_scale_reader, - reader_tasks[i]); - kfree(reader_tasks); - } - - if (writer_tasks) { - for (i = 0; i < nrealwriters; i++) { - torture_stop_kthread(rcu_scale_writer, - writer_tasks[i]); - if (!writer_n_durations) - continue; - j = writer_n_durations[i]; - pr_alert("%s%s writer %d gps: %d\n", - scale_type, SCALE_FLAG, i, j); - ngps += j; - } - pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", - scale_type, SCALE_FLAG, - t_rcu_scale_writer_started, t_rcu_scale_writer_finished, - t_rcu_scale_writer_finished - - t_rcu_scale_writer_started, - ngps, - rcuscale_seq_diff(b_rcu_gp_test_finished, - b_rcu_gp_test_started)); - for (i = 0; i < nrealwriters; i++) { - if (!writer_durations) - break; - if (!writer_n_durations) - continue; - wdpp = writer_durations[i]; - if (!wdpp) - continue; - for (j = 0; j < writer_n_durations[i]; j++) { - wdp = &wdpp[j]; - pr_alert("%s%s %4d writer-duration: %5d %llu\n", - scale_type, SCALE_FLAG, - i, j, *wdp); - if (j % 100 == 0) - schedule_timeout_uninterruptible(1); - } - kfree(writer_durations[i]); - } - kfree(writer_tasks); - kfree(writer_durations); - kfree(writer_n_durations); - } - - /* Do torture-type-specific cleanup operations. */ - if (cur_ops->cleanup != NULL) - cur_ops->cleanup(); - - torture_cleanup_end(); -} - /* * Return the number if non-negative. If -1, the number of CPUs. * If less than -1, that much less than the number of CPUs, but @@ -625,20 +542,6 @@ static int compute_real(int n) } /* - * RCU scalability shutdown kthread. Just waits to be awakened, then shuts - * down system. - */ -static int -rcu_scale_shutdown(void *arg) -{ - wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); - smp_mb(); /* Wake before output. */ - rcu_scale_cleanup(); - kernel_power_off(); - return -EINVAL; -} - -/* * kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number * of iterations and measure total time and number of GP for all iterations to complete. */ @@ -874,6 +777,108 @@ unwind: return firsterr; } +static void +rcu_scale_cleanup(void) +{ + int i; + int j; + int ngps = 0; + u64 *wdp; + u64 *wdpp; + + /* + * Would like warning at start, but everything is expedited + * during the mid-boot phase, so have to wait till the end. + */ + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) + SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + if (rcu_gp_is_normal() && gp_exp) + SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + if (gp_exp && gp_async) + SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); + + if (kfree_rcu_test) { + kfree_scale_cleanup(); + return; + } + + if (torture_cleanup_begin()) + return; + if (!cur_ops) { + torture_cleanup_end(); + return; + } + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_scale_reader, + reader_tasks[i]); + kfree(reader_tasks); + } + + if (writer_tasks) { + for (i = 0; i < nrealwriters; i++) { + torture_stop_kthread(rcu_scale_writer, + writer_tasks[i]); + if (!writer_n_durations) + continue; + j = writer_n_durations[i]; + pr_alert("%s%s writer %d gps: %d\n", + scale_type, SCALE_FLAG, i, j); + ngps += j; + } + pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", + scale_type, SCALE_FLAG, + t_rcu_scale_writer_started, t_rcu_scale_writer_finished, + t_rcu_scale_writer_finished - + t_rcu_scale_writer_started, + ngps, + rcuscale_seq_diff(b_rcu_gp_test_finished, + b_rcu_gp_test_started)); + for (i = 0; i < nrealwriters; i++) { + if (!writer_durations) + break; + if (!writer_n_durations) + continue; + wdpp = writer_durations[i]; + if (!wdpp) + continue; + for (j = 0; j < writer_n_durations[i]; j++) { + wdp = &wdpp[j]; + pr_alert("%s%s %4d writer-duration: %5d %llu\n", + scale_type, SCALE_FLAG, + i, j, *wdp); + if (j % 100 == 0) + schedule_timeout_uninterruptible(1); + } + kfree(writer_durations[i]); + } + kfree(writer_tasks); + kfree(writer_durations); + kfree(writer_n_durations); + } + + /* Do torture-type-specific cleanup operations. */ + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +/* + * RCU scalability shutdown kthread. Just waits to be awakened, then shuts + * down system. + */ +static int +rcu_scale_shutdown(void *arg) +{ + wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); + smp_mb(); /* Wake before output. */ + rcu_scale_cleanup(); + kernel_power_off(); + return -EINVAL; +} + static int __init rcu_scale_init(void) { diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 5f4fc8184dd0..b770add3f843 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -241,7 +241,6 @@ static void cblist_init_generic(struct rcu_tasks *rtp) if (rcu_task_enqueue_lim < 0) { rcu_task_enqueue_lim = 1; rcu_task_cb_adjust = true; - pr_info("%s: Setting adjustable number of callback queues.\n", __func__); } else if (rcu_task_enqueue_lim == 0) { rcu_task_enqueue_lim = 1; } @@ -272,7 +271,9 @@ static void cblist_init_generic(struct rcu_tasks *rtp) raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); - pr_info("%s: Setting shift to %d and lim to %d.\n", __func__, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim)); + + pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name, + data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust); } // IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic(). @@ -463,6 +464,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu { int cpu; int cpunext; + int cpuwq; unsigned long flags; int len; struct rcu_head *rhp; @@ -473,11 +475,13 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu cpunext = cpu * 2 + 1; if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); - queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); + cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND; + queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); cpunext++; if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); - queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); + cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND; + queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); } } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f52ff7241041..1449cb69a0e0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2046,19 +2046,35 @@ rcu_check_quiescent_state(struct rcu_data *rdp) rcu_report_qs_rdp(rdp); } +/* Return true if callback-invocation time limit exceeded. */ +static bool rcu_do_batch_check_time(long count, long tlimit, + bool jlimit_check, unsigned long jlimit) +{ + // Invoke local_clock() only once per 32 consecutive callbacks. + return unlikely(tlimit) && + (!likely(count & 31) || + (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) && + jlimit_check && time_after(jiffies, jlimit))) && + local_clock() >= tlimit; +} + /* * Invoke any RCU callbacks that have made it to the end of their grace * period. Throttle as specified by rdp->blimit. */ static void rcu_do_batch(struct rcu_data *rdp) { + long bl; + long count = 0; int div; bool __maybe_unused empty; unsigned long flags; - struct rcu_head *rhp; + unsigned long jlimit; + bool jlimit_check = false; + long pending; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); - long bl, count = 0; - long pending, tlimit = 0; + struct rcu_head *rhp; + long tlimit = 0; /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { @@ -2082,11 +2098,15 @@ static void rcu_do_batch(struct rcu_data *rdp) div = READ_ONCE(rcu_divisor); div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); - if (in_serving_softirq() && unlikely(bl > 100)) { + if ((in_serving_softirq() || rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING) && + (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) || unlikely(bl > 100))) { + const long npj = NSEC_PER_SEC / HZ; long rrn = READ_ONCE(rcu_resched_ns); rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; tlimit = local_clock() + rrn; + jlimit = jiffies + (rrn + npj + 1) / npj; + jlimit_check = true; } trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_cbs(&rdp->cblist), bl); @@ -2126,21 +2146,23 @@ static void rcu_do_batch(struct rcu_data *rdp) * Make sure we don't spend too much time here and deprive other * softirq vectors of CPU cycles. */ - if (unlikely(tlimit)) { - /* only call local_clock() every 32 callbacks */ - if (likely((count & 31) || local_clock() < tlimit)) - continue; - /* Exceeded the time limit, so leave. */ + if (rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) break; - } } else { - // In rcuoc context, so no worries about depriving - // other softirq vectors of CPU cycles. + // In rcuc/rcuoc context, so no worries about + // depriving other softirq vectors of CPU cycles. local_bh_enable(); lockdep_assert_irqs_enabled(); cond_resched_tasks_rcu_qs(); lockdep_assert_irqs_enabled(); local_bh_disable(); + // But rcuc kthreads can delay quiescent-state + // reporting, so check time limits for them. + if (rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING && + rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) { + rdp->rcu_cpu_has_work = 1; + break; + } } } @@ -2459,12 +2481,12 @@ static void rcu_cpu_kthread(unsigned int cpu) *statusp = RCU_KTHREAD_RUNNING; local_irq_disable(); work = *workp; - *workp = 0; + WRITE_ONCE(*workp, 0); local_irq_enable(); if (work) rcu_core(); local_bh_enable(); - if (*workp == 0) { + if (!READ_ONCE(*workp)) { trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); *statusp = RCU_KTHREAD_WAITING; return; @@ -2756,7 +2778,7 @@ EXPORT_SYMBOL_GPL(call_rcu); */ struct kvfree_rcu_bulk_data { struct list_head list; - unsigned long gp_snap; + struct rcu_gp_oldstate gp_snap; unsigned long nr_records; void *records[]; }; @@ -2773,6 +2795,7 @@ struct kvfree_rcu_bulk_data { * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period + * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees. * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ @@ -2780,6 +2803,7 @@ struct kvfree_rcu_bulk_data { struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; + struct rcu_gp_oldstate head_free_gp_snap; struct list_head bulk_head_free[FREE_N_CHANNELS]; struct kfree_rcu_cpu *krcp; }; @@ -2900,6 +2924,9 @@ drain_page_cache(struct kfree_rcu_cpu *krcp) struct llist_node *page_list, *pos, *n; int freed = 0; + if (!rcu_min_cached_objs) + return 0; + raw_spin_lock_irqsave(&krcp->lock, flags); page_list = llist_del_all(&krcp->bkvcache); WRITE_ONCE(krcp->nr_bkv_objs, 0); @@ -2920,24 +2947,25 @@ kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, unsigned long flags; int i; - debug_rcu_bhead_unqueue(bnode); - - rcu_lock_acquire(&rcu_callback_map); - if (idx == 0) { // kmalloc() / kfree(). - trace_rcu_invoke_kfree_bulk_callback( - rcu_state.name, bnode->nr_records, - bnode->records); - - kfree_bulk(bnode->nr_records, bnode->records); - } else { // vmalloc() / vfree(). - for (i = 0; i < bnode->nr_records; i++) { - trace_rcu_invoke_kvfree_callback( - rcu_state.name, bnode->records[i], 0); - - vfree(bnode->records[i]); + if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) { + debug_rcu_bhead_unqueue(bnode); + rcu_lock_acquire(&rcu_callback_map); + if (idx == 0) { // kmalloc() / kfree(). + |