diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/locking/locktorture.c | 25 | ||||
| -rw-r--r-- | kernel/rcu/rcuscale.c | 4 | ||||
| -rw-r--r-- | kernel/rcu/rcutorture.c | 7 | ||||
| -rw-r--r-- | kernel/rcu/refscale.c | 36 | ||||
| -rw-r--r-- | kernel/rcu/srcutiny.c | 2 | ||||
| -rw-r--r-- | kernel/rcu/tasks.h | 36 | ||||
| -rw-r--r-- | kernel/rcu/tree.c | 107 | ||||
| -rw-r--r-- | kernel/rcu/tree_nocb.h | 1496 | ||||
| -rw-r--r-- | kernel/rcu/tree_plugin.h | 1506 | ||||
| -rw-r--r-- | kernel/rcu/tree_stall.h | 111 | ||||
| -rw-r--r-- | kernel/scftorture.c | 78 | ||||
| -rw-r--r-- | kernel/sched/core.c | 11 | ||||
| -rw-r--r-- | kernel/torture.c | 6 |
13 files changed, 1767 insertions, 1658 deletions
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index b3adb40549bf..7c5a4a087cc7 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -59,7 +59,7 @@ static struct task_struct **writer_tasks; static struct task_struct **reader_tasks; static bool lock_is_write_held; -static bool lock_is_read_held; +static atomic_t lock_is_read_held; static unsigned long last_lock_release; struct lock_stress_stats { @@ -682,7 +682,7 @@ static int lock_torture_writer(void *arg) if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; lock_is_write_held = true; - if (WARN_ON_ONCE(lock_is_read_held)) + if (WARN_ON_ONCE(atomic_read(&lock_is_read_held))) lwsp->n_lock_fail++; /* rare, but... */ lwsp->n_lock_acquired++; @@ -717,13 +717,13 @@ static int lock_torture_reader(void *arg) schedule_timeout_uninterruptible(1); cxt.cur_ops->readlock(tid); - lock_is_read_held = true; + atomic_inc(&lock_is_read_held); if (WARN_ON_ONCE(lock_is_write_held)) lrsp->n_lock_fail++; /* rare, but... */ lrsp->n_lock_acquired++; cxt.cur_ops->read_delay(&rand); - lock_is_read_held = false; + atomic_dec(&lock_is_read_held); cxt.cur_ops->readunlock(tid); stutter_wait("lock_torture_reader"); @@ -738,20 +738,22 @@ static int lock_torture_reader(void *arg) static void __torture_print_stats(char *page, struct lock_stress_stats *statp, bool write) { + long cur; bool fail = false; int i, n_stress; - long max = 0, min = statp ? statp[0].n_lock_acquired : 0; + long max = 0, min = statp ? data_race(statp[0].n_lock_acquired) : 0; long long sum = 0; n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress; for (i = 0; i < n_stress; i++) { - if (statp[i].n_lock_fail) + if (data_race(statp[i].n_lock_fail)) fail = true; - sum += statp[i].n_lock_acquired; - if (max < statp[i].n_lock_acquired) - max = statp[i].n_lock_acquired; - if (min > statp[i].n_lock_acquired) - min = statp[i].n_lock_acquired; + cur = data_race(statp[i].n_lock_acquired); + sum += cur; + if (max < cur) + max = cur; + if (min > cur) + min = cur; } page += sprintf(page, "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", @@ -996,7 +998,6 @@ static int __init lock_torture_init(void) } if (nreaders_stress) { - lock_is_read_held = false; cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress, sizeof(*cxt.lrsa), GFP_KERNEL); diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index dca51fe9c73f..2cc34a22a506 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -487,7 +487,7 @@ retry: if (gp_async) { cur_ops->gp_barrier(); } - writer_n_durations[me] = i_max; + writer_n_durations[me] = i_max + 1; torture_kthread_stopping("rcu_scale_writer"); return 0; } @@ -561,7 +561,7 @@ rcu_scale_cleanup(void) wdpp = writer_durations[i]; if (!wdpp) continue; - for (j = 0; j <= writer_n_durations[i]; j++) { + for (j = 0; j < writer_n_durations[i]; j++) { wdp = &wdpp[j]; pr_alert("%s%s %4d writer-duration: %5d %llu\n", scale_type, SCALE_FLAG, diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 40ef5417d954..ab4215266ebe 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2022,8 +2022,13 @@ static int rcu_torture_stall(void *args) __func__, raw_smp_processor_id()); while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at)) - if (stall_cpu_block) + if (stall_cpu_block) { +#ifdef CONFIG_PREEMPTION + preempt_schedule(); +#else schedule_timeout_uninterruptible(HZ); +#endif + } if (stall_cpu_irqsoff) local_irq_enable(); else if (!stall_cpu_block) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index d998a76fb542..66dc14cf5687 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -467,6 +467,40 @@ static struct ref_scale_ops acqrel_ops = { .name = "acqrel" }; +static volatile u64 stopopts; + +static void ref_clock_section(const int nloops) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) + x += ktime_get_real_fast_ns(); + preempt_enable(); + stopopts = x; +} + +static void ref_clock_delay_section(const int nloops, const int udl, const int ndl) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x += ktime_get_real_fast_ns(); + un_delay(udl, ndl); + } + preempt_enable(); + stopopts = x; +} + +static struct ref_scale_ops clock_ops = { + .readsection = ref_clock_section, + .delaysection = ref_clock_delay_section, + .name = "clock" +}; + static void rcu_scale_one_reader(void) { if (readdelay <= 0) @@ -759,7 +793,7 @@ ref_scale_init(void) int firsterr = 0; static struct ref_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops, - &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, + &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, }; if (!torture_init_begin(scale_type, verbose)) diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 26344dc6483b..a0ba2ed49bc6 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -96,7 +96,7 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct); */ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { - int newval = ssp->srcu_lock_nesting[idx] - 1; + int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); if (!newval && READ_ONCE(ssp->srcu_gp_waiting)) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8536c55df514..806160c44b17 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -643,8 +643,8 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } // // "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of // passing an empty function to schedule_on_each_cpu(). This approach -// provides an asynchronous call_rcu_tasks_rude() API and batching -// of concurrent calls to the synchronous synchronize_rcu_rude() API. +// provides an asynchronous call_rcu_tasks_rude() API and batching of +// concurrent calls to the synchronous synchronize_rcu_tasks_rude() API. // This invokes schedule_on_each_cpu() in order to send IPIs far and wide // and induces otherwise unnecessary context switches on all online CPUs, // whether idle or not. @@ -785,7 +785,10 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); // set that task's .need_qs flag so that task's next outermost // rcu_read_unlock_trace() will report the quiescent state (in which // case the count of readers is incremented). If both attempts fail, -// the task is added to a "holdout" list. +// the task is added to a "holdout" list. Note that IPIs are used +// to invoke trc_read_check_handler() in the context of running tasks +// in order to avoid ordering overhead on common-case shared-variable +// accessses. // rcu_tasks_trace_postscan(): // Initialize state and attempt to identify an immediate quiescent // state as above (but only for idle tasks), unblock CPU-hotplug @@ -847,7 +850,7 @@ static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw); /* If we are the last reader, wake up the grace-period kthread. */ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) { - int nq = t->trc_reader_special.b.need_qs; + int nq = READ_ONCE(t->trc_reader_special.b.need_qs); if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb) @@ -894,7 +897,7 @@ static void trc_read_check_handler(void *t_in) // If the task is not in a read-side critical section, and // if this is the last reader, awaken the grace-period kthread. - if (likely(!t->trc_reader_nesting)) { + if (likely(!READ_ONCE(t->trc_reader_nesting))) { if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) wake_up(&trc_wait); // Mark as checked after decrement to avoid false @@ -903,7 +906,7 @@ static void trc_read_check_handler(void *t_in) goto reset_ipi; } // If we are racing with an rcu_read_unlock_trace(), try again later. - if (unlikely(t->trc_reader_nesting < 0)) { + if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0)) { if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) wake_up(&trc_wait); goto reset_ipi; @@ -913,14 +916,14 @@ static void trc_read_check_handler(void *t_in) // Get here if the task is in a read-side critical section. Set // its state so that it will awaken the grace-period kthread upon // exit from that critical section. - WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)); WRITE_ONCE(t->trc_reader_special.b.need_qs, true); reset_ipi: // Allow future IPIs to be sent on CPU and for task. // Also order this IPI handler against any later manipulations of // the intended task. - smp_store_release(&per_cpu(trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ + smp_store_release(per_cpu_ptr(&trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^ } @@ -950,6 +953,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) n_heavy_reader_ofl_updates++; in_qs = true; } else { + // The task is not running, so C-language access is safe. in_qs = likely(!t->trc_reader_nesting); } @@ -964,7 +968,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) // state so that it will awaken the grace-period kthread upon exit // from that critical section. atomic_inc(&trc_n_readers_need_end); // One more to wait on. - WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)); WRITE_ONCE(t->trc_reader_special.b.need_qs, true); return true; } @@ -982,7 +986,7 @@ static void trc_wait_for_one_reader(struct task_struct *t, // The current task had better be in a quiescent state. if (t == current) { t->trc_reader_checked = true; - WARN_ON_ONCE(t->trc_reader_nesting); + WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); return; } @@ -994,6 +998,12 @@ static void trc_wait_for_one_reader(struct task_struct *t, } put_task_struct(t); + // If this task is not yet on the holdout list, then we are in + // an RCU read-side critical section. Otherwise, the invocation of + // rcu_add_holdout() that added it to the list did the necessary + // get_task_struct(). Either way, the task cannot be freed out + // from under this code. + // If currently running, send an IPI, either way, add to list. trc_add_holdout(t, bhp); if (task_curr(t) && @@ -1092,8 +1102,8 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) ".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0], ".i"[is_idle_task(t)], ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)], - t->trc_reader_nesting, - " N"[!!t->trc_reader_special.b.need_qs], + READ_ONCE(t->trc_reader_nesting), + " N"[!!READ_ONCE(t->trc_reader_special.b.need_qs)], cpu); sched_show_task(t); } @@ -1187,7 +1197,7 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) static void exit_tasks_rcu_finish_trace(struct task_struct *t) { WRITE_ONCE(t->trc_reader_checked, true); - WARN_ON_ONCE(t->trc_reader_nesting); + WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); WRITE_ONCE(t->trc_reader_nesting, 0); if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs))) rcu_read_unlock_trace_special(t, 0); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51f24ecd94b2..bce848e50512 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -74,17 +74,10 @@ /* Data structures. */ -/* - * Steal a bit from the bottom of ->dynticks for idle entry/exit - * control. Initially this is for TLB flushing. - */ -#define RCU_DYNTICK_CTRL_MASK 0x1 -#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) - static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nesting = 1, .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, - .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), + .dynticks = ATOMIC_INIT(1), #ifdef CONFIG_RCU_NOCB_CPU .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY, #endif @@ -259,6 +252,15 @@ void rcu_softirq_qs(void) } /* + * Increment the current CPU's rcu_data structure's ->dynticks field + * with ordering. Return the new value. + */ +static noinline noinstr unsigned long rcu_dynticks_inc(int incby) +{ + return arch_atomic_add_return(incby, this_cpu_ptr(&rcu_data.dynticks)); +} + +/* * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state, that is, * RCU is watching prior to the call to this function and is no longer @@ -266,7 +268,6 @@ void rcu_softirq_qs(void) */ static noinstr void rcu_dynticks_eqs_enter(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; /* @@ -275,13 +276,9 @@ static noinstr void rcu_dynticks_eqs_enter(void) * next idle sojourn. */ rcu_dynticks_task_trace_enter(); // Before ->dynticks update! - seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = rcu_dynticks_inc(1); // RCU is no longer watching. Better be in extended quiescent state! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (seq & RCU_DYNTICK_CTRL_CTR)); - /* Better not have special action (TLB flush) pending! */ - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (seq & RCU_DYNTICK_CTRL_MASK)); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1)); } /* @@ -291,7 +288,6 @@ static noinstr void rcu_dynticks_eqs_enter(void) */ static noinstr void rcu_dynticks_eqs_exit(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; /* @@ -299,15 +295,10 @@ static noinstr void rcu_dynticks_eqs_exit(void) * and we also must force ordering with the next RCU read-side * critical section. */ - seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = rcu_dynticks_inc(1); // RCU is now watching. Better not be in an extended quiescent state! rcu_dynticks_task_trace_exit(); // After ->dynticks update! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !(seq & RCU_DYNTICK_CTRL_CTR)); - if (seq & RCU_DYNTICK_CTRL_MASK) { - arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); - smp_mb__after_atomic(); /* _exit after clearing mask. */ - } + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1)); } /* @@ -324,9 +315,9 @@ static void rcu_dynticks_eqs_online(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR) + if (atomic_read(&rdp->dynticks) & 0x1) return; - atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + rcu_dynticks_inc(1); } /* @@ -336,9 +327,7 @@ static void rcu_dynticks_eqs_online(void) */ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - - return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); + return !(atomic_read(this_cpu_ptr(&rcu_data.dynticks)) & 0x1); } /* @@ -347,9 +336,8 @@ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) */ static int rcu_dynticks_snap(struct rcu_data *rdp) { - int snap = atomic_add_return(0, &rdp->dynticks); - - return snap & ~RCU_DYNTICK_CTRL_MASK; + smp_mb(); // Fundamental RCU ordering guarantee. + return atomic_read_acquire(&rdp->dynticks); } /* @@ -358,7 +346,7 @@ static int rcu_dynticks_snap(struct rcu_data *rdp) */ static bool rcu_dynticks_in_eqs(int snap) { - return !(snap & RCU_DYNTICK_CTRL_CTR); + return !(snap & 0x1); } /* Return true if the specified CPU is currently idle from an RCU viewpoint. */ @@ -389,8 +377,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) int snap; // If not quiescent, force back to earlier extended quiescent state. - snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK | - RCU_DYNTICK_CTRL_CTR); + snap = atomic_read(&rdp->dynticks) & ~0x1; smp_rmb(); // Order ->dynticks and *vp reads. if (READ_ONCE(*vp)) @@ -398,32 +385,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) smp_rmb(); // Order *vp read and ->dynticks re-read. // If still in the same extended quiescent state, we are good! - return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK); -} - -/* - * Set the special (bottom) bit of the specified CPU so that it - * will take special action (such as flushing its TLB) on the - * next exit from an extended quiescent state. Returns true if - * the bit was successfully set, or false if the CPU was not in - * an extended quiescent state. - */ -bool rcu_eqs_special_set(int cpu) -{ - int old; - int new; - int new_old; - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - - new_old = atomic_read(&rdp->dynticks); - do { - old = new_old; - if (old & RCU_DYNTICK_CTRL_CTR) - return false; - new = old | RCU_DYNTICK_CTRL_MASK; - new_old = atomic_cmpxchg(&rdp->dynticks, old, new); - } while (new_old != old); - return true; + return snap == atomic_read(&rdp->dynticks); } /* @@ -439,13 +401,12 @@ bool rcu_eqs_special_set(int cpu) */ notrace void rcu_momentary_dyntick_idle(void) { - int special; + int seq; raw_cpu_write(rcu_data.rcu_need_heavy_qs, false); - special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, - &this_cpu_ptr(&rcu_data)->dynticks); + seq = rcu_dynticks_inc(2); /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); + WARN_ON_ONCE(!(seq & 0x1)); rcu_preempt_deferred_qs(current); } EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); @@ -1325,7 +1286,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) */ jtsq = READ_ONCE(jiffies_to_sched_qs); ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu); - rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu); + rnhqp = per_cpu_ptr(&rcu_data.rcu_need_heavy_qs, rdp->cpu); if (!READ_ONCE(*rnhqp) && (time_after(jiffies, rcu_state.gp_start + jtsq * 2) || time_after(jiffies, rcu_state.jiffies_resched) || @@ -1772,7 +1733,7 @@ static void rcu_strict_gp_boundary(void *unused) /* * Initialize a new grace period. Return false if no grace period required. */ -static bool rcu_gp_init(void) +static noinline_for_stack bool rcu_gp_init(void) { unsigned long firstseq; unsigned long flags; @@ -1966,7 +1927,7 @@ static void rcu_gp_fqs(bool first_time) /* * Loop doing repeated quiescent-state forcing until the grace period ends. */ -static void rcu_gp_fqs_loop(void) +static noinline_for_stack void rcu_gp_fqs_loop(void) { bool first_gp_fqs; int gf = 0; @@ -1993,8 +1954,8 @@ static void rcu_gp_fqs_loop(void) trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqswait")); WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS); - ret = swait_event_idle_timeout_exclusive( - rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); + (void)swait_event_idle_timeout_exclusive(rcu_state.gp_wq, + rcu_gp_fqs_check_wake(&gf), j); rcu_gp_torture_wait(); WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS); /* Locking provides needed memory barriers. */ @@ -2471,9 +2432,6 @@ int rcutree_dead_cpu(unsigned int cpu) WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); - /* Do any needed no-CB deferred wakeups from this CPU. */ - do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu)); - // Stop-machine done, so allow nohz_full to disable tick. tick_dep_clear(TICK_DEP_BIT_RCU); return 0; @@ -4050,7 +4008,7 @@ void rcu_barrier(void) */ init_completion(&rcu_state.barrier_completion); atomic_set(&rcu_state.barrier_cpu_count, 2); - get_online_cpus(); + cpus_read_lock(); /* * Force each CPU with callbacks to register a new callback. @@ -4081,7 +4039,7 @@ void rcu_barrier(void) rcu_state.barrier_sequence); } } - put_online_cpus(); + cpus_read_unlock(); /* * Now that we have an rcu_barrier_callback() callback on each @@ -4784,4 +4742,5 @@ void __init rcu_init(void) #include "tree_stall.h" #include "tree_exp.h" +#include "tree_nocb.h" #include "tree_plugin.h" diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h new file mode 100644 index 000000000000..8fdf44f8523f --- /dev/null +++ b/kernel/rcu/tree_nocb.h @@ -0,0 +1,1496 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Internal non-public definitions that provide either classic + * or preemptible semantics. + * + * Copyright Red Hat, 2009 + * Copyright IBM Corporation, 2009 + * Copyright SUSE, 2021 + * + * Author: Ingo Molnar <mingo@elte.hu> + * Paul E. McKenney <paulmck@linux.ibm.com> + * Frederic Weisbecker <frederic@kernel.org> + */ + +#ifdef CONFIG_RCU_NOCB_CPU +static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ +static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ +static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) +{ + return lockdep_is_held(&rdp->nocb_lock); +} + +static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) +{ + /* Race on early boot between thread creation and assignment */ + if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread) + return true; + + if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread) + if (in_task()) + return true; + return false; +} + +/* + * Offload callback processing from the boot-time-specified set of CPUs + * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads + * created that pull the callbacks from the corresponding CPU, wait for + * a grace period to elapse, and invoke the callbacks. These kthreads + * are organized into GP kthreads, which manage incoming callbacks, wait for + * grace periods, and awaken CB kthreads, and the CB kthreads, which only + * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs + * do a wake_up() on their GP kthread when they insert a callback into any + * empty list, unless the rcu_nocb_poll boot parameter has been specified, + * in which case each kthread actively polls its CPU. (Which isn't so great + * for energy efficiency, but which does reduce RCU's overhead on that CPU.) + * + * This is intended to be used in conjunction with Frederic Weisbecker's + * adaptive-idle work, which would seriously reduce OS jitter on CPUs + * running CPU-bound user-mode computations. + * + * Offloading of callbacks can also be used as an energy-efficiency + * measure because CPUs with no RCU callbacks queued are more aggressive + * about entering dyntick-idle mode. + */ + + +/* + * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. + * If the list is invalid, a warning is emitted and all CPUs are offloaded. + */ +static int __init rcu_nocb_setup(char *str) +{ + alloc_bootmem_cpumask_var(&rcu_nocb_mask); + if (cpulist_parse(str, rcu_nocb_mask)) { + pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); + cpumask_setall(rcu_nocb_mask); + } + return 1; +} +__setup("rcu_nocbs=", rcu_nocb_setup); + +static int __init parse_rcu_nocb_poll(char *arg) +{ + rcu_nocb_poll = true; + return 0; +} +early_param("rcu_nocb_poll", parse_rcu_nocb_poll); + +/* + * Don't bother bypassing ->cblist if the call_rcu() rate is low. + * After all, the main point of bypassing is to avoid lock contention + * on ->nocb_lock, which only can happen at high call_rcu() rates. + */ +static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; +module_param(nocb_nobypass_lim_per_jiffy, int, 0); + +/* + * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the + * lock isn't immediately available, increment ->nocb_lock_contended to + * flag the contention. + */ +static void rcu_nocb_bypass_lock(struct rcu_data *rdp) + __acquires(&rdp->nocb_bypass_lock) +{ + lockdep_assert_irqs_disabled(); + if (raw_spin_trylock(&rdp->nocb_bypass_lock)) + return; + atomic_inc(&rdp->nocb_lock_contended); + WARN_ON_ONCE(smp_processor_id() != rdp->cpu); + smp_mb__after_atomic(); /* atomic_inc() before lock. */ + raw_spin_lock(&rdp->nocb_bypass_lock); + smp_mb__before_atomic(); /* atomic_dec() after lock. */ + atomic_dec(&rdp->nocb_lock_contended); +} + +/* + * Spinwait until the specified rcu_data structure's ->nocb_lock is + * not contended. Please note that this is extremely special-purpose, + * relying on the fact that at most two kthreads and one CPU contend for + * this lock, and also that the two kthreads are guaranteed to have frequent + * grace-period-duration time intervals between successive acquisitions + * of the lock. This allows us to use an extremely simple throttling + * mechanism, and further to apply it only to the CPU doing floods of + * call_rcu() invocations. Don't try this at home! + */ +static void rcu_nocb_wait_contended(struct rcu_data *rdp) +{ + WARN_ON_ONCE(smp_processor_id() != rdp->cpu); + while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended))) + cpu_relax(); +} + +/* + * Conditionally acquire the specified rcu_data structure's + * ->nocb_bypass_lock. + */ +static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + return raw_spin_trylock(&rdp->nocb_bypass_lock); +} + +/* + * Release the specified rcu_data structure's ->nocb_bypass_lock. + */ +static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) + __releases(&rdp->nocb_bypass_lock) +{ + lockdep_assert_irqs_disabled(); + raw_spin_unlock(&rdp->nocb_bypass_lock); +} + +/* + * Acquire the specified rcu_data structure's ->nocb_lock, but only + * if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_lock(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + if (!rcu_rdp_is_offloaded(rdp)) + return; + raw_spin_lock(&rdp->nocb_lock); +} + +/* + * Release the specified rcu_data structure's ->nocb_lock, but only + * if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_unlock(struct rcu_data *rdp) +{ + if (rcu_rdp_is_offloaded(rdp)) { + lockdep_assert_irqs_disabled(); + raw_spin_unlock(&rdp->nocb_lock); + } +} + +/* + * Release the specified rcu_data structure's ->nocb_lock and restore + * interrupts, but only if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, + unsigned long flags) +{ + if (rcu_rdp_is_offloaded(rdp)) { + lockdep_assert_irqs_disabled(); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + } else { + local_irq_restore(flags); + } +} + +/* Lockdep check that ->cblist may be safely accessed. */ +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + if (rcu_rdp_is_offloaded(rdp)) + lockdep_assert_held(&rdp->nocb_lock); +} + +/* + * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended + * grace period. + */ +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) +{ + swake_up_all(sq); +} + +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1]; +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) +{ + init_swait_queue_head(&rnp->nocb_gp_wq[0]); + init_swait_queue_head(&rnp->nocb_gp_wq[1]); +} + +/* Is the specified CPU a no-CBs CPU? */ +bool rcu_is_nocb_cpu(int cpu) +{ + if (cpumask_available(rcu_nocb_mask)) + return cpumask_test_cpu(cpu, rcu_nocb_mask); + return false; +} + +static bool __wake_nocb_gp(struct rcu_data *rdp_gp, + struct rcu_data *rdp, + bool force, unsigned long flags) + __releases(rdp_gp->nocb_gp_lock) +{ + bool needwake = false; + + if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("AlreadyAwake")); + return false; + } + + if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&rdp_gp->nocb_timer); + } + + if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { + WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); + needwake = true; + } + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + if (needwake) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); + wake_up_process(rdp_gp->nocb_gp_kthread); + } + + return needwake; +} + +/* + * Kick the GP kthread for this NOCB group. + */ +static bool wake_nocb_gp(struct rcu_data *rdp, bool force) +{ + unsigned long flags; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + return __wake_nocb_gp(rdp_gp, rdp, force, flags); +} + +/* + * Arrange to wake the GP kthread for this NOCB group at some future + * time when it is safe to do so. + */ +static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, + const char *reason) +{ + unsigned long flags; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + + /* + * Bypass wakeup overrides previous deferments. In case + * of callback storm, no need to wake up too early. + */ + if (waketype == RCU_NOCB_WAKE_BYPASS) { + mod_timer(&rdp_gp->nocb_timer, jiffies + 2); + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); + } else { + if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE) + mod_timer(&rdp_gp->nocb_timer, jiffies + 1); + if (rdp_gp->nocb_defer_wakeup < waketype) + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); + } + + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); +} + +/* + * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. + * However, if there is a callback to be enqueued and if ->nocb_bypass + * proves to be initially empty, just return false because the no-CB GP + * kthread may need to be awakened in this case. + * + * Note that this function always returns true if rhp is NULL. + */ +static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j) +{ + struct rcu_cblist rcl; + + WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); + rcu_lockdep_assert_cblist_protected(rdp); + lockdep_assert_held(&rdp->nocb_bypass_lock); + if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) { + raw_spin_unlock(&rdp->nocb_bypass_lock); + return false; + } + /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */ + if (rhp) + rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ + rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp); + rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl); + WRITE_ONCE(rdp->nocb_bypass_first, j); + rcu_nocb_bypass_unlock(rdp); + return true; +} + +/* + * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. + * However, if there is a callback to be enqueued and if ->nocb_bypass + * proves to be initially empty, just return false because the no-CB GP + * kthread may need to be awakened in this case. + * + * Note that this function always returns true if rhp is NULL. + */ +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j) |
