diff options
Diffstat (limited to 'kernel/rcu')
-rw-r--r-- | kernel/rcu/rcu_segcblist.c | 3 | ||||
-rw-r--r-- | kernel/rcu/rcuscale.c | 15 | ||||
-rw-r--r-- | kernel/rcu/rcutorture.c | 81 | ||||
-rw-r--r-- | kernel/rcu/tasks.h | 40 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 94 | ||||
-rw-r--r-- | kernel/rcu/tree_exp.h | 1 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 252 | ||||
-rw-r--r-- | kernel/rcu/tree_stall.h | 2 |
8 files changed, 342 insertions, 146 deletions
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 7f181c9675f7..aaa111237b60 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -261,8 +261,7 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) } /* - * Mark the specified rcu_segcblist structure as offloaded. This - * structure must be empty. + * Mark the specified rcu_segcblist structure as offloaded. */ void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload) { diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 06491d5530db..dca51fe9c73f 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -625,6 +625,8 @@ rcu_scale_shutdown(void *arg) torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu()."); torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration."); torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees."); +torture_param(bool, kfree_rcu_test_double, false, "Do we run a kfree_rcu() double-argument scale test?"); +torture_param(bool, kfree_rcu_test_single, false, "Do we run a kfree_rcu() single-argument scale test?"); static struct task_struct **kfree_reader_tasks; static int kfree_nrealthreads; @@ -644,10 +646,13 @@ kfree_scale_thread(void *arg) struct kfree_obj *alloc_ptr; u64 start_time, end_time; long long mem_begin, mem_during = 0; + bool kfree_rcu_test_both; + DEFINE_TORTURE_RANDOM(tr); VERBOSE_SCALEOUT_STRING("kfree_scale_thread task started"); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); + kfree_rcu_test_both = (kfree_rcu_test_single == kfree_rcu_test_double); start_time = ktime_get_mono_fast_ns(); @@ -670,7 +675,15 @@ kfree_scale_thread(void *arg) if (!alloc_ptr) return -ENOMEM; - kfree_rcu(alloc_ptr, rh); + // By default kfree_rcu_test_single and kfree_rcu_test_double are + // initialized to false. If both have the same value (false or true) + // both are randomly tested, otherwise only the one with value true + // is tested. + if ((kfree_rcu_test_single && !kfree_rcu_test_double) || + (kfree_rcu_test_both && torture_random(&tr) & 0x800)) + kfree_rcu(alloc_ptr); + else + kfree_rcu(alloc_ptr, rh); } cond_resched(); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 956e6bfd7e77..29d2f4c647d3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -245,11 +245,11 @@ static const char *rcu_torture_writer_state_getname(void) return rcu_torture_writer_state_names[i]; } -#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) -#define rcu_can_boost() 1 -#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ -#define rcu_can_boost() 0 -#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ +#if defined(CONFIG_RCU_BOOST) && defined(CONFIG_PREEMPT_RT) +# define rcu_can_boost() 1 +#else +# define rcu_can_boost() 0 +#endif #ifdef CONFIG_RCU_TRACE static u64 notrace rcu_trace_clock_local(void) @@ -925,9 +925,13 @@ static void rcu_torture_enable_rt_throttle(void) static bool rcu_torture_boost_failed(unsigned long start, unsigned long end) { + static int dbg_done; + if (end - start > test_boost_duration * HZ - HZ / 2) { VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); n_rcu_torture_boost_failure++; + if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) + cur_ops->gp_kthread_dbg(); return true; /* failed */ } @@ -950,8 +954,8 @@ static int rcu_torture_boost(void *arg) init_rcu_head_on_stack(&rbi.rcu); /* Each pass through the following loop does one boost-test cycle. */ do { - /* Track if the test failed already in this test interval? */ - bool failed = false; + bool failed = false; // Test failed already in this test interval + bool firsttime = true; /* Increment n_rcu_torture_boosts once per boost-test */ while (!kthread_should_stop()) { @@ -977,18 +981,17 @@ static int rcu_torture_boost(void *arg) /* Do one boost-test interval. */ endtime = oldstarttime + test_boost_duration * HZ; - call_rcu_time = jiffies; while (time_before(jiffies, endtime)) { /* If we don't have a callback in flight, post one. */ if (!smp_load_acquire(&rbi.inflight)) { /* RCU core before ->inflight = 1. */ smp_store_release(&rbi.inflight, 1); - call_rcu(&rbi.rcu, rcu_torture_boost_cb); + cur_ops->call(&rbi.rcu, rcu_torture_boost_cb); /* Check if the boost test failed */ - failed = failed || - rcu_torture_boost_failed(call_rcu_time, - jiffies); + if (!firsttime && !failed) + failed = rcu_torture_boost_failed(call_rcu_time, jiffies); call_rcu_time = jiffies; + firsttime = false; } if (stutter_wait("rcu_torture_boost")) sched_set_fifo_low(current); @@ -1001,7 +1004,7 @@ static int rcu_torture_boost(void *arg) * this case the boost check would never happen in the above * loop so do another one here. */ - if (!failed && smp_load_acquire(&rbi.inflight)) + if (!firsttime && !failed && smp_load_acquire(&rbi.inflight)) rcu_torture_boost_failed(call_rcu_time, jiffies); /* @@ -1027,6 +1030,9 @@ checkwait: if (stutter_wait("rcu_torture_boost")) sched_set_fifo_low(current); } while (!torture_must_stop()); + while (smp_load_acquire(&rbi.inflight)) + schedule_timeout_uninterruptible(1); // rcu_barrier() deadlocks. + /* Clean up and exit. */ while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) { torture_shutdown_absorb("rcu_torture_boost"); @@ -1791,7 +1797,7 @@ rcu_torture_stats_print(void) WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier() WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio - WARN_ON_ONCE(n_rcu_torture_boost_failure); // RCU boost failed + WARN_ON_ONCE(n_rcu_torture_boost_failure); // boost failed (TIMER_SOFTIRQ RT prio?) WARN_ON_ONCE(i > 1); // Too-short grace period } pr_cont("Reader Pipe: "); @@ -1855,6 +1861,45 @@ rcu_torture_stats(void *arg) torture_shutdown_absorb("rcu_torture_stats"); } while (!torture_must_stop()); torture_kthread_stopping("rcu_torture_stats"); + + { + struct rcu_head *rhp; + struct kmem_cache *kcp; + static int z; + + kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL); + rhp = kmem_cache_alloc(kcp, GFP_KERNEL); + pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z); + pr_alert("mem_dump_obj(ZERO_SIZE_PTR):"); + mem_dump_obj(ZERO_SIZE_PTR); + pr_alert("mem_dump_obj(NULL):"); + mem_dump_obj(NULL); + pr_alert("mem_dump_obj(%px):", &rhp); + mem_dump_obj(&rhp); + pr_alert("mem_dump_obj(%px):", rhp); + mem_dump_obj(rhp); + pr_alert("mem_dump_obj(%px):", &rhp->func); + mem_dump_obj(&rhp->func); + pr_alert("mem_dump_obj(%px):", &z); + mem_dump_obj(&z); + kmem_cache_free(kcp, rhp); + kmem_cache_destroy(kcp); + rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); + pr_alert("mem_dump_obj(kmalloc %px):", rhp); + mem_dump_obj(rhp); + pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func); + mem_dump_obj(&rhp->func); + kfree(rhp); + rhp = vmalloc(4096); + pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); + pr_alert("mem_dump_obj(vmalloc %px):", rhp); + mem_dump_obj(rhp); + pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func); + mem_dump_obj(&rhp->func); + vfree(rhp); + } + return 0; } @@ -1965,8 +2010,8 @@ static int rcu_torture_stall(void *args) local_irq_disable(); else if (!stall_cpu_block) preempt_disable(); - pr_alert("rcu_torture_stall start on CPU %d.\n", - raw_smp_processor_id()); + pr_alert("%s start on CPU %d.\n", + __func__, raw_smp_processor_id()); while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at)) if (stall_cpu_block) @@ -1977,7 +2022,7 @@ static int rcu_torture_stall(void *args) preempt_enable(); cur_ops->readunlock(idx); } - pr_alert("rcu_torture_stall end.\n"); + pr_alert("%s end.\n", __func__); torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); @@ -2589,6 +2634,8 @@ static bool rcu_torture_can_boost(void) if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2) return false; + if (!cur_ops->call) + return false; prio = rcu_get_gp_kthreads_prio(); if (!prio) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index af7c19439f4e..350ebf5051f9 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -20,7 +20,7 @@ typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); typedef void (*postgp_func_t)(struct rcu_tasks *rtp); /** - * Definition for a Tasks-RCU-like mechanism. + * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism. * @cbs_head: Head of callback list. * @cbs_tail: Tail pointer for callback list. * @cbs_wq: Wait queue allowning new callback to get kthread's attention. @@ -38,7 +38,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @pregp_func: This flavor's pre-grace-period function (optional). * @pertask_func: This flavor's per-task scan function (optional). * @postscan_func: This flavor's post-task scan function (optional). - * @holdout_func: This flavor's holdout-list scan function (optional). + * @holdouts_func: This flavor's holdout-list scan function (optional). * @postgp_func: This flavor's post-grace-period function (optional). * @call_func: This flavor's call_rcu()-equivalent function. * @name: This flavor's textual name. @@ -726,6 +726,42 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); // flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace // readers can operate from idle, offline, and exception entry/exit in no // way allows rcu_preempt and rcu_sched readers to also do so. +// +// The implementation uses rcu_tasks_wait_gp(), which relies on function +// pointers in the rcu_tasks structure. The rcu_spawn_tasks_trace_kthread() +// function sets these function pointers up so that rcu_tasks_wait_gp() +// invokes these functions in this order: +// +// rcu_tasks_trace_pregp_step(): +// Initialize the count of readers and block CPU-hotplug operations. +// rcu_tasks_trace_pertask(), invoked on every non-idle task: +// Initialize per-task state and attempt to identify an immediate +// quiescent state for that task, or, failing that, attempt to +// set that task's .need_qs flag so that task's next outermost +// rcu_read_unlock_trace() will report the quiescent state (in which +// case the count of readers is incremented). If both attempts fail, +// the task is added to a "holdout" list. +// rcu_tasks_trace_postscan(): +// Initialize state and attempt to identify an immediate quiescent +// state as above (but only for idle tasks), unblock CPU-hotplug +// operations, and wait for an RCU grace period to avoid races with +// tasks that are in the process of exiting. +// check_all_holdout_tasks_trace(), repeatedly until holdout list is empty: +// Scans the holdout list, attempting to identify a quiescent state +// for each task on the list. If there is a quiescent state, the +// corresponding task is removed from the holdout list. +// rcu_tasks_trace_postgp(): +// Wait for the count of readers do drop to zero, reporting any stalls. +// Also execute full memory barriers to maintain ordering with code +// executing after the grace period. +// +// The exit_tasks_rcu_finish_trace() synchronizes with exiting tasks. +// +// Pre-grace-period update-side code is ordered before the grace +// period via the ->cbs_lock and barriers in rcu_tasks_kthread(). +// Pre-grace-period read-side code is ordered before the grace period by +// atomic_dec_and_test() of the count of readers (for IPIed readers) and by +// scheduler context-switch ordering (for locked-down non-running readers). // The lockdep state must be outside of #ifdef to be useful. #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 07e812261474..8e78b2430c16 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -156,6 +156,7 @@ static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); +static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); /* rcuc/rcub kthread realtime priority */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; @@ -648,7 +649,6 @@ static noinstr void rcu_eqs_enter(bool user) instrumentation_begin(); trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - rdp = this_cpu_ptr(&rcu_data); rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); @@ -1077,7 +1077,6 @@ noinstr void rcu_nmi_enter(void) } else if (!in_nmi()) { instrumentation_begin(); rcu_irq_enter_check_tick(); - instrumentation_end(); } else { instrumentation_begin(); } @@ -1672,7 +1671,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) { bool ret = false; bool need_qs; - const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_rdp_is_offloaded(rdp); raw_lockdep_assert_held_rcu_node(rnp); @@ -2128,7 +2127,7 @@ static void rcu_gp_cleanup(void) needgp = true; } /* Advance CBs to reduce false positives below. */ - offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); + offloaded = rcu_rdp_is_offloaded(rdp); if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); WRITE_ONCE(rcu_state.gp_req_activity, jiffies); @@ -2327,7 +2326,7 @@ rcu_report_qs_rdp(struct rcu_data *rdp) unsigned long flags; unsigned long mask; bool needwake = false; - const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_rdp_is_offloaded(rdp); struct rcu_node *rnp; WARN_ON_ONCE(rdp->cpu != smp_processor_id()); @@ -2414,7 +2413,7 @@ int rcutree_dying_cpu(unsigned int cpu) blkd = !!(rnp->qsmask & rdp->grpmask); trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), - blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); + blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); return 0; } @@ -2497,7 +2496,7 @@ static void rcu_do_batch(struct rcu_data *rdp) int div; bool __maybe_unused empty; unsigned long flags; - const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_rdp_is_offloaded(rdp); struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); long bl, count = 0; @@ -3066,7 +3065,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); /* Go handle any RCU core processing required. */ - if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) { + if (unlikely(rcu_rdp_is_offloaded(rdp))) { __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ } else { __call_rcu_core(rdp, head, flags); @@ -3229,8 +3228,7 @@ krc_this_cpu_lock(unsigned long *flags) static inline void krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) { - raw_spin_unlock(&krcp->lock); - local_irq_restore(flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } static inline struct kvfree_rcu_bulk_data * @@ -3464,7 +3462,7 @@ static void fill_page_cache_func(struct work_struct *work) for (i = 0; i < rcu_min_cached_objs; i++) { bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_KERNEL | __GFP_NOWARN); + __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); if (bnode) { raw_spin_lock_irqsave(&krcp->lock, flags); @@ -3493,37 +3491,62 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp) } } +// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock() +// state specified by flags. If can_alloc is true, the caller must +// be schedulable and not be holding any locks or mutexes that might be +// acquired by the memory allocator or anything that it might invoke. +// Returns true if ptr was successfully recorded, else the caller must +// use a fallback. static inline bool -kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) +add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, + unsigned long *flags, void *ptr, bool can_alloc) { struct kvfree_rcu_bulk_data *bnode; int idx; - if (unlikely(!krcp->initialized)) + *krcp = krc_this_cpu_lock(flags); + if (unlikely(!(*krcp)->initialized)) return false; - lockdep_assert_held(&krcp->lock); idx = !!is_vmalloc_addr(ptr); /* Check if a new block is required. */ - if (!krcp->bkvhead[idx] || - krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { - bnode = get_cached_bnode(krcp); - /* Switch to emergency path. */ + if (!(*krcp)->bkvhead[idx] || + (*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { + bnode = get_cached_bnode(*krcp); + if (!bnode && can_alloc) { + krc_this_cpu_unlock(*krcp, *flags); + + // __GFP_NORETRY - allows a light-weight direct reclaim + // what is OK from minimizing of fallback hitting point of + // view. Apart of that it forbids any OOM invoking what is + // also beneficial since we are about to release memory soon. + // + // __GFP_NOMEMALLOC - prevents from consuming of all the + // memory reserves. Please note we have a fallback path. + // + // __GFP_NOWARN - it is supposed that an allocation can + // be failed under low memory or high memory pressure + // scenarios. + bnode = (struct kvfree_rcu_bulk_data *) + __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + *krcp = krc_this_cpu_lock(flags); + } + if (!bnode) return false; /* Initialize the new block. */ bnode->nr_records = 0; - bnode->next = krcp->bkvhead[idx]; + bnode->next = (*krcp)->bkvhead[idx]; /* Attach it to the head. */ - krcp->bkvhead[idx] = bnode; + (*krcp)->bkvhead[idx] = bnode; } /* Finally insert. */ - krcp->bkvhead[idx]->records - [krcp->bkvhead[idx]->nr_records++] = ptr; + (*krcp)->bkvhead[idx]->records + [(*krcp)->bkvhead[idx]->nr_records++] = ptr; return true; } @@ -3561,8 +3584,6 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) ptr = (unsigned long *) func; } - krcp = krc_this_cpu_lock(&flags); - // Queue the object but don't yet schedule the batch. if (debug_rcu_head_queue(ptr)) { // Probable double kfree_rcu(), just leak. @@ -3570,12 +3591,11 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) __func__, head); // Mark as success and leave. - success = true; - goto unlock_return; + return; } kasan_record_aux_stack(ptr); - success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); + success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); if (!success) { run_page_cache_worker(krcp); @@ -3904,13 +3924,13 @@ static int rcu_pending(int user) return 1; /* Does this CPU have callbacks ready to invoke? */ - if (!rcu_segcblist_is_offloaded(&rdp->cblist) && + if (!rcu_rdp_is_offloaded(rdp) && rcu_segcblist_ready_cbs(&rdp->cblist)) return 1; /* Has RCU gone idle with this CPU needing another grace period? */ if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) && - !rcu_segcblist_is_offloaded(&rdp->cblist) && + !rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; @@ -4029,7 +4049,7 @@ void rcu_barrier(void) for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); if (cpu_is_offline(cpu) && - !rcu_segcblist_is_offloaded(&rdp->cblist)) + !rcu_rdp_is_offloaded(rdp)) continue; if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) { rcu_barrier_trace(TPS("OnlineQ"), cpu, @@ -4144,15 +4164,13 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + /* - * Lock in case the CB/GP kthreads are still around handling - * old callbacks (longer term we should flush all callbacks - * before completing CPU offline) + * Only non-NOCB CPUs that didn't have early-boot callbacks need to be + * (re-)initialized. */ - rcu_nocb_lock(rdp); - if (rcu_segcblist_empty(&rdp->cblist)) /* No early-boot CBs? */ + if (!rcu_segcblist_is_enabled(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ - rcu_nocb_unlock(rdp); /* * Add CPU to leaf rcu_node pending-online bitmask. Any needed @@ -4352,7 +4370,7 @@ void rcutree_migrate_callbacks(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); bool needwake; - if (rcu_segcblist_is_offloaded(&rdp->cblist) || + if (rcu_rdp_is_offloaded(rdp) || rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ @@ -4370,7 +4388,7 @@ void rcutree_migrate_callbacks(int cpu) rcu_segcblist_disable(&rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); - if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) { + if (rcu_rdp_is_offloaded(my_rdp)) { raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ __call_rcu_nocb_wake(my_rdp, true, flags); } else { diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6c6ff06d4ae6..2796084ef85a 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -521,6 +521,7 @@ static void synchronize_rcu_expedited_wait(void) if (rcu_stall_is_suppressed()) continue; panic_on_rcu_stall(); + trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); ndetected = 0; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2d603771c7dc..ad0156b86937 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -16,8 +16,70 @@ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ +static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) +{ + return lockdep_is_held(&rdp->nocb_lock); +} + +static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) +{ + /* Race on early boot between thread creation and assignment */ + if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread) + return true; + + if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread) + if (in_task()) + return true; + return false; +} + +static inline bool rcu_running_nocb_timer(struct rcu_data *rdp) +{ + return (timer_curr_running(&rdp->nocb_timer) && !in_irq()); +} +#else +static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) +{ + return 0; +} + +static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) +{ + return false; +} + +static inline bool rcu_running_nocb_timer(struct rcu_data *rdp) +{ + return false; +} + #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ +static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) +{ + /* + * In order to read the offloaded state of an rdp is a safe + * and stable way and prevent from its value to be changed + * under us, we must either hold the barrier mutex, the cpu + * hotplug lock (read or write) or the nocb lock. Local + * non-preemptible reads are also safe. NOCB kthreads and + * timers have their own means of synchronization against the + * offloaded state updaters. + */ + RCU_LOCKDEP_WARN( + !(lockdep_is_held(&rcu_state.barrier_mutex) || + (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || + rcu_lockdep_is_held_nocb(rdp) || + (rdp == this_cpu_ptr(&rcu_data) && + !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) || + rcu_current_is_nocb_kthread(rdp) || + rcu_running_nocb_timer(rdp)), + "Unsafe read of RCU_NOCB offloaded state" + ); + + return rcu_segcblist_is_offloaded(&rdp->cblist); +} + /* * Check the RCU kernel configuration parameters and print informative * messages about anything out of the ordinary. @@ -393,8 +455,9 @@ void __rcu_read_unlock(void) { struct task_struct *t = current; + barrier(); // critical section before exit code. if (rcu_preempt_read_exit() == 0) { - barrier(); /* critical section before exit code. */ + barrier(); // critical-section exit before .s check. if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); } @@ -598,9 +661,9 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) static void rcu_read_unlock_special(struct task_struct *t) { unsigned long flags; + bool irqs_were_disabled; bool preempt_bh_were_disabled = !!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); - bool irqs_were_disabled; /* NMI handlers cannot block and cannot safely manipulate state. */ if (in_nmi()) @@ -609,30 +672,33 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); if (preempt_bh_were_disabled || irqs_were_disabled) { - bool exp; + bool expboost; // Expedited GP in flight or possible boosting. struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - exp = (t->rcu_blocked_node && - READ_ONCE(t->rcu_blocked_node->exp_tasks)) || - (rdp->grpmask & READ_ONCE(rnp->expmask)); + expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) || + (rdp->grpmask & READ_ONCE(rnp->expmask)) || + IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) || + (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && + t->rcu_blocked_node); // Need to defer quiescent state until everything is enabled. - if (use_softirq && (in_irq() || (exp && !irqs_were_disabled))) { + if (use_softirq && (in_irq() || (expboost && !irqs_were_disabled))) { // Using softirq, safe to awaken, and either the - // wakeup is free or there is an expedited GP. + // wakeup is free or there is either an expedited + // GP in flight or a potential need to deboost. raise_softirq_irqoff(RCU_SOFTIRQ); } else { // Enabling BH or preempt does reschedule, so... - // Also if no expediting, slow is OK. - // Plus nohz_full CPUs eventually get tick enabled. + // Also if no expediting and no possible deboosting, + // slow is OK. Plus nohz_full CPUs eventually get + // tick enabled. set_tsk_need_resched(current); set_preempt_need_resched(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && - !rdp->defer_qs_iw_pending && exp && cpu_online(rdp->cpu)) { + expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - init_irq_work(&rdp->defer_qs_iw, - rcu_preempt_deferred_qs_handler); + init_irq_work(&rdp->defer_qs_iw, rcu_preempt_deferred_qs_handler); rdp->defer_qs_iw_pending = true; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } @@ -1257,7 +1323,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) { *nextevt = KTIME_MAX; return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && - !rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist); + !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data)); } /* @@ -1352,7 +1418,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) /* If no non-offloaded callbacks, RCU doesn't need the CPU. */ if (rcu_segcblist_empty(&rdp->cblist) || - rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist)) { + rcu_rdp_is_offloaded(rdp)) { *nextevt = KTIME_MAX; return 0; } @@ -1388,7 +1454,7 @@ static void rcu_prepare_for_idle(void) int tne; lockdep_assert_irqs_disabled(); - if (rcu_segcblist_is_offloaded(&rdp->cblist)) + if (rcu_rdp_is_offloaded(rdp)) return; /* Handle nohz enablement switches conservatively. */ @@ -1429,7 +1495,7 @@ static void rcu_cleanup_after_idle(void) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); lockdep_assert_irqs_disabled(); - if (rcu_segcblist_is_offloaded(&rdp->cblist)) + if (rcu_rdp_is_offloaded(rdp)) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); @@ -1464,14 +1530,12 @@ static void rcu_cleanup_after_idle(void) /* * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. - * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a - * comma-separated list of CPUs and/or CPU ranges. If an invalid list is - * given, a warning is emitted and all CPUs are offloaded. + * If the list is invalid, a warning is emitted and all CPUs are offloaded. */ static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); - if (!strcasecmp(str, "all")) + if (!strcasecmp(str, "all")) /* legacy: use "0-N" instead */ cpumask_setall(rcu_nocb_mask); else if (cpulist_parse(str, rcu_nocb_mask)) { @@ -1494,7 +1558,7 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll); * After all, the main point of bypassing is to avoid lock contention * on ->nocb_lock, which only can happen at high call_rcu() rates. */ -int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; +static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; module_param(nocb_nobypass_lim_per_jiffy, int, 0); /* @@ -1560,7 +1624,7 @@ static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) static void rcu_nocb_lock(struct rcu_data *rdp) { lockdep_assert_irqs_disabled(); - if (!rcu_segcblist_is_offloaded(&rdp->cblist)) + if (!rcu_rdp_is_offloaded(rdp)) return; raw_spin_lock(&rdp->nocb_lock); } @@ -1571,7 +1635,7 @@ static void rcu_nocb_lock(struct rcu_data *rdp) */ static void rcu_nocb_unlock(struct rcu_data *rdp) { - if (rcu_segcblist_is_offloaded(&rdp->cblist)) { + if (rcu_rdp_is_offloaded(rdp)) { lockdep_assert_irqs_disabled(); raw_spin_unlock(&rdp->nocb_lock); } @@ -1584,7 +1648,7 @@ static void rcu_nocb_unlock(struct rcu_data *rdp) static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, unsigned long flags) { - if (rcu_segcblist_is_offloaded(&rdp->cblist)) { + if (rcu_rdp_is_offloaded(rdp)) { lockdep_assert_irqs_disabled(); raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } else { @@ -1596,7 +1660,7 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) { lockdep_assert_irqs_disabled(); - if (rcu_segcblist_is_offloaded(&rdp->cblist)) + if (rcu_rdp_is_offloaded(rdp)) lockdep_assert_held(&rdp->nocb_lock); } @@ -1641,12 +1705,16 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force, lockdep_assert_held(&rdp->nocb_lock); if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { + rcu_nocb_unlock_irqrestore(rdp, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("AlreadyAwake")); - rcu_nocb_unlock_irqrestore(rdp, flags); return false; } - del_timer(&rdp->nocb_timer); + + if (READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT) { + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&rdp->nocb_timer); + } rcu_nocb_unlock_irqrestore(rdp, flags); raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { @@ -1690,7 +1758,7 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, { struct rcu_cblist rcl; - WARN_ON_ONCE(!rcu_segcblist_is_offloaded(&rdp->cblist)); + WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); rcu_lockdep_assert_cblist_protected(rdp); lockdep_assert_held(&rdp->nocb_bypass_lock); if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) { @@ -1718,7 +1786,7 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, unsigned long j) { - if (!rcu_segcblist_is_offloaded(&rdp->cblist)) + if (!rcu_rdp_is_offloaded(rdp)) return true; rcu_lockdep_assert_cblist_protected(rdp); rcu_nocb_bypass_lock(rdp); @@ -1732,7 +1800,7 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) { rcu_lockdep_assert_cblist_protected(rdp); - if (!rcu_segcblist_is_offloaded(&rdp->cblist) || + if (!rcu_rdp_is_offloaded(rdp) || !rcu_nocb_bypass_trylock(rdp)) return; WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j)); @@ -1764,11 +1832,22 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, unsigned long j = jiffies; long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - if (!rcu_segcblist_is_offloaded(&rdp->cblist)) { + lockdep_assert_irqs_disabled(); + + // Pure softirq/rcuc based processing: no bypassing, no + // locking. + if (!rcu_rdp_is_offloaded(rdp)) { + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + return false; + } + + // In the process of (de-)offloading: no bypassing, but + // locking. + if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { + rcu_nocb_lock(rdp); *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); return false; /* Not offloaded, no bypassing. */ } - lockdep_assert_irqs_disabled(); // Don't use ->nocb_bypass during early boot. if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { @@ -1878,9 +1957,9 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, // If we are being polled or there is no kthread, just leave. t = READ_ONCE(rdp->nocb_gp_kthread); i |