diff options
29 files changed, 644 insertions, 203 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index dbc22d684627..6d16b78d5875 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3980,6 +3980,15 @@ Set threshold of queued RCU callbacks below which batch limiting is re-enabled. + rcutree.qovld= [KNL] + Set threshold of queued RCU callbacks beyond which + RCU's force-quiescent-state scan will aggressively + enlist help from cond_resched() and sched IPIs to + help CPUs more quickly reach quiescent states. + Set to less than zero to make this be set based + on rcutree.qhimark at boot time and to zero to + disable more aggressive help enlistment. + rcutree.rcu_idle_gp_delay= [KNL] Set wakeup interval for idle CPUs that have RCU callbacks (RCU_FAST_NO_HZ=y). @@ -4195,6 +4204,12 @@ rcupdate.rcu_cpu_stall_suppress= [KNL] Suppress RCU CPU stall warning messages. + rcupdate.rcu_cpu_stall_suppress_at_boot= [KNL] + Suppress RCU CPU stall warning messages and + rcutorture writer stall warnings that occur + during early boot, that is, during the time + before the init task is spawned. + rcupdate.rcu_cpu_stall_timeout= [KNL] Set timeout for RCU CPU stall warning messages. @@ -4867,6 +4882,10 @@ topology updates sent by the hypervisor to this LPAR. + torture.disable_onoff_at_boot= [KNL] + Prevent the CPU-hotplug component of torturing + until after init has spawned. + tp720= [HW,PS2] tpm_suspend_pcr=[HW,TPM] diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 1320288ff9ec..55a29b0d52fc 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2383,7 +2383,7 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre rcu_read_lock(); if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) goto out; - lh = rcu_dereference(nfsi->access_cache_entry_lru.prev); + lh = rcu_dereference(list_tail_rcu(&nfsi->access_cache_entry_lru)); cache = list_entry(lh, struct nfs_access_entry, lru); if (lh == &nfsi->access_cache_entry_lru || cred_fscmp(cred, cache->cred) != 0) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 9f313e4999fe..8214cdc715f2 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -60,9 +60,9 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) #define __list_check_rcu(dummy, cond, extra...) \ ({ \ check_arg_count_one(extra); \ - RCU_LOCKDEP_WARN(!cond && !rcu_read_lock_any_held(), \ + RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(), \ "RCU-list traversed in non-reader section!"); \ - }) + }) #else #define __list_check_rcu(dummy, cond, extra...) \ ({ check_arg_count_one(extra); }) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index b2b2dc990da9..045c28b71f4f 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -83,6 +83,7 @@ void rcu_scheduler_starting(void); static inline void rcu_scheduler_starting(void) { } #endif /* #else #ifndef CONFIG_SRCU */ static inline void rcu_end_inkernel_boot(void) { } +static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_dyntick_idle(void) { } static inline void kfree_rcu_scheduler_running(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 2f787b9029d1..45f3f66bb04d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -54,6 +54,7 @@ void exit_rcu(void); void rcu_scheduler_starting(void); extern int rcu_scheduler_active __read_mostly; void rcu_end_inkernel_boot(void); +bool rcu_inkernel_boot_has_ended(void); bool rcu_is_watching(void); #ifndef CONFIG_PREEMPTION void rcu_all_qs(void); diff --git a/include/linux/timer.h b/include/linux/timer.h index 1e6650ed066d..0dc19a8c39c9 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -164,7 +164,7 @@ static inline void destroy_timer_on_stack(struct timer_list *timer) { } */ static inline int timer_pending(const struct timer_list * timer) { - return timer->entry.pprev != NULL; + return !hlist_unhashed_lockless(&timer->entry); } extern void add_timer_on(struct timer_list *timer, int cpu); diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 5e49b06e8104..f9a7811148e2 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -624,6 +624,34 @@ TRACE_EVENT_RCU(rcu_invoke_kfree_callback, ); /* + * Tracepoint for the invocation of a single RCU callback of the special + * kfree_bulk() form. The first argument is the RCU flavor, the second + * argument is a number of elements in array to free, the third is an + * address of the array holding nr_records entries. + */ +TRACE_EVENT_RCU(rcu_invoke_kfree_bulk_callback, + + TP_PROTO(const char *rcuname, unsigned long nr_records, void **p), + + TP_ARGS(rcuname, nr_records, p), + + TP_STRUCT__entry( + __field(const char *, rcuname) + __field(unsigned long, nr_records) + __field(void **, p) + ), + + TP_fast_assign( + __entry->rcuname = rcuname; + __entry->nr_records = nr_records; + __entry->p = p; + ), + + TP_printk("%s bulk=0x%p nr_records=%lu", + __entry->rcuname, __entry->p, __entry->nr_records) +); + +/* * Tracepoint for exiting rcu_do_batch after RCU callbacks have been * invoked. The first argument is the name of the RCU flavor, * the second argument is number of callbacks actually invoked, @@ -712,6 +740,7 @@ TRACE_EVENT_RCU(rcu_torture_read, * "Begin": rcu_barrier() started. * "EarlyExit": rcu_barrier() piggybacked, thus early exit. * "Inc1": rcu_barrier() piggyback check counter incremented. + * "OfflineNoCBQ": rcu_barrier() found offline no-CBs CPU with callbacks. * "OnlineQ": rcu_barrier() found online CPU with callbacks. * "OnlineNQ": rcu_barrier() found online CPU, no callbacks. * "IRQ": An rcu_barrier_callback() callback posted on remote CPU. diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 99475a66c94f..5efbfc68ce99 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -618,7 +618,7 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = { static int lock_torture_writer(void *arg) { struct lock_stress_stats *lwsp = arg; - static DEFINE_TORTURE_RANDOM(rand); + DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_writer task started"); set_user_nice(current, MAX_NICE); @@ -655,7 +655,7 @@ static int lock_torture_writer(void *arg) static int lock_torture_reader(void *arg) { struct lock_stress_stats *lrsp = arg; - static DEFINE_TORTURE_RANDOM(rand); + DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_reader task started"); set_user_nice(current, MAX_NICE); @@ -696,15 +696,16 @@ static void __torture_print_stats(char *page, if (statp[i].n_lock_fail) fail = true; sum += statp[i].n_lock_acquired; - if (max < statp[i].n_lock_fail) - max = statp[i].n_lock_fail; - if (min > statp[i].n_lock_fail) - min = statp[i].n_lock_fail; + if (max < statp[i].n_lock_acquired) + max = statp[i].n_lock_acquired; + if (min > statp[i].n_lock_acquired) + min = statp[i].n_lock_acquired; } page += sprintf(page, "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", write ? "Writes" : "Reads ", - sum, max, min, max / 2 > min ? "???" : "", + sum, max, min, + !onoff_interval && max / 2 > min ? "???" : "", fail, fail ? "!!!" : ""); if (fail) atomic_inc(&cxt.n_lock_torture_errors); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 851bbb10819d..c9f090d64f00 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -57,7 +57,7 @@ rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) if (rt_mutex_has_waiters(lock)) val |= RT_MUTEX_HAS_WAITERS; - lock->owner = (struct task_struct *)val; + WRITE_ONCE(lock->owner, (struct task_struct *)val); } static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 82d5fba48b2f..f91f2c2cf138 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -3,6 +3,10 @@ # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n +ifeq ($(CONFIG_KCSAN),y) +KBUILD_CFLAGS += -g -fno-omit-frame-pointer +endif + obj-y += update.o sync.o obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 05f936ed167a..00ddc92c5774 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -198,6 +198,13 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +extern int rcu_cpu_stall_suppress_at_boot; + +static inline bool rcu_stall_is_suppressed_at_boot(void) +{ + return rcu_cpu_stall_suppress_at_boot && !rcu_inkernel_boot_has_ended(); +} + #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_ftrace_dump; @@ -205,6 +212,11 @@ extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; int rcu_jiffies_till_stall_check(void); +static inline bool rcu_stall_is_suppressed(void) +{ + return rcu_stall_is_suppressed_at_boot() || rcu_cpu_stall_suppress; +} + #define rcu_ftrace_dump_stall_suppress() \ do { \ if (!rcu_cpu_stall_suppress) \ @@ -218,6 +230,11 @@ do { \ } while (0) #else /* #endif #ifdef CONFIG_RCU_STALL_COMMON */ + +static inline bool rcu_stall_is_suppressed(void) +{ + return rcu_stall_is_suppressed_at_boot(); +} #define rcu_ftrace_dump_stall_suppress() #define rcu_ftrace_dump_stall_unsuppress() #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ @@ -325,7 +342,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * Iterate over all possible CPUs in a leaf RCU node. */ #define for_each_leaf_node_possible_cpu(rnp, cpu) \ - for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ + for (WARN_ON_ONCE(!rcu_is_leaf_node(rnp)), \ + (cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ (cpu) <= rnp->grphi; \ (cpu) = cpumask_next((cpu), cpu_possible_mask)) @@ -335,7 +353,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) #define rcu_find_next_bit(rnp, cpu, mask) \ ((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu))) #define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \ - for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ + for (WARN_ON_ONCE(!rcu_is_leaf_node(rnp)), \ + (cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ (cpu) <= rnp->grphi; \ (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask))) diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 5f4fd3b8777c..9a0f66133b4b 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -182,7 +182,7 @@ void rcu_segcblist_offload(struct rcu_segcblist *rsclp) bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp) { return rcu_segcblist_is_enabled(rsclp) && - &rsclp->head != rsclp->tails[RCU_DONE_TAIL]; + &rsclp->head != READ_ONCE(rsclp->tails[RCU_DONE_TAIL]); } /* @@ -381,8 +381,6 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, return; /* Nothing to do. */ WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head); WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail); - rclp->head = NULL; - rclp->tail = &rclp->head; } /* diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index da94b89cd531..a4a8d097d84d 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/init.h> +#include <linux/mm.h> #include <linux/module.h> #include <linux/kthread.h> #include <linux/err.h> @@ -611,6 +612,7 @@ kfree_perf_thread(void *arg) long me = (long)arg; struct kfree_obj *alloc_ptr; u64 start_time, end_time; + long long mem_begin, mem_during = 0; VERBOSE_PERFOUT_STRING("kfree_perf_thread task started"); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); @@ -626,6 +628,12 @@ kfree_perf_thread(void *arg) } do { + if (!mem_during) { + mem_during = mem_begin = si_mem_available(); + } else if (loop % (kfree_loops / 4) == 0) { + mem_during = (mem_during + si_mem_available()) / 2; + } + for (i = 0; i < kfree_alloc_num; i++) { alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL); if (!alloc_ptr) @@ -645,9 +653,11 @@ kfree_perf_thread(void *arg) else b_rcu_gp_test_finished = cur_ops->get_gp_seq(); - pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld\n", + pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n", (unsigned long long)(end_time - start_time), kfree_loops, - rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started)); + rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), + (mem_begin - mem_during) >> (20 - PAGE_SHIFT)); + if (shutdown) { smp_mb(); /* Assign before wake. */ wake_up(&shutdown_wq); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1aeecc165b21..5453bd557f43 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -339,7 +339,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!rcu_fwd_cb_nodelay && + if (!READ_ONCE(rcu_fwd_cb_nodelay) && !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); @@ -375,11 +375,12 @@ rcu_torture_pipe_update_one(struct rcu_torture *rp) { int i; - i = rp->rtort_pipe_count; + i = READ_ONCE(rp->rtort_pipe_count); if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + WRITE_ONCE(rp->rtort_pipe_count, i + 1); + if (rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { rp->rtort_mbtest = 0; return true; } @@ -1015,7 +1016,8 @@ rcu_torture_writer(void *arg) if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); - old_rp->rtort_pipe_count++; + WRITE_ONCE(old_rp->rtort_pipe_count, + old_rp->rtort_pipe_count + 1); switch (synctype[torture_random(&rand) % nsynctypes]) { case RTWS_DEF_FREE: rcu_torture_writer_state = RTWS_DEF_FREE; @@ -1067,7 +1069,8 @@ rcu_torture_writer(void *arg) if (stutter_wait("rcu_torture_writer") && !READ_ONCE(rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && - !torture_must_stop()) + !torture_must_stop() && + rcu_inkernel_boot_has_ended()) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != @@ -1290,7 +1293,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) atomic_inc(&n_rcu_torture_mberror); rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp); preempt_disable(); - pipe_count = p->rtort_pipe_count; + pipe_count = READ_ONCE(p->rtort_pipe_count); if (pipe_count > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; @@ -1404,14 +1407,15 @@ rcu_torture_stats_print(void) int i; long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + struct rcu_torture *rtcp; static unsigned long rtcv_snap = ULONG_MAX; static bool splatted; struct task_struct *wtp; for_each_possible_cpu(cpu) { for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; - batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; + pipesummary[i] += READ_ONCE(per_cpu(rcu_torture_count, cpu)[i]); + batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]); } } for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { @@ -1420,9 +1424,10 @@ rcu_torture_stats_print(void) } pr_alert("%s%s ", torture_type, TORTURE_FLAG); + rtcp = rcu_access_pointer(rcu_torture_current); pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", - rcu_torture_current, - rcu_torture_current ? "ver" : "VER", + rtcp, + rtcp && !rcu_stall_is_suppressed_at_boot() ? "ver" : "VER", rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), @@ -1478,7 +1483,8 @@ rcu_torture_stats_print(void) if (cur_ops->stats) cur_ops->stats(); if (rtcv_snap == rcu_torture_current_version && - rcu_torture_current != NULL) { + rcu_access_pointer(rcu_torture_current) && + !rcu_stall_is_suppressed()) { int __maybe_unused flags = 0; unsigned long __maybe_unused gp_seq = 0; @@ -1993,8 +1999,11 @@ static int rcu_torture_fwd_prog(void *args) schedule_timeout_interruptible(fwd_progress_holdoff * HZ); WRITE_ONCE(rcu_fwd_emergency_stop, false); register_oom_notifier(&rcutorture_oom_nb); - rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); - rcu_torture_fwd_prog_cr(rfp); + if (!IS_ENABLED(CONFIG_TINY_RCU) || + rcu_inkernel_boot_has_ended()) + rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); + if (rcu_inkernel_boot_has_ended()) + rcu_torture_fwd_prog_cr(rfp); unregister_oom_notifier(&rcutorture_oom_nb); /* Avoid slow periods, better to test when busy. */ @@ -2044,6 +2053,14 @@ static void rcu_torture_barrier_cbf(struct rcu_head *rcu) atomic_inc(&barrier_cbs_invoked); } +/* IPI handler to get callback posted on desired CPU, if online. */ +static void rcu_torture_barrier1cb(void *rcu_void) +{ + struct rcu_head *rhp = rcu_void; + + cur_ops->call(rhp, rcu_torture_barrier_cbf); +} + /* kthread function to register callbacks used to test RCU barriers. */ static int rcu_torture_barrier_cbs(void *arg) { @@ -2067,9 +2084,11 @@ static int rcu_torture_barrier_cbs(void *arg) * The above smp_load_acquire() ensures barrier_phase load * is ordered before the following ->call(). */ - local_irq_disable(); /* Just to test no-irq call_rcu(). */ - cur_ops->call(&rcu, rcu_torture_barrier_cbf); - local_irq_enable(); + if (smp_call_function_single(myid, rcu_torture_barrier1cb, + &rcu, 1)) { + // IPI failed, so use direct call from current CPU. + cur_ops->call(&rcu, rcu_torture_barrier_cbf); + } if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); @@ -2105,7 +2124,21 @@ static int rcu_torture_barrier(void *arg) pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n", atomic_read(&barrier_cbs_invoked), n_barrier_cbs); - WARN_ON_ONCE(1); + WARN_ON(1); + // Wait manually for the remaining callbacks + i = 0; + do { + if (WARN_ON(i++ > HZ)) + i = INT_MIN; + schedule_timeout_interruptible(1); + cur_ops->cb_barrier(); + } while (atomic_read(&barrier_cbs_invoked) != + n_barrier_cbs && + !torture_must_stop()); + smp_mb(); // Can't trust ordering if broken. + if (!torture_must_stop()) + pr_err("Recovered: barrier_cbs_invoked = %d\n", + atomic_read(&barrier_cbs_invoked)); } else { n_barrier_successes++; } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 657e6a7d1c03..0c71505f0e19 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -5,7 +5,7 @@ * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 * - * Author: Paul McKenney <paulmck@linux.ibm.com> + * Authors: Paul McKenney <paulmck@linux.ibm.com> * Lai Jiangshan <laijs@cn.fujitsu.com> * * For detailed explanation of Read-Copy Update mechanism see - @@ -450,7 +450,7 @@ static void srcu_gp_start(struct srcu_struct *ssp) spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ rcu_seq_start(&ssp->srcu_gp_seq); - state = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); + state = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(state != SRCU_STATE_SCAN1); } @@ -534,7 +534,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) rcu_seq_end(&ssp->srcu_gp_seq); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) - ssp->srcu_gp_seq_needed_exp = gpseq; + WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq); spin_unlock_irq_rcu_node(ssp); mutex_unlock(&ssp->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ @@ -550,7 +550,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) snp->srcu_have_cbs[idx] = gpseq; rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) - snp->srcu_gp_seq_needed_exp = gpseq; + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; spin_unlock_irq_rcu_node(snp); @@ -614,7 +614,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp } spin_lock_irqsave_rcu_node(ssp, flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - ssp->srcu_gp_seq_needed_exp = s; + WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(ssp, flags); } @@ -660,7 +660,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, if (snp == sdp->mynode) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) - snp->srcu_gp_seq_needed_exp = s; + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); } @@ -674,7 +674,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/ } if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - ssp->srcu_gp_seq_needed_exp = s; + WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); /* If grace period not already done and none in progress, start it. */ if (!rcu_seq_done(&ssp->srcu_gp_seq, s) && @@ -1079,7 +1079,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); */ unsigned long srcu_batches_completed(struct srcu_struct *ssp) { - return ssp->srcu_idx; + return READ_ONCE(ssp->srcu_idx); } EXPORT_SYMBOL_GPL(srcu_batches_completed); @@ -1130,7 +1130,9 @@ static void srcu_advance_state(struct srcu_struct *ssp) return; /* readers present, retry later. */ } srcu_flip(ssp); + spin_lock_irq_rcu_node(ssp); rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); + spin_unlock_irq_rcu_node(ssp); } if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d91c9156fab2..550193a9ce76 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1,12 +1,12 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Read-Copy Update mechanism for mutual exclusion + * Read-Copy Update mechanism for mutual exclusion (tree-based version) * * Copyright IBM Corporation, 2008 * * Authors: Dipankar Sarma <dipankar@in.ibm.com> * Manfred Spraul <manfred@colorfullife.com> - * Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical version + * Paul E. McKenney <paulmck@linux.ibm.com> * * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. @@ -150,6 +150,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); +static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); /* rcuc/rcub kthread realtime priority */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; @@ -342,14 +343,17 @@ bool rcu_eqs_special_set(int cpu) { int old; int new; + int new_old; struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + new_old = atomic_read(&rdp->dynticks); do { - old = atomic_read(&rdp->dynticks); + old = new_old; if (old & RCU_DYNTICK_CTRL_CTR) return false; new = old | RCU_DYNTICK_CTRL_MASK; - } while (atomic_cmpxchg(&rdp->dynticks, old, new) != old); + new_old = atomic_cmpxchg(&rdp->dynticks, old, new); + } while (new_old != old); return true; } @@ -410,10 +414,15 @@ static long blimit = DEFAULT_RCU_BLIMIT; static long qhimark = DEFAULT_RCU_QHIMARK; #define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ static long qlowmark = DEFAULT_RCU_QLOMARK; +#define DEFAULT_RCU_QOVLD_MULT 2 +#define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK) +static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */ +static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */ module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); +module_param(qovld, long, 0444); static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; @@ -818,11 +827,12 @@ static __always_inline void rcu_nmi_enter_common(bool irq) incby = 1; } else if (tick_nohz_full_cpu(rdp->cpu) && rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && - READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { + READ_ONCE(rdp->rcu_urgent_qs) && + !READ_ONCE(rdp->rcu_forced_tick)) { raw_spin_lock_rcu_node(rdp->mynode); // Recheck under lock. if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { - rdp->rcu_forced_tick = true; + WRITE_ONCE(rdp->rcu_forced_tick, true); tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); } raw_spin_unlock_rcu_node(rdp->mynode); @@ -899,7 +909,7 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) WRITE_ONCE(rdp->rcu_need_heavy_qs, false); if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) { tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU); - rdp->rcu_forced_tick = false; + WRITE_ONCE(rdp->rcu_forced_tick, false); } } @@ -1072,7 +1082,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu); if (!READ_ONCE(*rnhqp) && (time_after(jiffies, rcu_state.gp_start + jtsq * 2) || - time_after(jiffies, rcu_state.jiffies_resched))) { + time_after(jiffies, rcu_state.jiffies_resched) || + rcu_state.cbovld)) { WRITE_ONCE(*rnhqp, true); /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ smp_store_release(ruqp, true); @@ -1089,8 +1100,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * So hit them over the head with the resched_cpu() hammer! */ if (tick_nohz_full_cpu(rdp->cpu) && - time_after(jiffies, - READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) { + (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) || + rcu_state.cbovld)) { WRITE_ONCE(*ruqp, true); resched_cpu(rdp->cp |
