diff options
28 files changed, 321 insertions, 146 deletions
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 5a9238a2883c..467251f7fef6 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -2129,6 +2129,8 @@ Some of the relevant points of interest are as follows: <li> <a href="#Hotplug CPU">Hotplug CPU</a>. <li> <a href="#Scheduler and RCU">Scheduler and RCU</a>. <li> <a href="#Tracing and RCU">Tracing and RCU</a>. +<li> <a href="#Accesses to User Memory and RCU"> +Accesses to User Memory and RCU</a>. <li> <a href="#Energy Efficiency">Energy Efficiency</a>. <li> <a href="#Scheduling-Clock Interrupts and RCU"> Scheduling-Clock Interrupts and RCU</a>. @@ -2512,7 +2514,7 @@ disabled across the entire RCU read-side critical section. <p> It is possible to use tracing on RCU code, but tracing itself uses RCU. -For this reason, <tt>rcu_dereference_raw_notrace()</tt> +For this reason, <tt>rcu_dereference_raw_check()</tt> is provided for use by tracing, which avoids the destructive recursion that could otherwise ensue. This API is also used by virtualization in some architectures, @@ -2521,6 +2523,75 @@ cannot be used. The tracing folks both located the requirement and provided the needed fix, so this surprise requirement was relatively painless. +<h3><a name="Accesses to User Memory and RCU"> +Accesses to User Memory and RCU</a></h3> + +<p> +The kernel needs to access user-space memory, for example, to access +data referenced by system-call parameters. +The <tt>get_user()</tt> macro does this job. + +<p> +However, user-space memory might well be paged out, which means +that <tt>get_user()</tt> might well page-fault and thus block while +waiting for the resulting I/O to complete. +It would be a very bad thing for the compiler to reorder +a <tt>get_user()</tt> invocation into an RCU read-side critical +section. +For example, suppose that the source code looked like this: + +<blockquote> +<pre> + 1 rcu_read_lock(); + 2 p = rcu_dereference(gp); + 3 v = p->value; + 4 rcu_read_unlock(); + 5 get_user(user_v, user_p); + 6 do_something_with(v, user_v); +</pre> +</blockquote> + +<p> +The compiler must not be permitted to transform this source code into +the following: + +<blockquote> +<pre> + 1 rcu_read_lock(); + 2 p = rcu_dereference(gp); + 3 get_user(user_v, user_p); // BUG: POSSIBLE PAGE FAULT!!! + 4 v = p->value; + 5 rcu_read_unlock(); + 6 do_something_with(v, user_v); +</pre> +</blockquote> + +<p> +If the compiler did make this transformation in a +<tt>CONFIG_PREEMPT=n</tt> kernel build, and if <tt>get_user()</tt> did +page fault, the result would be a quiescent state in the middle +of an RCU read-side critical section. +This misplaced quiescent state could result in line 4 being +a use-after-free access, which could be bad for your kernel's +actuarial statistics. +Similar examples can be constructed with the call to <tt>get_user()</tt> +preceding the <tt>rcu_read_lock()</tt>. + +<p> +Unfortunately, <tt>get_user()</tt> doesn't have any particular +ordering properties, and in some architectures the underlying <tt>asm</tt> +isn't even marked <tt>volatile</tt>. +And even if it was marked <tt>volatile</tt>, the above access to +<tt>p->value</tt> is not volatile, so the compiler would not have any +reason to keep those two accesses in order. + +<p> +Therefore, the Linux-kernel definitions of <tt>rcu_read_lock()</tt> +and <tt>rcu_read_unlock()</tt> must act as compiler barriers, +at least for outermost instances of <tt>rcu_read_lock()</tt> and +<tt>rcu_read_unlock()</tt> within a nested set of RCU read-side critical +sections. + <h3><a name="Energy Efficiency">Energy Efficiency</a></h3> <p> diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 13e88fc00f01..f48f4621ccbc 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -57,6 +57,12 @@ o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that CONFIG_PREEMPT_RCU case, you might see stall-warning messages. + You can use the rcutree.kthread_prio kernel boot parameter to + increase the scheduling priority of RCU's kthreads, which can + help avoid this problem. However, please note that doing this + can increase your system's context-switch rate and thus degrade + performance. + o A periodic interrupt whose handler takes longer than the time interval between successive pairs of interrupts. This can prevent RCU's kthreads and softirq handlers from running. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 7ccd158b3894..f3fcd6140ee1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4047,6 +4047,10 @@ rcutorture.verbose= [KNL] Enable additional printk() statements. + rcupdate.rcu_cpu_stall_ftrace_dump= [KNL] + Dump ftrace buffer after reporting RCU CPU + stall warning. + rcupdate.rcu_cpu_stall_suppress= [KNL] Suppress RCU CPU stall warning messages. diff --git a/MAINTAINERS b/MAINTAINERS index 6426db5198f0..527317026492 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9326,7 +9326,7 @@ F: drivers/misc/lkdtm/* LINUX KERNEL MEMORY CONSISTENCY MODEL (LKMM) M: Alan Stern <stern@rowland.harvard.edu> -M: Andrea Parri <andrea.parri@amarulasolutions.com> +M: Andrea Parri <parri.andrea@gmail.com> M: Will Deacon <will@kernel.org> M: Peter Zijlstra <peterz@infradead.org> M: Boqun Feng <boqun.feng@gmail.com> diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index aab8ba40ce38..4b0bab2607e4 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -264,15 +264,13 @@ int __cpu_disable(void) return 0; } -static DECLARE_COMPLETION(cpu_died); - /* * called on the thread which is asking for a CPU to be shutdown - * waits until shutdown has completed, or it is timed out. */ void __cpu_die(unsigned int cpu) { - if (!wait_for_completion_timeout(&cpu_died, msecs_to_jiffies(5000))) { + if (!cpu_wait_death(cpu, 5)) { pr_err("CPU%u: cpu didn't die\n", cpu); return; } @@ -319,7 +317,7 @@ void arch_cpu_idle_dead(void) * this returns, power and/or clocks can be removed at any point * from this CPU and its cache by platform_cpu_kill(). */ - complete(&cpu_died); + (void)cpu_report_death(); /* * Ensure that the cache lines associated with that completion are diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index bb7c8cc77f1a..04b2b927bb5a 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -535,7 +535,7 @@ static inline void note_hpte_modification(struct kvm *kvm, */ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm) { - return rcu_dereference_raw_notrace(kvm->memslots[0]); + return rcu_dereference_raw_check(kvm->memslots[0]); } extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 7389db538c30..6fa42e9c4e6f 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -29,6 +29,7 @@ static bool pci_mmcfg_running_state; static bool pci_mmcfg_arch_init_failed; static DEFINE_MUTEX(pci_mmcfg_lock); +#define pci_mmcfg_lock_held() lock_is_held(&(pci_mmcfg_lock).dep_map) LIST_HEAD(pci_mmcfg_list); @@ -54,7 +55,7 @@ static void list_add_sorted(struct pci_mmcfg_region *new) struct pci_mmcfg_region *cfg; /* keep list sorted by segment and starting bus number */ - list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) { + list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list, pci_mmcfg_lock_held()) { if (cfg->segment > new->segment || (cfg->segment == new->segment && cfg->start_bus >= new->start_bus)) { @@ -118,7 +119,7 @@ struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus) { struct pci_mmcfg_region *cfg; - list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) + list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list, pci_mmcfg_lock_held()) if (cfg->segment == segment && cfg->start_bus <= bus && bus <= cfg->end_bus) return cfg; diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 9c0edf2fc0dd..2f9d0d20b836 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/mm.h> #include <linux/highmem.h> +#include <linux/lockdep.h> #include <linux/pci.h> #include <linux/interrupt.h> #include <linux/kmod.h> @@ -80,6 +81,7 @@ struct acpi_ioremap { static LIST_HEAD(acpi_ioremaps); static DEFINE_MUTEX(acpi_ioremap_lock); +#define acpi_ioremap_lock_held() lock_is_held(&acpi_ioremap_lock.dep_map) static void __init acpi_request_region (struct acpi_generic_address *gas, unsigned int length, char *desc) @@ -206,7 +208,7 @@ acpi_map_lookup(acpi_physical_address phys, acpi_size size) { struct acpi_ioremap *map; - list_for_each_entry_rcu(map, &acpi_ioremaps, list) + list_for_each_entry_rcu(map, &acpi_ioremaps, list, acpi_ioremap_lock_held()) if (map->phys <= phys && phys + size <= map->phys + map->size) return map; @@ -249,7 +251,7 @@ acpi_map_lookup_virt(void __iomem *virt, acpi_size size) { struct acpi_ioremap *map; - list_for_each_entry_rcu(map, &acpi_ioremaps, list) + list_for_each_entry_rcu(map, &acpi_ioremaps, list, acpi_ioremap_lock_held()) if (map->virt <= virt && virt + size <= map->virt + map->size) return map; diff --git a/drivers/base/base.h b/drivers/base/base.h index b405436ee28e..0d32544b6f91 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -165,6 +165,7 @@ static inline int devtmpfs_init(void) { return 0; } /* Device links support */ extern int device_links_read_lock(void); extern void device_links_read_unlock(int idx); +extern int device_links_read_lock_held(void); extern int device_links_check_suppliers(struct device *dev); extern void device_links_driver_bound(struct device *dev); extern void device_links_driver_cleanup(struct device *dev); diff --git a/drivers/base/core.c b/drivers/base/core.c index 636058bbf48a..eede79630ceb 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -68,6 +68,11 @@ void device_links_read_unlock(int idx) { srcu_read_unlock(&device_links_srcu, idx); } + +int device_links_read_lock_held(void) +{ + return srcu_read_lock_held(&device_links_srcu); +} #else /* !CONFIG_SRCU */ static DECLARE_RWSEM(device_links_lock); @@ -91,6 +96,13 @@ void device_links_read_unlock(int not_used) { up_read(&device_links_lock); } + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +int device_links_read_lock_held(void) +{ + return lockdep_is_held(&device_links_lock); +} +#endif #endif /* !CONFIG_SRCU */ /** diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index b75335508d2c..50def99df970 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -287,7 +287,8 @@ static int rpm_get_suppliers(struct device *dev) { struct device_link *link; - list_for_each_entry_rcu(link, &dev->links.suppliers, c_node) { + list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, + device_links_read_lock_held()) { int retval; if (!(link->flags & DL_FLAG_PM_RUNTIME) || @@ -309,7 +310,8 @@ static void rpm_put_suppliers(struct device *dev) { struct device_link *link; - list_for_each_entry_rcu(link, &dev->links.suppliers, c_node) { + list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, + device_links_read_lock_held()) { if (READ_ONCE(link->status) == DL_STATE_SUPPLIER_UNBIND) continue; @@ -1640,7 +1642,8 @@ void pm_runtime_clean_up_links(struct device *dev) idx = device_links_read_lock(); - list_for_each_entry_rcu(link, &dev->links.consumers, s_node) { + list_for_each_entry_rcu(link, &dev->links.consumers, s_node, + device_links_read_lock_held()) { if (link->flags & DL_FLAG_STATELESS) continue; @@ -1662,7 +1665,8 @@ void pm_runtime_get_suppliers(struct device *dev) idx = device_links_read_lock(); - list_for_each_entry_rcu(link, &dev->links.suppliers, c_node) + list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, + device_links_read_lock_held()) if (link->flags & DL_FLAG_PM_RUNTIME) { link->supplier_preactivated = true; refcount_inc(&link->rpm_active); @@ -1683,7 +1687,8 @@ void pm_runtime_put_suppliers(struct device *dev) idx = device_links_read_lock(); - list_for_each_entry_rcu(link, &dev->links.suppliers, c_node) + list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, + device_links_read_lock_held()) if (link->supplier_preactivated) { link->supplier_preactivated = false; if (refcount_dec_not_one(&link->rpm_active)) diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h index 9b83865d24f9..0027d4c8087c 100644 --- a/include/linux/rcu_sync.h +++ b/include/linux/rcu_sync.h @@ -31,9 +31,7 @@ struct rcu_sync { */ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) { - RCU_LOCKDEP_WARN(!rcu_read_lock_held() && - !rcu_read_lock_bh_held() && - !rcu_read_lock_sched_held(), + RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(), "suspicious rcu_sync_is_idle() usage"); return !READ_ONCE(rsp->gp_state); /* GP_IDLE */ } diff --git a/include/linux/rculist.h b/include/linux/rculist.h index e91ec9ddcd30..4158b7212936 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -41,6 +41,24 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) #define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) /* + * Check during list traversal that we are within an RCU reader + */ + +#define check_arg_count_one(dummy) + +#ifdef CONFIG_PROVE_RCU_LIST +#define __list_check_rcu(dummy, cond, extra...) \ + ({ \ + check_arg_count_one(extra); \ + RCU_LOCKDEP_WARN(!cond && !rcu_read_lock_any_held(), \ + "RCU-list traversed in non-reader section!"); \ + }) +#else +#define __list_check_rcu(dummy, cond, extra...) \ + ({ check_arg_count_one(extra); }) +#endif + +/* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know @@ -343,14 +361,16 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. + * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ -#define list_for_each_entry_rcu(pos, head, member) \ - for (pos = list_entry_rcu((head)->next, typeof(*pos), member); \ - &pos->member != (head); \ +#define list_for_each_entry_rcu(pos, head, member, cond...) \ + for (__list_check_rcu(dummy, ## cond, 0), \ + pos = list_entry_rcu((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** @@ -616,13 +636,15 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. + * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ -#define hlist_for_each_entry_rcu(pos, head, member) \ - for (pos = hlist_entry_safe (rcu_dereference_raw(hlist_first_rcu(head)),\ +#define hlist_for_each_entry_rcu(pos, head, member, cond...) \ + for (__list_check_rcu(dummy, ## cond, 0), \ + pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ @@ -642,10 +664,10 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, * not do any RCU debugging or tracing. */ #define hlist_for_each_entry_rcu_notrace(pos, head, member) \ - for (pos = hlist_entry_safe (rcu_dereference_raw_notrace(hlist_first_rcu(head)),\ + for (pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ - pos = hlist_entry_safe(rcu_dereference_raw_notrace(hlist_next_rcu(\ + pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 8f7167478c1d..80d6056f5855 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -221,6 +221,7 @@ int debug_lockdep_rcu_enabled(void); int rcu_read_lock_held(void); int rcu_read_lock_bh_held(void); int rcu_read_lock_sched_held(void); +int rcu_read_lock_any_held(void); #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ @@ -241,6 +242,12 @@ static inline int rcu_read_lock_sched_held(void) { return !preemptible(); } + +static inline int rcu_read_lock_any_held(void) +{ + return !preemptible(); +} + #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_PROVE_RCU @@ -476,7 +483,7 @@ do { \ * The no-tracing version of rcu_dereference_raw() must not call * rcu_read_lock_held(). */ -#define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu) +#define rcu_dereference_raw_check(p) __rcu_dereference_check((p), 1, __rcu) /** * rcu_dereference_protected() - fetch RCU pointer when updates prevented diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4861cf8e274b..4aca3f4379d2 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -620,7 +620,7 @@ static void print_lock(struct held_lock *hlock) return; } - printk(KERN_CONT "%p", hlock->instance); + printk(KERN_CONT "%px", hlock->instance); print_lock_name(lock); printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 5ec3ea4028e2..4aa02eee8f6c 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -8,6 +8,17 @@ menu "RCU Debugging" config PROVE_RCU def_bool PROVE_LOCKING +config PROVE_RCU_LIST + bool "RCU list lockdep debugging" + depends on PROVE_RCU && RCU_EXPERT + default n + help + Enable RCU lockdep checking for list usages. By default it is + turned off since there are several list RCU users that still + need to be converted to pass a lockdep expression. To prevent + false-positive splats, we keep it default disabled but once all + users are converted, we can remove this config option. + config TORTURE_TEST tristate default n diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 5290b01de534..8fd4f82c9b3d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -227,6 +227,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) #ifdef CONFIG_RCU_STALL_COMMON +extern int rcu_cpu_stall_ftrace_dump; extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; int rcu_jiffies_till_stall_check(void); diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 71b64648464e..822a39da0533 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -76,27 +76,6 @@ static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) return !*rsclp->tails[seg]; } -/* - * Interim function to return rcu_segcblist head pointer. Longer term, the - * rcu_segcblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp) -{ - return rsclp->head; -} - -/* - * Interim function to return rcu_segcblist head pointer. Longer term, the - * rcu_segcblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) -{ - WARN_ON_ONCE(rcu_segcblist_empty(rsclp)); - return rsclp->tails[RCU_NEXT_TAIL]; -} - void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index cf0e886314f2..5dffade2d7cd 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1279,8 +1279,9 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) c0 = l0 - u0; c1 = l1 - u1; - pr_cont(" %d(%ld,%ld %1p)", - cpu, c0, c1, rcu_segcblist_head(&sdp->srcu_cblist)); + pr_cont(" %d(%ld,%ld %c)", + cpu, c0, c1, + "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); s0 += c0; s1 += c1; } diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index af7e7b9c86af..d632cd019597 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -781,7 +781,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * other hand, if the CPU is not in an RCU read-side critical section, * the IPI handler reports the quiescent state immediately. * - * Although this is a greate improvement over previous expedited + * Although this is a great improvement over previous expedited * implementations, it is still unfriendly to real-time workloads, so is * thus not recommended for any sort of common-case code. In fact, if * you are using synchronize_rcu_expedited() in a loop, please restructure @@ -792,6 +792,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ void synchronize_rcu_expedited(void) { + bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT); struct rcu_exp_work rew; struct rcu_node *rnp; unsigned long s; @@ -817,7 +818,7 @@ void synchronize_rcu_expedited(void) return; /* Someone else did our work for us. */ /* Ensure that load happens before action based on it. */ - if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { + if (unlikely(boottime)) { /* Direct call during scheduler init and early_initcalls(). */ rcu_exp_sel_wait_wake(s); } else { @@ -835,5 +836,8 @@ void synchronize_rcu_expedited(void) /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); + + if (likely(!boottime)) + destroy_work_on_stack(&rew.rew_work); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index acb225023ed1..99e9d952827b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -288,7 +288,6 @@ void rcu_note_context_switch(bool preempt) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp; - barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); lockdep_assert_irqs_disabled(); WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); @@ -314,15 +313,6 @@ void rcu_note_context_switch(bool preempt) ? rnp->gp_seq : rcu_seq_snap(&rnp->gp_seq)); rcu_preempt_ctxt_queue(rnp, rdp); - } else if (t->rcu_read_lock_nesting < 0 && - t->rcu_read_unlock_special.s) { - - /* - * Complete exit from RCU read-side critical section on - * behalf of preempted instance of __rcu_read_unlock(). - */ - rcu_read_unlock_special(t); - rcu_preempt_deferred_qs(t); } else { rcu_preempt_deferred_qs(t); } @@ -340,7 +330,6 @@ void rcu_note_context_switch(bool preempt) if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); trace_rcu_utilization(TPS("End context switch")); - barrier(); /* Avoid RCU read-side critical sections leaking up. */ } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -626,22 +615,18 @@ static void rcu_read_unlock_special(struct task_struct *t) (rdp->grpmask & rnp->expmask) || tick_nohz_full_cpu(rdp->cpu); // Need to defer quiescent state until everything is enabled. - if ((exp || in_irq()) && irqs_were_disabled && use_softirq && - (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) { + if (irqs_were_disabled && use_softirq && + (in_interrupt() || + (exp && !t->rcu_read_unlock_special.b.deferred_qs))) { // Using softirq, safe to awaken, and we get // no help from enabling irqs, unlike bh/preempt. raise_softirq_irqoff(RCU_SOFTIRQ); - } else if (exp && irqs_were_disabled && !use_softirq && - !t->rcu_read_unlock_special.b.deferred_qs) { - // Safe to awaken and we get no help from enabling - // irqs, unlike bh/preempt. - invoke_rcu_core(); } else { // Enabling BH or preempt does reschedule, so... // Also if no expediting or NO_HZ_FULL, slow is OK. set_tsk_need_resched(current); set_preempt_need_resched(); - if (IS_ENABLED(CONFIG_IRQ_WORK) && + if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && !rdp->defer_qs_iw_pending && exp) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. @@ -828,11 +813,6 @@ static void rcu_qs(void) * dyntick-idle quiescent state visible to other CPUs, which will in * some cases serve for expedited as well as normal grace periods. * Either way, register a lightweight quiescent state. - * - * The barrier() calls are redundant in the common case when this is - * called externally, but just in case this is called from within this - * file. - * */ void rcu_all_qs(void) { @@ -847,14 +827,12 @@ void rcu_all_qs(void) return; } this_cpu_write(rcu_data.rcu_urgent_qs, false); - barrier(); /* Avoid RCU read-side critical sections leaking down. */ if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) { local_irq_save(flags); rcu_momentary_dyntick_idle(); local_irq_restore(flags); } rcu_qs(); - barrier(); /* Avoid RCU read-side critical sections leaking up. */ preempt_enable(); } EXPORT_SYMBOL_GPL(rcu_all_qs); @@ -864,7 +842,6 @@ EXPORT_SYMBOL_GPL(rcu_all_qs); */ void rcu_n |