summaryrefslogtreecommitdiff
path: root/kernel/sched/ext.c
diff options
context:
space:
mode:
authorDavid Dai <david.dai@linux.dev>2025-06-24 15:49:06 -0700
committerTejun Heo <tj@kernel.org>2025-06-24 13:05:26 -1000
commitcb444006a625c60e6d4dd3753863c3c74f96aac3 (patch)
tree3c0f23267768d24423079005885a9670aae1fad5 /kernel/sched/ext.c
parente2a37c277c64078d5439693963fb9813fa1e6e9c (diff)
downloadlinux-cb444006a625c60e6d4dd3753863c3c74f96aac3.tar.gz
linux-cb444006a625c60e6d4dd3753863c3c74f96aac3.tar.bz2
linux-cb444006a625c60e6d4dd3753863c3c74f96aac3.zip
sched_ext, rcu: Eject BPF scheduler on RCU CPU stall panic
For systems using a sched_ext scheduler and has panic_on_rcu_stall enabled, try kicking out the current scheduler before issuing a panic. While there are numerous reasons for RCU CPU stalls that are not directly attributed to the scheduler, deferring the panic gives sched_ext an opportunity to provide additional debug info when ejecting the current scheduler. Also, handling the event more gracefully allows us to potentially recover the system instead of incurring additional down time. Suggested-by: Tejun Heo <tj@kernel.org> Reviewed-by: Paul E. McKenney <paulmck@kernel.org> Signed-off-by: David Dai <david.dai@linux.dev> Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'kernel/sched/ext.c')
-rw-r--r--kernel/sched/ext.c35
1 files changed, 35 insertions, 0 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index bee98fdcdd01..df5b2c952cf7 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4673,6 +4673,41 @@ bool scx_allow_ttwu_queue(const struct task_struct *p)
}
/**
+ * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler
+ *
+ * While there are various reasons why RCU CPU stalls can occur on a system
+ * that may not be caused by the current BPF scheduler, try kicking out the
+ * current scheduler in an attempt to recover the system to a good state before
+ * issuing panics.
+ */
+bool scx_rcu_cpu_stall(void)
+{
+ struct scx_sched *sch;
+
+ rcu_read_lock();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch)) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ switch (scx_enable_state()) {
+ case SCX_ENABLING:
+ case SCX_ENABLED:
+ break;
+ default:
+ rcu_read_unlock();
+ return false;
+ }
+
+ scx_error(sch, "RCU CPU stall detected!");
+ rcu_read_unlock();
+
+ return true;
+}
+
+/**
* scx_softlockup - sched_ext softlockup handler
* @dur_s: number of seconds of CPU stuck due to soft lockup
*