diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Kconfig.preempt | 19 | ||||
-rw-r--r-- | kernel/entry/common.c | 17 | ||||
-rw-r--r-- | kernel/events/core.c | 195 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 80 | ||||
-rw-r--r-- | kernel/locking/rtmutex.c | 54 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 53 | ||||
-rw-r--r-- | kernel/rcu/tree.h | 2 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 31 | ||||
-rw-r--r-- | kernel/sched/core.c | 357 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 108 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 94 | ||||
-rw-r--r-- | kernel/sched/debug.c | 2 | ||||
-rw-r--r-- | kernel/sched/fair.c | 322 | ||||
-rw-r--r-- | kernel/sched/features.h | 2 | ||||
-rw-r--r-- | kernel/sched/idle.c | 1 | ||||
-rw-r--r-- | kernel/sched/rt.c | 2 | ||||
-rw-r--r-- | kernel/sched/sched.h | 51 | ||||
-rw-r--r-- | kernel/sched/topology.c | 99 | ||||
-rw-r--r-- | kernel/smp.c | 4 | ||||
-rw-r--r-- | kernel/static_call.c | 60 |
20 files changed, 941 insertions, 612 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index bf82259cff96..416017301660 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -40,6 +40,7 @@ config PREEMPT depends on !ARCH_NO_PREEMPT select PREEMPTION select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK + select PREEMPT_DYNAMIC if HAVE_PREEMPT_DYNAMIC help This option reduces the latency of the kernel by making all kernel code (that is not executing in a critical section) @@ -80,3 +81,21 @@ config PREEMPT_COUNT config PREEMPTION bool select PREEMPT_COUNT + +config PREEMPT_DYNAMIC + bool + help + This option allows to define the preemption model on the kernel + command line parameter and thus override the default preemption + model defined during compile time. + + The feature is primarily interesting for Linux distributions which + provide a pre-built kernel binary to reduce the number of kernel + flavors they offer while still offering different usecases. + + The runtime overhead is negligible with HAVE_STATIC_CALL_INLINE enabled + but if runtime patching is not available for the specific architecture + then the potential overhead should be considered. + + Interesting if you want the same pre-built kernel should be used for + both Server and Desktop workloads. diff --git a/kernel/entry/common.c b/kernel/entry/common.c index f9d491b17b78..8442e5c9cfa2 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -184,6 +184,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, * enabled above. */ local_irq_disable_exit_to_user(); + + /* Check if any of the above work has queued a deferred wakeup */ + rcu_nocb_flush_deferred_wakeup(); + ti_work = READ_ONCE(current_thread_info()->flags); } @@ -197,6 +201,9 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) lockdep_assert_irqs_disabled(); + /* Flush pending rcuog wakeup before the last need_resched() check */ + rcu_nocb_flush_deferred_wakeup(); + if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) ti_work = exit_to_user_mode_loop(regs, ti_work); @@ -385,6 +392,9 @@ void irqentry_exit_cond_resched(void) preempt_schedule_irq(); } } +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched); +#endif noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) { @@ -411,8 +421,13 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) } instrumentation_begin(); - if (IS_ENABLED(CONFIG_PREEMPTION)) + if (IS_ENABLED(CONFIG_PREEMPTION)) { +#ifdef CONFIG_PREEMT_DYNAMIC + static_call(irqentry_exit_cond_resched)(); +#else irqentry_exit_cond_resched(); +#endif + } /* Covers both tracing and lockdep */ trace_hardirqs_on(); instrumentation_end(); diff --git a/kernel/events/core.c b/kernel/events/core.c index c37401e3e5f7..5fe7d6346762 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1597,50 +1597,91 @@ static void perf_event_groups_init(struct perf_event_groups *groups) groups->index = 0; } +static inline struct cgroup *event_cgroup(const struct perf_event *event) +{ + struct cgroup *cgroup = NULL; + +#ifdef CONFIG_CGROUP_PERF + if (event->cgrp) + cgroup = event->cgrp->css.cgroup; +#endif + + return cgroup; +} + /* * Compare function for event groups; * * Implements complex key that first sorts by CPU and then by virtual index * which provides ordering when rotating groups for the same CPU. */ -static bool -perf_event_groups_less(struct perf_event *left, struct perf_event *right) +static __always_inline int +perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup, + const u64 left_group_index, const struct perf_event *right) { - if (left->cpu < right->cpu) - return true; - if (left->cpu > right->cpu) - return false; + if (left_cpu < right->cpu) + return -1; + if (left_cpu > right->cpu) + return 1; #ifdef CONFIG_CGROUP_PERF - if (left->cgrp != right->cgrp) { - if (!left->cgrp || !left->cgrp->css.cgroup) { - /* - * Left has no cgroup but right does, no cgroups come - * first. - */ - return true; - } - if (!right->cgrp || !right->cgrp->css.cgroup) { - /* - * Right has no cgroup but left does, no cgroups come - * first. - */ - return false; - } - /* Two dissimilar cgroups, order by id. */ - if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id) - return true; + { + const struct cgroup *right_cgroup = event_cgroup(right); - return false; + if (left_cgroup != right_cgroup) { + if (!left_cgroup) { + /* + * Left has no cgroup but right does, no + * cgroups come first. + */ + return -1; + } + if (!right_cgroup) { + /* + * Right has no cgroup but left does, no + * cgroups come first. + */ + return 1; + } + /* Two dissimilar cgroups, order by id. */ + if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup)) + return -1; + + return 1; + } } #endif - if (left->group_index < right->group_index) - return true; - if (left->group_index > right->group_index) - return false; + if (left_group_index < right->group_index) + return -1; + if (left_group_index > right->group_index) + return 1; - return false; + return 0; +} + +#define __node_2_pe(node) \ + rb_entry((node), struct perf_event, group_node) + +static inline bool __group_less(struct rb_node *a, const struct rb_node *b) +{ + struct perf_event *e = __node_2_pe(a); + return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index, + __node_2_pe(b)) < 0; +} + +struct __group_key { + int cpu; + struct cgroup *cgroup; +}; + +static inline int __group_cmp(const void *key, const struct rb_node *node) +{ + const struct __group_key *a = key; + const struct perf_event *b = __node_2_pe(node); + + /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */ + return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b); } /* @@ -1652,27 +1693,9 @@ static void perf_event_groups_insert(struct perf_event_groups *groups, struct perf_event *event) { - struct perf_event *node_event; - struct rb_node *parent; - struct rb_node **node; - event->group_index = ++groups->index; - node = &groups->tree.rb_node; - parent = *node; - - while (*node) { - parent = *node; - node_event = container_of(*node, struct perf_event, group_node); - - if (perf_event_groups_less(event, node_event)) - node = &parent->rb_left; - else - node = &parent->rb_right; - } - - rb_link_node(&event->group_node, parent, node); - rb_insert_color(&event->group_node, &groups->tree); + rb_add(&event->group_node, &groups->tree, __group_less); } /* @@ -1720,45 +1743,17 @@ static struct perf_event * perf_event_groups_first(struct perf_event_groups *groups, int cpu, struct cgroup *cgrp) { - struct perf_event *node_event = NULL, *match = NULL; - struct rb_node *node = groups->tree.rb_node; -#ifdef CONFIG_CGROUP_PERF - u64 node_cgrp_id, cgrp_id = 0; - - if (cgrp) - cgrp_id = cgrp->kn->id; -#endif - - while (node) { - node_event = container_of(node, struct perf_event, group_node); - - if (cpu < node_event->cpu) { - node = node->rb_left; - continue; - } - if (cpu > node_event->cpu) { - node = node->rb_right; - continue; - } -#ifdef CONFIG_CGROUP_PERF - node_cgrp_id = 0; - if (node_event->cgrp && node_event->cgrp->css.cgroup) - node_cgrp_id = node_event->cgrp->css.cgroup->kn->id; + struct __group_key key = { + .cpu = cpu, + .cgroup = cgrp, + }; + struct rb_node *node; - if (cgrp_id < node_cgrp_id) { - node = node->rb_left; - continue; - } - if (cgrp_id > node_cgrp_id) { - node = node->rb_right; - continue; - } -#endif - match = node_event; - node = node->rb_left; - } + node = rb_find_first(&key, &groups->tree, __group_cmp); + if (node) + return __node_2_pe(node); - return match; + return NULL; } /* @@ -1767,27 +1762,17 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu, static struct perf_event * perf_event_groups_next(struct perf_event *event) { - struct perf_event *next; -#ifdef CONFIG_CGROUP_PERF - u64 curr_cgrp_id = 0; - u64 next_cgrp_id = 0; -#endif - - next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node); - if (next == NULL || next->cpu != event->cpu) - return NULL; - -#ifdef CONFIG_CGROUP_PERF - if (event->cgrp && event->cgrp->css.cgroup) - curr_cgrp_id = event->cgrp->css.cgroup->kn->id; + struct __group_key key = { + .cpu = event->cpu, + .cgroup = event_cgroup(event), + }; + struct rb_node *next; - if (next->cgrp && next->cgrp->css.cgroup) - next_cgrp_id = next->cgrp->css.cgroup->kn->id; + next = rb_next_match(&key, &event->group_node, __group_cmp); + if (next) + return __node_2_pe(next); - if (curr_cgrp_id != next_cgrp_id) - return NULL; -#endif - return next; + return NULL; } /* diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index bf9edd8d75be..3ea7f8f92f1d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -613,41 +613,56 @@ static void put_uprobe(struct uprobe *uprobe) } } -static int match_uprobe(struct uprobe *l, struct uprobe *r) +static __always_inline +int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset, + const struct uprobe *r) { - if (l->inode < r->inode) + if (l_inode < r->inode) return -1; - if (l->inode > r->inode) + if (l_inode > r->inode) return 1; - if (l->offset < r->offset) + if (l_offset < r->offset) return -1; - if (l->offset > r->offset) + if (l_offset > r->offset) return 1; return 0; } +#define __node_2_uprobe(node) \ + rb_entry((node), struct uprobe, rb_node) + +struct __uprobe_key { + struct inode *inode; + loff_t offset; +}; + +static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b) +{ + const struct __uprobe_key *a = key; + return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b)); +} + +static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b) +{ + struct uprobe *u = __node_2_uprobe(a); + return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b)); +} + static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) { - struct uprobe u = { .inode = inode, .offset = offset }; - struct rb_node *n = uprobes_tree.rb_node; - struct uprobe *uprobe; - int match; + struct __uprobe_key key = { + .inode = inode, + .offset = offset, + }; + struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key); - while (n) { - uprobe = rb_entry(n, struct uprobe, rb_node); - match = match_uprobe(&u, uprobe); - if (!match) - return get_uprobe(uprobe); + if (node) + return get_uprobe(__node_2_uprobe(node)); - if (match < 0) - n = n->rb_left; - else - n = n->rb_right; - } return NULL; } @@ -668,32 +683,15 @@ static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) static struct uprobe *__insert_uprobe(struct uprobe *uprobe) { - struct rb_node **p = &uprobes_tree.rb_node; - struct rb_node *parent = NULL; - struct uprobe *u; - int match; + struct rb_node *node; - while (*p) { - parent = *p; - u = rb_entry(parent, struct uprobe, rb_node); - match = match_uprobe(uprobe, u); - if (!match) - return get_uprobe(u); + node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); + if (node) + return get_uprobe(__node_2_uprobe(node)); - if (match < 0) - p = &parent->rb_left; - else - p = &parent->rb_right; - - } - - u = NULL; - rb_link_node(&uprobe->rb_node, parent, p); - rb_insert_color(&uprobe->rb_node, &uprobes_tree); /* get access + creation ref */ refcount_set(&uprobe->ref, 2); - - return u; + return NULL; } /* diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 47a6e0b8073d..03b21135313c 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -267,27 +267,18 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, return 1; } +#define __node_2_waiter(node) \ + rb_entry((node), struct rt_mutex_waiter, tree_entry) + +static inline bool __waiter_less(struct rb_node *a, const struct rb_node *b) +{ + return rt_mutex_waiter_less(__node_2_waiter(a), __node_2_waiter(b)); +} + static void rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &lock->waiters.rb_root.rb_node; - struct rb_node *parent = NULL; - struct rt_mutex_waiter *entry; - bool leftmost = true; - - while (*link) { - parent = *link; - entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry); - if (rt_mutex_waiter_less(waiter, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = false; - } - } - - rb_link_node(&waiter->tree_entry, parent, link); - rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost); + rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less); } static void @@ -300,27 +291,18 @@ rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) RB_CLEAR_NODE(&waiter->tree_entry); } +#define __node_2_pi_waiter(node) \ + rb_entry((node), struct rt_mutex_waiter, pi_tree_entry) + +static inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b) +{ + return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b)); +} + static void rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &task->pi_waiters.rb_root.rb_node; - struct rb_node *parent = NULL; - struct rt_mutex_waiter *entry; - bool leftmost = true; - - while (*link) { - parent = *link; - entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry); - if (rt_mutex_waiter_less(waiter, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = false; - } - } - - rb_link_node(&waiter->pi_tree_entry, parent, link); - rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost); + rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less); } static void diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0f4a6a3c057b..da6f5213fb74 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -649,7 +649,6 @@ static noinstr void rcu_eqs_enter(bool user) trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rdp = this_cpu_ptr(&rcu_data); - do_nocb_deferred_wakeup(rdp); rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); @@ -683,6 +682,50 @@ void rcu_idle_enter(void) EXPORT_SYMBOL_GPL(rcu_idle_enter); #ifdef CONFIG_NO_HZ_FULL + +#if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK) +/* + * An empty function that will trigger a reschedule on + * IRQ tail once IRQs get re-enabled on userspace/guest resume. + */ +static void late_wakeup_func(struct irq_work *work) +{ +} + +static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) = + IRQ_WORK_INIT(late_wakeup_func); + +/* + * If either: + * + * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work + * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry. + * + * In these cases the late RCU wake ups aren't supported in the resched loops and our + * last resort is to fire a local irq_work that will trigger a reschedule once IRQs + * get re-enabled again. + */ +noinstr static void rcu_irq_work_resched(void) +{ + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + + if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU)) + return; + + if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU)) + return; + + instrumentation_begin(); + if (do_nocb_deferred_wakeup(rdp) && need_resched()) { + irq_work_queue(this_cpu_ptr(&late_wakeup_work)); + } + instrumentation_end(); +} + +#else +static inline void rcu_irq_work_resched(void) { } +#endif + /** * rcu_user_enter - inform RCU that we are resuming userspace. * @@ -697,8 +740,16 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); noinstr void rcu_user_enter(void) { lockdep_assert_irqs_disabled(); + + /* + * Other than generic entry implementation, we may be past the last + * rescheduling opportunity in the entry code. Trigger a self IPI + * that will fire and reschedule once we resume in user/guest mode. + */ + rcu_irq_work_resched(); rcu_eqs_enter(true); } + #endif /* CONFIG_NO_HZ_FULL */ /** diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 5d359b9f9fec..71821d59d95c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -435,7 +435,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, unsigned long flags); static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); -static void do_nocb_deferred_wakeup(struct rcu_data *rdp); +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_cpu_nocb_kthread(int cpu); static void __init rcu_spawn_nocb_kthreads(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 231a0c6cf03c..2d603771c7dc 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1632,8 +1632,8 @@ bool rcu_is_nocb_cpu(int cpu) * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock * and this function releases it. */ -static void wake_nocb_gp(struct rcu_data *rdp, bool force, - unsigned long flags) +static bool wake_nocb_gp(struct rcu_data *rdp, bool force, + unsigned long flags) __releases(rdp->nocb_lock) { bool needwake = false; @@ -1644,7 +1644,7 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force, trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("AlreadyAwake")); rcu_nocb_unlock_irqrestore(rdp, flags); - return; + return false; } del_timer(&rdp->nocb_timer); rcu_nocb_unlock_irqrestore(rdp, flags); @@ -1657,6 +1657,8 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force, raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); if (needwake) wake_up_process(rdp_gp->nocb_gp_kthread); + + return needwake; } /* @@ -2251,20 +2253,23 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) } /* Do a deferred wakeup of rcu_nocb_kthread(). */ -static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) +static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp) { unsigned long flags; int ndw; + int ret; rcu_nocb_lock_irqsave(rdp, flags); if (!rcu_nocb_need_deferred_wakeup(rdp)) { rcu_nocb_unlock_irqrestore(rdp, flags); - return; + return false; } ndw = READ_ONCE(rdp->nocb_defer_wakeup); WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); + ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); + + return ret; } /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ @@ -2280,11 +2285,18 @@ static void do_nocb_deferred_wakeup_timer(struct timer_list *t) * This means we do an inexact common-case check. Note that if * we miss, ->nocb_timer will eventually clean things up. */ -static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) { if (rcu_nocb_need_deferred_wakeup(rdp)) - do_nocb_deferred_wakeup_common(rdp); + return do_nocb_deferred_wakeup_common(rdp); + return false; +} + +void rcu_nocb_flush_deferred_wakeup(void) +{ + do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data)); } +EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup); static int rdp_offload_toggle(struct rcu_data *rdp, bool offload, unsigned long flags) @@ -2835,8 +2847,9 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) return false; } -static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) { + return false; } static void rcu_spawn_cpu_nocb_kthread(int cpu) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 22f6748c16f6..7f5ffc878411 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -355,8 +355,9 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = rq->hrtick_time; - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); + hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); } /* @@ -380,7 +381,6 @@ static void __hrtick_start(void *arg) void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time; s64 delta; /* @@ -388,9 +388,7 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense and can cause timer DoS. */ delta = max_t(s64, delay, 10000LL); - time = ktime_add_ns(timer->base->get_time(), delta); - - hrtimer_set_expires(timer, time); + rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta); if (rq == this_rq()) __hrtick_restart(rq); @@ -4970,7 +4968,7 @@ static void __sched notrace __schedule(bool preempt) schedule_debug(prev, preempt); - if (sched_feat(HRTICK)) + if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); local_irq_disable(); @@ -5264,6 +5262,12 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); +EXPORT_STATIC_CALL_TRAMP(preempt_schedule); +#endif + + /** * preempt_schedule_notrace - preempt_schedule called by tracing * @@ -5316,8 +5320,197 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); +EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); +#endif + #endif /* CONFIG_PREEMPTION */ +#ifdef CONFIG_PREEMPT_DYNAMIC + +#include <linux/entry-common.h> + +/* + * SC:cond_resched + * SC:might_resched + * SC:preempt_schedule + * SC:preempt_schedule_notrace + * SC:irqentry_exit_cond_resched + * + * + * NONE: + * cond_resched <- __cond_resched + * might_resched <- RET0 + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * + * VOLUNTARY: + * cond_resched <- __cond_resched + * might_resched <- __cond_resched + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * + * FULL: + * cond_resched <- RET0 + * might_resched <- RET0 + * preempt_schedule <- preempt_schedule + * preempt_schedule_notrace <- preempt_schedule_notrace + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + */ + +enum { + preempt_dynamic_none = 0, + preempt_dynamic_voluntary, + preempt_dynamic_full, +}; + +static int preempt_dynamic_mode = preempt_dynamic_full; + +static int sched_dynamic_mode(const char *str) +{ + if (!strcmp(str, "none")) + return 0; + + if (!strcmp(str, "voluntary")) + return 1; + + if (!strcmp(str, "full")) + return 2; + + return -1; +} + +static void sched_dynamic_update(int mode) +{ + /* + * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in + * the ZERO state, which is invalid. + */ + static_call_update(cond_resched, __cond_resched); + static_call_update(might_resched, __cond_resched); + static_call_update(preempt_schedule, __preempt_schedule_func); + static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); + static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); + + switch (mode) { + case preempt_dynamic_none: + static_call_update(cond_resched, __cond_resched); + static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0); + static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL); + static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL); + static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL); + pr_info("Dynamic Preempt: none\n"); + break; + + case preempt_dynamic_voluntary: + static_call_update(cond_resched, __cond_resched); + static_call_update(might_resched, __cond_resched); + static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL); + static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL); + static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL); + pr_info("Dynamic Preempt: voluntary\n"); + break; + + case preempt_dynamic_full: + static_call_update(cond_resched, (typeof(&__cond_resched)) __static_call_return0); + static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0); + static_call_update(preempt_schedule, __preempt_schedule_func); + static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); + static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); + pr_info("Dynamic Preempt: full\n"); + break; + } + + preempt_dynamic_mode = mode; +} + +static int __init setup_preempt_mode(char *str) +{ + int mode = sched_dynamic_mode(str); + if (mode < 0) { + pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); + return 1; + } + + sched_dynamic_update(mode); + return 0; +} +__setup("preempt=", setup_preempt_mode); + +#ifdef CONFIG_SCHED_DEBUG + +static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[16]; + int mode; + + if (cnt > 15) + cnt = 15; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + mode = sched_dynamic_mode(strstrip(buf)); + if (mode < 0) + return mode; + + sched_dynamic_update(mode); + + *ppos += cnt; + + return cnt; +} + +static int sched_dynamic_show(struct seq_file *m, void *v) +{ + static const char * preempt_modes[] = { + "none", "voluntary", "full" + }; + int i; + + for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { + if (preempt_dynamic_mode == i) + seq_puts(m, "("); |