diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-20 17:41:08 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-20 17:41:08 -0800 |
| commit | 1f2d9ffc7a5f916935749ffc6e93fb33bfe94d2f (patch) | |
| tree | a5dabaa924d50867cbe347e20a7643b2850f11c0 /kernel | |
| parent | a2f0e7eee1344eb9f91b22bc72d9eb0a52b849c9 (diff) | |
| parent | 7c4a5b89a0b5a57a64b601775b296abf77a9fe97 (diff) | |
| download | linux-1f2d9ffc7a5f916935749ffc6e93fb33bfe94d2f.tar.gz linux-1f2d9ffc7a5f916935749ffc6e93fb33bfe94d2f.tar.bz2 linux-1f2d9ffc7a5f916935749ffc6e93fb33bfe94d2f.zip | |
Merge tag 'sched-core-2023-02-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- Improve the scalability of the CFS bandwidth unthrottling logic with
large number of CPUs.
- Fix & rework various cpuidle routines, simplify interaction with the
generic scheduler code. Add __cpuidle methods as noinstr to objtool's
noinstr detection and fix boatloads of cpuidle bugs & quirks.
- Add new ABI: introduce MEMBARRIER_CMD_GET_REGISTRATIONS, to query
previously issued registrations.
- Limit scheduler slice duration to the sysctl_sched_latency period, to
improve scheduling granularity with a large number of SCHED_IDLE
tasks.
- Debuggability enhancement on sys_exit(): warn about disabled IRQs,
but also enable them to prevent a cascade of followup problems and
repeat warnings.
- Fix the rescheduling logic in prio_changed_dl().
- Micro-optimize cpufreq and sched-util methods.
- Micro-optimize ttwu_runnable()
- Micro-optimize the idle-scanning in update_numa_stats(),
select_idle_capacity() and steal_cookie_task().
- Update the RSEQ code & self-tests
- Constify various scheduler methods
- Remove unused methods
- Refine __init tags
- Documentation updates
- Misc other cleanups, fixes
* tag 'sched-core-2023-02-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (110 commits)
sched/rt: pick_next_rt_entity(): check list_entry
sched/deadline: Add more reschedule cases to prio_changed_dl()
sched/fair: sanitize vruntime of entity being placed
sched/fair: Remove capacity inversion detection
sched/fair: unlink misfit task from cpu overutilized
objtool: mem*() are not uaccess safe
cpuidle: Fix poll_idle() noinstr annotation
sched/clock: Make local_clock() noinstr
sched/clock/x86: Mark sched_clock() noinstr
x86/pvclock: Improve atomic update of last_value in pvclock_clocksource_read()
x86/atomics: Always inline arch_atomic64*()
cpuidle: tracing, preempt: Squash _rcuidle tracing
cpuidle: tracing: Warn about !rcu_is_watching()
cpuidle: lib/bug: Disable rcu_is_watching() during WARN/BUG
cpuidle: drivers: firmware: psci: Dont instrument suspend code
KVM: selftests: Fix build of rseq test
exit: Detect and fix irq disabled state in oops
cpuidle, arm64: Fix the ARM64 cpuidle logic
cpuidle: mvebu: Fix duplicate flags assignment
sched/fair: Limit sched slice duration
...
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/context_tracking.c | 12 | ||||
| -rw-r--r-- | kernel/cpu_pm.c | 9 | ||||
| -rw-r--r-- | kernel/exit.c | 7 | ||||
| -rw-r--r-- | kernel/fork.c | 8 | ||||
| -rw-r--r-- | kernel/locking/lockdep.c | 3 | ||||
| -rw-r--r-- | kernel/panic.c | 5 | ||||
| -rw-r--r-- | kernel/printk/printk.c | 2 | ||||
| -rw-r--r-- | kernel/ptrace.c | 2 | ||||
| -rw-r--r-- | kernel/rseq.c | 65 | ||||
| -rw-r--r-- | kernel/sched/clock.c | 27 | ||||
| -rw-r--r-- | kernel/sched/core.c | 134 | ||||
| -rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 43 | ||||
| -rw-r--r-- | kernel/sched/cputime.c | 4 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 42 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 389 | ||||
| -rw-r--r-- | kernel/sched/idle.c | 47 | ||||
| -rw-r--r-- | kernel/sched/membarrier.c | 39 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 5 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 107 | ||||
| -rw-r--r-- | kernel/sched/topology.c | 4 | ||||
| -rw-r--r-- | kernel/signal.c | 2 | ||||
| -rw-r--r-- | kernel/time/tick-broadcast-hrtimer.c | 29 | ||||
| -rw-r--r-- | kernel/time/tick-broadcast.c | 6 | ||||
| -rw-r--r-- | kernel/trace/trace.c | 3 | ||||
| -rw-r--r-- | kernel/trace/trace_preemptirq.c | 61 |
25 files changed, 689 insertions, 366 deletions
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 77978e372377..a09f1c19336a 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -510,7 +510,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * In this we case we don't care about any concurrency/ordering. */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) - atomic_set(&ct->state, state); + arch_atomic_set(&ct->state, state); } else { /* * Even if context tracking is disabled on this CPU, because it's outside @@ -527,7 +527,7 @@ void noinstr __ct_user_enter(enum ctx_state state) */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { /* Tracking for vtime only, no concurrent RCU EQS accounting */ - atomic_set(&ct->state, state); + arch_atomic_set(&ct->state, state); } else { /* * Tracking for vtime and RCU EQS. Make sure we don't race @@ -535,7 +535,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * RCU only requires RCU_DYNTICKS_IDX increments to be fully * ordered. */ - atomic_add(state, &ct->state); + arch_atomic_add(state, &ct->state); } } } @@ -630,12 +630,12 @@ void noinstr __ct_user_exit(enum ctx_state state) * In this we case we don't care about any concurrency/ordering. */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) - atomic_set(&ct->state, CONTEXT_KERNEL); + arch_atomic_set(&ct->state, CONTEXT_KERNEL); } else { if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { /* Tracking for vtime only, no concurrent RCU EQS accounting */ - atomic_set(&ct->state, CONTEXT_KERNEL); + arch_atomic_set(&ct->state, CONTEXT_KERNEL); } else { /* * Tracking for vtime and RCU EQS. Make sure we don't race @@ -643,7 +643,7 @@ void noinstr __ct_user_exit(enum ctx_state state) * RCU only requires RCU_DYNTICKS_IDX increments to be fully * ordered. */ - atomic_sub(state, &ct->state); + arch_atomic_sub(state, &ct->state); } } } diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index ba4ba71facf9..b0f0d15085db 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -30,16 +30,9 @@ static int cpu_pm_notify(enum cpu_pm_event event) { int ret; - /* - * This introduces a RCU read critical section, which could be - * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know - * this. - */ - ct_irq_enter_irqson(); rcu_read_lock(); ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL); rcu_read_unlock(); - ct_irq_exit_irqson(); return notifier_to_errno(ret); } @@ -49,11 +42,9 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev unsigned long flags; int ret; - ct_irq_enter_irqson(); raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL); raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); - ct_irq_exit_irqson(); return notifier_to_errno(ret); } diff --git a/kernel/exit.c b/kernel/exit.c index 15dc2ec80c46..bccfa4218356 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -807,6 +807,8 @@ void __noreturn do_exit(long code) struct task_struct *tsk = current; int group_dead; + WARN_ON(irqs_disabled()); + synchronize_group_exit(tsk, code); WARN_ON(tsk->plug); @@ -938,6 +940,11 @@ void __noreturn make_task_dead(int signr) if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); + if (unlikely(irqs_disabled())) { + pr_info("note: %s[%d] exited with irqs disabled\n", + current->comm, task_pid_nr(current)); + local_irq_enable(); + } if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), diff --git a/kernel/fork.c b/kernel/fork.c index d9c97704b7c9..038b898dad52 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1060,6 +1060,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->reported_split_lock = 0; #endif +#ifdef CONFIG_SCHED_MM_CID + tsk->mm_cid = -1; + tsk->mm_cid_active = 0; +#endif return tsk; free_stack: @@ -1169,6 +1173,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); + mm_init_cid(mm); return mm; fail_pcpu: @@ -1601,6 +1606,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) tsk->mm = mm; tsk->active_mm = mm; + sched_mm_cid_fork(tsk); return 0; } @@ -3034,7 +3040,7 @@ void __init mm_cache_init(void) * dynamically sized based on the maximum CPU number this system * can have, taking hotplug into account (nr_cpu_ids). */ - mm_size = sizeof(struct mm_struct) + cpumask_size(); + mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size(); mm_cachep = kmem_cache_create_usercopy("mm_struct", mm_size, ARCH_MIN_MMSTRUCT_ALIGN, diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e3375bc40dad..50d4863974e7 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -55,6 +55,7 @@ #include <linux/rcupdate.h> #include <linux/kprobes.h> #include <linux/lockdep.h> +#include <linux/context_tracking.h> #include <asm/sections.h> @@ -6555,6 +6556,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { struct task_struct *curr = current; int dl = READ_ONCE(debug_locks); + bool rcu = warn_rcu_enter(); /* Note: the following can be executed concurrently, so be careful. */ pr_warn("\n"); @@ -6595,5 +6597,6 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) lockdep_print_held_locks(curr); pr_warn("\nstack backtrace:\n"); dump_stack(); + warn_rcu_exit(rcu); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/panic.c b/kernel/panic.c index 463c9295bc28..487f5b03bf83 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,6 +34,7 @@ #include <linux/ratelimit.h> #include <linux/debugfs.h> #include <linux/sysfs.h> +#include <linux/context_tracking.h> #include <trace/events/error_report.h> #include <asm/sections.h> @@ -679,6 +680,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, void warn_slowpath_fmt(const char *file, int line, unsigned taint, const char *fmt, ...) { + bool rcu = warn_rcu_enter(); struct warn_args args; pr_warn(CUT_HERE); @@ -693,11 +695,13 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint, va_start(args.args, fmt); __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); + warn_rcu_exit(rcu); } EXPORT_SYMBOL(warn_slowpath_fmt); #else void __warn_printk(const char *fmt, ...) { + bool rcu = warn_rcu_enter(); va_list args; pr_warn(CUT_HERE); @@ -705,6 +709,7 @@ void __warn_printk(const char *fmt, ...) va_start(args, fmt); vprintk(fmt, args); va_end(args); + warn_rcu_exit(rcu); } EXPORT_SYMBOL(__warn_printk); #endif diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a5ed2e53547c..94f136b25f6a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2196,7 +2196,7 @@ static u16 printk_sprint(char *text, u16 size, int facility, } } - trace_console_rcuidle(text, text_len); + trace_console(text, text_len); return text_len; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 54482193e1ed..0786450074c1 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -813,7 +813,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, { struct ptrace_rseq_configuration conf = { .rseq_abi_pointer = (u64)(uintptr_t)task->rseq, - .rseq_abi_size = sizeof(*task->rseq), + .rseq_abi_size = task->rseq_len, .signature = task->rseq_sig, .flags = 0, }; diff --git a/kernel/rseq.c b/kernel/rseq.c index d38ab944105d..9de6e35fe679 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -18,6 +18,9 @@ #define CREATE_TRACE_POINTS #include <trace/events/rseq.h> +/* The original rseq structure size (including padding) is 32 bytes. */ +#define ORIG_RSEQ_SIZE 32 + #define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) @@ -82,15 +85,25 @@ * F1. <failure> */ -static int rseq_update_cpu_id(struct task_struct *t) +static int rseq_update_cpu_node_id(struct task_struct *t) { - u32 cpu_id = raw_smp_processor_id(); struct rseq __user *rseq = t->rseq; + u32 cpu_id = raw_smp_processor_id(); + u32 node_id = cpu_to_node(cpu_id); + u32 mm_cid = task_mm_cid(t); - if (!user_write_access_begin(rseq, sizeof(*rseq))) + WARN_ON_ONCE((int) mm_cid < 0); + if (!user_write_access_begin(rseq, t->rseq_len)) goto efault; unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end); unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end); + unsafe_put_user(node_id, &rseq->node_id, efault_end); + unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end); + /* + * Additional feature fields added after ORIG_RSEQ_SIZE + * need to be conditionally updated only if + * t->rseq_len != ORIG_RSEQ_SIZE. + */ user_write_access_end(); trace_rseq_update(t); return 0; @@ -101,9 +114,10 @@ efault: return -EFAULT; } -static int rseq_reset_rseq_cpu_id(struct task_struct *t) +static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) { - u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED; + u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, + mm_cid = 0; /* * Reset cpu_id_start to its initial state (0). @@ -117,6 +131,21 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t) */ if (put_user(cpu_id, &t->rseq->cpu_id)) return -EFAULT; + /* + * Reset node_id to its initial state (0). + */ + if (put_user(node_id, &t->rseq->node_id)) + return -EFAULT; + /* + * Reset mm_cid to its initial state (0). + */ + if (put_user(mm_cid, &t->rseq->mm_cid)) + return -EFAULT; + /* + * Additional feature fields added after ORIG_RSEQ_SIZE + * need to be conditionally reset only if + * t->rseq_len != ORIG_RSEQ_SIZE. + */ return 0; } @@ -301,7 +330,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (unlikely(ret < 0)) goto error; } - if (unlikely(rseq_update_cpu_id(t))) + if (unlikely(rseq_update_cpu_node_id(t))) goto error; return; @@ -344,15 +373,16 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, /* Unregister rseq for current thread. */ if (current->rseq != rseq || !current->rseq) return -EINVAL; - if (rseq_len != sizeof(*rseq)) + if (rseq_len != current->rseq_len) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; - ret = rseq_reset_rseq_cpu_id(current); + ret = rseq_reset_rseq_cpu_node_id(current); if (ret) return ret; current->rseq = NULL; current->rseq_sig = 0; + current->rseq_len = 0; return 0; } @@ -365,7 +395,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, * the provided address differs from the prior * one. */ - if (current->rseq != rseq || rseq_len != sizeof(*rseq)) + if (current->rseq != rseq || rseq_len != current->rseq_len) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; @@ -374,15 +404,24 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, } /* - * If there was no rseq previously registered, - * ensure the provided rseq is properly aligned and valid. + * If there was no rseq previously registered, ensure the provided rseq + * is properly aligned, as communcated to user-space through the ELF + * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq + * size, the required alignment is the original struct rseq alignment. + * + * In order to be valid, rseq_len is either the original rseq size, or + * large enough to contain all supported fields, as communicated to + * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. */ - if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || - rseq_len != sizeof(*rseq)) + if (rseq_len < ORIG_RSEQ_SIZE || + (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || + (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || + rseq_len < offsetof(struct rseq, end)))) return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; current->rseq = rseq; + current->rseq_len = rseq_len; current->rseq_sig = sig; /* * If rseq was previously inactive, and has just been diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e374c0c923da..5732fa75ebab 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -93,7 +93,7 @@ struct sched_clock_data { static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); -notrace static inline struct sched_clock_data *this_scd(void) +static __always_inline struct sched_clock_data *this_scd(void) { return this_cpu_ptr(&sched_clock_data); } @@ -244,12 +244,12 @@ late_initcall(sched_clock_init_late); * min, max except they take wrapping into account */ -notrace static inline u64 wrap_min(u64 x, u64 y) +static __always_inline u64 wrap_min(u64 x, u64 y) { return (s64)(x - y) < 0 ? x : y; } -notrace static inline u64 wrap_max(u64 x, u64 y) +static __always_inline u64 wrap_max(u64 x, u64 y) { return (s64)(x - y) > 0 ? x : y; } @@ -260,7 +260,7 @@ notrace static inline u64 wrap_max(u64 x, u64 y) * - filter out backward motion * - use the GTOD tick value to create a window to filter crazy TSC values */ -notrace static u64 sched_clock_local(struct sched_clock_data *scd) +static __always_inline u64 sched_clock_local(struct sched_clock_data *scd) { u64 now, clock, old_clock, min_clock, max_clock, gtod; s64 delta; @@ -287,13 +287,28 @@ again: clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - if (!try_cmpxchg64(&scd->clock, &old_clock, clock)) + if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock)) goto again; return clock; } -notrace static u64 sched_clock_remote(struct sched_clock_data *scd) +noinstr u64 local_clock(void) +{ + u64 clock; + + if (static_branch_likely(&__sched_clock_stable)) + return sched_clock() + __sched_clock_offset; + + preempt_disable_notrace(); + clock = sched_clock_local(this_scd()); + preempt_enable_notrace(); + + return clock; +} +EXPORT_SYMBOL_GPL(local_clock); + +static notrace u64 sched_clock_remote(struct sched_clock_data *scd) { struct sched_clock_data *my_scd = this_scd(); u64 this_clock, remote_clock; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2a4918a1faa9..fb49dbf61273 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -152,7 +152,7 @@ __read_mostly int scheduler_running; DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); /* kernel prio, less is more */ -static inline int __task_prio(struct task_struct *p) +static inline int __task_prio(const struct task_struct *p) { if (p->sched_class == &stop_sched_class) /* trumps deadline */ return -2; @@ -174,7 +174,8 @@ static inline int __task_prio(struct task_struct *p) */ /* real prio, less is less */ -static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) +static inline bool prio_less(const struct task_struct *a, + const struct task_struct *b, bool in_fi) { int pa = __task_prio(a), pb = __task_prio(b); @@ -194,7 +195,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool return false; } -static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b) +static inline bool __sched_core_less(const struct task_struct *a, + const struct task_struct *b) { if (a->core_cookie < b->core_cookie) return true; @@ -3675,14 +3677,39 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } /* - * Mark the task runnable and perform wakeup-preemption. + * Mark the task runnable. */ -static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, - struct rq_flags *rf) +static inline void ttwu_do_wakeup(struct task_struct *p) { - check_preempt_curr(rq, p, wake_flags); WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); +} + +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, + struct rq_flags *rf) +{ + int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; + + lockdep_assert_rq_held(rq); + + if (p->sched_contributes_to_load) + rq->nr_uninterruptible--; + +#ifdef CONFIG_SMP + if (wake_flags & WF_MIGRATED) + en_flags |= ENQUEUE_MIGRATED; + else +#endif + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } + + activate_task(rq, p, en_flags); + check_preempt_curr(rq, p, wake_flags); + + ttwu_do_wakeup(p); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { @@ -3712,31 +3739,6 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, #endif } -static void -ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, - struct rq_flags *rf) -{ - int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; - - lockdep_assert_rq_held(rq); - - if (p->sched_contributes_to_load) - rq->nr_uninterruptible--; - -#ifdef CONFIG_SMP - if (wake_flags & WF_MIGRATED) - en_flags |= ENQUEUE_MIGRATED; - else -#endif - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - - activate_task(rq, p, en_flags); - ttwu_do_wakeup(rq, p, wake_flags, rf); -} - /* * Consider @p being inside a wait loop: * @@ -3770,9 +3772,15 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) rq = __task_rq_lock(p, &rf); if (task_on_rq_queued(p)) { - /* check_preempt_curr() may use rq clock */ - update_rq_clock(rq); - ttwu_do_wakeup(rq, p, wake_flags, &rf); + if (!task_on_cpu(rq, p)) { + /* + * When on_rq && !on_cpu the task is preempted, see if + * it should preempt the task that is current now. + */ + update_rq_clock(rq); + check_preempt_curr(rq, p, wake_flags); + } + ttwu_do_wakeup(p); ret = 1; } __task_rq_unlock(rq, &rf); @@ -4138,8 +4146,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) goto out; trace_sched_waking(p); - WRITE_ONCE(p->__state, TASK_RUNNING); - trace_sched_wakeup(p); + ttwu_do_wakeup(p); goto out; } @@ -5104,6 +5111,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); rseq_preempt(prev); + switch_mm_cid(prev, next); fire_sched_out_preempt_notifiers(prev, next); kmap_local_sched_out(); prepare_task(next); @@ -6260,7 +6268,7 @@ static bool steal_cookie_task(int cpu, struct sched_domain *sd) { int i; - for_each_cpu_wrap(i, sched_domain_span(sd), cpu) { + for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) { if (i == cpu) continue; @@ -11365,3 +11373,53 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) { trace_sched_update_nr_running_tp(rq, count); } + +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_exit_signals(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + unsigned long flags; + + if (!mm) + return; + local_irq_save(flags); + mm_cid_put(mm, t->mm_cid); + t->mm_cid = -1; + t->mm_cid_active = 0; + local_irq_restore(flags); +} + +void sched_mm_cid_before_execve(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + unsigned long flags; + + if (!mm) + return; + local_irq_save(flags); + mm_cid_put(mm, t->mm_cid); + t->mm_cid = -1; + t->mm_cid_active = 0; + local_irq_restore(flags); +} + +void sched_mm_cid_after_execve(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + unsigned long flags; + + if (!mm) + return; + local_irq_save(flags); + t->mm_cid = mm_cid_get(mm); + t->mm_cid_active = 1; + local_irq_restore(flags); + rseq_set_notify_resume(t); +} + +void sched_mm_cid_fork(struct task_struct *t) +{ + WARN_ON_ONCE(!t->mm || t->mm_cid != -1); + t->mm_cid_active = 1; +} +#endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 1207c78f85c1..5c840151f3bb 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -48,7 +48,6 @@ struct sugov_cpu { unsigned long util; unsigned long bw_dl; - unsigned long max; /* The field below is for single-CPU policies only: */ #ifdef CONFIG_NO_HZ_COMMON @@ -158,7 +157,6 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); sg_cpu->bw_dl = cpu_bw_dl(rq); sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), FREQUENCY_UTIL, NULL); @@ -238,6 +236,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * sugov_iowait_apply() - Apply the IO boost to a CPU. * @sg_cpu: the sugov data for the cpu to boost * @time: the update time from the caller + * @max_cap: the max CPU capacity * * A CPU running a task which woken up after an IO operation can have its * utilization boosted to speed up the completion of those IO operations. @@ -251,7 +250,8 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * This mechanism is designed to boost high frequently IO waiting tasks, while * being more conservative on tasks which does sporadic IO operations. */ -static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) +static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, + unsigned long max_cap) { unsigned long boost; @@ -280,7 +280,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) * sg_cpu->util is already in capacity scale; convert iowait_boost * into the same scale so we can compare. */ - boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT; + boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL); if (sg_cpu->util < boost) sg_cpu->util = boost; @@ -310,7 +310,8 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) } static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, - u64 time, unsigned int flags) + u64 time, unsigned long max_cap, + unsigned int flags) { sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -321,7 +322,7 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, return false; sugov_get_util(sg_cpu); - sugov_iowait_apply(sg_cpu, time); + sugov_iowait_apply(sg_cpu, time, max_cap); return true; } @@ -332,12 +333,15 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned int cached_freq = sg_policy->cached_raw_freq; + unsigned long max_cap; unsigned int next_f; - if (!sugov_update_single_common(sg_cpu, time, flags)) + max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + + if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) return; - next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max); + next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); /* * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. @@ -374,6 +378,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); unsigned long prev_util = sg_cpu->util; + unsigned long max_cap; /* * Fall back to the "frequency" path if frequency invariance is not @@ -385,7 +390,9 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, return; } - if (!sugov_update_single_common(sg_cpu, time, flags)) + max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + + if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) return; /* @@ -399,7 +406,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, sg_cpu->util = prev_util; cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), - map_util_perf(sg_cpu->util), sg_cpu->max); + map_util_perf(sg_cpu->util), max_cap); sg_cpu->sg_policy->last_freq_update_time = time; } @@ -408,25 +415,21 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - unsigned long util = 0, max = 1; + unsigned long util = 0, max_cap; unsigned int j; + max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); - unsigned long j_util, j_max; sugov_get_util(j_sg_cpu); - sugov_iowait_apply(j_sg_cpu, time); - j_util = j_sg_cpu->util; - j_max = j_sg_cpu->max; + sugov_iowait_apply(j_sg_cpu, time, max_cap); - if (j_util * max > j_max * util) { - util = j_util; |
