diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 2333 |
1 files changed, 377 insertions, 1956 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c56fb57f2991..34e2291a9a6c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1,88 +1,28 @@ /* * kernel/sched/core.c * - * Kernel scheduler and related syscalls + * Core kernel scheduler code and related syscalls * * Copyright (C) 1991-2002 Linus Torvalds - * - * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and - * make semaphores SMP safe - * 1998-11-19 Implemented schedule_timeout() and related stuff - * by Andrea Arcangeli - * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: - * hybrid priority-list and round-robin design with - * an array-switch method of distributing timeslices - * and per-CPU runqueues. Cleanups and useful suggestions - * by Davide Libenzi, preemptible kernel bits by Robert Love. - * 2003-09-03 Interactivity tuning by Con Kolivas. - * 2004-04-02 Scheduler domains code by Nick Piggin - * 2007-04-15 Work begun on replacing all interactivity tuning with a - * fair scheduling design by Con Kolivas. - * 2007-05-05 Load balancing (smp-nice) and other improvements - * by Peter Williams - * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith - * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri - * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, - * Thomas Gleixner, Mike Kravetz */ - -#include <linux/kasan.h> -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/nmi.h> -#include <linux/init.h> -#include <linux/uaccess.h> -#include <linux/highmem.h> -#include <linux/mmu_context.h> -#include <linux/interrupt.h> -#include <linux/capability.h> -#include <linux/completion.h> -#include <linux/kernel_stat.h> -#include <linux/debug_locks.h> -#include <linux/perf_event.h> -#include <linux/security.h> -#include <linux/notifier.h> -#include <linux/profile.h> -#include <linux/freezer.h> -#include <linux/vmalloc.h> -#include <linux/blkdev.h> -#include <linux/delay.h> -#include <linux/pid_namespace.h> -#include <linux/smp.h> -#include <linux/threads.h> -#include <linux/timer.h> -#include <linux/rcupdate.h> -#include <linux/cpu.h> +#include <linux/sched.h> #include <linux/cpuset.h> -#include <linux/percpu.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/sysctl.h> -#include <linux/syscalls.h> -#include <linux/times.h> -#include <linux/tsacct_kern.h> -#include <linux/kprobes.h> #include <linux/delayacct.h> -#include <linux/unistd.h> -#include <linux/pagemap.h> -#include <linux/hrtimer.h> -#include <linux/tick.h> -#include <linux/ctype.h> -#include <linux/ftrace.h> -#include <linux/slab.h> #include <linux/init_task.h> #include <linux/context_tracking.h> -#include <linux/compiler.h> -#include <linux/frame.h> + +#include <linux/blkdev.h> +#include <linux/kprobes.h> +#include <linux/mmu_context.h> +#include <linux/module.h> +#include <linux/nmi.h> #include <linux/prefetch.h> -#include <linux/mutex.h> +#include <linux/profile.h> +#include <linux/security.h> +#include <linux/syscalls.h> #include <asm/switch_to.h> #include <asm/tlb.h> -#include <asm/irq_regs.h> -#ifdef CONFIG_PARAVIRT -#include <asm/paravirt.h> -#endif #include "sched.h" #include "../workqueue_internal.h" @@ -91,27 +31,8 @@ #define CREATE_TRACE_POINTS #include <trace/events/sched.h> -DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static void update_rq_clock_task(struct rq *rq, s64 delta); - -void update_rq_clock(struct rq *rq) -{ - s64 delta; - - lockdep_assert_held(&rq->lock); - - if (rq->clock_skip_update & RQCF_ACT_SKIP) - return; - - delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; - if (delta < 0) - return; - rq->clock += delta; - update_rq_clock_task(rq, delta); -} - /* * Debugging: various feature bits */ @@ -140,7 +61,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; /* - * period over which we measure -rt task cpu usage in us. + * period over which we measure -rt task CPU usage in us. * default: 1s */ unsigned int sysctl_sched_rt_period = 1000000; @@ -153,7 +74,7 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; -/* cpus with isolated domains */ +/* CPUs with isolated domains */ cpumask_var_t cpu_isolated_map; /* @@ -185,7 +106,7 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) rq = task_rq(p); raw_spin_lock(&rq->lock); if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { - rf->cookie = lockdep_pin_lock(&rq->lock); + rq_pin_lock(rq, rf); return rq; } raw_spin_unlock(&rq->lock); @@ -221,11 +142,11 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) * If we observe the old cpu in task_rq_lock, the acquire of * the old rq->lock will fully serialize against the stores. * - * If we observe the new cpu in task_rq_lock, the acquire will + * If we observe the new CPU in task_rq_lock, the acquire will * pair with the WMB to ensure we must then also see migrating. */ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { - rf->cookie = lockdep_pin_lock(&rq->lock); + rq_pin_lock(rq, rf); return rq; } raw_spin_unlock(&rq->lock); @@ -236,6 +157,84 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) } } +/* + * RQ-clock updating methods: + */ + +static void update_rq_clock_task(struct rq *rq, s64 delta) +{ +/* + * In theory, the compile should just see 0 here, and optimize out the call + * to sched_rt_avg_update. But I don't trust it... + */ +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + s64 steal = 0, irq_delta = 0; +#endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; + + /* + * Since irq_time is only updated on {soft,}irq_exit, we might run into + * this case when a previous update_rq_clock() happened inside a + * {soft,}irq region. + * + * When this happens, we stop ->clock_task and only update the + * prev_irq_time stamp to account for the part that fit, so that a next + * update will consume the rest. This ensures ->clock_task is + * monotonic. + * + * It does however cause some slight miss-attribution of {soft,}irq + * time, a more accurate solution would be to update the irq_time using + * the current rq->clock timestamp, except that would require using + * atomic ops. + */ + if (irq_delta > delta) + irq_delta = delta; + + rq->prev_irq_time += irq_delta; + delta -= irq_delta; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + if (static_key_false((¶virt_steal_rq_enabled))) { + steal = paravirt_steal_clock(cpu_of(rq)); + steal -= rq->prev_steal_time_rq; + + if (unlikely(steal > delta)) + steal = delta; + + rq->prev_steal_time_rq += steal; + delta -= steal; + } +#endif + + rq->clock_task += delta; + +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) + sched_rt_avg_update(rq, irq_delta + steal); +#endif +} + +void update_rq_clock(struct rq *rq) +{ + s64 delta; + + lockdep_assert_held(&rq->lock); + + if (rq->clock_update_flags & RQCF_ACT_SKIP) + return; + +#ifdef CONFIG_SCHED_DEBUG + rq->clock_update_flags |= RQCF_UPDATED; +#endif + delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; + if (delta < 0) + return; + rq->clock += delta; + update_rq_clock_task(rq, delta); +} + + #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. @@ -458,7 +457,7 @@ void wake_up_q(struct wake_q_head *head) task = container_of(node, struct task_struct, wake_q); BUG_ON(!task); - /* task can safely be re-inserted now */ + /* Task can safely be re-inserted now: */ node = node->next; task->wake_q.next = NULL; @@ -516,12 +515,12 @@ void resched_cpu(int cpu) #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ_COMMON /* - * In the semi idle case, use the nearest busy cpu for migrating timers - * from an idle cpu. This is good for power-savings. + * In the semi idle case, use the nearest busy CPU for migrating timers + * from an idle CPU. This is good for power-savings. * * We don't do similar optimization for completely idle system, as - * selecting an idle cpu will add more delays to the timers than intended - * (as that cpu's timer base may not be uptodate wrt jiffies etc). + * selecting an idle CPU will add more delays to the timers than intended + * (as that CPU's timer base may not be uptodate wrt jiffies etc). */ int get_nohz_timer_target(void) { @@ -550,6 +549,7 @@ unlock: rcu_read_unlock(); return cpu; } + /* * When add_timer_on() enqueues a timer into the timer wheel of an * idle CPU then this timer might expire before the next timer event @@ -784,60 +784,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } -static void update_rq_clock_task(struct rq *rq, s64 delta) -{ -/* - * In theory, the compile should just see 0 here, and optimize out the call - * to sched_rt_avg_update. But I don't trust it... - */ -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - s64 steal = 0, irq_delta = 0; -#endif -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; - - /* - * Since irq_time is only updated on {soft,}irq_exit, we might run into - * this case when a previous update_rq_clock() happened inside a - * {soft,}irq region. - * - * When this happens, we stop ->clock_task and only update the - * prev_irq_time stamp to account for the part that fit, so that a next - * update will consume the rest. This ensures ->clock_task is - * monotonic. - * - * It does however cause some slight miss-attribution of {soft,}irq - * time, a more accurate solution would be to update the irq_time using - * the current rq->clock timestamp, except that would require using - * atomic ops. - */ - if (irq_delta > delta) - irq_delta = delta; - - rq->prev_irq_time += irq_delta; - delta -= irq_delta; -#endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - if (static_key_false((¶virt_steal_rq_enabled))) { - steal = paravirt_steal_clock(cpu_of(rq)); - steal -= rq->prev_steal_time_rq; - - if (unlikely(steal > delta)) - steal = delta; - - rq->prev_steal_time_rq += steal; - delta -= steal; - } -#endif - - rq->clock_task += delta; - -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) - sched_rt_avg_update(rq, irq_delta + steal); -#endif -} - void sched_set_stop_task(int cpu, struct task_struct *stop) { struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; @@ -1018,7 +964,7 @@ struct migration_arg { }; /* - * Move (not current) task off this cpu, onto dest cpu. We're doing + * Move (not current) task off this CPU, onto the destination CPU. We're doing * this because either it can't run here any more (set_cpus_allowed() * away from this CPU, or CPU going down), or because we're * attempting to rebalance this task on exec (sched_exec). @@ -1052,8 +998,8 @@ static int migration_cpu_stop(void *data) struct rq *rq = this_rq(); /* - * The original target cpu might have gone down and we might - * be on another cpu but it doesn't matter. + * The original target CPU might have gone down and we might + * be on another CPU but it doesn't matter. */ local_irq_disable(); /* @@ -1171,7 +1117,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (p->flags & PF_KTHREAD) { /* * For kernel threads that do indeed end up on online && - * !active we want to ensure they are strict per-cpu threads. + * !active we want to ensure they are strict per-CPU threads. */ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && !cpumask_intersects(new_mask, cpu_active_mask) && @@ -1195,9 +1141,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, * OK, since we're going to drop the lock immediately * afterwards anyway. */ - lockdep_unpin_lock(&rq->lock, rf.cookie); + rq_unpin_lock(rq, &rf); rq = move_queued_task(rq, p, dest_cpu); - lockdep_repin_lock(&rq->lock, rf.cookie); + rq_repin_lock(rq, &rf); } out: task_rq_unlock(rq, p, &rf); @@ -1276,7 +1222,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) /* * Task isn't running anymore; make it appear like we migrated * it before it went to sleep. This means on wakeup we make the - * previous cpu our target instead of where it really is. + * previous CPU our target instead of where it really is. */ p->wake_cpu = cpu; } @@ -1508,12 +1454,12 @@ EXPORT_SYMBOL_GPL(kick_process); * * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, * see __set_cpus_allowed_ptr(). At this point the newly online - * cpu isn't yet part of the sched domains, and balancing will not + * CPU isn't yet part of the sched domains, and balancing will not * see it. * - * - on cpu-down we clear cpu_active() to mask the sched domains and + * - on CPU-down we clear cpu_active() to mask the sched domains and * avoid the load balancer to place new tasks on the to be removed - * cpu. Existing tasks will remain running there and will be taken + * CPU. Existing tasks will remain running there and will be taken * off. * * This means that fallback selection must not select !active CPUs. @@ -1529,9 +1475,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) int dest_cpu; /* - * If the node that the cpu is on has been offlined, cpu_to_node() - * will return -1. There is no cpu on the node, and we should - * select the cpu on the other node. + * If the node that the CPU is on has been offlined, cpu_to_node() + * will return -1. There is no CPU on the node, and we should + * select the CPU on the other node. */ if (nid != -1) { nodemask = cpumask_of_node(nid); @@ -1563,7 +1509,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) state = possible; break; } - /* fall-through */ + /* Fall-through */ case possible: do_set_cpus_allowed(p, cpu_possible_mask); state = fail; @@ -1607,7 +1553,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) /* * In order not to call set_task_cpu() on a blocking task we need * to rely on ttwu() to place the task on a valid ->cpus_allowed - * cpu. + * CPU. * * Since this is common to all placement strategies, this lives here. * @@ -1681,7 +1627,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl activate_task(rq, p, en_flags); p->on_rq = TASK_ON_RQ_QUEUED; - /* if a worker is waking up, notify workqueue */ + /* If a worker is waking up, notify the workqueue: */ if (p->flags & PF_WQ_WORKER) wq_worker_waking_up(p, cpu_of(rq)); } @@ -1690,7 +1636,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl * Mark the task runnable and perform wakeup-preemption. */ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, - struct pin_cookie cookie) + struct rq_flags *rf) { check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; @@ -1702,9 +1648,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, * Our task @p is fully woken up and running; so its safe to * drop the rq->lock, hereafter rq is only used for statistics. */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, rf); p->sched_class->task_woken(rq, p); - lockdep_repin_lock(&rq->lock, cookie); + rq_repin_lock(rq, rf); } if (rq->idle_stamp) { @@ -1723,7 +1669,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, - struct pin_cookie cookie) + struct rq_flags *rf) { int en_flags = ENQUEUE_WAKEUP; @@ -1738,7 +1684,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, #endif ttwu_activate(rq, p, en_flags); - ttwu_do_wakeup(rq, p, wake_flags, cookie); + ttwu_do_wakeup(rq, p, wake_flags, rf); } /* @@ -1757,7 +1703,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) if (task_on_rq_queued(p)) { /* check_preempt_curr() may use rq clock */ update_rq_clock(rq); - ttwu_do_wakeup(rq, p, wake_flags, rf.cookie); + ttwu_do_wakeup(rq, p, wake_flags, &rf); ret = 1; } __task_rq_unlock(rq, &rf); @@ -1770,15 +1716,15 @@ void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); - struct pin_cookie cookie; struct task_struct *p; unsigned long flags; + struct rq_flags rf; if (!llist) return; raw_spin_lock_irqsave(&rq->lock, flags); - cookie = lockdep_pin_lock(&rq->lock); + rq_pin_lock(rq, &rf); while (llist) { int wake_flags = 0; @@ -1789,10 +1735,10 @@ void sched_ttwu_pending(void) if (p->sched_remote_wakeup) wake_flags = WF_MIGRATED; - ttwu_do_activate(rq, p, wake_flags, cookie); + ttwu_do_activate(rq, p, wake_flags, &rf); } - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, &rf); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -1864,7 +1810,7 @@ void wake_up_if_idle(int cpu) raw_spin_lock_irqsave(&rq->lock, flags); if (is_idle_task(rq->curr)) smp_send_reschedule(cpu); - /* Else cpu is not in idle, do nothing here */ + /* Else CPU is not idle, do nothing here: */ raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -1881,20 +1827,20 @@ bool cpus_share_cache(int this_cpu, int that_cpu) static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); - struct pin_cookie cookie; + struct rq_flags rf; #if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { - sched_clock_cpu(cpu); /* sync clocks x-cpu */ + sched_clock_cpu(cpu); /* Sync clocks across CPUs */ ttwu_queue_remote(p, cpu, wake_flags); return; } #endif raw_spin_lock(&rq->lock); - cookie = lockdep_pin_lock(&rq->lock); - ttwu_do_activate(rq, p, wake_flags, cookie); - lockdep_unpin_lock(&rq->lock, cookie); + rq_pin_lock(rq, &rf); + ttwu_do_activate(rq, p, wake_flags, &rf); + rq_unpin_lock(rq, &rf); raw_spin_unlock(&rq->lock); } @@ -1904,8 +1850,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * MIGRATION * * The basic program-order guarantee on SMP systems is that when a task [t] - * migrates, all its activity on its old cpu [c0] happens-before any subsequent - * execution on its new cpu [c1]. + * migrates, all its activity on its old CPU [c0] happens-before any subsequent + * execution on its new CPU [c1]. * * For migration (of runnable tasks) this is provided by the following means: * @@ -1916,7 +1862,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * * Transitivity guarantees that B happens after A and C after B. * Note: we only require RCpc transitivity. - * Note: the cpu doing B need not be c0 or c1 + * Note: the CPU doing B need not be c0 or c1 * * Example: * @@ -2024,7 +1970,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) trace_sched_waking(p); - success = 1; /* we're going to change ->state */ + /* We're going to change ->state: */ + success = 1; cpu = task_cpu(p); /* @@ -2073,7 +2020,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) smp_rmb(); /* - * If the owning (remote) cpu is still in the middle of schedule() with + * If the owning (remote) CPU is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. * * Pairs with the smp_store_release() in finish_lock_switch(). @@ -2086,11 +2033,24 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; + if (p->in_iowait) { + delayacct_blkio_end(); + atomic_dec(&task_rq(p)->nr_iowait); + } + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); } + +#else /* CONFIG_SMP */ + + if (p->in_iowait) { + delayacct_blkio_end(); + atomic_dec(&task_rq(p)->nr_iowait); + } + #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); @@ -2111,7 +2071,7 @@ out: * ensure that this_rq() is locked, @p is bound to this_rq() and not * the current task. */ -static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie) +static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) { struct rq *rq = task_rq(p); @@ -2128,11 +2088,11 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie * disabled avoiding further scheduler activity on it and we've * not yet picked a replacement task. */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, rf); raw_spin_unlock(&rq->lock); raw_spin_lock(&p->pi_lock); raw_spin_lock(&rq->lock); - lockdep_repin_lock(&rq->lock, cookie); + rq_repin_lock(rq, rf); } if (!(p->state & TASK_NORMAL)) @@ -2140,10 +2100,15 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie trace_sched_waking(p); - if (!task_on_rq_queued(p)) + if (!task_on_rq_queued(p)) { + if (p->in_iowait) { + delayacct_blkio_end(); + atomic_dec(&rq->nr_iowait); + } ttwu_activate(rq, p, ENQUEUE_WAKEUP); + } - ttwu_do_wakeup(rq, p, 0, cookie); + ttwu_do_wakeup(rq, p, 0, rf); ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); @@ -2427,7 +2392,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) */ raw_spin_lock_irqsave(&p->pi_lock, flags); /* - * We're setting the cpu for the first time, we don't migrate, + * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). */ __set_task_cpu(p, cpu); @@ -2570,7 +2535,7 @@ void wake_up_new_task(struct task_struct *p) /* * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path - * - any previously selected cpu might disappear through hotplug + * - any previously selected CPU might disappear through hotplug * * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, * as we're not fully set-up yet. @@ -2578,6 +2543,7 @@ void wake_up_new_task(struct task_struct *p) __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); + update_rq_clock(rq); post_init_entity_util_avg(&p->se); activate_task(rq, p, 0); @@ -2590,9 +2556,9 @@ void wake_up_new_task(struct task_struct *p) * Nothing relies on rq->lock after this, so its fine to * drop it. */ - lockdep_unpin_lock(&rq->lock, rf.cookie); + rq_unpin_lock(rq, &rf); p->sched_class->task_woken(rq, p); - lockdep_repin_lock(&rq->lock, rf.cookie); + rq_repin_lock(rq, &rf); } #endif task_rq_unlock(rq, p, &rf); @@ -2861,7 +2827,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next, struct pin_cookie cookie) + struct task_struct *next, struct rq_flags *rf) { struct mm_struct *mm, *oldmm; @@ -2887,13 +2853,16 @@ context_switch(struct rq *rq, struct task_struct *prev, prev->active_mm = NULL; rq->prev_mm = oldmm; } + + rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); + /* * Since the runqueue lock will be released by the next * task (which is an invalid locking op but in the case * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, rf); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); /* Here we just switch the register state and the stack. */ @@ -2920,7 +2889,7 @@ unsigned long nr_running(void) } /* - * Check if only the current task is running on the cpu. + * Check if only the current task is running on the CPU. * * Caution: this function does not check that the caller has disabled * preemption, thus the result might have a time-of-check-to-time-of-use @@ -2949,6 +2918,36 @@ unsigned long long nr_context_switches(void) return sum; } +/* + * IO-wait accounting, and how its mostly bollocks (on SMP). + * + * The idea behind IO-wait account is to account the idle time that we could + * have spend running if it were not for IO. That is, if we were to improve the + * storage performance, we'd have a proportional reduction in IO-wait time. + * + * This all works nicely on UP, where, when a task blocks on IO, we account + * idle time as IO-wait, because if the storage were faster, it could've been + * running and we'd not be idle. + * + * This has been extended to SMP, by doing the same for each CPU. This however + * is broken. + * + * Imagine for instance the case where two tasks block on one CPU, only the one + * CPU will have IO-wait accounted, while the other has regular idle. Even + * though, if the storage were faster, both could've ran at the same time, + * utilising both CPUs. + * + * This means, that when looking globally, the current IO-wait accounting on + * SMP is a lower bound, by reason of under accounting. + * + * Worse, since the numbers are provided per CPU, they are sometimes + * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly + * associated with any one particular CPU, it can wake to another CPU than it + * blocked on. This means the per CPU IO-wait number is meaningless. + * + * Task CPU affinities can make all that even more 'interesting'. + */ + unsigned long nr_iowait(void) { unsigned long i, sum = 0; @@ -2959,6 +2958,13 @@ unsigned long nr_iowait(void) return sum; } +/* + * Consumers of these two interfaces, like for example the cpufreq menu + * governor are using nonsensical data. Boosting frequency for a CPU that has + * IO-wait which might not even end up running the task when it does become + * runnable. + */ + unsigned long nr_iowait_cpu(int cpu) { struct rq *this = cpu_rq(cpu); @@ -3042,8 +3048,8 @@ unsigned long long task_sched_runtime(struct task_struct *p) * So we have a optimization chance when the task's delta_exec is 0. * Reading ->on_cpu is racy, but this is ok. * - * If we race with it leaving cpu, we'll take a lock. So we're correct. - * If we race with it entering cpu, unaccounted time is 0. This is + * If we race with it leaving CPU, we'll take a lock. So we're correct. + * If we race with it entering CPU, unaccounted time is 0. This is * indistinguishable from the read occurring a few cycles earlier. * If we see ->on_cpu without ->on_rq, the task is leaving, and has * been accounted, so we're correct here as well. @@ -3257,31 +3263,30 @@ static inline void schedule_debug(struct task_struct *prev) * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - const struct sched_class *class = &fair_sched_class; + const struct sched_class *class; struct task_struct *p; /* * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ - if (likely(prev->sched_class == class && - rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq, prev, cookie); + if (likely(rq->nr_running == rq->cfs.h_nr_running)) { + p = fair_sched_class.pick_next_task(rq, prev, rf); if (unlikely(p == RETRY_TASK)) goto again; - /* assumes fair_sched_class->next == idle_sched_class */ + /* Assumes fair_sched_class->next == idle_sched_class */ if (unlikely(!p)) - p = idle_sched_class.pick_next_task(rq, prev, cookie); + p = idle_sched_class.pick_next_task(rq, prev, rf); return p; } again: for_each_class(class) { - p = class->pick_next_task(rq, prev, cookie); + p = class->pick_next_task(rq, prev, rf); if (p) { if (unlikely(p == RETRY_TASK)) goto again; @@ -3289,7 +3294,8 @@ again: } } - BUG(); /* the idle class will always have a runnable task */ + /* The idle class should always have a runnable task: */ + BUG(); } /* @@ -3335,7 +3341,7 @@ static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; - struct pin_cookie cookie; + struct rq_flags rf; struct rq *rq; int cpu; @@ -3358,9 +3364,10 @@ static void __sched notrace __schedule(bool preempt) */ smp_mb__before_spinlock(); raw_spin_lock(&rq->lock); - cookie = lockdep_pin_lock(&rq->lock); + rq_pin_lock(rq, &rf); - rq->clock_skip_update <<= 1; /* promote REQ to ACT */ + /* Promote REQ to ACT */ + rq->clock_update_flags <<= 1; switch_count = &prev->nivcsw; if (!preempt && prev->state) { @@ -3370,6 +3377,11 @@ static void __sched notrace __schedule(bool preempt) deactivate_task(rq, prev, DEQUEUE_SLEEP); prev->on_rq = 0; + if (prev->in_iowait) { + atomic_inc(&rq->nr_iowait); + delayacct_blkio_start(); + } + /* * If a worker went to sleep, notify and ask workqueue * whether it wants to wake up a task to maintain @@ -3380,7 +3392,7 @@ static void __sched notrace __schedule(bool preempt) to_wakeup = wq_worker_sleeping(prev); if (to_wakeup) - try_to_wake_up_local(to_wakeup, cookie); + try_to_wake_up_local(to_wakeup, &rf); } } switch_count = &prev->nvcsw; @@ -3389,10 +3401,9 @@ static void __sched notrace __schedule(bool preempt) if (task_on_rq_queued(prev)) update_rq_clock(rq); - next = pick_next_task(rq, prev, cookie); + next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); clear_preempt_need_resched(); - rq->clock_skip_update = 0; if (likely(prev != next)) { rq->nr_switches++; @@ -3400,9 +3411,12 @@ static void __sched notrace __schedule(bool preempt) ++*switch_count; trace_sched_switch(preempt, prev, next); - rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */ + + /* Also unlocks the rq: */ + rq = context_switch(rq, prev, next, &rf); } else { - lockdep_unpin_lock(&rq->lock, cookie); + rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); + rq_unpin_lock(rq, &rf); raw_spin_unlock_irq(&rq->lock); } @@ -3426,14 +3440,18 @@ void __noreturn do_task_dead(void) smp_mb(); raw_spin_unlock_wait(¤t->pi_lock); - /* causes final put_task_struct in finish_task_switch(). */ + /* Causes final put_task_struct in finish_task_switch(): */ __set_current_state(TASK_DEAD); - current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ + + /* Tell freezer to ignore us: */ + current->flags |= PF_NOFREEZE; + __schedule(false); BUG(); - /* Avoid "noreturn function does return". */ + + /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ for (;;) - cpu_relax(); /* For when BUG is null */ + cpu_relax(); } static inline void sched_submit_work(struct task_struct *tsk) @@ -3651,6 +3669,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) BUG_ON(prio > MAX_PRIO); rq = __task_rq_lock(p, &rf); + update_rq_clock(rq); /* * Idle task boosting is a nono in general. There is one @@ -3725,7 +3744,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) check_class_changed(rq, p, prev_class, oldprio); out_unlock: - preempt_disable(); /* avoid rq from going away on us */ + /* Avoid rq from going away on us: */ + preempt_disable(); __task_rq_unlock(rq, &rf); balance_callback(rq); @@ -3747,6 +3767,8 @@ void set_user_nice(struct task_struct *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &rf); + update_rq_clock(rq); + /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -3793,7 +3815,7 @@ EXPORT_SYMBOL(set_user_nice); */ int can_nice(const struct task_struct *p, const int nice) { - /* convert nice value [19,-20] to rlimit style value [1,40] */ + /* Convert nice value [19,-20] to rlimit style value [1,40]: */ int nice_rlim = nice_to_rlimit(nice); return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || @@ -3849,7 +3871,7 @@ int task_prio(const struct task_struct *p) } /** - * idle_cpu - is a given cpu idle currently? + * idle_cpu - is a given CPU idle currently? * @cpu: the processor in question. * * Return: 1 if the CPU is currently idle. 0 otherwise. @@ -3873,10 +3895,10 @@ int idle_cpu(int cpu) } /** - * idle_task - return the idle task for a given cpu. + * idle_task - return the idle task for a given CPU. * @cpu: the processor in question. * - * Return: The idle task for the cpu @cpu. + * Return: The idle task for the CPU @cpu. */ struct task_struct *idle_task(int cpu) { @@ -4042,7 +4064,7 @@ __checkparam_dl(const struct sched_attr *attr) } /* - * check the target process has a UID that matches the current process's + * Check the target process has a UID that matches the current process's: */ static bool check_same_owner(struct task_struct *p) { @@ -4057,8 +4079,7 @@ static bool check_same_owner(struct task_struct *p) return match; } -static bool dl_param_changed(struct task_struct *p, - const struct sched_attr *attr) +static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) { struct sched_dl_entity *dl_se = &p->dl; @@ -4085,10 +4106,10 @@ static int __sched_setscheduler(struct task_struct *p, int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; struct rq *rq; - /* may grab non-irq protected spin_locks */ + /* May grab non-irq protected spin_locks: */ BUG_ON(in_interrupt()); recheck: - /* double check policy once rq lock held */ + /* Double check policy once rq lock held: */ if (policy < 0) { reset_on_fork = p-& |