summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-06-22 15:52:04 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2015-06-22 15:52:04 -0700
commit23b7776290b10297fe2cae0fb5f166a4f2c68121 (patch)
tree73d1e76644a20bc7bff80fbfdb08e8b9a9f28420 /kernel
parent6bc4c3ad3619e1bcb4a6330e030007ace8ca465e (diff)
parent6fab54101923044712baee429ff573f03b99fc47 (diff)
downloadlinux-23b7776290b10297fe2cae0fb5f166a4f2c68121.tar.gz
linux-23b7776290b10297fe2cae0fb5f166a4f2c68121.tar.bz2
linux-23b7776290b10297fe2cae0fb5f166a4f2c68121.zip
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes are: - lockless wakeup support for futexes and IPC message queues (Davidlohr Bueso, Peter Zijlstra) - Replace spinlocks with atomics in thread_group_cputimer(), to improve scalability (Jason Low) - NUMA balancing improvements (Rik van Riel) - SCHED_DEADLINE improvements (Wanpeng Li) - clean up and reorganize preemption helpers (Frederic Weisbecker) - decouple page fault disabling machinery from the preemption counter, to improve debuggability and robustness (David Hildenbrand) - SCHED_DEADLINE documentation updates (Luca Abeni) - topology CPU masks cleanups (Bartosz Golaszewski) - /proc/sched_debug improvements (Srikar Dronamraju)" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (79 commits) sched/deadline: Remove needless parameter in dl_runtime_exceeded() sched: Remove superfluous resetting of the p->dl_throttled flag sched/deadline: Drop duplicate init_sched_dl_class() declaration sched/deadline: Reduce rq lock contention by eliminating locking of non-feasible target sched/deadline: Make init_sched_dl_class() __init sched/deadline: Optimize pull_dl_task() sched/preempt: Add static_key() to preempt_notifiers sched/preempt: Fix preempt notifiers documentation about hlist_del() within unsafe iteration sched/stop_machine: Fix deadlock between multiple stop_two_cpus() sched/debug: Add sum_sleep_runtime to /proc/<pid>/sched sched/debug: Replace vruntime with wait_sum in /proc/sched_debug sched/debug: Properly format runnable tasks in /proc/sched_debug sched/numa: Only consider less busy nodes as numa balancing destinations Revert 095bebf61a46 ("sched/numa: Do not move past the balance point if unbalanced") sched/fair: Prevent throttling in early pick_next_task_fair() preempt: Reorganize the notrace definitions a bit preempt: Use preempt_schedule_context() as the official tracing preemption point sched: Make preempt_schedule_context() function-tracing safe x86: Remove cpu_sibling_mask() and cpu_core_mask() x86: Replace cpu_**_mask() with topology_**_cpumask() ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/fork.c8
-rw-r--r--kernel/futex.c33
-rw-r--r--kernel/locking/lglock.c22
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c6
-rw-r--r--kernel/sched/auto_group.h2
-rw-r--r--kernel/sched/core.c136
-rw-r--r--kernel/sched/cputime.c2
-rw-r--r--kernel/sched/deadline.c51
-rw-r--r--kernel/sched/debug.c11
-rw-r--r--kernel/sched/fair.c372
-rw-r--r--kernel/sched/loadavg.c (renamed from kernel/sched/proc.c)236
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h11
-rw-r--r--kernel/sched/stats.h15
-rw-r--r--kernel/sched/wait.c4
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/stop_machine.c42
-rw-r--r--kernel/time/posix-cpu-timers.c87
19 files changed, 592 insertions, 456 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 03c1eaaa6ef5..0bb88b555550 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1091,10 +1091,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
unsigned long cpu_limit;
- /* Thread group counters. */
- thread_group_cputime_init(sig);
-
- cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (cpu_limit != RLIM_INFINITY) {
sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
sig->cputimer.running = 1;
@@ -1396,6 +1393,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->hardirq_context = 0;
p->softirq_context = 0;
#endif
+
+ p->pagefault_disabled = 0;
+
#ifdef CONFIG_LOCKDEP
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
diff --git a/kernel/futex.c b/kernel/futex.c
index 55ca63ad9622..aacc706f85fc 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1090,9 +1090,11 @@ static void __unqueue_futex(struct futex_q *q)
/*
* The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
*/
-static void wake_futex(struct futex_q *q)
+static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
{
struct task_struct *p = q->task;
@@ -1100,14 +1102,10 @@ static void wake_futex(struct futex_q *q)
return;
/*
- * We set q->lock_ptr = NULL _before_ we wake up the task. If
- * a non-futex wake up happens on another CPU then the task
- * might exit and p would dereference a non-existing task
- * struct. Prevent this by holding a reference on p across the
- * wake up.
+ * Queue the task for later wakeup for after we've released
+ * the hb->lock. wake_q_add() grabs reference to p.
*/
- get_task_struct(p);
-
+ wake_q_add(wake_q, p);
__unqueue_futex(q);
/*
* The waiting task can free the futex_q as soon as
@@ -1117,9 +1115,6 @@ static void wake_futex(struct futex_q *q)
*/
smp_wmb();
q->lock_ptr = NULL;
-
- wake_up_state(p, TASK_NORMAL);
- put_task_struct(p);
}
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -1217,6 +1212,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
struct futex_q *this, *next;
union futex_key key = FUTEX_KEY_INIT;
int ret;
+ WAKE_Q(wake_q);
if (!bitset)
return -EINVAL;
@@ -1244,13 +1240,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if (!(this->bitset & bitset))
continue;
- wake_futex(this);
+ mark_wake_futex(&wake_q, this);
if (++ret >= nr_wake)
break;
}
}
spin_unlock(&hb->lock);
+ wake_up_q(&wake_q);
out_put_key:
put_futex_key(&key);
out:
@@ -1269,6 +1266,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
int ret, op_ret;
+ WAKE_Q(wake_q);
retry:
ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
@@ -1320,7 +1318,7 @@ retry_private:
ret = -EINVAL;
goto out_unlock;
}
- wake_futex(this);
+ mark_wake_futex(&wake_q, this);
if (++ret >= nr_wake)
break;
}
@@ -1334,7 +1332,7 @@ retry_private:
ret = -EINVAL;
goto out_unlock;
}
- wake_futex(this);
+ mark_wake_futex(&wake_q, this);
if (++op_ret >= nr_wake2)
break;
}
@@ -1344,6 +1342,7 @@ retry_private:
out_unlock:
double_unlock_hb(hb1, hb2);
+ wake_up_q(&wake_q);
out_put_keys:
put_futex_key(&key2);
out_put_key1:
@@ -1503,6 +1502,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
struct futex_pi_state *pi_state = NULL;
struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
+ WAKE_Q(wake_q);
if (requeue_pi) {
/*
@@ -1679,7 +1679,7 @@ retry_private:
* woken by futex_unlock_pi().
*/
if (++task_count <= nr_wake && !requeue_pi) {
- wake_futex(this);
+ mark_wake_futex(&wake_q, this);
continue;
}
@@ -1719,6 +1719,7 @@ retry_private:
out_unlock:
free_pi_state(pi_state);
double_unlock_hb(hb1, hb2);
+ wake_up_q(&wake_q);
hb_waiters_dec(hb2);
/*
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
index 86ae2aebf004..951cfcd10b4a 100644
--- a/kernel/locking/lglock.c
+++ b/kernel/locking/lglock.c
@@ -60,6 +60,28 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
}
EXPORT_SYMBOL(lg_local_unlock_cpu);
+void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
+{
+ BUG_ON(cpu1 == cpu2);
+
+ /* lock in cpu order, just like lg_global_lock */
+ if (cpu2 < cpu1)
+ swap(cpu1, cpu2);
+
+ preempt_disable();
+ lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+ arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
+ arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
+}
+
+void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
+{
+ lock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
+ arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
+ preempt_enable();
+}
+
void lg_global_lock(struct lglock *lg)
{
int i;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 46be87024875..67687973ce80 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
-obj-y += core.o proc.o clock.o cputime.o
+obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
obj-y += wait.o completion.o idle.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index eae160dd669d..750ed601ddf7 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -1,5 +1,3 @@
-#ifdef CONFIG_SCHED_AUTOGROUP
-
#include "sched.h"
#include <linux/proc_fs.h>
@@ -141,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
p->signal->autogroup = autogroup_kref_get(ag);
- if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+ if (!READ_ONCE(sysctl_sched_autogroup_enabled))
goto out;
for_each_thread(p, t)
@@ -249,5 +247,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
}
#endif /* CONFIG_SCHED_DEBUG */
-
-#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 8bd047142816..890c95f2587a 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
{
- int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+ int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
if (enabled && task_wants_autogroup(p, tg))
return p->signal->autogroup->tg;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index db9b10a78d74..f89ca9bcf42a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -511,7 +511,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
- typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+ typeof(ti->flags) old, val = READ_ONCE(ti->flags);
for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
@@ -541,6 +541,52 @@ static bool set_nr_if_polling(struct task_struct *p)
#endif
#endif
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+ struct wake_q_node *node = &task->wake_q;
+
+ /*
+ * Atomically grab the task, if ->wake_q is !nil already it means
+ * its already queued (either by us or someone else) and will get the
+ * wakeup due to that.
+ *
+ * This cmpxchg() implies a full barrier, which pairs with the write
+ * barrier implied by the wakeup in wake_up_list().
+ */
+ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+ return;
+
+ get_task_struct(task);
+
+ /*
+ * The head is context local, there can be no concurrency.
+ */
+ *head->lastp = node;
+ head->lastp = &node->next;
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+ struct wake_q_node *node = head->first;
+
+ while (node != WAKE_Q_TAIL) {
+ struct task_struct *task;
+
+ task = container_of(node, struct task_struct, wake_q);
+ BUG_ON(!task);
+ /* task can safely be re-inserted now */
+ node = node->next;
+ task->wake_q.next = NULL;
+
+ /*
+ * wake_up_process() implies a wmb() to pair with the queueing
+ * in wake_q_add() so as not to miss wakeups.
+ */
+ wake_up_process(task);
+ put_task_struct(task);
+ }
+}
+
/*
* resched_curr - mark rq's current task 'to be rescheduled now'.
*
@@ -2105,12 +2151,15 @@ void wake_up_new_task(struct task_struct *p)
#ifdef CONFIG_PREEMPT_NOTIFIERS
+static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
+
/**
* preempt_notifier_register - tell me when current is being preempted & rescheduled
* @notifier: notifier struct to register
*/
void preempt_notifier_register(struct preempt_notifier *notifier)
{
+ static_key_slow_inc(&preempt_notifier_key);
hlist_add_head(&notifier->link, &current->preempt_notifiers);
}
EXPORT_SYMBOL_GPL(preempt_notifier_register);
@@ -2119,15 +2168,16 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register);
* preempt_notifier_unregister - no longer interested in preemption notifications
* @notifier: notifier struct to unregister
*
- * This is safe to call from within a preemption notifier.
+ * This is *not* safe to call from within a preemption notifier.
*/
void preempt_notifier_unregister(struct preempt_notifier *notifier)
{
hlist_del(&notifier->link);
+ static_key_slow_dec(&preempt_notifier_key);
}
EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
struct preempt_notifier *notifier;
@@ -2135,9 +2185,15 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
notifier->ops->sched_in(notifier, raw_smp_processor_id());
}
+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+ if (static_key_false(&preempt_notifier_key))
+ __fire_sched_in_preempt_notifiers(curr);
+}
+
static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
- struct task_struct *next)
+__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
{
struct preempt_notifier *notifier;
@@ -2145,13 +2201,21 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
notifier->ops->sched_out(notifier, next);
}
+static __always_inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+ if (static_key_false(&preempt_notifier_key))
+ __fire_sched_out_preempt_notifiers(curr, next);
+}
+
#else /* !CONFIG_PREEMPT_NOTIFIERS */
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
}
-static void
+static inline void
fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
@@ -2397,9 +2461,9 @@ unsigned long nr_iowait_cpu(int cpu)
void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
{
- struct rq *this = this_rq();
- *nr_waiters = atomic_read(&this->nr_iowait);
- *load = this->cpu_load[0];
+ struct rq *rq = this_rq();
+ *nr_waiters = atomic_read(&rq->nr_iowait);
+ *load = rq->load.weight;
}
#ifdef CONFIG_SMP
@@ -2497,6 +2561,7 @@ void scheduler_tick(void)
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
update_cpu_load_active(rq);
+ calc_global_load_tick(rq);
raw_spin_unlock(&rq->lock);
perf_event_task_tick();
@@ -2525,7 +2590,7 @@ void scheduler_tick(void)
u64 scheduler_tick_max_deferment(void)
{
struct rq *rq = this_rq();
- unsigned long next, now = ACCESS_ONCE(jiffies);
+ unsigned long next, now = READ_ONCE(jiffies);
next = rq->last_sched_tick + HZ;
@@ -2726,9 +2791,7 @@ again:
* - return from syscall or exception to user-space
* - return from interrupt-handler to user-space
*
- * WARNING: all callers must re-check need_resched() afterward and reschedule
- * accordingly in case an event triggered the need for rescheduling (such as
- * an interrupt waking up a task) while preemption was disabled in __schedule().
+ * WARNING: must be called with preemption disabled!
*/
static void __sched __schedule(void)
{
@@ -2737,7 +2800,6 @@ static void __sched __schedule(void)
struct rq *rq;
int cpu;
- preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
rcu_note_context_switch();
@@ -2801,8 +2863,6 @@ static void __sched __schedule(void)
raw_spin_unlock_irq(&rq->lock);
post_schedule(rq);
-
- sched_preempt_enable_no_resched();
}
static inline void sched_submit_work(struct task_struct *tsk)
@@ -2823,7 +2883,9 @@ asmlinkage __visible void __sched schedule(void)
sched_submit_work(tsk);
do {
+ preempt_disable();
__schedule();
+ sched_preempt_enable_no_resched();
} while (need_resched());
}
EXPORT_SYMBOL(schedule);
@@ -2862,15 +2924,14 @@ void __sched schedule_preempt_disabled(void)
static void __sched notrace preempt_schedule_common(void)
{
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ preempt_active_enter();
__schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
+ preempt_active_exit();
/*
* Check again in case we missed a preemption opportunity
* between schedule and now.
*/
- barrier();
} while (need_resched());
}
@@ -2894,9 +2955,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
-#ifdef CONFIG_CONTEXT_TRACKING
/**
- * preempt_schedule_context - preempt_schedule called by tracing
+ * preempt_schedule_notrace - preempt_schedule called by tracing
*
* The tracing infrastructure uses preempt_enable_notrace to prevent
* recursion and tracing preempt enabling caused by the tracing
@@ -2909,7 +2969,7 @@ EXPORT_SYMBOL(preempt_schedule);
* instead of preempt_schedule() to exit user context if needed before
* calling the scheduler.
*/
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
{
enum ctx_state prev_ctx;
@@ -2917,7 +2977,13 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
return;
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ /*
+ * Use raw __prempt_count() ops that don't call function.
+ * We can't call functions before disabling preemption which
+ * disarm preemption tracing recursions.
+ */
+ __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+ barrier();
/*
* Needs preempt disabled in case user_exit() is traced
* and the tracer calls preempt_enable_notrace() causing
@@ -2927,12 +2993,11 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
__schedule();
exception_exit(prev_ctx);
- __preempt_count_sub(PREEMPT_ACTIVE);
barrier();
+ __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
} while (need_resched());
}
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_CONTEXT_TRACKING */
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
#endif /* CONFIG_PREEMPT */
@@ -2952,17 +3017,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
prev_state = exception_enter();
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ preempt_active_enter();
local_irq_enable();
__schedule();
local_irq_disable();
- __preempt_count_sub(PREEMPT_ACTIVE);
-
- /*
- * Check again in case we missed a preemption opportunity
- * between schedule and now.
- */
- barrier();
+ preempt_active_exit();
} while (need_resched());
exception_exit(prev_state);
@@ -3040,7 +3099,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
- p->dl.dl_throttled = 0;
enqueue_flag = ENQUEUE_REPLENISH;
} else
p->dl.dl_boosted = 0;
@@ -5314,7 +5372,7 @@ static struct notifier_block migration_notifier = {
.priority = CPU_PRI_MIGRATION,
};
-static void __cpuinit set_cpu_rq_start_time(void)
+static void set_cpu_rq_start_time(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
@@ -7734,11 +7792,11 @@ static long sched_group_rt_runtime(struct task_group *tg)
return rt_runtime_us;
}
-static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
{
u64 rt_runtime, rt_period;
- rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+ rt_period = rt_period_us * NSEC_PER_USEC;
rt_runtime = tg->rt_bandwidth.rt_runtime;
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1ee600c..f5a64ffad176 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new)
{
cputime_t old;
- while (new > (old = ACCESS_ONCE(*counter)))
+ while (new > (old = READ_ONCE(*counter)))
cmpxchg_cputime(counter, old, new);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5e95145088fd..392e8fb94db3 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -640,7 +640,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
}
static
-int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
+int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
{
return (dl_se->runtime <= 0);
}
@@ -684,7 +684,7 @@ static void update_curr_dl(struct rq *rq)
sched_rt_avg_update(rq, delta_exec);
dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
- if (dl_runtime_exceeded(rq, dl_se)) {
+ if (dl_runtime_exceeded(dl_se)) {
dl_se->dl_throttled = 1;
__dequeue_task_dl(rq, curr, 0);
if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
@@ -995,7 +995,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
rq = cpu_rq(cpu);
rcu_read_lock();
- curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+ curr = READ_ONCE(rq->curr); /* unlocked access */
/*
* If we are dealing with a -deadline task, we must
@@ -1012,7 +1012,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
(p->nr_cpus_allowed > 1)) {
int target = find_later_rq(p);
- if (target != -1)
+ if (target != -1 &&
+ dl_time_before(p->dl.deadline,
+ cpu_rq(target)->dl.earliest_dl.curr))
cpu = target;
}
rcu_read_unlock();
@@ -1230,6 +1232,32 @@ next_node:
return NULL;
}
+/*
+ * Return the earliest pushable rq's task, which is suitable to be executed
+ * on the CPU, NULL otherwise:
+ */
+static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
+{
+ struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost;
+ struct task_struct *p = NULL;
+
+ if (!has_pushable_dl_tasks(rq))
+ return NULL;
+
+next_node:
+ if (next_node) {
+ p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
+
+ if (pick_dl_task(rq, p, cpu))
+ return p;
+
+ next_node = rb_next(next_node);
+ goto next_node;
+ }
+
+ return NULL;
+}
+
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
static int find_later_rq(struct task_struct *task)
@@ -1333,6 +1361,17 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
later_rq = cpu_rq(cpu);
+ if (!dl_time_before(task->dl.deadline,
+ later_rq->dl.earliest_dl.curr)) {
+ /*
+ * Target rq has tasks of equal or earlier deadline,
+ * retrying does not release any lock and is unlikely
+ * to yield a different result.
+ */
+ later_rq = NULL;
+ break;
+ }
+
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
@@ -1514,7 +1553,7 @@ static int pull_dl_task(struct rq *this_rq)
if (src_rq->dl.dl_nr_running <= 1)
goto skip;
- p = pick_next_earliest_dl_task(src_rq, this_cpu);
+ p = pick_earliest_pushable_dl_task(src_rq, this_cpu);
/*
* We found a task to be pulled if:
@@ -1659,7 +1698,7 @@ static void rq_offline_dl(struct rq *rq)
cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
}
-void init_sched_dl_class(void)
+void __init init_sched_dl_class(void)
{
unsigned int i;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index a245c1fc6f0a..704683cc9042 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -132,12 +132,14 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
p->prio);
#ifdef CONFIG_SCHEDSTATS
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- SPLIT_NS(p->se.vruntime),
+ SPLIT_NS(p->se.statistics.wait_sum),
SPLIT_NS(p->se.sum_exec_runtime),
SPLIT_NS(p->se.statistics.sum_sleep_runtime));
#else
- SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
- 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
+ SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+ 0LL, 0L,
+ SPLIT_NS(p->se.sum_exec_runtime),
+ 0LL, 0L);
#endif
#ifdef CONFIG_NUMA_BALANCING
SEQ_printf(m, " %d", task_node(p));
@@ -156,7 +158,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m,
"\nrunnable tasks:\n"
" task PID tree-key switches prio"
- " exec-runtime sum-exec sum-sleep\n"
+ " wait-time sum-exec sum-sleep\n"
"------------------------------------------------------"
"----------------------------------------------------\n");
@@ -582,6 +584,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
nr_switches = p->nvcsw + p->nivcsw;
#ifdef CONFIG_SCHEDSTATS
+ PN(se.statistics.sum_sleep_runtime);
PN(se.statistics.wait_start);
PN(se.statistics.sleep_start);
PN(se.statistics.block_start);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c2980e8733bc..433061d984ea 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -141,9 +141,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
*
* This idea comes from the SD scheduler of Con Kolivas:
*/
-static int get_update_sysctl_factor(void)
+static unsigned int get_update_sysctl_factor(void)
{
- unsigned int cpus = min_t(int, num_online_cpus(), 8);
+ unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
unsigned int factor;
switch (sysctl_sched_tunable_scaling) {
@@ -576,7 +576,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- int factor = get_update_sysctl_factor();
+ unsigned int factor = get_update_sysctl_factor();
if (ret || !write)
return ret;
@@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
static unsigned int task_scan_min(struct task_struct *p)
{
- unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
+ unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;
@@ -1198,11 +1198,9 @@ static void task_numa_assign(struct task_numa_env *env,
static bool load_too_imbalanced(long src_load, long dst_load,
struct task_numa_env *env)
{
+ long imb, old_imb;
+ long orig_src_load, orig_dst_load;
long src_capacity, dst_capacity;
- long orig_src_load;
- long load_a, load_b;
- long moved_load;
- long imb;
/*
* The load is corrected for the CPU capacity available on each node.
@@ -1215,39 +1213,30 @@ static bool load_too_imbalanced(long src_load, long dst_load,
dst_capacity = env->dst_stats.compute_capacity;
/* We care about the slope of the imbalance, not the direction. */
- load_a = dst_load;
- load_b = src_load;
- if (load_a < load_b)
- swap(load_a, load_b);
+ if (dst_load < src_load)
+ swap(dst_load, src_load);
/* Is the difference below the threshold? */
- imb = load_a * src_capacity * 100 -
- load_b * dst_capacity * env->imbalance_pct;
+ imb = dst_load * src_capacity * 100 -
+ src_load * dst_capacity * env->imbalance_pct;
if (imb <= 0)
return false;
/*
* The imbalance is above the allowed threshold.
- * Allow a move that brings us closer to a balanced situation,
- * without moving things past the point of balance.
+ * Compare it with the old imbalance.
*/
orig_src_load = env->src_stats.load;
+ orig_dst_load = env->dst_stats.load;
- /*
- * In a task swap, there will be one load moving from src to dst,
- * and another moving back. This is the net sum of both moves.
- * A simple task move will always have a positive value.
- * Allow the move if it brings the system closer to a balanced
- * situation, without crossing over the balance point.
- */
- moved_load = orig_src_load - src_load;
+ if (orig_dst_load < orig_src_load)
+ swap(orig_dst_load, orig_src_load);
- if (moved_load > 0)
- /* Moving src -> dst. Did we overshoot balance? */
- return src_load * dst_capacity < dst_load * src_capacity;
- else
- /* Moving dst -> src. Did we overshoot balance? */
- return dst_load * src_capacity < src_load * dst_capacity;
+ old_imb = orig_dst_load * src_capacity * 100 -
+ orig_src_load * dst_capacity * env->imbalance_pct;
+
+ /* Would this change make things worse? */
+ return (imb > old_imb);
}
/*
@@ -1409,6 +1398,30 @@ static void task_numa_find_cpu(struct task_numa_env *env,
}
}
+/* Only move tasks to a NUMA node less busy than the current node. */
+static bool numa_has_capacity(struct task_numa_env *env)
+{
+ struct numa_stats *src = &env->src_stats;
+ struct numa_stats *dst = &env->dst_stats;
+
+ if (src->has_free_capacity && !dst->has_free_capacity)
+ return false;
+
+ /*
+ * Only consider a task move if the source has a higher load
+ * than the destination, corrected for CPU capacity on each node.
+ *
+ * src->load dst->load
+ * --------------------- vs ---------------------
+ * src->compute_capacity dst->compute_capacity
+ */
+ if (src->load * dst->compute_capacity >
+ dst->load * src->compute_capacity)
+ return true;