From 39be350127ec60a078edffe5b4915dafba4ba514 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 26 Jan 2012 12:44:34 +0100
Subject: sched, block: Unify cache detection

The block layer has some code trying to determine if two CPUs share a
cache, the scheduler has a similar function. Expose the function used
by the scheduler and make the block layer use it, thereby removing the
block layers usage of CONFIG_SCHED* and topology bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Jens Axboe <axboe@kernel.dk>
Link: http://lkml.kernel.org/r/1327579450.2446.95.camel@twins
---
 kernel/sched/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5255c9d2e053..d7c43227311d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1507,7 +1507,7 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
 }
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
 
-static inline int ttwu_share_cache(int this_cpu, int that_cpu)
+bool cpus_share_cache(int this_cpu, int that_cpu)
 {
 	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
 }
@@ -1518,7 +1518,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 	struct rq *rq = cpu_rq(cpu);
 
 #if defined(CONFIG_SMP)
-	if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
+	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
 		sched_clock_cpu(cpu); /* sync clocks x-cpu */
 		ttwu_queue_remote(p, cpu);
 		return;
@@ -5754,7 +5754,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
  *
  * Also keep a unique ID per domain (we use the first cpu number in
  * the cpumask of the domain), this allows us to quickly tell if
- * two cpus are in the same cache domain, see ttwu_share_cache().
+ * two cpus are in the same cache domain, see cpus_share_cache().
  */
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_id);
-- 
cgit v1.2.3


From 4ec4412e1e91f44a3dcb97b6c9172a13fc78bac9 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 12 Dec 2011 20:21:08 +0100
Subject: sched: Ensure cpu_power periodic update

With a lot of small tasks, the softirq sched is nearly never called
when no_hz is enabled. In this case load_balance() is mainly called
with the newly_idle mode which doesn't update the cpu_power.

Add a next_update field which ensure a maximum update period when
there is short activity.

Having stale cpu_power information can skew the load-balancing
decisions, this is cured by the guaranteed update.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1323717668-2143-1-git-send-email-vincent.guittot@linaro.org
---
 kernel/sched/fair.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c6414fc669d..8e77a6bd597b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -215,6 +215,8 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 
 const struct sched_class fair_sched_class;
 
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
 /**************************************************************
  * CFS operations on generic schedulable entities:
  */
@@ -3776,6 +3778,11 @@ void update_group_power(struct sched_domain *sd, int cpu)
 	struct sched_domain *child = sd->child;
 	struct sched_group *group, *sdg = sd->groups;
 	unsigned long power;
+	unsigned long interval;
+
+	interval = msecs_to_jiffies(sd->balance_interval);
+	interval = clamp(interval, 1UL, max_load_balance_interval);
+	sdg->sgp->next_update = jiffies + interval;
 
 	if (!child) {
 		update_cpu_power(sd, cpu);
@@ -3883,12 +3890,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	 * domains. In the newly idle case, we will allow all the cpu's
 	 * to do the newly idle load balance.
 	 */
-	if (idle != CPU_NEWLY_IDLE && local_group) {
-		if (balance_cpu != this_cpu) {
-			*balance = 0;
-			return;
-		}
-		update_group_power(sd, this_cpu);
+	if (local_group) {
+		if (idle != CPU_NEWLY_IDLE) {
+			if (balance_cpu != this_cpu) {
+				*balance = 0;
+				return;
+			}
+			update_group_power(sd, this_cpu);
+		} else if (time_after_eq(jiffies, group->sgp->next_update))
+			update_group_power(sd, this_cpu);
 	}
 
 	/* Adjust by relative CPU power of the group */
@@ -4945,8 +4955,6 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
 
 static DEFINE_SPINLOCK(balancing);
 
-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
-
 /*
  * Scale the max load_balance interval with the number of CPUs in the system.
  * This trades load-balance latency on larger machines for less cross talk.
-- 
cgit v1.2.3


From 30fd049afcfed50e022704036e8629d6bdfe84e6 Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Tue, 24 Jan 2012 22:33:56 +0600
Subject: sched: Remove sched_switch

Currently we don't utilize the sched_switch field anymore.

But, simply removing sched_switch field from the middle of the
sched_stat output will break tools.

So, to stay compatible we hardcode it to zero and remove the
field from the scheduler data structures.

Update the schedstat documentation accordingly.

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1327422836.27181.5.camel@localhost.localdomain
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/debug.c | 1 -
 kernel/sched/sched.h | 1 -
 kernel/sched/stats.c | 4 ++--
 3 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2a075e10004b..09acaa15161d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -288,7 +288,6 @@ static void print_cpu(struct seq_file *m, int cpu)
 
 	P(yld_count);
 
-	P(sched_switch);
 	P(sched_count);
 	P(sched_goidle);
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 98c0c2623db8..8a2c768d2f98 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -462,7 +462,6 @@ struct rq {
 	unsigned int yld_count;
 
 	/* schedule() stats */
-	unsigned int sched_switch;
 	unsigned int sched_count;
 	unsigned int sched_goidle;
 
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 2a581ba8e190..903ffa9e8872 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -32,9 +32,9 @@ static int show_schedstat(struct seq_file *seq, void *v)
 
 		/* runqueue-specific stats */
 		seq_printf(seq,
-		    "cpu%d %u %u %u %u %u %u %llu %llu %lu",
+		    "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
 		    cpu, rq->yld_count,
-		    rq->sched_switch, rq->sched_count, rq->sched_goidle,
+		    rq->sched_count, rq->sched_goidle,
 		    rq->ttwu_count, rq->ttwu_local,
 		    rq->rq_cpu_time,
 		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
-- 
cgit v1.2.3


From ed387b781ea6e14b78f449aa2ee4f270b60b01ac Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Tue, 31 Jan 2012 11:40:32 +0900
Subject: sched: Move SMP-only variable into the SMP section

This also fixes the following compilation warning on !SMP:

  CC kernel/sched/fair.o
  kernel/sched/fair.c:218:36: warning: 'max_load_balance_interval' defined but not used [-Wunused-variable]

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/4F2754A0.9090306@ct.jp.nec.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8e77a6bd597b..4ab60a24ea49 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -215,8 +215,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 
 const struct sched_class fair_sched_class;
 
-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
-
 /**************************************************************
  * CFS operations on generic schedulable entities:
  */
@@ -3086,6 +3084,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  * Fair scheduling class load-balancing methods:
  */
 
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
 /*
  * pull_task - move a task from a remote runqueue to the local runqueue.
  * Both runqueues must be locked.
-- 
cgit v1.2.3


From 62f6536a630affe3176deb48554d27ee58b65077 Mon Sep 17 00:00:00 2001
From: "Nikunj A. Dadhania" <nikunj@linux.vnet.ibm.com>
Date: Fri, 17 Feb 2012 08:34:30 +0530
Subject: sched: Remove rcu_read_lock/unlock() from select_idle_sibling()

select_idle_sibling() is called from select_task_rq_fair(), which
already has the RCU read lock held.

Signed-off-by: Nikunj A. Dadhania <nikunj@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20120217030409.11748.12491.stgit@abhimanyu
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/fair.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4ab60a24ea49..6ce9992926d0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2670,8 +2670,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
 	/*
 	 * Otherwise, iterate the domains and find an elegible idle cpu.
 	 */
-	rcu_read_lock();
-
 	sd = rcu_dereference(per_cpu(sd_llc, target));
 	for_each_lower_domain(sd) {
 		sg = sd->groups;
@@ -2693,8 +2691,6 @@ next:
 		} while (sg != sd->groups);
 	}
 done:
-	rcu_read_unlock();
-
 	return target;
 }
 
-- 
cgit v1.2.3


From de5bdff7a72acc281219be2b8edeeca1fd81c542 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Thu, 16 Feb 2012 14:52:21 +0900
Subject: sched: Make initial SCHED_RR timeslace DEF_TIMESLICE

Current the initial SCHED_RR timeslice of init_task is HZ, which means
1s, and is not same as the default SCHED_RR timeslice DEF_TIMESLICE.

Change that initial timeslice to the DEF_TIMESLICE.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
[ s/DEF_TIMESLICE/RR_TIMESLICE/g ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/4F3C9995.3010800@ct.jp.nec.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/rt.c    | 4 ++--
 kernel/sched/sched.h | 4 ----
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f42ae7fb5ec5..f70206c2c802 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1972,7 +1972,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 	if (--p->rt.time_slice)
 		return;
 
-	p->rt.time_slice = DEF_TIMESLICE;
+	p->rt.time_slice = RR_TIMESLICE;
 
 	/*
 	 * Requeue to the end of queue if we are not the only element
@@ -2000,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
 	 * Time slice is 0 for SCHED_FIFO tasks
 	 */
 	if (task->policy == SCHED_RR)
-		return DEF_TIMESLICE;
+		return RR_TIMESLICE;
 	else
 		return 0;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8a2c768d2f98..c0660a1a0cd1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -36,11 +36,7 @@ extern __read_mostly int scheduler_running;
 
 /*
  * These are the 'tuning knobs' of the scheduler:
- *
- * default timeslice is 100 msecs (used only for SCHED_RR tasks).
- * Timeslices get refilled after they expire.
  */
-#define DEF_TIMESLICE		(100 * HZ / 1000)
 
 /*
  * single value that denotes runtime == period, ie unlimited time.
-- 
cgit v1.2.3


From 42c62a589f1ccbf38a02cb732231f9c2fccc5ab0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 18 Oct 2011 22:03:48 +0200
Subject: sched/rt: Keep period timer ticking when rt throttling is active

When a runqueue is throttled we cannot disable the period timer
because that timer is the only way to undo the throttling.

We got stale throttling entries when a rq was throttled and then the
global sysctl was disabled, which stopped the timer.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
[ Added changelog ]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/n/tip-nuj34q52p6ro7szapuz84i0v@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/rt.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f70206c2c802..6d1eb0be1870 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -778,12 +778,9 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
 
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 {
-	int i, idle = 1;
+	int i, idle = 1, throttled = 0;
 	const struct cpumask *span;
 
-	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
-		return 1;
-
 	span = sched_rt_period_mask();
 	for_each_cpu(i, span) {
 		int enqueue = 0;
@@ -818,12 +815,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 			if (!rt_rq_throttled(rt_rq))
 				enqueue = 1;
 		}
+		if (rt_rq->rt_throttled)
+			throttled = 1;
 
 		if (enqueue)
 			sched_rt_rq_enqueue(rt_rq);
 		raw_spin_unlock(&rq->lock);
 	}
 
+	if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
+		return 1;
+
 	return idle;
 }
 
@@ -884,7 +886,8 @@ static void update_curr_rt(struct rq *rq)
 	if (unlikely((s64)delta_exec < 0))
 		delta_exec = 0;
 
-	schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
+	schedstat_set(curr->se.statistics.exec_max,
+		      max(curr->se.statistics.exec_max, delta_exec));
 
 	curr->se.sum_exec_runtime += delta_exec;
 	account_group_exec_runtime(curr, delta_exec);
-- 
cgit v1.2.3


From 7abc63b1bd412f7655b62ef3e35c3c11c5134636 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 18 Oct 2011 22:03:48 +0200
Subject: sched/rt: Do not throttle when PI boosting

When a runqueue has rt_runtime_us = 0 then the only way it can
accumulate rt_time is via PI boosting. That causes the runqueue
to be throttled and replenishing does not change anything due to
rt_runtime_us = 0. So avoid that situation by clearing rt_time and
skip the throttling alltogether.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
[ Changelog ]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/n/tip-7x70cypsotjb4jvcor3edctk@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/rt.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 6d1eb0be1870..7f7e7cdcb472 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -857,8 +857,24 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 		return 0;
 
 	if (rt_rq->rt_time > runtime) {
-		rt_rq->rt_throttled = 1;
-		printk_once(KERN_WARNING "sched: RT throttling activated\n");
+		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+		/*
+		 * Don't actually throttle groups that have no runtime assigned
+		 * but accrue some time due to boosting.
+		 */
+		if (likely(rt_b->rt_runtime)) {
+			rt_rq->rt_throttled = 1;
+			printk_once(KERN_WARNING "sched: RT throttling activated\n");
+		} else {
+			/*
+			 * In case we did anyway, make it go away,
+			 * replenishment is a joke, since it will replenish us
+			 * with exactly 0 ns.
+			 */
+			rt_rq->rt_time = 0;
+		}
+
 		if (rt_rq_throttled(rt_rq)) {
 			sched_rt_rq_dequeue(rt_rq);
 			return 1;
-- 
cgit v1.2.3


From c5491ea779793f977d282754db478157cc409d82 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 21 Mar 2011 12:09:35 +0100
Subject: sched/rt: Add schedule_preempt_disabled()

Add helper to get rid of the ever repeating:

    preempt_enable_no_resched();
    schedule();
    preempt_disable();

patterns.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-wxx7btox7coby6ifv5vzhzgp@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/core.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 91fe6a0e9098..73022395c00e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3246,6 +3246,18 @@ asmlinkage void __sched schedule(void)
 }
 EXPORT_SYMBOL(schedule);
 
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
+ */
+void __sched schedule_preempt_disabled(void)
+{
+	preempt_enable_no_resched();
+	schedule();
+	preempt_disable();
+}
+
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 
 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-- 
cgit v1.2.3


From bd2f55361f18347e890d52ff9cfd8895455ec11b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 21 Mar 2011 12:33:18 +0100
Subject: sched/rt: Use schedule_preempt_disabled()

Coccinelle based conversion.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-24swm5zut3h9c4a6s46x8rws@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/mutex.c   | 4 +---
 kernel/softirq.c | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/mutex.c b/kernel/mutex.c
index 89096dd8786f..a307cc9c9526 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -240,9 +240,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 
 		/* didn't get the lock, go to sleep: */
 		spin_unlock_mutex(&lock->wait_lock, flags);
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 		spin_lock_mutex(&lock->wait_lock, flags);
 	}
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4eb3a0fa351e..79b524767a24 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -744,9 +744,7 @@ static int run_ksoftirqd(void * __bind_cpu)
 	while (!kthread_should_stop()) {
 		preempt_disable();
 		if (!local_softirq_pending()) {
-			preempt_enable_no_resched();
-			schedule();
-			preempt_disable();
+			schedule_preempt_disabled();
 		}
 
 		__set_current_state(TASK_RUNNING);
-- 
cgit v1.2.3


From ba74c1448f127649046615ec017bded7b2a76f29 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 21 Mar 2011 13:32:17 +0100
Subject: sched/rt: Document scheduler related skip-resched-check sites

Create a distinction between scheduler related preempt_enable_no_resched()
calls and the nearly one hundred other places in the kernel that do not
want to reschedule, for one reason or another.

This distinction matters for -rt, where the scheduler and the non-scheduler
preempt models (and checks) are different. For upstream it's purely
documentational.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/n/tip-gs88fvx2mdv5psnzxnv575ke@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/core.c | 6 +++---
 kernel/softirq.c    | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 73022395c00e..643cc37fcb23 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3220,7 +3220,7 @@ need_resched:
 
 	post_schedule(rq);
 
-	preempt_enable_no_resched();
+	sched_preempt_enable_no_resched();
 	if (need_resched())
 		goto need_resched;
 }
@@ -3253,7 +3253,7 @@ EXPORT_SYMBOL(schedule);
  */
 void __sched schedule_preempt_disabled(void)
 {
-	preempt_enable_no_resched();
+	sched_preempt_enable_no_resched();
 	schedule();
 	preempt_disable();
 }
@@ -4486,7 +4486,7 @@ SYSCALL_DEFINE0(sched_yield)
 	__release(rq->lock);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 	do_raw_spin_unlock(&rq->lock);
-	preempt_enable_no_resched();
+	sched_preempt_enable_no_resched();
 
 	schedule();
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 79b524767a24..f268369ebe1f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -353,7 +353,7 @@ void irq_exit(void)
 		tick_nohz_irq_exit();
 #endif
 	rcu_irq_exit();
-	preempt_enable_no_resched();
+	sched_preempt_enable_no_resched();
 }
 
 /*
@@ -759,7 +759,7 @@ static int run_ksoftirqd(void * __bind_cpu)
 			if (local_softirq_pending())
 				__do_softirq();
 			local_irq_enable();
-			preempt_enable_no_resched();
+			sched_preempt_enable_no_resched();
 			cond_resched();
 			preempt_disable();
 			rcu_note_context_switch((long)__bind_cpu);
-- 
cgit v1.2.3


From 63b2001169e75cd71e917ec953fdab572e3f944a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 1 Dec 2011 00:04:00 +0100
Subject: sched/wait: Add __wake_up_all_locked() API

For code which protects the waitqueue itself with another lock it
makes no sense to acquire the waitqueue lock for wakeup all. Provide
__wake_up_all_locked().

This is an optimization on the vanilla kernel (to be used by the
PCI code) and an important semantic distinction on -rt.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-ux6m4b8jonb9inx8xafh77ds@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 643cc37fcb23..820f7453fda9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3418,9 +3418,9 @@ EXPORT_SYMBOL(__wake_up);
 /*
  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
  */
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
 {
-	__wake_up_common(q, mode, 1, 0, NULL);
+	__wake_up_common(q, mode, nr, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
-- 
cgit v1.2.3


From 1c4dd99bed5f6f70932bf8dacdd54d04a2619778 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Jun 2011 20:07:38 +0200
Subject: sched/rt: Prevent idle task boosting

Idle task boosting is a nono in general. There is one
exception, when PREEMPT_RT and NOHZ is active:

The idle task calls get_next_timer_interrupt() and holds
the timer wheel base->lock on the CPU and another CPU wants
to access the timer (probably to cancel it). We can safely
ignore the boosting request, as the idle CPU runs this code
with interrupts disabled and will complete the lock
protected section without being interrupted. So there is no
real need to boost.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-755rvsosz7sdzot12a3gbha6@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/core.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 820f7453fda9..c2bbdecaa27d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3779,6 +3779,24 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	rq = __task_rq_lock(p);
 
+	/*
+	 * Idle task boosting is a nono in general. There is one
+	 * exception, when PREEMPT_RT and NOHZ is active:
+	 *
+	 * The idle task calls get_next_timer_interrupt() and holds
+	 * the timer wheel base->lock on the CPU and another CPU wants
+	 * to access the timer (probably to cancel it). We can safely
+	 * ignore the boosting request, as the idle CPU runs this code
+	 * with interrupts disabled and will complete the lock
+	 * protected section without being interrupted. So there is no
+	 * real need to boost.
+	 */
+	if (unlikely(p == rq->idle)) {
+		WARN_ON(p != rq->curr);
+		WARN_ON(p->pi_blocked_on);
+		goto out_unlock;
+	}
+
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
 	prev_class = p->sched_class;
@@ -3802,11 +3820,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
 
 	check_class_changed(rq, p, prev_class, oldprio);
+out_unlock:
 	__task_rq_unlock(rq);
 }
-
 #endif
-
 void set_user_nice(struct task_struct *p, long nice)
 {
 	int old_prio, delta, on_rq;
-- 
cgit v1.2.3


From 3c7d51843b03a6839e9ec7cda724e54d2319a63a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 20:46:52 +0200
Subject: sched/rt: Do not submit new work when PI-blocked

When we are PI-blocked then we want to get things done ASAP.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-vw8et3445km5b8mpihf4trae@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c2bbdecaa27d..25e06a5e048b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3227,7 +3227,7 @@ need_resched:
 
 static inline void sched_submit_work(struct task_struct *tsk)
 {
-	if (!tsk->state)
+	if (!tsk->state || tsk_is_pi_blocked(tsk))
 		return;
 	/*
 	 * If we are going to sleep and we have plugged IO queued,
-- 
cgit v1.2.3


From 8e45cb545d98bc58e75b7de89ec8d3e5c8459ee6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 22 Feb 2012 12:47:19 +0100
Subject: sched: Move load-balancing arguments into helper struct

Passing large sets of similar arguments all around the load-balancer
gets tiresom when you want to modify something. Stick them all in a
helper structure and pass the structure around.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: pjt@google.com
Link: http://lkml.kernel.org/n/tip-5slqz0vhsdzewrfk9eza1aon@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/fair.c | 177 +++++++++++++++++++++++++++-------------------------
 1 file changed, 93 insertions(+), 84 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 79e9e13c31ab..55b1f117419a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3135,13 +3135,25 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 #define LBF_HAD_BREAKS	0x0C	/* count HAD_BREAKs overflows into ABORT */
 #define LBF_ABORT	0x10
 
+struct lb_env {
+	struct sched_domain	*sd;
+
+	int			this_cpu;
+	struct rq		*this_rq;
+
+	struct rq		*busiest_rq;
+	struct cfs_rq		*busiest_cfs_rq;
+
+	enum cpu_idle_type	idle;
+	unsigned long		max_load_move;
+	unsigned int		flags;
+};
+
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
 static
-int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
-		     struct sched_domain *sd, enum cpu_idle_type idle,
-		     int *lb_flags)
+int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
 	int tsk_cache_hot = 0;
 	/*
@@ -3150,13 +3162,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
-	if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
+	if (!cpumask_test_cpu(env->this_cpu, tsk_cpus_allowed(p))) {
 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
 		return 0;
 	}
-	*lb_flags &= ~LBF_ALL_PINNED;
+	env->flags &= ~LBF_ALL_PINNED;
 
-	if (task_running(rq, p)) {
+	if (task_running(env->busiest_rq, p)) {
 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
 		return 0;
 	}
@@ -3167,12 +3179,12 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 2) too many balance attempts have failed.
 	 */
 
-	tsk_cache_hot = task_hot(p, rq->clock_task, sd);
+	tsk_cache_hot = task_hot(p, env->busiest_rq->clock_task, env->sd);
 	if (!tsk_cache_hot ||
-		sd->nr_balance_failed > sd->cache_nice_tries) {
+		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
 		if (tsk_cache_hot) {
-			schedstat_inc(sd, lb_hot_gained[idle]);
+			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
 			schedstat_inc(p, se.statistics.nr_forced_migrations);
 		}
 #endif
@@ -3193,31 +3205,27 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
  *
  * Called with both runqueues locked.
  */
-static int
-move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-	      struct sched_domain *sd, enum cpu_idle_type idle)
+static int move_one_task(struct lb_env *env)
 {
 	struct task_struct *p, *n;
 	struct cfs_rq *cfs_rq;
-	int pinned = 0;
 
-	for_each_leaf_cfs_rq(busiest, cfs_rq) {
+	for_each_leaf_cfs_rq(env->busiest_rq, cfs_rq) {
 		list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
 			if (throttled_lb_pair(task_group(p),
-					      busiest->cpu, this_cpu))
+					      env->busiest_rq->cpu, env->this_cpu))
 				break;
 
-			if (!can_migrate_task(p, busiest, this_cpu,
-						sd, idle, &pinned))
+			if (!can_migrate_task(p, env))
 				continue;
 
-			pull_task(busiest, p, this_rq, this_cpu);
+			pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu);
 			/*
 			 * Right now, this is only the second place pull_task()
 			 * is called, so we can safely collect pull_task()
 			 * stats here rather than inside pull_task().
 			 */
-			schedstat_inc(sd, lb_gained[idle]);
+			schedstat_inc(env->sd, lb_gained[env->idle]);
 			return 1;
 		}
 	}
@@ -3225,31 +3233,26 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	return 0;
 }
 
-static unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-	      unsigned long max_load_move, struct sched_domain *sd,
-	      enum cpu_idle_type idle, int *lb_flags,
-	      struct cfs_rq *busiest_cfs_rq)
+static unsigned long balance_tasks(struct lb_env *env)
 {
 	int loops = 0, pulled = 0;
-	long rem_load_move = max_load_move;
+	long rem_load_move = env->max_load_move;
 	struct task_struct *p, *n;
 
-	if (max_load_move == 0)
+	if (env->max_load_move == 0)
 		goto out;
 
-	list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
+	list_for_each_entry_safe(p, n, &env->busiest_cfs_rq->tasks, se.group_node) {
 		if (loops++ > sysctl_sched_nr_migrate) {
-			*lb_flags |= LBF_NEED_BREAK;
+			env->flags |= LBF_NEED_BREAK;
 			break;
 		}
 
 		if ((p->se.load.weight >> 1) > rem_load_move ||
-		    !can_migrate_task(p, busiest, this_cpu, sd, idle,
-				      lb_flags))
+		    !can_migrate_task(p, env))
 			continue;
 
-		pull_task(busiest, p, this_rq, this_cpu);
+		pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu);
 		pulled++;
 		rem_load_move -= p->se.load.weight;
 
@@ -3259,8 +3262,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		 * kernels will stop after the first task is pulled to minimize
 		 * the critical section.
 		 */
-		if (idle == CPU_NEWLY_IDLE) {
-			*lb_flags |= LBF_ABORT;
+		if (env->idle == CPU_NEWLY_IDLE) {
+			env->flags |= LBF_ABORT;
 			break;
 		}
 #endif
@@ -3278,9 +3281,9 @@ out:
 	 * so we can safely collect pull_task() stats here rather than
 	 * inside pull_task().
 	 */
-	schedstat_add(sd, lb_gained[idle], pulled);
+	schedstat_add(env->sd, lb_gained[env->idle], pulled);
 
-	return max_load_move - rem_load_move;
+	return env->max_load_move - rem_load_move;
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3363,40 +3366,39 @@ static void update_h_load(long cpu)
 	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		  unsigned long max_load_move,
-		  struct sched_domain *sd, enum cpu_idle_type idle,
-		  int *lb_flags)
+static unsigned long load_balance_fair(struct lb_env *env)
 {
-	long rem_load_move = max_load_move;
-	struct cfs_rq *busiest_cfs_rq;
+	unsigned long max_load_move = env->max_load_move;
+	long rem_load_move = env->max_load_move;
 
 	rcu_read_lock();
-	update_h_load(cpu_of(busiest));
+	update_h_load(cpu_of(env->busiest_rq));
 
-	for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
-		unsigned long busiest_h_load = busiest_cfs_rq->h_load;
-		unsigned long busiest_weight = busiest_cfs_rq->load.weight;
+	for_each_leaf_cfs_rq(env->busiest_rq, env->busiest_cfs_rq) {
+		unsigned long busiest_h_load = env->busiest_cfs_rq->h_load;
+		unsigned long busiest_weight = env->busiest_cfs_rq->load.weight;
 		u64 rem_load, moved_load;
 
-		if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
+		if (env->flags & (LBF_NEED_BREAK|LBF_ABORT))
 			break;
 
 		/*
 		 * empty group or part of a throttled hierarchy
 		 */
-		if (!busiest_cfs_rq->task_weight ||
-		    throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
+		if (!env->busiest_cfs_rq->task_weight)
+			continue;
+
+		if (throttled_lb_pair(env->busiest_cfs_rq->tg,
+				      cpu_of(env->busiest_rq),
+				      env->this_cpu))
 			continue;
 
 		rem_load = (u64)rem_load_move * busiest_weight;
 		rem_load = div_u64(rem_load, busiest_h_load + 1);
 
-		moved_load = balance_tasks(this_rq, this_cpu, busiest,
-				rem_load, sd, idle, lb_flags,
-				busiest_cfs_rq);
+		env->max_load_move = rem_load;
 
+		moved_load = balance_tasks(env);
 		if (!moved_load)
 			continue;
 
@@ -3416,15 +3418,10 @@ static inline void update_shares(int cpu)
 {
 }
 
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		  unsigned long max_load_move,
-		  struct sched_domain *sd, enum cpu_idle_type idle,
-		  int *lb_flags)
+static unsigned long load_balance_fair(struct lb_env *env)
 {
-	return balance_tasks(this_rq, this_cpu, busiest,
-			max_load_move, sd, idle, lb_flags,
-			&busiest->cfs);
+	env->busiest_cfs_rq = &env->busiest_rq->cfs;
+	return balance_tasks(env);
 }
 #endif
 
@@ -3435,21 +3432,17 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
  *
  * Called with both runqueues locked.
  */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		      unsigned long max_load_move,
-		      struct sched_domain *sd, enum cpu_idle_type idle,
-		      int *lb_flags)
+static int move_tasks(struct lb_env *env)
 {
+	unsigned long max_load_move = env->max_load_move;
 	unsigned long total_load_moved = 0, load_moved;
 
 	do {
-		load_moved = load_balance_fair(this_rq, this_cpu, busiest,
-				max_load_move - total_load_moved,
-				sd, idle, lb_flags);
-
+		env->max_load_move = max_load_move - total_load_moved;
+		load_moved = load_balance_fair(env);
 		total_load_moved += load_moved;
 
-		if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
+		if (env->flags & (LBF_NEED_BREAK|LBF_ABORT))
 			break;
 
 #ifdef CONFIG_PREEMPT
@@ -3458,8 +3451,8 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		 * kernels will stop after the first task is pulled to minimize
 		 * the critical section.
 		 */
-		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
-			*lb_flags |= LBF_ABORT;
+		if (env->idle == CPU_NEWLY_IDLE && env->this_rq->nr_running) {
+			env->flags |= LBF_ABORT;
 			break;
 		}
 #endif
@@ -4459,13 +4452,20 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *balance)
 {
-	int ld_moved, lb_flags = 0, active_balance = 0;
+	int ld_moved, active_balance = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
 	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
 
+	struct lb_env env = {
+		.sd		= sd,
+		.this_cpu	= this_cpu,
+		.this_rq	= this_rq,
+		.idle		= idle,
+	};
+
 	cpumask_copy(cpus, cpu_active_mask);
 
 	schedstat_inc(sd, lb_count[idle]);
@@ -4500,11 +4500,13 @@ redo:
 		 * still unbalanced. ld_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
-		lb_flags |= LBF_ALL_PINNED;
+		env.flags |= LBF_ALL_PINNED;
+		env.max_load_move = imbalance;
+		env.busiest_rq = busiest;
+
 		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
-		ld_moved = move_tasks(this_rq, this_cpu, busiest,
-				      imbalance, sd, idle, &lb_flags);
+		ld_moved = move_tasks(&env);
 		double_rq_unlock(this_rq, busiest);
 		local_irq_restore(flags);
 
@@ -4514,18 +4516,18 @@ redo:
 		if (ld_moved && this_cpu != smp_processor_id())
 			resched_cpu(this_cpu);
 
-		if (lb_flags & LBF_ABORT)
+		if (env.flags & LBF_ABORT)
 			goto out_balanced;
 
-		if (lb_flags & LBF_NEED_BREAK) {
-			lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
-			if (lb_flags & LBF_ABORT)
+		if (env.flags & LBF_NEED_BREAK) {
+			env.flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
+			if (env.flags & LBF_ABORT)
 				goto out_balanced;
 			goto redo;
 		}
 
 		/* All tasks on this runqueue were pinned by CPU affinity */
-		if (unlikely(lb_flags & LBF_ALL_PINNED)) {
+		if (unlikely(env.flags & LBF_ALL_PINNED)) {
 			cpumask_clear_cpu(cpu_of(busiest), cpus);
 			if (!cpumask_empty(cpus))
 				goto redo;
@@ -4555,7 +4557,7 @@ redo:
 					tsk_cpus_allowed(busiest->curr))) {
 				raw_spin_unlock_irqrestore(&busiest->lock,
 							    flags);
-				lb_flags |= LBF_ALL_PINNED;
+				env.flags |= LBF_ALL_PINNED;
 				goto out_one_pinned;
 			}
 
@@ -4608,7 +4610,7 @@ out_balanced:
 
 out_one_pinned:
 	/* tune up the balancing interval */
-	if (((lb_flags & LBF_ALL_PINNED) &&
+	if (((env.flags & LBF_ALL_PINNED) &&
 			sd->balance_interval < MAX_PINNED_INTERVAL) ||
 			(sd->balance_interval < sd->max_interval))
 		sd->balance_interval *= 2;
@@ -4718,10 +4720,17 @@ static int active_load_balance_cpu_stop(void *data)
 	}
 
 	if (likely(sd)) {
+		struct lb_env env = {
+			.sd		= sd,
+			.this_cpu	= target_cpu,
+			.this_rq	= target_rq,
+			.busiest_rq	= busiest_rq,
+			.idle		= CPU_IDLE,
+		};
+
 		schedstat_inc(sd, alb_count);
 
-		if (move_one_task(target_rq, target_cpu, busiest_rq,
-				  sd, CPU_IDLE))
+		if (move_one_task(&env))
 			schedstat_inc(sd, alb_pushed);
 		else
 			schedstat_inc(sd, alb_failed);
-- 
cgit v1.2.3


From ddcdf6e7d9919d139031fa2a6addd9544a9a833e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 22 Feb 2012 19:27:40 +0100
Subject: sched: Rename load-balancing fields

 s/env->this_/env->dst_/g
 s/env->busiest_/env->src_/g
 s/pull_task/move_task/g

Makes everything clearer.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: pjt@google.com
Link: http://lkml.kernel.org/n/tip-0yvgms8t8x962drpvl0fu0kk@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/fair.c | 118 ++++++++++++++++++++++++++--------------------------
 1 file changed, 60 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 55b1f117419a..233d05171bf5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2918,7 +2918,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 		return;
 
 	/*
-	 * This is possible from callers such as pull_task(), in which we
+	 * This is possible from callers such as move_task(), in which we
 	 * unconditionally check_prempt_curr() after an enqueue (which may have
 	 * lead to a throttle).  This both saves work and prevents false
 	 * next-buddy nomination below.
@@ -3084,17 +3084,37 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
+#define LBF_ALL_PINNED	0x01
+#define LBF_NEED_BREAK	0x02	/* clears into HAD_BREAK */
+#define LBF_HAD_BREAK	0x04
+#define LBF_HAD_BREAKS	0x0C	/* count HAD_BREAKs overflows into ABORT */
+#define LBF_ABORT	0x10
+
+struct lb_env {
+	struct sched_domain	*sd;
+
+	int			src_cpu;
+	struct rq		*src_rq;
+	struct cfs_rq		*src_cfs_rq;
+
+	int			dst_cpu;
+	struct rq		*dst_rq;
+
+	enum cpu_idle_type	idle;
+	unsigned long		max_load_move;
+	unsigned int		flags;
+};
+
 /*
- * pull_task - move a task from a remote runqueue to the local runqueue.
+ * move_task - move a task from one runqueue to another runqueue.
  * Both runqueues must be locked.
  */
-static void pull_task(struct rq *src_rq, struct task_struct *p,
-		      struct rq *this_rq, int this_cpu)
+static void move_task(struct task_struct *p, struct lb_env *env)
 {
-	deactivate_task(src_rq, p, 0);
-	set_task_cpu(p, this_cpu);
-	activate_task(this_rq, p, 0);
-	check_preempt_curr(this_rq, p, 0);
+	deactivate_task(env->src_rq, p, 0);
+	set_task_cpu(p, env->dst_cpu);
+	activate_task(env->dst_rq, p, 0);
+	check_preempt_curr(env->dst_rq, p, 0);
 }
 
 /*
@@ -3129,26 +3149,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 	return delta < (s64)sysctl_sched_migration_cost;
 }
 
-#define LBF_ALL_PINNED	0x01
-#define LBF_NEED_BREAK	0x02	/* clears into HAD_BREAK */
-#define LBF_HAD_BREAK	0x04
-#define LBF_HAD_BREAKS	0x0C	/* count HAD_BREAKs overflows into ABORT */
-#define LBF_ABORT	0x10
-
-struct lb_env {
-	struct sched_domain	*sd;
-
-	int			this_cpu;
-	struct rq		*this_rq;
-
-	struct rq		*busiest_rq;
-	struct cfs_rq		*busiest_cfs_rq;
-
-	enum cpu_idle_type	idle;
-	unsigned long		max_load_move;
-	unsigned int		flags;
-};
-
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
@@ -3162,13 +3162,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
-	if (!cpumask_test_cpu(env->this_cpu, tsk_cpus_allowed(p))) {
+	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
 		return 0;
 	}
 	env->flags &= ~LBF_ALL_PINNED;
 
-	if (task_running(env->busiest_rq, p)) {
+	if (task_running(env->src_rq, p)) {
 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
 		return 0;
 	}
@@ -3179,7 +3179,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	 * 2) too many balance attempts have failed.
 	 */
 
-	tsk_cache_hot = task_hot(p, env->busiest_rq->clock_task, env->sd);
+	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
 	if (!tsk_cache_hot ||
 		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
@@ -3210,20 +3210,20 @@ static int move_one_task(struct lb_env *env)
 	struct task_struct *p, *n;
 	struct cfs_rq *cfs_rq;
 
-	for_each_leaf_cfs_rq(env->busiest_rq, cfs_rq) {
+	for_each_leaf_cfs_rq(env->src_rq, cfs_rq) {
 		list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
 			if (throttled_lb_pair(task_group(p),
-					      env->busiest_rq->cpu, env->this_cpu))
+					      env->src_cpu, env->dst_cpu))
 				break;
 
 			if (!can_migrate_task(p, env))
 				continue;
 
-			pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu);
+			move_task(p, env);
 			/*
-			 * Right now, this is only the second place pull_task()
-			 * is called, so we can safely collect pull_task()
-			 * stats here rather than inside pull_task().
+			 * Right now, this is only the second place move_task()
+			 * is called, so we can safely collect move_task()
+			 * stats here rather than inside move_task().
 			 */
 			schedstat_inc(env->sd, lb_gained[env->idle]);
 			return 1;
@@ -3242,7 +3242,7 @@ static unsigned long balance_tasks(struct lb_env *env)
 	if (env->max_load_move == 0)
 		goto out;
 
-	list_for_each_entry_safe(p, n, &env->busiest_cfs_rq->tasks, se.group_node) {
+	list_for_each_entry_safe(p, n, &env->src_cfs_rq->tasks, se.group_node) {
 		if (loops++ > sysctl_sched_nr_migrate) {
 			env->flags |= LBF_NEED_BREAK;
 			break;
@@ -3252,7 +3252,7 @@ static unsigned long balance_tasks(struct lb_env *env)
 		    !can_migrate_task(p, env))
 			continue;
 
-		pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu);
+		move_task(p, env);
 		pulled++;
 		rem_load_move -= p->se.load.weight;
 
@@ -3277,9 +3277,9 @@ static unsigned long balance_tasks(struct lb_env *env)
 	}
 out:
 	/*
-	 * Right now, this is one of only two places pull_task() is called,
-	 * so we can safely collect pull_task() stats here rather than
-	 * inside pull_task().
+	 * Right now, this is one of only two places move_task() is called,
+	 * so we can safely collect move_task() stats here rather than
+	 * inside move_task().
 	 */
 	schedstat_add(env->sd, lb_gained[env->idle], pulled);
 
@@ -3372,11 +3372,11 @@ static unsigned long load_balance_fair(struct lb_env *env)
 	long rem_load_move = env->max_load_move;
 
 	rcu_read_lock();
-	update_h_load(cpu_of(env->busiest_rq));
+	update_h_load(cpu_of(env->src_rq));
 
-	for_each_leaf_cfs_rq(env->busiest_rq, env->busiest_cfs_rq) {
-		unsigned long busiest_h_load = env->busiest_cfs_rq->h_load;
-		unsigned long busiest_weight = env->busiest_cfs_rq->load.weight;
+	for_each_leaf_cfs_rq(env->src_rq, env->src_cfs_rq) {
+		unsigned long busiest_h_load = env->src_cfs_rq->h_load;
+		unsigned long busiest_weight = env->src_cfs_rq->load.weight;
 		u64 rem_load, moved_load;
 
 		if (env->flags & (LBF_NEED_BREAK|LBF_ABORT))
@@ -3385,12 +3385,12 @@ static unsigned long load_balance_fair(struct lb_env *env)
 		/*
 		 * empty group or part of a throttled hierarchy
 		 */
-		if (!env->busiest_cfs_rq->task_weight)
+		if (!env->src_cfs_rq->task_weight)
 			continue;
 
-		if (throttled_lb_pair(env->busiest_cfs_rq->tg,
-				      cpu_of(env->busiest_rq),
-				      env->this_cpu))
+		if (throttled_lb_pair(env->src_cfs_rq->tg,
+				      cpu_of(env->src_rq),
+				      env->dst_cpu))
 			continue;
 
 		rem_load = (u64)rem_load_move * busiest_weight;
@@ -3420,7 +3420,7 @@ static inline void update_shares(int cpu)
 
 static unsigned long load_balance_fair(struct lb_env *env)
 {
-	env->busiest_cfs_rq = &env->busiest_rq->cfs;
+	env->src_cfs_rq = &env->src_rq->cfs;
 	return balance_tasks(env);
 }
 #endif
@@ -3451,7 +3451,7 @@ static int move_tasks(struct lb_env *env)
 		 * kernels will stop after the first task is pulled to minimize
 		 * the critical section.
 		 */
-		if (env->idle == CPU_NEWLY_IDLE && env->this_rq->nr_running) {
+		if (env->idle == CPU_NEWLY_IDLE && env->dst_rq->nr_running) {
 			env->flags |= LBF_ABORT;
 			break;
 		}
@@ -4461,8 +4461,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 	struct lb_env env = {
 		.sd		= sd,
-		.this_cpu	= this_cpu,
-		.this_rq	= this_rq,
+		.dst_cpu	= this_cpu,
+		.dst_rq		= this_rq,
 		.idle		= idle,
 	};
 
@@ -4502,7 +4502,8 @@ redo:
 		 */
 		env.flags |= LBF_ALL_PINNED;
 		env.max_load_move = imbalance;
-		env.busiest_rq = busiest;
+		env.src_cpu = busiest->cpu;
+		env.src_rq = busiest;
 
 		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
@@ -4722,9 +4723,10 @@ static int active_load_balance_cpu_stop(void *data)
 	if (likely(sd)) {
 		struct lb_env env = {
 			.sd		= sd,
-			.this_cpu	= target_cpu,
-			.this_rq	= target_rq,
-			.busiest_rq	= busiest_rq,
+			.dst_cpu	= target_cpu,
+			.dst_rq		= target_rq,
+			.src_cpu	= busiest_rq->cpu,
+			.src_rq		= busiest_rq,
 			.idle		= CPU_IDLE,
 		};
 
-- 
cgit v1.2.3


From 367456c756a6b84f493ca9cc5b17b1f5d38ef466 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 20 Feb 2012 21:49:09 +0100
Subject: sched: Ditch per cgroup task lists for load-balancing

Per cgroup load-balance has numerous problems, chief amongst them that
there is no real sane order in them. So stop pretending it makes sense
and enqueue all tasks on a single list.

This also allows us to more easily fix the fwd progress issue
uncovered by the lock-break stuff. Rotate the list on failure to
migreate and limit the total iterations to nr_running (which with
releasing the lock isn't strictly accurate but close enough).

Also add a filter that skips very light tasks on the first attempt
around the list, this attempts to avoid shooting whole cgroups around
without affecting over balance.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: pjt@google.com
Link: http://lkml.kernel.org/n/tip-tx8yqydc7eimgq7i4rkc3a4g@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/core.c  |   3 +
 kernel/sched/fair.c  | 176 ++++++++++++++++++++++-----------------------------
 kernel/sched/sched.h |  10 +--
 3 files changed, 80 insertions(+), 109 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 25e06a5e048b..95545126be1c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6959,6 +6959,9 @@ void __init sched_init(void)
 		rq->online = 0;
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
+
+		INIT_LIST_HEAD(&rq->cfs_tasks);
+
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ
 		rq->nohz_flags = 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 233d05171bf5..a0424fc4cc54 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -776,29 +776,16 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */
 
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-	cfs_rq->task_weight += weight;
-}
-#else
-static inline void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-}
-#endif
-
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_add(&cfs_rq->load, se->load.weight);
 	if (!parent_entity(se))
 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
-	if (entity_is_task(se)) {
-		add_cfs_task_weight(cfs_rq, se->load.weight);
-		list_add(&se->group_node, &cfs_rq->tasks);
-	}
+#ifdef CONFIG_SMP
+	if (entity_is_task(se))
+		list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+#endif
 	cfs_rq->nr_running++;
 }
 
@@ -808,10 +795,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	if (!parent_entity(se))
 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-	if (entity_is_task(se)) {
-		add_cfs_task_weight(cfs_rq, -se->load.weight);
+	if (entity_is_task(se))
 		list_del_init(&se->group_node);
-	}
 	cfs_rq->nr_running--;
 }
 
@@ -3085,17 +3070,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
 #define LBF_ALL_PINNED	0x01
-#define LBF_NEED_BREAK	0x02	/* clears into HAD_BREAK */
-#define LBF_HAD_BREAK	0x04
-#define LBF_HAD_BREAKS	0x0C	/* count HAD_BREAKs overflows into ABORT */
-#define LBF_ABORT	0x10
+#define LBF_NEED_BREAK	0x02
+#define LBF_ABORT	0x04
 
 struct lb_env {
 	struct sched_domain	*sd;
 
 	int			src_cpu;
 	struct rq		*src_rq;
-	struct cfs_rq		*src_cfs_rq;
 
 	int			dst_cpu;
 	struct rq		*dst_rq;
@@ -3103,6 +3085,10 @@ struct lb_env {
 	enum cpu_idle_type	idle;
 	unsigned long		max_load_move;
 	unsigned int		flags;
+
+	unsigned int		loop;
+	unsigned int		loop_break;
+	unsigned int		loop_max;
 };
 
 /*
@@ -3208,53 +3194,69 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 static int move_one_task(struct lb_env *env)
 {
 	struct task_struct *p, *n;
-	struct cfs_rq *cfs_rq;
 
-	for_each_leaf_cfs_rq(env->src_rq, cfs_rq) {
-		list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
-			if (throttled_lb_pair(task_group(p),
-					      env->src_cpu, env->dst_cpu))
-				break;
+	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+		if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
+			continue;
 
-			if (!can_migrate_task(p, env))
-				continue;
+		if (!can_migrate_task(p, env))
+			continue;
 
-			move_task(p, env);
-			/*
-			 * Right now, this is only the second place move_task()
-			 * is called, so we can safely collect move_task()
-			 * stats here rather than inside move_task().
-			 */
-			schedstat_inc(env->sd, lb_gained[env->idle]);
-			return 1;
-		}
+		move_task(p, env);
+		/*
+		 * Right now, this is only the second place move_task()
+		 * is called, so we can safely collect move_task()
+		 * stats here rather than inside move_task().
+		 */
+		schedstat_inc(env->sd, lb_gained[env->idle]);
+		return 1;
 	}
-
 	return 0;
 }
 
+static unsigned long task_h_load(struct task_struct *p);
+
 static unsigned long balance_tasks(struct lb_env *env)
 {
-	int loops = 0, pulled = 0;
 	long rem_load_move = env->max_load_move;
 	struct task_struct *p, *n;
+	unsigned long load;
+	int pulled = 0;
 
 	if (env->max_load_move == 0)
 		goto out;
 
-	list_for_each_entry_safe(p, n, &env->src_cfs_rq->tasks, se.group_node) {
-		if (loops++ > sysctl_sched_nr_migrate) {
+	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+		env->loop++;
+		/* We've more or less seen every task there is, call it quits */
+		if (env->loop > env->loop_max) {
+			env->flags |= LBF_ABORT;
+			break;
+		}
+		/* take a beather every nr_migrate tasks */
+		if (env->loop > env->loop_break) {
+			env->loop_break += sysctl_sched_nr_migrate;
 			env->flags |= LBF_NEED_BREAK;
 			break;
 		}
 
-		if ((p->se.load.weight >> 1) > rem_load_move ||
-		    !can_migrate_task(p, env))
-			continue;
+		if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
+					env->dst_cpu))
+			goto next;
+
+		load = task_h_load(p);
+		if (load < 16 && !env->sd->nr_balance_failed)
+			goto next;
+
+		if ((load * 2) > rem_load_move)
+			goto next;
+
+		if (!can_migrate_task(p, env))
+			goto next;
 
 		move_task(p, env);
 		pulled++;
-		rem_load_move -= p->se.load.weight;
+		rem_load_move -= load;
 
 #ifdef CONFIG_PREEMPT
 		/*
@@ -3274,6 +3276,10 @@ static unsigned long balance_tasks(struct lb_env *env)
 		 */
 		if (rem_load_move <= 0)
 			break;
+
+		continue;
+next:
+		list_move_tail(&p->se.group_node, &env->src_rq->cfs_tasks);
 	}
 out:
 	/*
@@ -3363,65 +3369,33 @@ static int tg_load_down(struct task_group *tg, void *data)
 
 static void update_h_load(long cpu)
 {
+	rcu_read_lock();
 	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
+	rcu_read_unlock();
 }
 
-static unsigned long load_balance_fair(struct lb_env *env)
+static unsigned long task_h_load(struct task_struct *p)
 {
-	unsigned long max_load_move = env->max_load_move;
-	long rem_load_move = env->max_load_move;
-
-	rcu_read_lock();
-	update_h_load(cpu_of(env->src_rq));
-
-	for_each_leaf_cfs_rq(env->src_rq, env->src_cfs_rq) {
-		unsigned long busiest_h_load = env->src_cfs_rq->h_load;
-		unsigned long busiest_weight = env->src_cfs_rq->load.weight;
-		u64 rem_load, moved_load;
-
-		if (env->flags & (LBF_NEED_BREAK|LBF_ABORT))
-			break;
-
-		/*
-		 * empty group or part of a throttled hierarchy
-		 */
-		if (!env->src_cfs_rq->task_weight)
-			continue;
-
-		if (throttled_lb_pair(env->src_cfs_rq->tg,
-				      cpu_of(env->src_rq),
-				      env->dst_cpu))
-			continue;
-
-		rem_load = (u64)rem_load_move * busiest_weight;
-		rem_load = div_u64(rem_load, busiest_h_load + 1);
-
-		env->max_load_move = rem_load;
-
-		moved_load = balance_tasks(env);
-		if (!moved_load)
-			continue;
-
-		moved_load *= busiest_h_load;
-		moved_load = div_u64(moved_load, busiest_weight + 1);
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	unsigned long load;
 
-		rem_load_move -= moved_load;
-		if (rem_load_move < 0)
-			break;
-	}
-	rcu_read_unlock();
+	load = p->se.load.weight;
+	load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
 
-	return max_load_move - rem_load_move;
+	return load;
 }
 #else
 static inline void update_shares(int cpu)
 {
 }
 
-static unsigned long load_balance_fair(struct lb_env *env)
+static inline void update_h_load(long cpu)
 {
-	env->src_cfs_rq = &env->src_rq->cfs;
-	return balance_tasks(env);
+}
+
+static unsigned long task_h_load(struct task_struct *p)
+{
+	return p->se.load.weight;
 }
 #endif
 
@@ -3437,9 +3411,10 @@ static int move_tasks(struct lb_env *env)
 	unsigned long max_load_move = env->max_load_move;
 	unsigned long total_load_moved = 0, load_moved;
 
+	update_h_load(cpu_of(env->src_rq));
 	do {
 		env->max_load_move = max_load_move - total_load_moved;
-		load_moved = load_balance_fair(env);
+		load_moved = balance_tasks(env);
 		total_load_moved += load_moved;
 
 		if (env->flags & (LBF_NEED_BREAK|LBF_ABORT))
@@ -4464,6 +4439,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		.dst_cpu	= this_cpu,
 		.dst_rq		= this_rq,
 		.idle		= idle,
+		.loop_break	= sysctl_sched_nr_migrate,
 	};
 
 	cpumask_copy(cpus, cpu_active_mask);
@@ -4504,6 +4480,7 @@ redo:
 		env.max_load_move = imbalance;
 		env.src_cpu = busiest->cpu;
 		env.src_rq = busiest;
+		env.loop_max = busiest->nr_running;
 
 		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
@@ -4521,9 +4498,7 @@ redo:
 			goto out_balanced;
 
 		if (env.flags & LBF_NEED_BREAK) {
-			env.flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
-			if (env.flags & LBF_ABORT)
-				goto out_balanced;
+			env.flags &= ~LBF_NEED_BREAK;
 			goto redo;
 		}
 
@@ -5357,7 +5332,6 @@ static void set_curr_task_fair(struct rq *rq)
 void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
-	INIT_LIST_HEAD(&cfs_rq->tasks);
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 #ifndef CONFIG_64BIT
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c0660a1a0cd1..753bdd567416 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -212,9 +212,6 @@ struct cfs_rq {
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
 
-	struct list_head tasks;
-	struct list_head *balance_iterator;
-
 	/*
 	 * 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
@@ -241,11 +238,6 @@ struct cfs_rq {
 	struct task_group *tg;	/* group that "owns" this runqueue */
 
 #ifdef CONFIG_SMP
-	/*
-	 * the part of load.weight contributed by tasks
-	 */
-	unsigned long task_weight;
-
 	/*
 	 *   h_load = weight * f(tg)
 	 *
@@ -420,6 +412,8 @@ struct rq {
 	int cpu;
 	int online;
 
+	struct list_head cfs_tasks;
+
 	u64 rt_avg;
 	u64 age_stamp;
 	u64 idle_stamp;
-- 
cgit v1.2.3


From 2e5b5b3a1b7768c89fbfeca18e75f8ee377e924c Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Thu, 23 Feb 2012 17:41:27 +0900
Subject: sched: Clean up parameter passing of proc_sched_autogroup_set_nice()

Pass nice as a value to proc_sched_autogroup_set_nice().

No side effect is expected, and the variable err will be overwritten with
the return value.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/4F45FBB7.5090607@ct.jp.nec.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/auto_group.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index e8a1f83ee0e7..0984a21076a3 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -195,20 +195,20 @@ __setup("noautogroup", setup_autogroup);
 
 #ifdef CONFIG_PROC_FS
 
-int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
+int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
 {
 	static unsigned long next = INITIAL_JIFFIES;
 	struct autogroup *ag;
 	int err;
 
-	if (*nice < -20 || *nice > 19)
+	if (nice < -20 || nice > 19)
 		return -EINVAL;
 
-	err = security_task_setnice(current, *nice);
+	err = security_task_setnice(current, nice);
 	if (err)
 		return err;
 
-	if (*nice < 0 && !can_nice(current, *nice))
+	if (nice < 0 && !can_nice(current, nice))
 		return -EPERM;
 
 	/* this is a heavy operation taking global locks.. */
@@ -219,9 +219,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
 	ag = autogroup_task_get(p);
 
 	down_write(&ag->lock);
-	err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
+	err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
 	if (!err)
-		ag->nice = *nice;
+		ag->nice = nice;
 	up_write(&ag->lock);
 
 	autogroup_kref_put(ag);
-- 
cgit v1.2.3


From 5d6523ebd2f67de9d23285aad7f3910e7b0aee83 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 10 Mar 2012 00:07:36 +0100
Subject: sched: Fix load-balance wreckage

Commit 367456c ("sched: Ditch per cgroup task lists for
load-balancing") completely wrecked load-balancing due to
a few silly mistakes.

Correct those and remove more pointless code.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-zk04ihygwxn7qqrlpaf73b0r@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/fair.c | 110 +++++++++++++++++++---------------------------------
 1 file changed, 39 insertions(+), 71 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a0424fc4cc54..def17aa302d5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
 #ifdef CONFIG_SMP
 	if (entity_is_task(se))
-		list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+		list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
 #endif
 	cfs_rq->nr_running++;
 }
@@ -3071,7 +3071,6 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
 #define LBF_ALL_PINNED	0x01
 #define LBF_NEED_BREAK	0x02
-#define LBF_ABORT	0x04
 
 struct lb_env {
 	struct sched_domain	*sd;
@@ -3083,7 +3082,7 @@ struct lb_env {
 	struct rq		*dst_rq;
 
 	enum cpu_idle_type	idle;
-	unsigned long		max_load_move;
+	long			load_move;
 	unsigned int		flags;
 
 	unsigned int		loop;
@@ -3216,39 +3215,47 @@ static int move_one_task(struct lb_env *env)
 
 static unsigned long task_h_load(struct task_struct *p);
 
-static unsigned long balance_tasks(struct lb_env *env)
+/*
+ * move_tasks tries to move up to load_move weighted load from busiest to
+ * this_rq, as part of a balancing operation within domain "sd".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct lb_env *env)
 {
-	long rem_load_move = env->max_load_move;
-	struct task_struct *p, *n;
+	struct list_head *tasks = &env->src_rq->cfs_tasks;
+	struct task_struct *p;
 	unsigned long load;
 	int pulled = 0;
 
-	if (env->max_load_move == 0)
-		goto out;
+	if (env->load_move <= 0)
+		return 0;
+
+	while (!list_empty(tasks)) {
+		p = list_first_entry(tasks, struct task_struct, se.group_node);
 
-	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
 		env->loop++;
 		/* We've more or less seen every task there is, call it quits */
-		if (env->loop > env->loop_max) {
-			env->flags |= LBF_ABORT;
+		if (env->loop > env->loop_max)
 			break;
-		}
-		/* take a beather every nr_migrate tasks */
+
+		/* take a breather every nr_migrate tasks */
 		if (env->loop > env->loop_break) {
 			env->loop_break += sysctl_sched_nr_migrate;
 			env->flags |= LBF_NEED_BREAK;
 			break;
 		}
 
-		if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
-					env->dst_cpu))
+		if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
 			goto next;
 
 		load = task_h_load(p);
+
 		if (load < 16 && !env->sd->nr_balance_failed)
 			goto next;
 
-		if ((load * 2) > rem_load_move)
+		if ((load / 2) > env->load_move)
 			goto next;
 
 		if (!can_migrate_task(p, env))
@@ -3256,7 +3263,7 @@ static unsigned long balance_tasks(struct lb_env *env)
 
 		move_task(p, env);
 		pulled++;
-		rem_load_move -= load;
+		env->load_move -= load;
 
 #ifdef CONFIG_PREEMPT
 		/*
@@ -3264,24 +3271,22 @@ static unsigned long balance_tasks(struct lb_env *env)
 		 * kernels will stop after the first task is pulled to minimize
 		 * the critical section.
 		 */
-		if (env->idle == CPU_NEWLY_IDLE) {
-			env->flags |= LBF_ABORT;
+		if (env->idle == CPU_NEWLY_IDLE)
 			break;
-		}
 #endif
 
 		/*
 		 * We only want to steal up to the prescribed amount of
 		 * weighted load.
 		 */
-		if (rem_load_move <= 0)
+		if (env->load_move <= 0)
 			break;
 
 		continue;
 next:
-		list_move_tail(&p->se.group_node, &env->src_rq->cfs_tasks);
+		list_move_tail(&p->se.group_node, tasks);
 	}
-out:
+
 	/*
 	 * Right now, this is one of only two places move_task() is called,
 	 * so we can safely collect move_task() stats here rather than
@@ -3289,7 +3294,7 @@ out:
 	 */
 	schedstat_add(env->sd, lb_gained[env->idle], pulled);
 
-	return env->max_load_move - rem_load_move;
+	return pulled;
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3399,43 +3404,6 @@ static unsigned long task_h_load(struct task_struct *p)
 }
 #endif
 
-/*
- * move_tasks tries to move up to max_load_move weighted load from busiest t