From d01d4827858cdc2e1c437c87ab65ec0a00fd40f8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 21 Sep 2009 11:06:27 +0200 Subject: sched: Always show Cpus_allowed field in /proc//status The Cpus_allowed fields in /proc//status is currently only shown in case of CONFIG_CPUSETS. However their contents are also useful for the !CONFIG_CPUSETS case. So change the current behaviour and always show these fields. Signed-off-by: Heiko Carstens Cc: Andrew Morton Cc: Oleg Nesterov Cc: Peter Zijlstra LKML-Reference: <20090921090627.GD4649@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- fs/proc/array.c | 11 +++++++++++ kernel/cpuset.c | 8 +------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 725a650bbbb8..762aea9c9c71 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -321,6 +321,16 @@ static inline void task_context_switch_counts(struct seq_file *m, p->nivcsw); } +static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) +{ + seq_printf(m, "Cpus_allowed:\t"); + seq_cpumask(m, &task->cpus_allowed); + seq_printf(m, "\n"); + seq_printf(m, "Cpus_allowed_list:\t"); + seq_cpumask_list(m, &task->cpus_allowed); + seq_printf(m, "\n"); +} + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -335,6 +345,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, } task_sig(m, task); task_cap(m, task); + task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); #if defined(CONFIG_S390) task_show_regs(m, task); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7e75a41bd508..b81f7f096e1c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2499,15 +2499,9 @@ const struct file_operations proc_cpuset_operations = { }; #endif /* CONFIG_PROC_PID_CPUSET */ -/* Display task cpus_allowed, mems_allowed in /proc//status file. */ +/* Display task mems_allowed in /proc//status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { - seq_printf(m, "Cpus_allowed:\t"); - seq_cpumask(m, &task->cpus_allowed); - seq_printf(m, "\n"); - seq_printf(m, "Cpus_allowed_list:\t"); - seq_cpumask_list(m, &task->cpus_allowed); - seq_printf(m, "\n"); seq_printf(m, "Mems_allowed:\t"); seq_nodemask(m, &task->mems_allowed); seq_printf(m, "\n"); -- cgit v1.2.3 From cf82ff7ea7695b0e82ba07bc5e9f1bd03a74e1aa Mon Sep 17 00:00:00 2001 From: "Jayson R. King" Date: Mon, 5 Oct 2009 05:21:26 -0500 Subject: sched: Remove obsolete comment in sched_init() Remove the comment about calling alloc_bootmem() as it is not called here since commit 36b7b6d465489c4754c4fd66fcec6086eba87896. Signed-off-by: Jayson R. King Cc: Peter Zijlstra Cc: Jiri Kosina LKML-Reference: <4AC9C8A6.6010209@jaysonking.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 830967e18285..a56446d7fda2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9322,10 +9322,6 @@ void __init sched_init(void) #ifdef CONFIG_CPUMASK_OFFSTACK alloc_size += num_possible_cpus() * cpumask_size(); #endif - /* - * As sched_init() is called before page_alloc is setup, - * we use alloc_bootmem(). - */ if (alloc_size) { ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); -- cgit v1.2.3 From ce0e7b28fb75cb003cfc8d0238613aaf1c55e797 Mon Sep 17 00:00:00 2001 From: Ryota Ozaki Date: Sat, 24 Oct 2009 01:20:10 +0900 Subject: sched, cpuacct: Fix niced guest time accounting CPU time of a guest is always accounted in 'user' time without concern for the nice value of its counterpart process although the guest is scheduled under the nice value. This patch fixes the defect and accounts cpu time of a niced guest in 'nice' time as same as a niced process. And also the patch adds 'guest_nice' to cpuacct. The value provides niced guest cpu time which is like 'nice' to 'user'. The original discussions can be found here: http://www.mail-archive.com/kvm@vger.kernel.org/msg23982.html http://www.mail-archive.com/kvm@vger.kernel.org/msg23860.html Signed-off-by: Ryota Ozaki Acked-by: Avi Kivity Cc: Peter Zijlstra LKML-Reference: <1256314810-7897-1-git-send-email-ozaki.ryota@gmail.com> Signed-off-by: Ingo Molnar --- Documentation/filesystems/proc.txt | 3 ++- fs/proc/stat.c | 19 +++++++++++++------ include/linux/kernel_stat.h | 1 + kernel/sched.c | 9 +++++++-- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 2c48f945546b..4af0018533f2 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1072,7 +1072,8 @@ second). The meanings of the columns are as follows, from left to right: - irq: servicing interrupts - softirq: servicing softirqs - steal: involuntary wait -- guest: running a guest +- guest: running a normal guest +- guest_nice: running a niced guest The "intr" line gives counts of interrupts serviced since boot time, for each of the possible system interrupts. The first column is the total of all diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 7cc726c6d70a..b9b7aad2003d 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -27,7 +27,7 @@ static int show_stat(struct seq_file *p, void *v) int i, j; unsigned long jif; cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; - cputime64_t guest; + cputime64_t guest, guest_nice; u64 sum = 0; u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; @@ -36,7 +36,7 @@ static int show_stat(struct seq_file *p, void *v) user = nice = system = idle = iowait = irq = softirq = steal = cputime64_zero; - guest = cputime64_zero; + guest = guest_nice = cputime64_zero; getboottime(&boottime); jif = boottime.tv_sec; @@ -51,6 +51,8 @@ static int show_stat(struct seq_file *p, void *v) softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); + guest_nice = cputime64_add(guest_nice, + kstat_cpu(i).cpustat.guest_nice); for_each_irq_nr(j) { sum += kstat_irqs_cpu(j, i); } @@ -65,7 +67,8 @@ static int show_stat(struct seq_file *p, void *v) } sum += arch_irq_stat(); - seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu " + "%llu\n", (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), (unsigned long long)cputime64_to_clock_t(system), @@ -74,7 +77,8 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), (unsigned long long)cputime64_to_clock_t(steal), - (unsigned long long)cputime64_to_clock_t(guest)); + (unsigned long long)cputime64_to_clock_t(guest), + (unsigned long long)cputime64_to_clock_t(guest_nice)); for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ @@ -88,8 +92,10 @@ static int show_stat(struct seq_file *p, void *v) softirq = kstat_cpu(i).cpustat.softirq; steal = kstat_cpu(i).cpustat.steal; guest = kstat_cpu(i).cpustat.guest; + guest_nice = kstat_cpu(i).cpustat.guest_nice; seq_printf(p, - "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " + "%llu\n", i, (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), @@ -99,7 +105,8 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), (unsigned long long)cputime64_to_clock_t(steal), - (unsigned long long)cputime64_to_clock_t(guest)); + (unsigned long long)cputime64_to_clock_t(guest), + (unsigned long long)cputime64_to_clock_t(guest_nice)); } seq_printf(p, "intr %llu", (unsigned long long)sum); diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 348fa8874b52..c059044bc6dc 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -25,6 +25,7 @@ struct cpu_usage_stat { cputime64_t iowait; cputime64_t steal; cputime64_t guest; + cputime64_t guest_nice; }; struct kernel_stat { diff --git a/kernel/sched.c b/kernel/sched.c index e5205811c19e..67be4d0dddaa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5017,8 +5017,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, p->gtime = cputime_add(p->gtime, cputime); /* Add guest time to cpustat. */ - cpustat->user = cputime64_add(cpustat->user, tmp); - cpustat->guest = cputime64_add(cpustat->guest, tmp); + if (TASK_NICE(p) > 0) { + cpustat->nice = cputime64_add(cpustat->nice, tmp); + cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); + } else { + cpustat->user = cputime64_add(cpustat->user, tmp); + cpustat->guest = cputime64_add(cpustat->guest, tmp); + } } /* -- cgit v1.2.3 From 1477b6a7edd9ffa7bba4f9779ce9a76ce92761ed Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 4 Nov 2009 16:14:16 +0900 Subject: sched: Remove unused __schedule() declaration __schedule() had been removed. Signed-off-by: Hiroshi Shimamoto Cc: Peter Zijlstra LKML-Reference: <4AF129C8.3030008@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/kgdb.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 75e6e60bf583..f18102c4d0b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -349,7 +349,6 @@ extern signed long schedule_timeout(signed long timeout); extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); -asmlinkage void __schedule(void); asmlinkage void schedule(void); extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 9147a3190c9d..7d7014634022 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -870,7 +870,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks) /* * All threads that don't have debuggerinfo should be - * in __schedule() sleeping, since all other CPUs + * in schedule() sleeping, since all other CPUs * are in kgdb_wait, and thus have debuggerinfo. */ if (local_debuggerinfo) { -- cgit v1.2.3 From 2a2bb3142d326bb28b03875cabfc49baaac9a14a Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 4 Nov 2009 16:16:10 +0900 Subject: sched: Remove unused time_sync_thresh declaration time_sync_thresh had been removed. Signed-off-by: Hiroshi Shimamoto Cc: Peter Zijlstra LKML-Reference: <4AF12A3A.5050200@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index f18102c4d0b8..754b3deed02b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -171,8 +171,6 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) } #endif -extern unsigned long long time_sync_thresh; - /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). -- cgit v1.2.3 From 9824a2b728b63e7ff586b9fd9293c819be79f0f3 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 4 Nov 2009 16:16:54 +0900 Subject: sched: Remove unused cpu_nr_migrations() cpu_nr_migrations() is not used, remove it. Signed-off-by: Hiroshi Shimamoto Cc: Peter Zijlstra LKML-Reference: <4AF12A66.6020609@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/sched.c | 11 ----------- 2 files changed, 12 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 754b3deed02b..dfc21fb76bf1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -145,7 +145,6 @@ extern unsigned long this_cpu_load(void); extern void calc_global_load(void); -extern u64 cpu_nr_migrations(int cpu); extern unsigned long get_parent_ip(unsigned long addr); diff --git a/kernel/sched.c b/kernel/sched.c index 67be4d0dddaa..30fd0ba5f603 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -541,7 +541,6 @@ struct rq { struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; - u64 nr_migrations_in; struct cfs_rq cfs; struct rt_rq rt; @@ -2049,7 +2048,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #endif if (old_cpu != new_cpu) { p->se.nr_migrations++; - new_rq->nr_migrations_in++; #ifdef CONFIG_SCHEDSTATS if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); @@ -2988,15 +2986,6 @@ static void calc_load_account_active(struct rq *this_rq) } } -/* - * Externally visible per-cpu scheduler statistics: - * cpu_nr_migrations(cpu) - number of migrations into that cpu - */ -u64 cpu_nr_migrations(int cpu) -{ - return cpu_rq(cpu)->nr_migrations_in; -} - /* * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). -- cgit v1.2.3 From 45a5c8bad827ebb9c9798becc15bce2e804d49e0 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Wed, 4 Nov 2009 03:15:44 +0530 Subject: sched: Add USER_SCHED to feature removal list Peter Zijlstra suggested that we remove USER_SCHED at: http://lkml.org/lkml/2009/3/21/67 Removing USER_SCHED removes a lot of code from the scheduler and simplifies the code. We already have the ability to do user based classification which is tightened using PAM in userspace. Schedule USER_SCHED for removal in 2.6.34 Signed-off-by: Dhaval Giani Acked-by: Peter Zijlstra Cc: Balbir Singh Cc: Bharata B Rao Cc: Serge E. Hallyn Cc: Srivatsa Vaddagiri LKML-Reference: <20091103214544.GI5495@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- Documentation/feature-removal-schedule.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index bc693fffabe0..f613df8ec7bf 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -6,6 +6,21 @@ be removed from this file. --------------------------- +What: USER_SCHED +When: 2.6.34 + +Why: USER_SCHED was implemented as a proof of concept for group scheduling. + The effect of USER_SCHED can already be achieved from userspace with + the help of libcgroup. The removal of USER_SCHED will also simplify + the scheduler code with the removal of one major ifdef. There are also + issues USER_SCHED has with USER_NS. A decision was taken not to fix + those and instead remove USER_SCHED. Also new group scheduling + features will not be implemented for USER_SCHED. + +Who: Dhaval Giani + +--------------------------- + What: PRISM54 When: 2.6.34 -- cgit v1.2.3 From e2c880630438f80b474378d5487b511b07665051 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 3 Nov 2009 14:53:15 +1030 Subject: cpumask: Simplify sched_rt.c find_lowest_rq() wants to call pick_optimal_cpu() on the intersection of sched_domain_span(sd) and lowest_mask. Rather than doing a cpus_and into a temporary, we can open-code it. This actually makes the code slightly clearer, IMHO. Signed-off-by: Rusty Russell Acked-by: Gregory Haskins Cc: Steven Rostedt LKML-Reference: <200911031453.15350.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 61 ++++++++++++++++++++++--------------------------------- 1 file changed, 24 insertions(+), 37 deletions(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index a4d790cddb19..5c5fef378415 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1153,29 +1153,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); -static inline int pick_optimal_cpu(int this_cpu, - const struct cpumask *mask) -{ - int first; - - /* "this_cpu" is cheaper to preempt than a remote processor */ - if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask)) - return this_cpu; - - first = cpumask_first(mask); - if (first < nr_cpu_ids) - return first; - - return -1; -} - static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); - cpumask_var_t domain_mask; if (task->rt.nr_cpus_allowed == 1) return -1; /* No other targets possible */ @@ -1198,28 +1181,26 @@ static int find_lowest_rq(struct task_struct *task) * Otherwise, we consult the sched_domains span maps to figure * out which cpu is logically closest to our hot cache data. */ - if (this_cpu == cpu) - this_cpu = -1; /* Skip this_cpu opt if the same */ - - if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) { - for_each_domain(cpu, sd) { - if (sd->flags & SD_WAKE_AFFINE) { - int best_cpu; + if (!cpumask_test_cpu(this_cpu, lowest_mask)) + this_cpu = -1; /* Skip this_cpu opt if not among lowest */ - cpumask_and(domain_mask, - sched_domain_span(sd), - lowest_mask); + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + int best_cpu; - best_cpu = pick_optimal_cpu(this_cpu, - domain_mask); - - if (best_cpu != -1) { - free_cpumask_var(domain_mask); - return best_cpu; - } - } + /* + * "this_cpu" is cheaper to preempt than a + * remote processor. + */ + if (this_cpu != -1 && + cpumask_test_cpu(this_cpu, sched_domain_span(sd))) + return this_cpu; + + best_cpu = cpumask_first_and(lowest_mask, + sched_domain_span(sd)); + if (best_cpu < nr_cpu_ids) + return best_cpu; } - free_cpumask_var(domain_mask); } /* @@ -1227,7 +1208,13 @@ static int find_lowest_rq(struct task_struct *task) * just give the caller *something* to work with from the compatible * locations. */ - return pick_optimal_cpu(this_cpu, lowest_mask); + if (this_cpu != -1) + return this_cpu; + + cpu = cpumask_any(lowest_mask); + if (cpu < nr_cpu_ids) + return cpu; + return -1; } /* Will lock the rq it finds */ -- cgit v1.2.3 From acc3f5d7cabbfd6cec71f0c1f9900621fa2d6ae7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 3 Nov 2009 14:53:40 +1030 Subject: cpumask: Partition_sched_domains takes array of cpumask_var_t Currently partition_sched_domains() takes a 'struct cpumask *doms_new' which is a kmalloc'ed array of cpumask_t. You can't have such an array if 'struct cpumask' is undefined, as we plan for CONFIG_CPUMASK_OFFSTACK=y. So, we make this an array of cpumask_var_t instead: this is the same for the CONFIG_CPUMASK_OFFSTACK=n case, but requires multiple allocations for the CONFIG_CPUMASK_OFFSTACK=y case. Hence we add alloc_sched_domains() and free_sched_domains() functions. Signed-off-by: Rusty Russell Cc: Peter Zijlstra LKML-Reference: <200911031453.40668.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ++++-- kernel/cpuset.c | 19 +++++++------- kernel/sched.c | 68 ++++++++++++++++++++++++++++++++++----------------- 3 files changed, 61 insertions(+), 34 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index dfc21fb76bf1..78ba664474f3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1009,9 +1009,13 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd) return to_cpumask(sd->span); } -extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, +extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new); +/* Allocate an array of sched domains, for partition_sched_domains(). */ +cpumask_var_t *alloc_sched_domains(unsigned int ndoms); +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); + /* Test a flag in parent sched domain */ static inline int test_sd_parent(struct sched_domain *sd, int flag) { @@ -1029,7 +1033,7 @@ unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu); struct sched_domain_attr; static inline void -partition_sched_domains(int ndoms_new, struct cpumask *doms_new, +partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d247381e7371..3cf2183b472d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -537,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */ -/* FIXME: see the FIXME in partition_sched_domains() */ -static int generate_sched_domains(struct cpumask **domains, +static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { LIST_HEAD(q); /* queue of cpusets to be scanned */ @@ -546,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains, struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ - struct cpumask *doms; /* resulting partition; i.e. sched domains */ + cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ @@ -557,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains, /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { - doms = kmalloc(cpumask_size(), GFP_KERNEL); + ndoms = 1; + doms = alloc_sched_domains(ndoms); if (!doms) goto done; @@ -566,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains, *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } - cpumask_copy(doms, top_cpuset.cpus_allowed); + cpumask_copy(doms[0], top_cpuset.cpus_allowed); - ndoms = 1; goto done; } @@ -636,7 +635,7 @@ restart: * Now we know how many domains to create. * Convert to and populate cpu masks. */ - doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); + doms = alloc_sched_domains(ndoms); if (!doms) goto done; @@ -656,7 +655,7 @@ restart: continue; } - dp = doms + nslot; + dp = doms[nslot]; if (nslot == ndoms) { static int warnings = 10; @@ -718,7 +717,7 @@ done: static void do_rebuild_sched_domains(struct work_struct *unused) { struct sched_domain_attr *attr; - struct cpumask *doms; + cpumask_var_t *doms; int ndoms; get_online_cpus(); @@ -2052,7 +2051,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu) { struct sched_domain_attr *attr; - struct cpumask *doms; + cpumask_var_t *doms; int ndoms; switch (phase) { diff --git a/kernel/sched.c b/kernel/sched.c index 30fd0ba5f603..ae026aad145b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8846,7 +8846,7 @@ static int build_sched_domains(const struct cpumask *cpu_map) return __build_sched_domains(cpu_map, NULL); } -static struct cpumask *doms_cur; /* current sched domains */ +static cpumask_var_t *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ static struct sched_domain_attr *dattr_cur; /* attribues of custom domains in 'doms_cur' */ @@ -8868,6 +8868,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void) return 0; } +cpumask_var_t *alloc_sched_domains(unsigned int ndoms) +{ + int i; + cpumask_var_t *doms; + + doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); + if (!doms) + return NULL; + for (i = 0; i < ndoms; i++) { + if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { + free_sched_domains(doms, i); + return NULL; + } + } + return doms; +} + +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) +{ + unsigned int i; + for (i = 0; i < ndoms; i++) + free_cpumask_var(doms[i]); + kfree(doms); +} + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to @@ -8879,12 +8904,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) arch_update_cpu_topology(); ndoms_cur = 1; - doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); + doms_cur = alloc_sched_domains(ndoms_cur); if (!doms_cur) - doms_cur = fallback_doms; - cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); + doms_cur = &fallback_doms; + cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); dattr_cur = NULL; - err = build_sched_domains(doms_cur); + err = build_sched_domains(doms_cur[0]); register_sched_domain_sysctl(); return err; @@ -8934,19 +8959,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * doms_new[] to the current sched domain partitioning, doms_cur[]. * It destroys each deleted domain and builds each new domain. * - * 'doms_new' is an array of cpumask's of length 'ndoms_new'. + * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. * The masks don't intersect (don't overlap.) We should setup one * sched domain for each mask. CPUs not in any of the cpumasks will * not be load balanced. If the same cpumask appears both in the * current 'doms_cur' domains and in the new 'doms_new', we can leave * it as it is. * - * The passed in 'doms_new' should be kmalloc'd. This routine takes - * ownership of it and will kfree it when done with it. If the caller - * failed the kmalloc call, then it can pass in doms_new == NULL && - * ndoms_new == 1, and partition_sched_domains() will fallback to - * the single partition 'fallback_doms', it also forces the domains - * to be rebuilt. + * The passed in 'doms_new' should be allocated using + * alloc_sched_domains. This routine takes ownership of it and will + * free_sched_domains it when done with it. If the caller failed the + * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, + * and partition_sched_domains() will fallback to the single partition + * 'fallback_doms', it also forces the domains to be rebuilt. * * If doms_new == NULL it will be replaced with cpu_online_mask. * ndoms_new == 0 is a special case for destroying existing domains, @@ -8954,8 +8979,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * * Call with hotplug lock held */ -/* FIXME: Change to struct cpumask *doms_new[] */ -void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { int i, j, n; @@ -8974,40 +8998,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(&doms_cur[i], &doms_new[j]) + if (cpumask_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } /* no match - a current sched domain not in new doms_new[] */ - detach_destroy_domains(doms_cur + i); + detach_destroy_domains(doms_cur[i]); match1: ; } if (doms_new == NULL) { ndoms_cur = 0; - doms_new = fallback_doms; - cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); + doms_new = &fallback_doms; + cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur && !new_topology; j++) { - if (cpumask_equal(&doms_new[i], &doms_cur[j]) + if (cpumask_equal(doms_new[i], doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } /* no match - add a new doms_new */ - __build_sched_domains(doms_new + i, + __build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); match2: ; } /* Remember the new sched domains */ - if (doms_cur != fallback_doms) - kfree(doms_cur); + if (doms_cur != &fallback_doms) + free_sched_domains(doms_cur, ndoms_cur); kfree(dattr_cur); /* kfree(NULL) is safe */ doms_cur = doms_new; dattr_cur = dattr_new; -- cgit v1.2.3 From a1f84a3ab8e002159498814eaa7e48c33752b04b Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 27 Oct 2009 15:35:38 +0100 Subject: sched: Check for an idle shared cache in select_task_rq_fair() When waking affine, check for an idle shared cache, and if found, wake to that CPU/sibling instead of the waker's CPU. This improves pgsql+oltp ramp up by roughly 8%. Possibly more for other loads, depending on overlap. The trade-off is a roughly 1% peak downturn if tasks are truly synchronous. Signed-off-by: Mike Galbraith Cc: Arjan van de Ven Cc: Peter Zijlstra Cc: LKML-Reference: <1256654138.17752.7.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4e777b47eeda..da87385683cc 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1372,11 +1372,36 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag want_sd = 0; } - if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && - cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + if (want_affine && (tmp->flags & SD_WAKE_AFFINE)) { + int candidate = -1, i; - affine_sd = tmp; - want_affine = 0; + if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) + candidate = cpu; + + /* + * Check for an idle shared cache. + */ + if (tmp->flags & SD_PREFER_SIBLING) { + if (candidate == cpu) { + if (!cpu_rq(prev_cpu)->cfs.nr_running) + candidate = prev_cpu; + } + + if (candidate == -1 || candidate == cpu) { + for_each_cpu(i, sched_domain_span(tmp)) { + if (!cpu_rq(i)->cfs.nr_running) { + candidate = i; + break; + } + } + } + } + + if (candidate >= 0) { + affine_sd = tmp; + want_affine = 0; + cpu = candidate; + } } if (!want_sd && !want_affine) -- cgit v1.2.3 From 1b9508f6831e10d53256825de8904caa22d1ca2c Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 4 Nov 2009 17:53:50 +0100 Subject: sched: Rate-limit newidle Rate limit newidle to migration_cost. It's a win for all stages of sysbench oltp tests. Signed-off-by: Mike Galbraith Cc: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 22 +++++++++++++++++++++- kernel/sched_debug.c | 4 ++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index ae026aad145b..f8492123b5d1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -589,6 +589,8 @@ struct rq { u64 rt_avg; u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; #endif /* calc_load related fields */ @@ -2353,6 +2355,17 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (rq != orig_rq) update_rq_clock(rq); + if (rq->idle_stamp) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } + WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); @@ -4389,6 +4402,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq) int pulled_task = 0; unsigned long next_balance = jiffies + HZ; + this_rq->idle_stamp = this_rq->clock; + + if (this_rq->avg_idle < sysctl_sched_migration_cost) + return; + for_each_domain(this_cpu, sd) { unsigned long interval; @@ -4403,8 +4421,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) + if (pulled_task) { + this_rq->idle_stamp = 0; break; + } } if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index efb84409bc43..6988cf08f705 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -285,12 +285,16 @@ static void print_cpu(struct seq_file *m, int cpu) #ifdef CONFIG_SCHEDSTATS #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); +#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); P(yld_count); P(sched_switch); P(sched_count); P(sched_goidle); +#ifdef CONFIG_SMP + P64(avg_idle); +#endif P(ttwu_count); P(ttwu_local); -- cgit v1.2.3 From fd210738f6601d0fb462df9a2fe5a41896ff6a8f Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 5 Nov 2009 10:57:46 +0100 Subject: sched: Fix affinity logic in select_task_rq_fair() Ingo Molnar reported: [ 26.804000] BUG: using smp_processor_id() in preemptible [00000000] code: events/1/10 [ 26.808000] caller is vmstat_update+0x26/0x70 [ 26.812000] Pid: 10, comm: events/1 Not tainted 2.6.32-rc5 #6887 [ 26.816000] Call Trace: [ 26.820000] [] ? printk+0x28/0x3c [ 26.824000] [] debug_smp_processor_id+0xf0/0x110 [ 26.824000] mount used greatest stack depth: 1464 bytes left [ 26.828000] [] vmstat_update+0x26/0x70 [ 26.832000] [] worker_thread+0x188/0x310 [ 26.836000] [] ? worker_thread+0x127/0x310 [ 26.840000] [] ? autoremove_wake_function+0x0/0x60 [ 26.844000] [] ? worker_thread+0x0/0x310 [ 26.848000] [] kthread+0x7c/0x90 [ 26.852000] [] ? kthread+0x0/0x90 [ 26.856000] [] kernel_thread_helper+0x7/0x10 [ 26.860000] BUG: using smp_processor_id() in preemptible [00000000] code: events/1/10 [ 26.864000] caller is vmstat_update+0x3c/0x70 Because this commit: a1f84a3: sched: Check for an idle shared cache in select_task_rq_fair() broke ->cpus_allowed. Signed-off-by: Mike Galbraith Cc: Peter Zijlstra Cc: arjan@infradead.org Cc: LKML-Reference: <1257415066.12867.1.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index da87385683cc..e4d4483fd617 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1389,6 +1389,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag if (candidate == -1 || candidate == cpu) { for_each_cpu(i, sched_domain_span(tmp)) { + if (!cpumask_test_cpu(i, &p->cpus_allowed)) + continue; if (!cpu_rq(i)->cfs.nr_running) { candidate = i; break; -- cgit v1.2.3 From d8c80ce091f6ead6710bc71b58f2c32e5bf855e4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 27 Oct 2009 15:45:23 +0800 Subject: sched, no_hz: Remove unused rq->last_tick_seen field In 15934a37324f32e0fda633dc7984a671ea81cd75, field last_tick_seen is added to struct rq. But it is unused now. Signed-off-by: Lai Jiangshan Cc: Guillaume Chazarain LKML-Reference: <4AE6A513.6010100@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index f8492123b5d1..23e353568d8e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -534,7 +534,6 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; #ifdef CONFIG_NO_HZ - unsigned long last_tick_seen; unsigned char in_nohz_recently; #endif /* capture load from *all* tasks on this cpu: */ -- cgit v1.2.3 From eae0c9dfb534cb3449888b9601228efa6480fdb5 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 10 Nov 2009 03:50:02 +0100 Subject: sched: Fix and clean up rate-limit newidle code Commit 1b9508f, "Rate-limit newidle" has been confirmed to fix the netperf UDP loopback regression reported by Alex Shi. This is a cleanup and a fix: - moved to a more out of the way spot - fix to ensure that balancing doesn't try to balance runqueues which haven't gone online yet, which can mess up CPU enumeration during boot. Reported-by: Alex Shi Reported-by: Zhang, Yanmin Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra Cc: # .32.x: a1f84a3: sched: Check for an idle shared cache Cc: # .32.x: 1b9508f: sched: Rate-limit newidle Cc: # .32.x: fd21073: sched: Fix affinity logic Cc: # .32.x LKML-Reference: <1257821402.5648.17.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 23e353568d8e..ad37776cc39b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2354,17 +2354,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (rq != orig_rq) update_rq_clock(rq); - if (rq->idle_stamp) { - u64 delta = rq->clock - rq->idle_stamp; - u64 max = 2*sysctl_sched_migration_cost; - - if (delta > max) - rq->avg_idle = max; - else - update_avg(&rq->avg_idle, delta); - rq->idle_stamp = 0; - } - WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); @@ -2421,6 +2410,17 @@ out_running: #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); + + if (unlikely(rq->idle_stamp)) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } #endif out: task_rq_unlock(rq, &flags); @@ -4098,7 +4098,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_setall(cpus); + cpumask_copy(cpus, cpu_online_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4261,7 +4261,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int all_pinned = 0; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_setall(cpus); + cpumask_copy(cpus, cpu_online_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -9522,6 +9522,8 @@ void __init sched_init(void) rq->cpu = i; rq->online = 0; rq->migration_thread = NULL; + rq->idle_stamp = 0; + rq->avg_idle = 2*sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->migration_queue); rq_attach_root(rq, &def_root_domain); #endif -- cgit v1.2.3 From ffd44db5f02af32bcc25a8eb5981bf02a141cdab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Nov 2009 20:12:01 +0100 Subject: sched: Make sure task has correct sched_class after policy change From the code in rt_mutex_setprio(), it is evident that the intention is that task's with a RT 'prio' value as a consequence of receiving a PI boost also have their 'sched_class' field set to '&rt_sched_class'. However, Peter noticed that the code in __setscheduler() could result in this intention being frustrated. Fix it. Reported-by: Peter Williams Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <1257880321.4108.457.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index ad37776cc39b..43e61fa04dc7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6159,22 +6159,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) BUG_ON(p->se.on_rq); p->policy = policy; - switch (p->policy) { - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - p->sched_class = &fair_sched_class; - break; - case SCHED_FIFO: - case SCHED_RR: - p->sched_class = &rt_sched_class; - break; - } - p->rt_priority = prio; p->normal_prio = normal_prio(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); + if (rt_prio(p->prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; set_load_weight(p); } -- cgit v1.2.3 From 055a00865dcfc8e61f3cbefbb879c9577bd36ae5 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 12 Nov 2009 11:07:44 +0100 Subject: sched: Fix/add missing update_rq_clock() calls kthread_bind(), migrate_task() and sched_fork were missing updates, and try_to_wake_up() was updating after having already used the stale clock. Aside from preventing potential latency hits, there' a side benefit in that early boot printk time stamps become monotonic. Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra LKML-Reference: <1258020464.6491.2.camel@marge.simson.net> Signed-off-by: Ingo Molnar LKML-Reference: --- kernel/sched.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 3c11ae0a948d..701eca4958a2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2017,6 +2017,7 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) } spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); set_task_cpu(p, cpu); p->cpus_allowed = cpumask_of_cpu(cpu); p->rt.nr_cpus_allowed = 1; @@ -2115,6 +2116,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) * it is sufficient to simply update the task's cpu field. */ if (!p->se.on_rq && !task_running(rq, p)) { + update_rq_clock(rq); set_task_cpu(p, dest_cpu); return 0; } @@ -2376,14 +2378,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, task_rq_unlock(rq, &flags); cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) + if (cpu != orig_cpu) { + local_irq_save(flags); + rq = cpu_rq(cpu); + update_rq_clock(rq); set_task_cpu(p, cpu); - + local_irq_restore(flags); + } rq = task_rq_lock(p, &flags); - if (rq != orig_rq) - update_rq_clock(rq); - WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); @@ -2545,6 +2548,7 @@ static void __sched_fork(struct task_struct *p) void sched_fork(struct task_struct *p, int clone_flags) { int cpu = get_cpu(); + unsigned long flags; __sched_fork(p); @@ -2581,7 +2585,10 @@ void sched_fork(struct task_struct *p, int clone_flags) #ifdef CONFIG_SMP cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); #endif + local_irq_save(flags); + update_rq_clock(cpu_rq(cpu)); set_task_cpu(p, cpu); + local_irq_restore(flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) -- cgit v1.2.3 From 761b1d26df542fd5eb348837351e4d2f3bc7bffe Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 12 Nov 2009 13:33:45 +0900 Subject: sched: Fix granularity of task_u/stime() Originally task_s/utime() were designed to return clock_t but later changed to return cputime_t by following commit: commit efe567fc8281661524ffa75477a7c4ca9b466c63 Author: Christian Borntraeger Date: Thu Aug 23 15:18:02 2007 +0200 It only changed the type of return value, but not the implementation. As the result the granularity of task_s/utime() is still that of clock_t, not that of cputime_t. So using task_s/utime() in __exit_signal() makes values accumulated to the signal struct to be rounded and coarse grained. This patch removes casts to clock_t in task_u/stime(), to keep granularity of cputime_t over the calculation. v2: Use div_u64() to avoid error "undefined reference to `__udivdi3`" on some 32bit systems. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: xiyou.wangcong@gmail.com Cc: Spencer Candland Cc: Oleg Nesterov Cc: Stanislaw Gruszka LKML-Reference: <4AFB9029.9000208@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 43e61fa04dc7..ab9a034c4a17 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5156,41 +5156,45 @@ cputime_t task_stime(struct task_struct *p) return p->stime; } #else + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs) \ + msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC)) +#endif + cputime_t task_utime(struct task_struct *p) { - clock_t utime = cputime_to_clock_t(p->utime), - total = utime + cputime_to_clock_t(p->stime); + cputime_t utime = p->utime, total = utime + p->stime; u64 temp; /* * Use CFS's precise accounting: */ - temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime); if (total) { temp *= utime; do_div(temp, total); } - utime = (clock_t)temp; + utime = (cputime_t)temp; - p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); + p->prev_utime = max(p->prev_utime, utime); return p->prev_utime; } cputime_t task_stime(struct task_struct *p) { - clock_t stime; + cputime_t stime; /* * Use CFS's precise accounting. (we subtract utime from * the total, to make sure the total observed by userspace * grows monotonically - apps rely on that): */ - stime = nsec_to_clock_t(p->se.sum_exec_runtime) - - cputime_to_clock_t(task_utime(p)); + stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p); if (stime >= 0) - p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); + p->prev_stime = max(p->prev_stime, stime); return p->prev_stime; } -- cgit v1.2.3 From a50bde5130f65733142b32975616427d0ea50856 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Nov 2009 15:55:28 +0100 Subject: sched: Cleanup select_task_rq_fair() Clean up the new affine to idle sibling bits while trying to grok them. Should not have any function differences. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091112145610.832503781@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 73 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 51 insertions(+), 22 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e4d4483fd617..a32df1524746 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1318,6 +1318,41 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) return idlest; } +/* + * Try and locate an idle CPU in the sched_domain. + */ +static int +select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) +{ + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); + int i; + + /* + * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE + * test in select_task_rq_fair) and the prev_cpu is idle then that's + * always a better target than the current cpu. + */ + if (target == cpu) { + if (!cpu_rq(prev_cpu)->cfs.nr_running) + target = prev_cpu; + } + + /* + * Otherwise, iterate the domain and find an elegible idle cpu. + */ + if (target == -1 || target == cpu) { + for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { + if (!cpu_rq(i)->cfs.nr_running) { + target = i; + break; + } + } + } + + return target; +} + /* * sched_balance_self: balance the current task (running on cpu) in domains * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and @@ -1373,36 +1408,30 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag } if (want_affine && (tmp->flags & SD_WAKE_AFFINE)) { - int candidate = -1, i; + int target = -1; + /* + * If both cpu and prev_cpu are part of this domain, + * cpu is a valid SD_WAKE_AFFINE target. + */ if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) - candidate = cpu; + target = cpu; /* - * Check for an idle shared cache. + * If there's an idle sibling in this domain, make that + * the wake_affine target instead of the current cpu. + * + * XXX: should we possibly do this outside of + * WAKE_AFFINE, in case the shared cache domain is + * smaller than the WAKE_AFFINE domain? */ - if (tmp->flags & SD_PREFER_SIBLING) { - if (candidate == cpu) { - if (!cpu_rq(prev_cpu)->cfs.nr_running) - candidate = prev_cpu; - } - - if (candidate == -1 || candidate == cpu) { - for_each_cpu(i, sched_domain_span(tmp)) { - if (!cpumask_test_cpu(i, &p->cpus_allowed)) - continue; - if (!cpu_rq(i)->cfs.nr_running) { - candidate = i; - break; - } - } - } - } + if (tmp->flags & SD_PREFER_SIBLING) + target = select_idle_sibling(p, tmp, target); - if (candidate >= 0) { + if (target >= 0) { affine_sd = tmp; want_affine = 0; - cpu = candidate; + cpu = target; } } -- cgit v1.2.3 From fe3bcfe1f6c1fc4ea7706ac2d05e579fd9092682 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Nov 2009 15:55:29 +0100 Subject: sched: More generic WAKE_AFFINE vs select_idle_sibling() Instead of only considering SD_WAKE_AFFINE | SD_PREFER_SIBLING domains also allow all SD_PREFER_SIBLING domains below a SD_WAKE_AFFINE domain to change the affinity target. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091112145610.909723612@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a32df1524746..f28a2671a1a6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1333,20 +1333,16 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) * test in select_task_rq_fair) and the prev_cpu is idle then that's * always a better target than the current cpu. */ - if (target == cpu) { - if (!cpu_rq(prev_cpu)->cfs.nr_running) - target = prev_cpu; - } + if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) + return prev_cpu; /* * Otherwise, iterate the domain and find an elegible idle cpu. */ - if (target == -1 || target == cpu) { - for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { - if (!cpu_rq(i)->cfs.nr_running) { - target = i; - break; - } + for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { + if (!cpu_rq(i)->cfs.nr_running) { + target = i; + break; } } @@ -1407,7 +1403,12 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag want_sd = 0; } - if (want_affine && (tmp->flags & SD_WAKE_AFFINE)) { + /* + * While iterating the domains looking for a spanning + * WAKE_AFFINE domain, adjust the affine target to any idle cpu + * in cache sharing domains along the way. + */ + if (want_affine) { int target = -1; /* @@ -1420,17 +1421,15 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag /* * If there's an idle sibling in this domain, make that * the wake_affine target instead of the current cpu. - * - * XXX: should we possibly do this outside of - * WAKE_AFFINE, in case the shared cache domain is - * smaller than the WAKE_AFFINE domain? */ if (tmp->flags & SD_PREFER_SIBLING) target = select_idle_sibling(p, tmp, target); if (target >= 0) { - affine_sd = tmp; - want_affine = 0; + if (tmp->flags & SD_WAKE_AFFINE) { + affine_sd = tmp; + want_affine = 0; + } cpu = target; } } -- cgit v1.2.3 From 498657a478c60be092208422fefa9c7b248729c2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Nov 2009 18:33:53 +0900 Subject: sched, kvm: Fix race condition involving sched_in_preempt_notifers In finish_task_switch(), fire_sched_in_preempt_notifiers() is called after finish_lock_switch(). However, depending on architecture, preemption can be enabled after finish_lock_switch() which breaks the semantics of preempt notifiers. So move it before finish_arch_switch(). This also makes the in- notifiers symmetric to out- notifiers in terms of locking - now both are called under rq lock. Signed-off-by: Tejun Heo Acked-by: Avi Kivity Cc: Peter Zijlstra LKML-Reference: <4AFD2801.7020900@kernel.org> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 701eca4958a2..cea2beac7909 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2758,9 +2758,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) prev_state = prev->state; finish_arch_switch(prev); perf_event_task_sched_in(current, cpu_of(rq)); + fire_sched_in_preempt_notifiers(current); finish_lock_switch(rq, prev); - fire_sched_in_preempt_notifiers(current); if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { -- cgit v1.2.3 From 047106adcc85e3023da210143a6ab8a55df9e0fc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 16 Nov 2009 10:28:09 +0100 Subject: sched: Sched_rt_periodic_timer vs cpu hotplug Heiko reported a case where a timer interrupt managed to reference a root_domain structure that was already freed by a concurrent hot-un-plug operation. Solve this like the regular sched_domain stuff is also synchronized, by adding a synchronize_sched() stmt to the free path, this ensures that a root_domain stays present for any atomic section that could have observed it. Reported-by: Heiko Carstens Signed-off-by: Peter Zijlstra Acked-by: Heiko Carstens Cc: Gregory Haskins Cc: Siddha Suresh B Cc: Martin Schwidefsky LKML-Reference: <1258363873.26714.83.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index cea2beac7909..3c91f110fc62 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7912,6 +7912,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) static void free_rootdomain(struct root_domain *rd) { + synchronize_sched(); + cpupri_cleanup(&rd->cpupri); free_cpumask_var(rd->rto_mask); -- cgit v1.2.3 From 429947248f814e90f416ab4f68a871ab628000c3 Mon Sep 17 00:00:00 2001 From: Jan Blunck Date: Fri, 20 Nov 2009 17:40:37 +0100 Subject: sched_feat_write(): Update ppos instead of file->f_pos sched_feat_write() should update ppos instead of file->f_pos. (This reduces some BKL dependencies of this code.) Signed-off-by: Jan Blunck Cc: jkacur@redhat.com Cc: Arnd Bergmann Cc: Frederic Weisbecker Cc: Jamie Lokier Cc: Peter Zijlstra Cc: Christoph Hellwig Cc: Alan Cox LKML-Reference: <1258735245-25826-8-git-send-email-jblunck@suse.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index ab9a034c4a17..93474a7935ae 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -771,7 +771,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, if (!sched_feat_names[i]) return -EINVAL; - filp->f_pos += cnt; + *ppos += cnt; return cnt; } -- cgit v1.2.3 From 36ace27e3e60d44ea69ce394b2e45386ae98d9d9 Mon Sep 17 00:00:00 2001 From: Tim Blechmann Date: Tue, 24 Nov 2009 11:55:45 +0100 Subject: sched: Optimize branch hint in pick_next_task_fair() Branch hint profiling on my nehalem machine showed 90% incorrect branch hints: 15728471 158903754 90 pick_next_task_fair sched_fair.c 1555 Signed-off-by: Tim Blechmann Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <4B0BBBB1.2050100@klingt.org> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f28a2671a1a6..24086e7e7593 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1704,7 +1704,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; - if (unlikely(!cfs_rq->nr_running)) + if (!cfs_rq->nr_running) return NULL; do { -- cgit v1.2.3 From 710390d90f143a9ebb87a475215140f426792efd Mon Sep 17 00:00:00 2001 From: Tim Blechmann Date: Tue, 24 Nov 2009 11:55:27 +0100 Subject: sched: Optimize branch hint in context_switch() Branch hint profiling on my nehalem machine showed over 90% incorrect branch hints: 10420275 170645395 94 context_switch sched.c 3043 10408421 171098521 94 context_switch sched.c 3050 Signed-off-by: Tim Blechmann Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <4B0BBB9F.6080304@klingt.org> Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 93474a7935ae..010d5e16b4c5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2829,14 +2829,14 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_start_context_switch(prev); - if (unlikely(!mm)) { + if (likely(!mm)) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (unlikely(!prev->mm)) { + if (likely(!prev->mm)) { prev->active_mm = NULL; rq->prev_mm = oldmm; } -- cgit v1.2.3 From a3a1de0c34de6f5f8332cd6151c46af7813c0fcb Mon Sep 17 00:00:00 2001 From: Tim Blechmann Date: Tue, 24 Nov 2009 11:55:15 +0100 Subject: sched, x86: Optimize branch hint in __switch_to() Branch hint profiling on my nehalem machine showed 96% incorrect branch hints: 6548732 174664120 96 __switch_to process_64.c 406 6548745 174565593 96 __switch_to process_64.c 410 Signed-off-by: Tim Blechmann Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <4B0BBB93.3080307@klingt.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ad535b683170..d9db1049e86f 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -406,11 +406,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * This won't pick up thread selector changes, but I guess that is ok. */ savesegment(es, prev->es); - if (unlikely(next->es | prev->es)) + if (next->es | prev->es) loadsegment(es, next->es); - savesegment(ds, prev->ds); - if (unlikely(next->ds | prev->ds)) + if (next->ds | prev->ds) loadsegment(ds, next->ds); -- cgit v1.2.3 From 93335a21557e80f6a99bc2812c634e488139043c Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Wed, 25 Nov 2009 15:23:41 +0200 Subject: sched.c: Call debug_show_all_locks() when dumping all tasks In commit v2.6.21-691-g39bc89f ("make SysRq-T show all tasks again") the interface of show_state_filter() was changed: zero valued 'state_filter' specifies "dump all tasks" (instead of -1). However, the condition for calling debug_show_all_locks() ("show locks if all tasks are dumped") was not updated accordingly. Signed-off-by: Shmulik Ladkani Cc: peterz@infradead.org LKML-Reference: <4b0d2fe4.0ab6660a.6437.3cfc@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 010d5e16b4c5..a57c6aee6d4a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6915,7 +6915,7 @@ void show_state_filter(unsigned long state_filter) /* * Only show locks if all tasks are dumped: */ - if (state_filter == -1) + if (!state_filter) debug_show_all_locks(); } -- cgit v1.2.3 From f6630114d9198aa959ac95c131334c020038f253 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 17 Nov 2009 18:22:15 -0600 Subject: sched: Limit the number of scheduler debug messages Remove the verbose scheduler debug messages unless kernel parameter "sched_debug" set. /proc/sched_debug unchanged. Signed-off-by: Mike Travis Cc: Heiko Carstens Cc: Roland Dreier Cc: Randy Dunlap Cc: Tejun Heo Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: Yinghai Lu Cc: David Rientjes Cc: Steven Rostedt Cc: Rusty Russell Cc: Hidetoshi Seto Cc: Jack Steiner Cc: Frederic Weisbecker LKML-Reference: <20091118002221.489305000@alcatraz.americas.sgi.com> Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 2 ++ kernel/sched.c | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9107b387e91f..f2a9507b27b2 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2182,6 +2182,8 @@ and is between 256 and 4096 characters. It is defined in the file sbni= [NET] Granch SBNI12 leased line adapter + sched_debug [KNL] Enables verbose scheduler debug messages. + sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver Format: [,[,]] diff --git a/kernel/sched.c b/kernel/sched.c index a57c6aee6d4a..48ff66a6892d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7720,6 +7720,16 @@ early_initcall(migration_init); #ifdef CONFIG_SCHED_DEBUG +static __read_m