From 0b1adaa031a55e44f5dd942f234bf09d28e8a0d6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Mar 2010 19:45:54 +0100 Subject: genirq: Prevent oneshot irq thread race Lars-Peter pointed out that the oneshot threaded interrupt handler code has the following race: CPU0 CPU1 hande_level_irq(irq X) mask_ack_irq(irq X) handle_IRQ_event(irq X) wake_up(thread_handler) thread handler(irq X) runs finalize_oneshot(irq X) does not unmask due to !(desc->status & IRQ_MASKED) return from irq does not unmask due to (desc->status & IRQ_ONESHOT) This leaves the interrupt line masked forever. The reason for this is the inconsistent handling of the IRQ_MASKED flag. Instead of setting it in the mask function the oneshot support sets the flag after waking up the irq thread. The solution for this is to set/clear the IRQ_MASKED status whenever we mask/unmask an interrupt line. That's the easy part, but that cleanup opens another race: CPU0 CPU1 hande_level_irq(irq) mask_ack_irq(irq) handle_IRQ_event(irq) wake_up(thread_handler) thread handler(irq) runs finalize_oneshot_irq(irq) unmask(irq) irq triggers again handle_level_irq(irq) mask_ack_irq(irq) return from irq due to IRQ_INPROGRESS return from irq does not unmask due to (desc->status & IRQ_ONESHOT) This requires that we synchronize finalize_oneshot_irq() with the primary handler. If IRQ_INPROGESS is set we wait until the primary handler on the other CPU has returned before unmasking the interrupt line again. We probably have never seen that problem because it does not happen on UP and on SMP the irqbalancer protects us by pinning the primary handler and the thread to the same CPU. Reported-by: Lars-Peter Clausen Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- kernel/irq/chip.c | 31 ++++++++++++++++++++++--------- kernel/irq/manage.c | 18 ++++++++++++++++++ 2 files changed, 40 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index d70394f12ee9..71eba24a39a2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -359,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq) if (desc->chip->ack) desc->chip->ack(irq); } + desc->status |= IRQ_MASKED; +} + +static inline void mask_irq(struct irq_desc *desc, int irq) +{ + if (desc->chip->mask) { + desc->chip->mask(irq); + desc->status |= IRQ_MASKED; + } +} + +static inline void unmask_irq(struct irq_desc *desc, int irq) +{ + if (desc->chip->unmask) { + desc->chip->unmask(irq); + desc->status &= ~IRQ_MASKED; + } } /* @@ -484,10 +501,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; - if (unlikely(desc->status & IRQ_ONESHOT)) - desc->status |= IRQ_MASKED; - else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) - desc->chip->unmask(irq); + if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) + unmask_irq(desc, irq); out_unlock: raw_spin_unlock(&desc->lock); } @@ -524,8 +539,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) { desc->status |= IRQ_PENDING; - if (desc->chip->mask) - desc->chip->mask(irq); + mask_irq(desc, irq); goto out; } @@ -593,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) irqreturn_t action_ret; if (unlikely(!action)) { - desc->chip->mask(irq); + mask_irq(desc, irq); goto out_unlock; } @@ -605,8 +619,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) if (unlikely((desc->status & (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == (IRQ_PENDING | IRQ_MASKED))) { - desc->chip->unmask(irq); - desc->status &= ~IRQ_MASKED; + unmask_irq(desc, irq); } desc->status &= ~IRQ_PENDING; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index eb6078ca60c7..69a3d7b9414c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -483,8 +483,26 @@ static int irq_wait_for_interrupt(struct irqaction *action) */ static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) { +again: chip_bus_lock(irq, desc); raw_spin_lock_irq(&desc->lock); + + /* + * Implausible though it may be we need to protect us against + * the following scenario: + * + * The thread is faster done than the hard interrupt handler + * on the other CPU. If we unmask the irq line then the + * interrupt can come in again and masks the line, leaves due + * to IRQ_INPROGRESS and the irq line is masked forever. + */ + if (unlikely(desc->status & IRQ_INPROGRESS)) { + raw_spin_unlock_irq(&desc->lock); + chip_bus_sync_unlock(irq, desc); + cpu_relax(); + goto again; + } + if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { desc->status &= ~IRQ_MASKED; desc->chip->unmask(irq); -- cgit v1.2.3 From 80a05b9ffa7dc13f6693902dd8999a2b61a3a0d7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 12 Mar 2010 17:34:14 +0100 Subject: clockevents: Sanitize min_delta_ns adjustment and prevent overflows The current logic which handles clock events programming failures can increase min_delta_ns unlimited and even can cause overflows. Sanitize it by: - prevent zero increase when min_delta_ns == 1 - limiting min_delta_ns to a jiffie - bail out if the jiffie limit is hit - add retries stats for /proc/timer_list so we can gather data Reported-by: Uwe Kleine-Koenig Signed-off-by: Thomas Gleixner --- kernel/time/tick-oneshot.c | 52 +++++++++++++++++++++++++++++++++++----------- kernel/time/timer_list.c | 3 ++- 2 files changed, 42 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 0a8a213016f0..aada0e52680a 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -22,6 +22,29 @@ #include "tick-internal.h" +/* Limit min_delta to a jiffie */ +#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) + +static int tick_increase_min_delta(struct clock_event_device *dev) +{ + /* Nothing to do if we already reached the limit */ + if (dev->min_delta_ns >= MIN_DELTA_LIMIT) + return -ETIME; + + if (dev->min_delta_ns < 5000) + dev->min_delta_ns = 5000; + else + dev->min_delta_ns += dev->min_delta_ns >> 1; + + if (dev->min_delta_ns > MIN_DELTA_LIMIT) + dev->min_delta_ns = MIN_DELTA_LIMIT; + + printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", + dev->name ? dev->name : "?", + (unsigned long long) dev->min_delta_ns); + return 0; +} + /** * tick_program_event internal worker function */ @@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, if (!ret || !force) return ret; + dev->retries++; /* - * We tried 2 times to program the device with the given - * min_delta_ns. If that's not working then we double it + * We tried 3 times to program the device with the given + * min_delta_ns. If that's not working then we increase it * and emit a warning. */ if (++i > 2) { /* Increase the min. delta and try again */ - if (!dev->min_delta_ns) - dev->min_delta_ns = 5000; - else - dev->min_delta_ns += dev->min_delta_ns >> 1; - - printk(KERN_WARNING - "CE: %s increasing min_delta_ns to %llu nsec\n", - dev->name ? dev->name : "?", - (unsigned long long) dev->min_delta_ns << 1); - + if (tick_increase_min_delta(dev)) { + /* + * Get out of the loop if min_delta_ns + * hit the limit already. That's + * better than staying here forever. + * + * We clear next_event so we have a + * chance that the box survives. + */ + printk(KERN_WARNING + "CE: Reprogramming failure. Giving up\n"); + dev->next_event.tv64 = KTIME_MAX; + return -ETIME; + } i = 0; } diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index bdfb8dd1050c..1a4a7dd78777 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -228,6 +228,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) SEQ_printf(m, " event_handler: "); print_name_offset(m, dev->event_handler); SEQ_printf(m, "\n"); + SEQ_printf(m, " retries: %lu\n", dev->retries); } static void timer_list_show_tickdevices(struct seq_file *m) @@ -257,7 +258,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.5\n"); + SEQ_printf(m, "Timer List Version: v0.6\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); -- cgit v1.2.3 From 829b6c1ef488856c6a46a2f705f5068062d5f34c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 11 Mar 2010 14:04:30 -0800 Subject: timer stats: Fix del_timer_sync() and try_to_del_timer_sync() These functions forgot to run timer_stats_timer_clear_start_info(). It's unobvious what effect this has and whether it matters much - we won't be printing it out anyway if the timer's detached. Untested, just an Ingo trollpatch. [ Nevertheless correct - tglx ] Signed-off-by: Andrew Morton Cc: Ingo Molnar Cc: johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- kernel/timer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index c61a7949387f..fc965eae0e87 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -880,6 +880,7 @@ int try_to_del_timer_sync(struct timer_list *timer) if (base->running_timer == timer) goto out; + timer_stats_timer_clear_start_info(timer); ret = 0; if (timer_pending(timer)) { detach_timer(timer, 1); -- cgit v1.2.3 From 15365c108ea27598e265f8c13e7051d99ca5b0b9 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 11 Mar 2010 14:04:31 -0800 Subject: posix-cpu-timers: Reset expire cache when no timer is running When a process deletes cpu timer or a timer expires we do not clear the expiration cache sig->cputimer_expires. As a result the fastpath_timer_check() which prevents us to loop over all threads in case no timer is active is not working and we run the slow path needlessly on every tick. Zero sig->cputimer_expires in stop_process_timers(). Signed-off-by: Stanislaw Gruszka Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Hidetoshi Seto Cc: Spencer Candland Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-cpu-timers.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 438ff4523513..edec25a68e9e 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1060,9 +1060,9 @@ static void check_thread_timers(struct task_struct *tsk, } } -static void stop_process_timers(struct task_struct *tsk) +static void stop_process_timers(struct signal_struct *sig) { - struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct thread_group_cputimer *cputimer = &sig->cputimer; unsigned long flags; if (!cputimer->running) @@ -1071,6 +1071,10 @@ static void stop_process_timers(struct task_struct *tsk) spin_lock_irqsave(&cputimer->lock, flags); cputimer->running = 0; spin_unlock_irqrestore(&cputimer->lock, flags); + + sig->cputime_expires.prof_exp = cputime_zero; + sig->cputime_expires.virt_exp = cputime_zero; + sig->cputime_expires.sched_exp = 0; } static u32 onecputick; @@ -1131,7 +1135,7 @@ static void check_process_timers(struct task_struct *tsk, list_empty(&timers[CPUCLOCK_VIRT]) && cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && list_empty(&timers[CPUCLOCK_SCHED])) { - stop_process_timers(tsk); + stop_process_timers(sig); return; } -- cgit v1.2.3 From cd3d8031eb4311e516329aee03c79a08333141f1 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 12 Mar 2010 16:15:36 +0900 Subject: sched: sched_getaffinity(): Allow less than NR_CPUS length [ Note, this commit changes the syscall ABI for > 1024 CPUs systems. ] Recently, some distro decided to use NR_CPUS=4096 for mysterious reasons. Unfortunately, glibc sched interface has the following definition: # define __CPU_SETSIZE 1024 # define __NCPUBITS (8 * sizeof (__cpu_mask)) typedef unsigned long int __cpu_mask; typedef struct { __cpu_mask __bits[__CPU_SETSIZE / __NCPUBITS]; } cpu_set_t; It mean, if NR_CPUS is bigger than 1024, cpu_set_t makes an ABI issue ... More recently, Sharyathi Nagesh reported following test program makes misterious syscall failure: ----------------------------------------------------------------------- #define _GNU_SOURCE #include #include #include int main() { cpu_set_t set; if (sched_getaffinity(0, sizeof(cpu_set_t), &set) < 0) printf("\n Call is failing with:%d", errno); } ----------------------------------------------------------------------- Because the kernel assumes len argument of sched_getaffinity() is bigger than NR_CPUS. But now it is not correct. Now we are faced with the following annoying dilemma, due to the limitations of the glibc interface built in years ago: (1) if we change glibc's __CPU_SETSIZE definition, we lost binary compatibility of _all_ application. (2) if we don't change it, we also lost binary compatibility of Sharyathi's use case. Then, I would propse to change the rule of the len argument of sched_getaffinity(). Old: len should be bigger than NR_CPUS New: len should be bigger than maximum possible cpu id This creates the following behavior: (A) In the real 4096 cpus machine, the above test program still return -EINVAL. (B) NR_CPUS=4096 but the machine have less than 1024 cpus (almost all machines in the world), the above can run successfully. Fortunatelly, BIG SGI machine is mainly used for HPC use case. It means they can rebuild their programs. IOW we hope they are not annoyed by this issue ... Reported-by: Sharyathi Nagesh Signed-off-by: KOSAKI Motohiro Acked-by: Ulrich Drepper Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Jack Steiner Cc: Russ Anderson Cc: Mike Travis LKML-Reference: <20100312161316.9520.A69D9226@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9ab3cd7858d3..6eaef3df2d19 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4902,7 +4902,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, int ret; cpumask_var_t mask; - if (len < cpumask_size()) + if (len < nr_cpu_ids) + return -EINVAL; + if (len & (sizeof(unsigned long)-1)) return -EINVAL; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) @@ -4910,10 +4912,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ret = sched_getaffinity(pid, mask); if (ret == 0) { - if (copy_to_user(user_mask_ptr, mask, cpumask_size())) + int retlen = min(len, cpumask_size()); + + if (copy_to_user(user_mask_ptr, mask, retlen)) ret = -EFAULT; else - ret = cpumask_size(); + ret = retlen; } free_cpumask_var(mask); -- cgit v1.2.3 From e3818b8dce2a934cd1521dbc4827e5238d8f45d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Mar 2010 17:03:43 -0700 Subject: rcu: Make rcu_read_lock_bh_held() allow for disabled BH Disabling BH can stand in for rcu_read_lock_bh(), and this patch updates rcu_read_lock_bh_held() to allow for this. In order to avoid include-file hell, this function is moved out of line to kernel/rcupdate.c. This fixes a false positive RCU warning. Reported-by: Arnd Bergmann Reported-by: Eric Dumazet Signed-off-by: Paul E. McKenney Acked-by: Lai Jiangshan Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <20100316000343.GA25857@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f1125c1a6321..63fe25433980 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -45,6 +45,7 @@ #include #include #include +#include #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -66,6 +67,28 @@ EXPORT_SYMBOL_GPL(rcu_sched_lock_map); int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +/** + * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? + * + * Check for bottom half being disabled, which covers both the + * CONFIG_PROVE_RCU and not cases. Note that if someone uses + * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) + * will show the situation. + * + * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. + */ +int rcu_read_lock_bh_held(void) +{ + if (!debug_lockdep_rcu_enabled()) + return 1; + return in_softirq(); +} +EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); + +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + /* * This function is invoked towards the end of the scheduler's initialization * process. Before this is called, the idle task might contain -- cgit v1.2.3 From c890692bf37671b5b78a1870d55d6d87e1c8a509 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 11 Mar 2010 14:08:43 -0800 Subject: kernel/sched.c: Suppress unused var warning On UP: kernel/sched.c: In function 'wake_up_new_task': kernel/sched.c:2631: warning: unused variable 'cpu' Signed-off-by: Andrew Morton Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6eaef3df2d19..82975b5b42f7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2650,7 +2650,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; - int cpu = get_cpu(); + int cpu __maybe_unused = get_cpu(); #ifdef CONFIG_SMP /* -- cgit v1.2.3 From 8bc037fb89bb3104b9ae290d18c877624cd7d9cc Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 17 Mar 2010 09:36:58 +0900 Subject: sched: Use proper type in sched_getaffinity() Using the proper type fixes the following compiler warning: kernel/sched.c:4850: warning: comparison of distinct pointer types lacks a cast Signed-off-by: KOSAKI Motohiro Cc: torvalds@linux-foundation.org Cc: travis@sgi.com Cc: peterz@infradead.org Cc: drepper@redhat.com Cc: rja@sgi.com Cc: sharyath@in.ibm.com Cc: steiner@sgi.com LKML-Reference: <20100317090046.4C79.A69D9226@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 82975b5b42f7..49d2fa7b687a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4912,7 +4912,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ret = sched_getaffinity(pid, mask); if (ret == 0) { - int retlen = min(len, cpumask_size()); + size_t retlen = min_t(size_t, len, cpumask_size()); if (copy_to_user(user_mask_ptr, mask, retlen)) ret = -EFAULT; -- cgit v1.2.3 From 2271048d1b3b0aabf83d25b29c20646dcabedc05 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 18 Mar 2010 17:54:19 -0400 Subject: ring-buffer: Do 8 byte alignment for 64 bit that can not handle 4 byte align The ring buffer uses 4 byte alignment while recording events into the buffer, even on 64bit machines. This saves space when there are lots of events being recorded at 4 byte boundaries. The ring buffer has a zero copy method to write into the buffer, with the reserving of space and then committing it. This may cause problems when writing an 8 byte word into a 4 byte alignment (not 8). For x86 and PPC this is not an issue, but on some architectures this would cause an out-of-alignment exception. This patch uses CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS to determine if it is OK to use 4 byte alignments on 64 bit machines. If it is not, it forces the ring buffer event header to be 8 bytes and not 4, and will align the length of the data to be 8 byte aligned. This keeps the data payload at 8 byte alignments and will allow these machines to run without issue. The trick to this is that the header can be either 4 bytes or 8 bytes depending on the length of the data payload. The 4 byte header has a length field that supports up to 112 bytes. If the length of the data is more than 112, the length field is set to zero, and the actual length is stored in the next 4 bytes after the header. When CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is not set, the code forces zero in the 4 byte header forcing the length to be stored in the 4 byte array, even with a small data load. It also forces the length of the data load to be 8 byte aligned. The combination of these two guarantee that the data is always at 8 byte alignment. Tested-by: Frederic Weisbecker (on sparc64) Reported-by: Frederic Weisbecker Acked-by: David S. Miller Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 05a9f83b8819..d1187ef20caf 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -207,6 +207,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on); #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ +#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +# define RB_FORCE_8BYTE_ALIGNMENT 0 +# define RB_ARCH_ALIGNMENT RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT 1 +# define RB_ARCH_ALIGNMENT 8U +#endif + /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -1547,7 +1555,7 @@ rb_update_event(struct ring_buffer_event *event, case 0: length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA) + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) event->array[0] = length; else event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); @@ -1722,11 +1730,11 @@ static unsigned rb_calculate_event_length(unsigned length) if (!length) length = 1; - if (length > RB_MAX_SMALL_DATA) + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) length += sizeof(event.array[0]); length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ALIGNMENT); + length = ALIGN(length, RB_ARCH_ALIGNMENT); return length; } -- cgit v1.2.3 From 8c2eb4805d422bdbf60ba00ff233c794d23c3c00 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 19 Mar 2010 10:28:02 +0000 Subject: softlockup: Stop spurious softlockup messages due to overflow Ensure additions on touch_ts do not overflow. This can occur when the top 32 bits of the TSC reach 0xffffffff causing additions to touch_ts to overflow and this in turn generates spurious softlockup warnings. Signed-off-by: Colin Ian King Cc: Peter Zijlstra Cc: Eric Dumazet Cc: LKML-Reference: <1268994482.1798.6.camel@lenovo> Signed-off-by: Ingo Molnar --- kernel/softlockup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 0d4c7898ab80..4b493f67dcb5 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -155,11 +155,11 @@ void softlockup_tick(void) * Wake up the high-prio watchdog task twice per * threshold timespan. */ - if (now > touch_ts + softlockup_thresh/2) + if (time_after(now - softlockup_thresh/2, touch_ts)) wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); /* Warn about unreasonable delays: */ - if (now <= (touch_ts + softlockup_thresh)) + if (time_before_eq(now - softlockup_thresh, touch_ts)) return; per_cpu(softlockup_print_ts, this_cpu) = touch_ts; -- cgit v1.2.3 From 830ec0458c390f29c6c99e1ff7feab9e36368d12 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 18 Mar 2010 14:47:30 -0700 Subject: time: Fix accumulation bug triggered by long delay. The logarithmic accumulation done in the timekeeping has some overflow protection that limits the max shift value. That means it will take more then shift loops to accumulate all of the cycles. This causes the shift decrement to underflow, which causes the loop to never exit. The simplest fix would be simply to do a: if (shift) shift--; However that is not optimal, as we know the cycle offset is larger then the interval << shift, the above would make shift drop to zero, then we would be spinning for quite awhile accumulating at interval chunks at a time. Instead, this patch only decreases shift if the offset is smaller then cycle_interval << shift. This makes sure we accumulate using the largest chunks possible without overflowing tick_length, and limits the number of iterations through the loop. This issue was found and reported by Sonic Zhang, who also tested the fix. Many thanks your explanation and testing! Reported-by: Sonic Zhang Signed-off-by: John Stultz Tested-by: Sonic Zhang LKML-Reference: <1268948850-5225-1-git-send-email-johnstul@us.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 16736379a9ca..39f6177fafac 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -818,7 +818,8 @@ void update_wall_time(void) shift = min(shift, maxshift); while (offset >= timekeeper.cycle_interval) { offset = logarithmic_accumulation(offset, shift); - shift--; + if(offset < timekeeper.cycle_interval< Date: Thu, 11 Mar 2010 17:01:09 -0700 Subject: resources: add interfaces that return conflict information request_resource() and insert_resource() only return success or failure, which no information about what existing resource conflicted with the proposed new reservation. This patch adds request_resource_conflict() and insert_resource_conflict(), which return the conflicting resource. Callers may use this for better error messages or to adjust the new resource and retry the request. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- kernel/resource.c | 44 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 2d5be5d9bf5f..9c358e263534 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -219,19 +219,34 @@ void release_child_resources(struct resource *r) } /** - * request_resource - request and reserve an I/O or memory resource + * request_resource_conflict - request and reserve an I/O or memory resource * @root: root resource descriptor * @new: resource descriptor desired by caller * - * Returns 0 for success, negative error code on error. + * Returns 0 for success, conflict resource on error. */ -int request_resource(struct resource *root, struct resource *new) +struct resource *request_resource_conflict(struct resource *root, struct resource *new) { struct resource *conflict; write_lock(&resource_lock); conflict = __request_resource(root, new); write_unlock(&resource_lock); + return conflict; +} + +/** + * request_resource - request and reserve an I/O or memory resource + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * + * Returns 0 for success, negative error code on error. + */ +int request_resource(struct resource *root, struct resource *new) +{ + struct resource *conflict; + + conflict = request_resource_conflict(root, new); return conflict ? -EBUSY : 0; } @@ -474,25 +489,40 @@ static struct resource * __insert_resource(struct resource *parent, struct resou } /** - * insert_resource - Inserts a resource in the resource tree + * insert_resource_conflict - Inserts resource in the resource tree * @parent: parent of the new resource * @new: new resource to insert * - * Returns 0 on success, -EBUSY if the resource can't be inserted. + * Returns 0 on success, conflict resource if the resource can't be inserted. * - * This function is equivalent to request_resource when no conflict + * This function is equivalent to request_resource_conflict when no conflict * happens. If a conflict happens, and the conflicting resources * entirely fit within the range of the new resource, then the new * resource is inserted and the conflicting resources become children of * the new resource. */ -int insert_resource(struct resource *parent, struct resource *new) +struct resource *insert_resource_conflict(struct resource *parent, struct resource *new) { struct resource *conflict; write_lock(&resource_lock); conflict = __insert_resource(parent, new); write_unlock(&resource_lock); + return conflict; +} + +/** + * insert_resource - Inserts a resource in the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Returns 0 on success, -EBUSY if the resource can't be inserted. + */ +int insert_resource(struct resource *parent, struct resource *new) +{ + struct resource *conflict; + + conflict = insert_resource_conflict(parent, new); return conflict ? -EBUSY : 0; } -- cgit v1.2.3 From cc8c3b78433222e5dbc1fdfcfdde29e1743f181a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 23 Mar 2010 22:40:53 +0100 Subject: genirq: Protect access to irq_desc->action in can_request_irq() can_request_irq() accesses and dereferences irq_desc->action w/o holding irq_desc->lock. So action can be freed on another CPU before it's dereferenced. Unlikely, but ... Protect it with desc->lock. Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 69a3d7b9414c..398fda155f6e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -382,6 +382,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; + unsigned long flags; if (!desc) return 0; @@ -389,11 +390,14 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) if (desc->status & IRQ_NOREQUEST) return 0; + raw_spin_lock_irqsave(&desc->lock, flags); action = desc->action; if (action) if (irqflags & action->flags & IRQF_SHARED) action = NULL; + raw_spin_unlock_irqrestore(&desc->lock, flags); + return !action; } -- cgit v1.2.3 From 860652bfb890bd861c999ec39fcffabe5b712f85 Mon Sep 17 00:00:00 2001 From: Henrik Kretzschmar Date: Wed, 24 Mar 2010 12:59:20 +0100 Subject: genirq: Move two IRQ functions from .init.text to .text Both functions should not be marked as __init, since they be called from modules after the init section is freed. Signed-off-by: Henrik Kretzschmar Cc: Yinghai Lu Cc: Peter Zijlstra Cc: Jiri Kosina LKML-Reference: <1269431961-5731-1-git-send-email-henne@nachtwindheim.de> Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 71eba24a39a2..3c2d6e7737f0 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -729,7 +729,7 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, __set_irq_handler(irq, handle, 0, name); } -void __init set_irq_noprobe(unsigned int irq) +void set_irq_noprobe(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -744,7 +744,7 @@ void __init set_irq_noprobe(unsigned int irq) raw_spin_unlock_irqrestore(&desc->lock, flags); } -void __init set_irq_probe(unsigned int irq) +void set_irq_probe(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; -- cgit v1.2.3 From 9d34706f42f9b8c15185423d9af98d37ba21d011 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 23 Mar 2010 13:35:12 -0700 Subject: cgroups: remove duplicate include commit e6a1105b ("cgroups: subsystem module loading interface") and commit c50cc752 ("sched, cgroups: Fix module export") result in duplicate including of module.h Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ef909a329750..e2769e13980c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -27,7 +27,6 @@ */ #include -#include #include #include #include -- cgit v1.2.3 From 5ab116c9349ef52d6fbd2e2917a53f13194b048e Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 23 Mar 2010 13:35:34 -0700 Subject: cpuset: fix the problem that cpuset_mem_spread_node() returns an offline node cpuset_mem_spread_node() returns an offline node, and causes an oops. This patch fixes it by initializing task->mems_allowed to node_states[N_HIGH_MEMORY], and updating task->mems_allowed when doing memory hotplug. Signed-off-by: Miao Xie Acked-by: David Rientjes Reported-by: Nick Piggin Tested-by: Nick Piggin Cc: Paul Menage Cc: Li Zefan Cc: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 20 ++++++++++++-------- kernel/kthread.c | 2 +- 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ba401fab459f..5d38bd74483c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * call to guarantee_online_mems(), as we know no one is changing * our task's cpuset. * - * Hold callback_mutex around the two modifications of our tasks - * mems_allowed to synchronize with cpuset_mems_allowed(). - * * While the mm_struct we are migrating is typically from some * other task, the task_struct mems_allowed that we are hacking * is for our current task, which must allocate new pages for that @@ -1391,11 +1388,10 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); - to = node_possible_map; } else { guarantee_online_cpus(cs, cpus_attach); - guarantee_online_mems(cs, &to); } + guarantee_online_mems(cs, &to); /* do per-task migration stuff possibly for each in the threadgroup */ cpuset_attach_task(tsk, &to, cs); @@ -2090,15 +2086,23 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { + nodemask_t oldmems; + cgroup_lock(); switch (action) { case MEM_ONLINE: - case MEM_OFFLINE: + oldmems = top_cpuset.mems_allowed; mutex_lock(&callback_mutex); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; mutex_unlock(&callback_mutex); - if (action == MEM_OFFLINE) - scan_for_empty_cpusets(&top_cpuset); + update_tasks_nodemask(&top_cpuset, &oldmems, NULL); + break; + case MEM_OFFLINE: + /* + * needn't update top_cpuset.mems_allowed explicitly because + * scan_for_empty_cpusets() will update it. + */ + scan_for_empty_cpusets(&top_cpuset); break; default: break; diff --git a/kernel/kthread.c b/kernel/kthread.c index 82ed0ea15194..83911c780175 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -219,7 +219,7 @@ int kthreadd(void *unused) set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, cpu_all_mask); - set_mems_allowed(node_possible_map); + set_mems_allowed(node_states[N_HIGH_MEMORY]); current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; -- cgit v1.2.3 From 53feb29767c29c877f9d47dcfe14211b5b0f7ebd Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 23 Mar 2010 13:35:35 -0700 Subject: cpuset: alloc nodemask_t on the heap rather than the stack Signed-off-by: Miao Xie Acked-by: David Rientjes Cc: Nick Piggin Cc: Paul Menage Cc: Li Zefan Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 94 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 5d38bd74483c..d10946748ec2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -970,15 +970,20 @@ static void cpuset_change_nodemask(struct task_struct *p, struct cpuset *cs; int migrate; const nodemask_t *oldmem = scan->data; - nodemask_t newmems; + NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); + + if (!newmems) + return; cs = cgroup_cs(scan->cg); - guarantee_online_mems(cs, &newmems); + guarantee_online_mems(cs, newmems); task_lock(p); - cpuset_change_task_nodemask(p, &newmems); + cpuset_change_task_nodemask(p, newmems); task_unlock(p); + NODEMASK_FREE(newmems); + mm = get_task_mm(p); if (!mm) return; @@ -1048,16 +1053,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { - nodemask_t oldmem; + NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL); int retval; struct ptr_heap heap; + if (!oldmem) + return -ENOMEM; + /* * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; * it's read-only */ - if (cs == &top_cpuset) - return -EACCES; + if (cs == &top_cpuset) { + retval = -EACCES; + goto done; + } /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. @@ -1073,11 +1083,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, goto done; if (!nodes_subset(trialcs->mems_allowed, - node_states[N_HIGH_MEMORY])) - return -EINVAL; + node_states[N_HIGH_MEMORY])) { + retval = -EINVAL; + goto done; + } } - oldmem = cs->mems_allowed; - if (nodes_equal(oldmem, trialcs->mems_allowed)) { + *oldmem = cs->mems_allowed; + if (nodes_equal(*oldmem, trialcs->mems_allowed)) { retval = 0; /* Too easy - nothing to do */ goto done; } @@ -1093,10 +1105,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, cs->mems_allowed = trialcs->mems_allowed; mutex_unlock(&callback_mutex); - update_tasks_nodemask(cs, &oldmem, &heap); + update_tasks_nodemask(cs, oldmem, &heap); heap_free(&heap); done: + NODEMASK_FREE(oldmem); return retval; } @@ -1381,39 +1394,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *oldcont, struct task_struct *tsk, bool threadgroup) { - nodemask_t from, to; struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); + NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); + NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL); + + if (from == NULL || to == NULL) + goto alloc_fail; if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); } else { guarantee_online_cpus(cs, cpus_attach); } - guarantee_online_mems(cs, &to); + guarantee_online_mems(cs, to); /* do per-task migration stuff possibly for each in the threadgroup */ - cpuset_attach_task(tsk, &to, cs); + cpuset_attach_task(tsk, to, cs); if (threadgroup) { struct task_struct *c; rcu_read_lock(); list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - cpuset_attach_task(c, &to, cs); + cpuset_attach_task(c, to, cs); } rcu_read_unlock(); } /* change mm; only needs to be done once even if threadgroup */ - from = oldcs->mems_allowed; - to = cs->mems_allowed; + *from = oldcs->mems_allowed; + *to = cs->mems_allowed; mm = get_task_mm(tsk); if (mm) { - mpol_rebind_mm(mm, &to); + mpol_rebind_mm(mm, to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, &from, &to); + cpuset_migrate_mm(mm, from, to); mmput(mm); } + +alloc_fail: + NODEMASK_FREE(from); + NODEMASK_FREE(to); } /* The various types of files and directories in a cpuset file system */ @@ -1558,13 +1579,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) { - nodemask_t mask; + NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); + int retval; + + if (mask == NULL) + return -ENOMEM; mutex_lock(&callback_mutex); - mask = cs->mems_allowed; + *mask = cs->mems_allowed; mutex_unlock(&callback_mutex); - return nodelist_scnprintf(page, PAGE_SIZE, mask); + retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); + + NODEMASK_FREE(mask); + + return retval; } static ssize_t cpuset_common_file_read(struct cgroup *cont, @@ -1993,7 +2022,10 @@ static void scan_for_empty_cpusets(struct cpuset *root) struct cpuset *cp; /* scans cpusets being updated */ struct cpuset *child; /* scans child cpusets of cp */ struct cgroup *cont; - nodemask_t oldmems; + NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); + + if (oldmems == NULL) + return; list_add_tail((struct list_head *)&root->stack_list, &queue); @@ -2010,7 +2042,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; - oldmems = cp->mems_allowed; + *oldmems = cp->mems_allowed; /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); @@ -2026,9 +2058,10 @@ static void scan_for_empty_cpusets(struct cpuset *root) remove_tasks_in_empty_cpuset(cp); else { update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, &oldmems, NULL); + update_tasks_nodemask(cp, oldmems, NULL); } } + NODEMASK_FREE(oldmems); } /* @@ -2086,16 +2119,19 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - nodemask_t oldmems; + NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); + + if (oldmems == NULL) + return NOTIFY_DONE; cgroup_lock(); switch (action) { case MEM_ONLINE: - oldmems = top_cpuset.mems_allowed; + *oldmems = top_cpuset.mems_allowed; mutex_lock(&callback_mutex); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, &oldmems, NULL); + update_tasks_nodemask(&top_cpuset, oldmems, NULL); break; case MEM_OFFLINE: /* @@ -2108,6 +2144,8 @@ static int cpuset_track_online_nodes(struct notifier_block *self, break; } cgroup_unlock(); + + NODEMASK_FREE(oldmems); return NOTIFY_OK; } #endif -- cgit v1.2.3 From 4f598458ea4450f53e8ed929ee4e66b3404a7286 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Wed, 10 Mar 2010 22:59:13 +0100 Subject: Freezer: Only show the state of tasks refusing to freeze show_state will dump all tasks state, so if freezer failed to freeze any task, kernel will dump all tasks state and flood the dmesg log. This patch makes freezer only show state of tasks refusing to freeze. Signed-off-by: Xiaotian Feng Acked-by: Pavel Machek Acked-by: David Rientjes Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 5ade1bdcf366..a0480cd4daaf 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -88,12 +88,11 @@ static int try_to_freeze_tasks(bool sig_only) printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " "(%d tasks refusing to freeze):\n", elapsed_csecs / 100, elapsed_csecs % 100, todo); - show_state(); read_lock(&tasklist_lock); do_each_thread(g, p) { task_lock(p); if (freezing(p) && !freezer_should_skip(p)) - printk(KERN_ERR " %s\n", p->comm); + sched_show_task(p); cancel_freezing(p); task_unlock(p); } while_each_thread(g, p); -- cgit v1.2.3 From 5a7aadfe2fcb0f69e2acc1fbefe22a096e792fc9 Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Fri, 26 Mar 2010 23:51:44 +0100 Subject: Freezer: Fix buggy resume test for tasks frozen with cgroup freezer When the cgroup freezer is used to freeze tasks we do not want to thaw those tasks during resume. Currently we test the cgroup freezer state of the resuming tasks to see if the cgroup is FROZEN. If so then we don't thaw the task. However, the FREEZING state also indicates that the task should remain frozen. This also avoids a problem pointed out by Oren Ladaan: the freezer state transition from FREEZING to FROZEN is updated lazily when userspace reads or writes the freezer.state file in the cgroup filesystem. This means that resume will thaw tasks in cgroups which should be in the FROZEN state if there is no read/write of the freezer.state file to trigger this transition before suspend. NOTE: Another "simple" solution would be to always update the cgroup freezer state during resume. However it's a bad choice for several reasons: Updating the cgroup freezer state is somewhat expensive because it requires walking all the tasks in the cgroup and checking if they are each frozen. Worse, this could easily make resume run in N^2 time where N is the number of tasks in the cgroup. Finally, updating the freezer state from this code path requires trickier locking because of the way locks must be ordered. Instead of updating the freezer state we rely on the fact that lazy updates only manage the transition from FREEZING to FROZEN. We know that a cgroup with the FREEZING state may actually be FROZEN so test for that state too. This makes sense in the resume path even for partially-frozen cgroups -- those that really are FREEZING but not FROZEN. Reported-by: Oren Ladaan Signed-off-by: Matt Helsley Cc: stable@kernel.org Signed-off-by: Rafael J. Wysocki --- kernel/cgroup_freezer.c | 9 ++++++--- kernel/power/process.c | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 59e9ef6aab40..eb3f34d57419 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -47,17 +47,20 @@ static inline struct freezer *task_freezer(struct task_struct *task) struct freezer, css); } -int cgroup_frozen(struct task_struct *task) +int cgroup_freezing_or_frozen(struct task_struct *task) { struct freezer *freezer; enum freezer_state state; task_lock(task); freezer = task_freezer(task); - state = freezer->state; + if (!freezer->css.cgroup->parent) + state = CGROUP_THAWED; /* root cgroup can't be frozen */ + else + state = freezer->state; task_unlock(task); - return state == CGROUP_FROZEN; + return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); } /* diff --git a/kernel/power/process.c b/kernel/power/process.c index a0480cd4daaf..71ae29052ab6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -144,7 +144,7 @@ static void thaw_tasks(bool nosig_only) if (nosig_only && should_send_signal(p)) continue; - if (cgroup_frozen(p)) + if (cgroup_freezing_or_frozen(p)) continue; thaw_process(p); -- cgit v1.2.3 From 259354deaaf03d49a02dbb9975d6ec2a54675672 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 10 Mar 2010 18:56:10 +0900 Subject: module: encapsulate percpu handling better and record percpu_size Better encapsulate module static percpu area handling so that code outsidef of CONFIG_SMP ifdef doesn't deal with mod->percpu directly and add mod->percpu_size and record percpu_size in it. Both percpu fields are compiled out on UP. While at it, mark mod->percpu w/ __percpu. This is to prepare for is_module_percpu_address(). Signed-off-by: Tejun Heo Acked-by: Rusty Russell --- kernel/module.c | 66 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index c968d3606dca..e7a6e53fc73e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -370,27 +370,33 @@ EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP -static void *percpu_modalloc(unsigned long size, unsigned long align, - const char *name) +static inline void __percpu *mod_percpu(struct module *mod) { - void *ptr; + return mod->percpu; +} +static int percpu_modalloc(struct module *mod, + unsigned long size, unsigned long align) +{ if (align > PAGE_SIZE) { printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", - name, align, PAGE_SIZE); + mod->name, align, PAGE_SIZE); align = PAGE_SIZE; } - ptr = __alloc_reserved_percpu(size, align); - if (!ptr) + mod->percpu = __alloc_reserved_percpu(size, align); + if (!mod->percpu) { printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n", size); - return ptr; + return -ENOMEM; + } + mod->percpu_size = size; + return 0; } -static void percpu_modfree(void *freeme) +static void percpu_modfree(struct module *mod) { - free_percpu(freeme); + free_percpu(mod->percpu); } static unsigned int find_pcpusec(Elf_Ehdr *hdr, @@ -400,24 +406,28 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr, return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); } -static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) +static void percpu_modcopy(struct module *mod, + const void *from, unsigned long size) { int cpu; for_each_possible_cpu(cpu) - memcpy(pcpudest + per_cpu_offset(cpu), from, size); + memcpy(per_cpu_ptr(mod->percpu, cpu), from, size); } #else /* ... !CONFIG_SMP */ -static inline void *percpu_modalloc(unsigned long size, unsigned long align, - const char *name) +static inline void __percpu *mod_percpu(struct module *mod) { return NULL; } -static inline void percpu_modfree(void *pcpuptr) +static inline int percpu_modalloc(struct module *mod, + unsigned long size, unsigned long align) +{ + return -ENOMEM; +} +static inline void percpu_modfree(struct module *mod) { - BUG(); } static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, @@ -425,8 +435,8 @@ static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, { return 0; } -static inline void percpu_modcopy(void *pcpudst, const void *src, - unsigned long size) +static inline void percpu_modcopy(struct module *mod, + const void *from, unsigned long size) { /* pcpusec should be 0, and size of that section should be 0. */ BUG_ON(size != 0); @@ -1400,8 +1410,7 @@ static void free_module(struct module *mod) /* This may be NULL, but that's OK */ module_free(mod, mod->module_init); kfree(mod->args); - if (mod->percpu) - percpu_modfree(mod->percpu); + percpu_modfree(mod); #if defined(CONFIG_MODULE_UNLOAD) if (mod->refptr) free_percpu(mod->refptr); @@ -1520,7 +1529,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs, default: /* Divert to percpu allocation if a percpu var. */ if (sym[i].st_shndx == pcpuindex) - secbase = (unsigned long)mod->percpu; + secbase = (unsigned long)mod_percpu(mod); else secbase = sechdrs[sym[i].st_shndx].sh_addr; sym[i].st_value += secbase; @@ -1954,7 +1963,7 @@ static noinline struct module *load_module(void __user *umod, unsigned int modindex, versindex, infoindex, pcpuindex; struct module *mod; long err = 0; - void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ + void *ptr = NULL; /* Stops spurious gcc warning */ unsigned long symoffs, stroffs, *strmap; mm_segment_t old_fs; @@ -2094,15 +2103,11 @@ static noinline struct module *load_module(void __user *umod, if (pcpuindex) { /* We have a special allocation for this section. */ - percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, - sechdrs[pcpuindex].sh_addralign, - mod->name); - if (!percpu) { - err = -ENOMEM; + err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size, + sechdrs[pcpuindex].sh_addralign); + if (err) goto free_mod; - } sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; - mod->percpu = percpu; } /* Determine total sizes, and put offsets in sh_entsize. For now @@ -2317,7 +2322,7 @@ static noinline struct module *load_module(void __user *umod, sort_extable(mod->extable, mod->extable + mod->num_exentries); /* Finally, copy percpu area over. */ - percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, + percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr, sechdrs[pcpuindex].sh_size); add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, @@ -2409,8 +2414,7 @@ static noinline struct module *load_module(void __user *umod, module_free(mod, mod->module_core); /* mod will be freed with core. Don't access it beyond this line! */ free_percpu: - if (percpu) - percpu_modfree(percpu); + percpu_modfree(mod); free_mod: kfree(args); kfree(strmap); -- cgit v1.2.3 From 10fad5e46f6c7bdfb01b1a012380a38e3c6ab346 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 10 Mar 2010 18:57:54 +0900 Subject: percpu, module: implement and use is_kernel/module_percpu_address() lockdep has custom code to check whether a pointer belongs to static percpu area which is somewhat broken. Implement proper is_kernel/module_percpu_address() and replace the custom code. On UP, percpu variables are regular static variables and can't be distinguished from them. Always return %false on UP. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Rusty Russell Cc: Ingo Molnar --- kernel/lockdep.c | 21 +++++---------------- kernel/module.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index c927a549db2c..9bbb9c841e48 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -582,9 +582,6 @@ static int static_obj(void *obj) unsigned long start = (unsigned long) &_stext, end = (unsigned long) &_end, addr = (unsigned long) obj; -#ifdef CONFIG_SMP - int i; -#endif /* * static variable? @@ -595,24 +592,16 @@ static int static_obj(void *obj) if (arch_is_kernel_data(addr)) return 1; -#ifdef CONFIG_SMP /* - * percpu var? + * in-kernel percpu var? */ - for_each_possible_cpu(i) { - start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); - end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM - + per_cpu_offset(i); - - if ((addr >= start) && (addr < end)) - return 1; - } -#endif + if (is_kernel_percpu_address(addr)) + return 1; /* - * module var? + * module static or percpu var? */ - return is_module_address(addr); + return is_module_address(addr) || is_module_percpu_address(addr); } /* diff --git a/kernel/module.c b/kernel/module.c index e7a6e53fc73e..9f8d23d8b3a8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -415,6 +415,40 @@ static void percpu_modcopy(struct module *mod, memcpy(per_cpu_ptr(mod->percpu, cpu), from, size); } +/** + * is_module_percpu_address - test whether address is from module static percpu + * @addr: address to test + * + * Test whether @addr belongs to module static percpu area. + * + * RETURNS: + * %true if @addr is from module static percpu area + */ +bool is_module_percpu_address(unsigned long addr) +{ + struct module *mod; + unsigned int cpu; + + preempt_disable(); + + list_for_each_entry_rcu(mod, &modules, list) { + if (!mod->percpu_size) + continue; + for_each_possible_cpu(cpu) { + void *start = per_cpu_ptr(mod->percpu, cpu); + + if ((void *)addr >= start && + (void *)addr < start + mod->percpu_size) { + preempt_enable(); + return true; + } + } + } + + preempt_enable(); + return false; +} + #else /* ... !CONFIG_SMP */ static inline void __percpu *mod_percpu(struct module *mod) @@ -441,6 +475,10 @@ static inline void percpu_modcopy(struct module *mod, /* pcpusec should be 0, and size of that section should be 0. */ BUG_ON(size != 0); } +bool is_module_percpu_address(unsigned long addr) +{ + return false; +} #endif /* CONFIG_SMP */ -- cgit v1.2.3 From 88be12c440cfa2fa3f5be83507360aac9ea1c54e Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 29 Mar 2010 12:01:50 +0100 Subject: slow-work: use get_ref wrapper instead of directly calling get_ref Otherwise we can get an oops if the user has no get_ref/put_ref requirement. Signed-off-by: Dave Airlie Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/slow-work.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/slow-work.c b/kernel/slow-work.c index 7494bbf5a270..7d3f4fa9ef4f 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -637,7 +637,7 @@ int delayed_slow_work_enqueue(struct delayed_slow_work *dwork, goto cancelled; /* the timer holds a reference whilst it is pending */ - ret = work->ops->get_ref(work); + ret = slow_work_get_ref(work); if (ret < 0) goto cant_get_ref; -- cgit v1.2.3 From a53f4f9efaeb1d87cfae066346979d4d70e1abe9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 29 Mar 2010 13:08:52 +0100 Subject: SLOW_WORK: CONFIG_SLOW_WORK_PROC should be CONFIG_SLOW_WORK_DEBUG CONFIG_SLOW_WORK_PROC was changed to CONFIG_SLOW_WORK_DEBUG, but not in all instances. Change the remaining instances. This makes the debugfs file display the time mark and the owner's description again. Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/slow-work.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/slow-work.h b/kernel/slow-work.h index 321f3c59d732..a29ebd1ef41d 100644 --- a/kernel/slow-work.h +++ b/kernel/slow-work.h @@ -43,28 +43,28 @@ extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *); */ static inline void slow_work_set_thread_pid(int id, pid_t pid) { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG slow_work_pids[id] = pid; #endif } static inline void slow_work_mark_time(struct slow_work *work) { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG work->mark = CURRENT_TIME; #endif } static inline void slow_work_begin_exec(int id, struct slow_work *work) { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG slow_work_execs[id] = work; #endif } static inline void slow_work_end_exec(int id, struct slow_work *work) { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG write_lock(&slow_work_execs_lock); slow_work_execs[id] = NULL; write_unlock(&slow_work_execs_lock); -- cgit v1.2.3 From eed63519e3e74d515d2007ecd895338d0ba2a85c Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Sun, 28 Mar 2010 19:42:56 -0700 Subject: x86: Do not free zero sized per cpu areas This avoids an infinite loop in free_early_partial(). Add a warning to free_early_partial() to catch future problems. -v5: put back start > end back into WARN_ONCE() -v6: use one line for warning, suggested by Linus -v7: more tests -v8: remove the function name as suggested by Johannes WARN_ONCE() will print out that function name. Signed-off-by: Ian Campbell Signed-off-by: Yinghai Lu Tested-by: Konrad Rzeszutek Wilk Tested-by: Joel Becker Tested-by: Stanislaw Gruszka Acked-by: Johannes Weiner Cc: Peter Zijlstra Cc: David Miller Cc: Benjamin Herrenschmidt Cc: Linus Torvalds LKML-Reference: <1269830604-26214-4-git-send-email-yinghai@kernel.org> Signed-off-by: Ingo Molnar --- kernel/early_res.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/early_res.c b/kernel/early_res.c index 3cb2c661bb78..31aa9332ef3f 100644 --- a/kernel/early_res.c +++ b/kernel/early_res.c @@ -333,6 +333,12 @@ void __init free_early_partial(u64 start, u64 end) struct early_res *r; int i; + if (start == end) + return; + + if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end)) + return; + try_next: i = find_overlapped_early(start, end); if (i >= max_early_res) -- cgit v1.2.3 From e36673ec5126f15a8cddf6049aede7bdcf484c26 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Mar 2010 10:57:37 +0800 Subject: tracing: Fix lockdep warning in global_clock() # echo 1 > events/enable # echo global > trace_clock ------------[ cut here ]------------ WARNING: at kernel/lockdep.c:3162 check_flags+0xb2/0x190() ... ---[ end trace 3f86734a89416623 ]--- possible reason: unannotated irqs-on. ... There's no reason to use the raw_local_irq_save() in trace_clock_global. The local_irq_save() version is fine, and does not cause the bug in lockdep. Acked-by: Peter Zijlstra Signed-off-by: Li Zefan LKML-Reference: <4BA97FA1.7030606@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 6fbfb8f417b9..9d589d8dcd1a 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -84,7 +84,7 @@ u64 notrace trace_clock_global(void) int this_cpu; u64 now; - raw_local_irq_save(flags); + local_irq_save(