From e162b39a368f0401e41b558f430c354d12a85b37 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Thu, 15 Jan 2009 11:08:40 -0800 Subject: softlockup: decouple hung tasks check from softlockup detection Decoupling allows: * hung tasks check to happen at very low priority * hung tasks check and softlockup to be enabled/disabled independently at compile and/or run-time * individual panic settings to be enabled disabled independently at compile and/or run-time * softlockup threshold to be reduced without increasing hung tasks poll frequency (hung task check is expensive relative to softlock watchdog) * hung task check to be zero over-head when disabled at run-time Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- kernel/Makefile | 1 + kernel/hung_task.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/softlockup.c | 100 -------------------------- kernel/sysctl.c | 15 +++- 4 files changed, 213 insertions(+), 101 deletions(-) create mode 100644 kernel/hung_task.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 2aebc4cd7878..979745f1b4bc 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -74,6 +74,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o +obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o diff --git a/kernel/hung_task.c b/kernel/hung_task.c new file mode 100644 index 000000000000..ba5a77cad3bb --- /dev/null +++ b/kernel/hung_task.c @@ -0,0 +1,198 @@ +/* + * Detect Hung Task + * + * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Have a reasonable limit on the number of tasks checked: + */ +unsigned long __read_mostly sysctl_hung_task_check_count = 1024; + +/* + * Zero means infinite timeout - no checking done: + */ +unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; +static unsigned long __read_mostly hung_task_poll_jiffies; + +unsigned long __read_mostly sysctl_hung_task_warnings = 10; + +static int __read_mostly did_panic; + +static struct task_struct *watchdog_task; + +/* + * Should we panic (and reboot, if panic_timeout= is set) when a + * hung task is detected: + */ +unsigned int __read_mostly sysctl_hung_task_panic = + CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; + +static int __init hung_task_panic_setup(char *str) +{ + sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); + + return 1; +} +__setup("hung_task_panic=", hung_task_panic_setup); + +static int +hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) +{ + did_panic = 1; + + return NOTIFY_DONE; +} + +static struct notifier_block panic_block = { + .notifier_call = hung_task_panic, +}; + +/* + * Returns seconds, approximately. We don't need nanosecond + * resolution, and we don't need to waste time with a big divide when + * 2^30ns == 1.074s. + */ +static unsigned long get_timestamp(void) +{ + int this_cpu = raw_smp_processor_id(); + + return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ +} + +static void check_hung_task(struct task_struct *t, unsigned long now) +{ + unsigned long switch_count = t->nvcsw + t->nivcsw; + + if (t->flags & PF_FROZEN) + return; + + if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { + t->last_switch_count = switch_count; + t->last_switch_timestamp = now; + return; + } + if ((long)(now - t->last_switch_timestamp) < + sysctl_hung_task_timeout_secs) + return; + if (!sysctl_hung_task_warnings) + return; + sysctl_hung_task_warnings--; + + /* + * Ok, the task did not get scheduled for more than 2 minutes, + * complain: + */ + printk(KERN_ERR "INFO: task %s:%d blocked for more than " + "%ld seconds.\n", t->comm, t->pid, + sysctl_hung_task_timeout_secs); + printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" + " disables this message.\n"); + sched_show_task(t); + __debug_show_held_locks(t); + + t->last_switch_timestamp = now; + touch_nmi_watchdog(); + + if (sysctl_hung_task_panic) + panic("hung_task: blocked tasks"); +} + +/* + * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for + * a really long time (120 seconds). If that happens, print out + * a warning. + */ +static void check_hung_uninterruptible_tasks(void) +{ + int max_count = sysctl_hung_task_check_count; + unsigned long now = get_timestamp(); + struct task_struct *g, *t; + + /* + * If the system crashed already then all bets are off, + * do not report extra hung tasks: + */ + if (test_taint(TAINT_DIE) || did_panic) + return; + + read_lock(&tasklist_lock); + do_each_thread(g, t) { + if (!--max_count) + goto unlock; + /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ + if (t->state == TASK_UNINTERRUPTIBLE) + check_hung_task(t, now); + } while_each_thread(g, t); + unlock: + read_unlock(&tasklist_lock); +} + +static void update_poll_jiffies(void) +{ + /* timeout of 0 will disable the watchdog */ + if (sysctl_hung_task_timeout_secs == 0) + hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT; + else + hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2; +} + +/* + * Process updating of timeout sysctl + */ +int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + + if (ret || !write) + goto out; + + update_poll_jiffies(); + + wake_up_process(watchdog_task); + + out: + return ret; +} + +/* + * kthread which checks for tasks stuck in D state + */ +static int watchdog(void *dummy) +{ + set_user_nice(current, 0); + update_poll_jiffies(); + + for ( ; ; ) { + while (schedule_timeout_interruptible(hung_task_poll_jiffies)); + check_hung_uninterruptible_tasks(); + } + + return 0; +} + +static int __init hung_task_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); + + return 0; +} + +module_init(hung_task_init); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 85d5a2455103..88796c330838 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -165,98 +165,12 @@ void softlockup_tick(void) panic("softlockup: hung tasks"); } -/* - * Have a reasonable limit on the number of tasks checked: - */ -unsigned long __read_mostly sysctl_hung_task_check_count = 1024; - -/* - * Zero means infinite timeout - no checking done: - */ -unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480; - -unsigned long __read_mostly sysctl_hung_task_warnings = 10; - -/* - * Only do the hung-tasks check on one CPU: - */ -static int check_cpu __read_mostly = -1; - -static void check_hung_task(struct task_struct *t, unsigned long now) -{ - unsigned long switch_count = t->nvcsw + t->nivcsw; - - if (t->flags & PF_FROZEN) - return; - - if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { - t->last_switch_count = switch_count; - t->last_switch_timestamp = now; - return; - } - if ((long)(now - t->last_switch_timestamp) < - sysctl_hung_task_timeout_secs) - return; - if (!sysctl_hung_task_warnings) - return; - sysctl_hung_task_warnings--; - - /* - * Ok, the task did not get scheduled for more than 2 minutes, - * complain: - */ - printk(KERN_ERR "INFO: task %s:%d blocked for more than " - "%ld seconds.\n", t->comm, t->pid, - sysctl_hung_task_timeout_secs); - printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" - " disables this message.\n"); - sched_show_task(t); - __debug_show_held_locks(t); - - t->last_switch_timestamp = now; - touch_nmi_watchdog(); - - if (softlockup_panic) - panic("softlockup: blocked tasks"); -} - -/* - * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for - * a really long time (120 seconds). If that happens, print out - * a warning. - */ -static void check_hung_uninterruptible_tasks(int this_cpu) -{ - int max_count = sysctl_hung_task_check_count; - unsigned long now = get_timestamp(this_cpu); - struct task_struct *g, *t; - - /* - * If the system crashed already then all bets are off, - * do not report extra hung tasks: - */ - if (test_taint(TAINT_DIE) || did_panic) - return; - - read_lock(&tasklist_lock); - do_each_thread(g, t) { - if (!--max_count) - goto unlock; - /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ - if (t->state == TASK_UNINTERRUPTIBLE) - check_hung_task(t, now); - } while_each_thread(g, t); - unlock: - read_unlock(&tasklist_lock); -} - /* * The watchdog thread - runs every second and touches the timestamp. */ static int watchdog(void *__bind_cpu) { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - int this_cpu = (long)__bind_cpu; sched_setscheduler(current, SCHED_FIFO, ¶m); @@ -276,11 +190,6 @@ static int watchdog(void *__bind_cpu) if (kthread_should_stop()) break; - if (this_cpu == check_cpu) { - if (sysctl_hung_task_timeout_secs) - check_hung_uninterruptible_tasks(this_cpu); - } - set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); @@ -312,18 +221,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - check_cpu = cpumask_any(cpu_online_mask); wake_up_process(per_cpu(watchdog_task, hotcpu)); break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - if (hotcpu == check_cpu) { - /* Pick any other online cpu. */ - check_cpu = cpumask_any_but(cpu_online_mask, hotcpu); - } - break; - case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: if (!per_cpu(watchdog_task, hotcpu)) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 596dc31a7116..2481ed30d2b5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -805,6 +805,19 @@ static struct ctl_table kern_table[] = { .extra1 = &neg_one, .extra2 = &sixty, }, +#endif +#ifdef CONFIG_DETECT_HUNG_TASK + { + .ctl_name = CTL_UNNUMBERED, + .procname = "hung_task_panic", + .data = &sysctl_hung_task_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, + }, { .ctl_name = CTL_UNNUMBERED, .procname = "hung_task_check_count", @@ -820,7 +833,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_hung_task_timeout_secs, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &proc_dohung_task_timeout_secs, .strategy = &sysctl_intvec, }, { -- cgit v1.2.3 From af432eb1cc3178ec7109aca2283aafb1c12ccac1 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Fri, 16 Jan 2009 09:09:38 -0800 Subject: softlockup: fix to allow compiling with !DETECT_HUNG_TASK Fixes the following compile error: kernel/fork.c:1049: error: 'struct task_struct' has no member named 'last_switch_count' kernel/fork.c:1050: error: 'struct task_struct' has no member named 'last_switch_timestamp' Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1d68f1255dd8..fb9444282836 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1041,7 +1041,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->default_timer_slack_ns = current->timer_slack_ns; -#ifdef CONFIG_DETECT_SOFTLOCKUP +#ifdef CONFIG_DETECT_HUNG_TASK p->last_switch_count = 0; p->last_switch_timestamp = 0; #endif -- cgit v1.2.3 From 603a148f434742fff08273207ffa44176cad13a1 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Sat, 17 Jan 2009 10:31:48 -0800 Subject: softlockup: fix potential race in hung_task when resetting timeout Impact: fix potential false panic A potential race exists if sysctl_hung_task_timeout_secs is reset to 0 while inside check_hung_uniterruptible_tasks(). If check_task() is entered, a comparison with 0 will result in a false hung_task being detected. If sysctl_hung_task_panic is set, the system will panic. Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index ba5a77cad3bb..ba8ccd432963 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -72,7 +72,8 @@ static unsigned long get_timestamp(void) return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ } -static void check_hung_task(struct task_struct *t, unsigned long now) +static void check_hung_task(struct task_struct *t, unsigned long now, + unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; @@ -84,8 +85,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now) t->last_switch_timestamp = now; return; } - if ((long)(now - t->last_switch_timestamp) < - sysctl_hung_task_timeout_secs) + if ((long)(now - t->last_switch_timestamp) < timeout) return; if (!sysctl_hung_task_warnings) return; @@ -96,8 +96,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now) * complain: */ printk(KERN_ERR "INFO: task %s:%d blocked for more than " - "%ld seconds.\n", t->comm, t->pid, - sysctl_hung_task_timeout_secs); + "%ld seconds.\n", t->comm, t->pid, timeout); printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); @@ -115,7 +114,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now) * a really long time (120 seconds). If that happens, print out * a warning. */ -static void check_hung_uninterruptible_tasks(void) +static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; unsigned long now = get_timestamp(); @@ -134,7 +133,7 @@ static void check_hung_uninterruptible_tasks(void) goto unlock; /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) - check_hung_task(t, now); + check_hung_task(t, now, timeout); } while_each_thread(g, t); unlock: read_unlock(&tasklist_lock); @@ -180,8 +179,17 @@ static int watchdog(void *dummy) update_poll_jiffies(); for ( ; ; ) { + unsigned long timeout; + while (schedule_timeout_interruptible(hung_task_poll_jiffies)); - check_hung_uninterruptible_tasks(); + + /* + * Need to cache timeout here to avoid timeout being set + * to 0 via sysctl while inside check_hung_*_tasks(). + */ + timeout = sysctl_hung_task_timeout_secs; + if (timeout) + check_hung_uninterruptible_tasks(timeout); } return 0; -- cgit v1.2.3 From ce9dbe244bf2063c41792e40dae7745957b118e0 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Wed, 4 Feb 2009 20:35:48 -0800 Subject: softlockup: check all tasks in hung_task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: extend the scope of hung-task checks Changed the default value of hung_task_check_count to PID_MAX_LIMIT. hung_task_batch_count added to put an upper bound on the critical section. Every hung_task_batch_count checks, the rcu lock is never held for a too long time. Keeping the critical section small minimizes time preemption is disabled and keeps rcu grace periods small. To prevent following a stale pointer, get_task_struct is called on g and t. To verify that g and t have not been unhashed while outside the critical section, the task states are checked. The design was proposed by Frédéric Weisbecker. Signed-off-by: Mandeep Singh Baines Suggested-by: Frédéric Weisbecker Acked-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index ba8ccd432963..481ca8b5c2bc 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -17,9 +17,18 @@ #include /* - * Have a reasonable limit on the number of tasks checked: + * The number of tasks checked: */ -unsigned long __read_mostly sysctl_hung_task_check_count = 1024; +unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; + +/* + * Limit number of tasks checked in a batch. + * + * This value controls the preemptibility of khungtaskd since preemption + * is disabled during the critical section. It also controls the size of + * the RCU grace period. So it needs to be upper-bound. + */ +#define HUNG_TASK_BATCHING 1024 /* * Zero means infinite timeout - no checking done: @@ -109,6 +118,24 @@ static void check_hung_task(struct task_struct *t, unsigned long now, panic("hung_task: blocked tasks"); } +/* + * To avoid extending the RCU grace period for an unbounded amount of time, + * periodically exit the critical section and enter a new one. + * + * For preemptible RCU it is sufficient to call rcu_read_unlock in order + * exit the grace period. For classic RCU, a reschedule is required. + */ +static void rcu_lock_break(struct task_struct *g, struct task_struct *t) +{ + get_task_struct(g); + get_task_struct(t); + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + put_task_struct(t); + put_task_struct(g); +} + /* * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for * a really long time (120 seconds). If that happens, print out @@ -117,6 +144,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now, static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; + int batch_count = HUNG_TASK_BATCHING; unsigned long now = get_timestamp(); struct task_struct *g, *t; @@ -131,6 +159,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) do_each_thread(g, t) { if (!--max_count) goto unlock; + if (!--batch_count) { + batch_count = HUNG_TASK_BATCHING; + rcu_lock_break(g, t); + /* Exit if t or g was unhashed during refresh. */ + if (t->state == TASK_DEAD || g->state == TASK_DEAD) + goto unlock; + } /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) check_hung_task(t, now, timeout); -- cgit v1.2.3 From 94be52dc075a32af4aa73d7e10f68734d62d6af2 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Thu, 5 Feb 2009 09:56:08 -0800 Subject: softlockup: convert read_lock in hung_task to rcu_read_lock Since the tasklist is protected by rcu list operations, it is safe to convert the read_lock()s to rcu_read_lock(). Suggested-by: Peter Zijlstra Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 481ca8b5c2bc..3951a80e7cbe 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -155,7 +155,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) if (test_taint(TAINT_DIE) || did_panic) return; - read_lock(&tasklist_lock); + rcu_read_lock(); do_each_thread(g, t) { if (!--max_count) goto unlock; @@ -171,7 +171,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) check_hung_task(t, now, timeout); } while_each_thread(g, t); unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); } static void update_poll_jiffies(void) -- cgit v1.2.3 From 17406b82d621930cca8ccc1272cdac9a7dae8e40 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Fri, 6 Feb 2009 15:37:47 -0800 Subject: softlockup: remove timestamp checking from hung_task Impact: saves sizeof(long) bytes per task_struct By guaranteeing that sysctl_hung_task_timeout_secs have elapsed between tasklist scans we can avoid using timestamps. Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- kernel/fork.c | 8 +++----- kernel/hung_task.c | 48 +++++++++--------------------------------------- 2 files changed, 12 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index fb9444282836..bf582f75014b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -639,6 +639,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) tsk->min_flt = tsk->maj_flt = 0; tsk->nvcsw = tsk->nivcsw = 0; +#ifdef CONFIG_DETECT_HUNG_TASK + tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; +#endif tsk->mm = NULL; tsk->active_mm = NULL; @@ -1041,11 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->default_timer_slack_ns = current->timer_slack_ns; -#ifdef CONFIG_DETECT_HUNG_TASK - p->last_switch_count = 0; - p->last_switch_timestamp = 0; -#endif - task_io_accounting_init(&p->ioac); acct_clear_integrals(p); diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 3951a80e7cbe..0c924de58cb2 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -34,7 +34,6 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; * Zero means infinite timeout - no checking done: */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; -static unsigned long __read_mostly hung_task_poll_jiffies; unsigned long __read_mostly sysctl_hung_task_warnings = 10; @@ -69,33 +68,17 @@ static struct notifier_block panic_block = { .notifier_call = hung_task_panic, }; -/* - * Returns seconds, approximately. We don't need nanosecond - * resolution, and we don't need to waste time with a big divide when - * 2^30ns == 1.074s. - */ -static unsigned long get_timestamp(void) -{ - int this_cpu = raw_smp_processor_id(); - - return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ -} - -static void check_hung_task(struct task_struct *t, unsigned long now, - unsigned long timeout) +static void check_hung_task(struct task_struct *t, unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; if (t->flags & PF_FROZEN) return; - if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { + if (switch_count != t->last_switch_count) { t->last_switch_count = switch_count; - t->last_switch_timestamp = now; return; } - if ((long)(now - t->last_switch_timestamp) < timeout) - return; if (!sysctl_hung_task_warnings) return; sysctl_hung_task_warnings--; @@ -111,7 +94,6 @@ static void check_hung_task(struct task_struct *t, unsigned long now, sched_show_task(t); __debug_show_held_locks(t); - t->last_switch_timestamp = now; touch_nmi_watchdog(); if (sysctl_hung_task_panic) @@ -145,7 +127,6 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; int batch_count = HUNG_TASK_BATCHING; - unsigned long now = get_timestamp(); struct task_struct *g, *t; /* @@ -168,19 +149,16 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) } /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) - check_hung_task(t, now, timeout); + check_hung_task(t, timeout); } while_each_thread(g, t); unlock: rcu_read_unlock(); } -static void update_poll_jiffies(void) +static unsigned long timeout_jiffies(unsigned long timeout) { /* timeout of 0 will disable the watchdog */ - if (sysctl_hung_task_timeout_secs == 0) - hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT; - else - hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2; + return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT; } /* @@ -197,8 +175,6 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, if (ret || !write) goto out; - update_poll_jiffies(); - wake_up_process(watchdog_task); out: @@ -211,20 +187,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, static int watchdog(void *dummy) { set_user_nice(current, 0); - update_poll_jiffies(); for ( ; ; ) { - unsigned long timeout; + unsigned long timeout = sysctl_hung_task_timeout_secs; - while (schedule_timeout_interruptible(hung_task_poll_jiffies)); + while (schedule_timeout_interruptible(timeout_jiffies(timeout))) + timeout = sysctl_hung_task_timeout_secs; - /* - * Need to cache timeout here to avoid timeout being set - * to 0 via sysctl while inside check_hung_*_tasks(). - */ - timeout = sysctl_hung_task_timeout_secs; - if (timeout) - check_hung_uninterruptible_tasks(timeout); + check_hung_uninterruptible_tasks(timeout); } return 0; -- cgit v1.2.3 From cf2592f59c0e8ed4308adbdb2e0a88655379d579 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 10 Feb 2009 16:52:37 +0100 Subject: softlockup: ensure the task has been switched out once When we check if a task has been switched out since the last scan, we might have a race condition on the following scenario: - the task is freshly created and scheduled - it puts its state to TASK_UNINTERRUPTIBLE and is not yet switched out - check_hung_task() scans this task and will report a false positive because t->nvcsw + t->nivcsw == t->last_switch_count == 0 Add a check for such cases. Signed-off-by: Frederic Weisbecker Acked-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 0c924de58cb2..022a4927b785 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -72,7 +72,13 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; - if (t->flags & PF_FROZEN) + /* + * Ensure the task is not frozen. + * Also, when a freshly created task is scheduled once, changes + * its state to TASK_UNINTERRUPTIBLE without having ever been + * switched out once, it musn't be checked. + */ + if (unlikely(t->flags & PF_FROZEN || !switch_count)) return; if (switch_count != t->last_switch_count) { -- cgit v1.2.3 From 9f8d979f082a3ee1b27f32b7e0811b51c3ad1d15 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 12 Feb 2009 13:10:17 +0100 Subject: softlockup: move 'one' to the softlockup section in sysctl.c CONFIG_SOFTLOCKUP=y || CONFIG_DETECT_HUNG_TASKS=y is now the only user of the 'one' constant in kernel/sysctl.c. Move it to the softlockup block of constants. This fixes a GCC warning. Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3b6b54c8ac0d..6d2aeff92b3d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -90,6 +90,9 @@ extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ /* Constants used for minimum and maximum */ +#if defined(CONFIG_DETECT_HUNG_TASK) || defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM) +static int one = 1; +#endif #ifdef CONFIG_DETECT_SOFTLOCKUP static int sixty = 60; static int neg_one = -1; @@ -100,7 +103,6 @@ static int two = 2; #endif static int zero; -static int one = 1; static unsigned long one_ul = 1; static int one_hundred = 100; -- cgit v1.2.3 From 3aa551c9b4c40018f0e261a178e3d25478dc04a9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 23 Mar 2009 18:28:15 +0100 Subject: genirq: add threaded interrupt handler support Add support for threaded interrupt handlers: A device driver can request that its main interrupt handler runs in a thread. To achive this the device driver requests the interrupt with request_threaded_irq() and provides additionally to the handler a thread function. The handler function is called in hard interrupt context and needs to check whether the interrupt originated from the device. If the interrupt originated from the device then the handler can either return IRQ_HANDLED or IRQ_WAKE_THREAD. IRQ_HANDLED is returned when no further action is required. IRQ_WAKE_THREAD causes the genirq code to invoke the threaded (main) handler. When IRQ_WAKE_THREAD is returned handler must have disabled the interrupt on the device level. This is mandatory for shared interrupt handlers, but we need to do it as well for obscure x86 hardware where disabling an interrupt on the IO_APIC level redirects the interrupt to the legacy PIC interrupt lines. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- kernel/exit.c | 2 + kernel/irq/handle.c | 31 ++++++++- kernel/irq/manage.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 211 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 167e1e3ad7c6..ca0b3488c4a9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1037,6 +1037,8 @@ NORET_TYPE void do_exit(long code) schedule(); } + exit_irq_thread(); + exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 9ebf77968871..fe8f45374e86 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -357,8 +357,37 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) do { ret = action->handler(irq, action->dev_id); - if (ret == IRQ_HANDLED) + + switch (ret) { + case IRQ_WAKE_THREAD: + /* + * Wake up the handler thread for this + * action. In case the thread crashed and was + * killed we just pretend that we handled the + * interrupt. The hardirq handler above has + * disabled the device interrupt, so no irq + * storm is lurking. + */ + if (likely(!test_bit(IRQTF_DIED, + &action->thread_flags))) { + set_bit(IRQTF_RUNTHREAD, &action->thread_flags); + wake_up_process(action->thread); + } + + /* + * Set it to handled so the spurious check + * does not trigger. + */ + ret = IRQ_HANDLED; + /* Fall through to add to randomness */ + case IRQ_HANDLED: status |= action->flags; + break; + + default: + break; + } + retval |= ret; action = action->next; } while (action); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6458e99984c0..a4c1ab86cd25 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -8,16 +8,15 @@ */ #include +#include #include #include #include #include +#include #include "internals.h" -#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) -cpumask_var_t irq_default_affinity; - /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for @@ -53,9 +52,18 @@ void synchronize_irq(unsigned int irq) /* Oops, that failed? */ } while (status & IRQ_INPROGRESS); + + /* + * We made sure that no hardirq handler is running. Now verify + * that no threaded handlers are active. + */ + wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active)); } EXPORT_SYMBOL(synchronize_irq); +#ifdef CONFIG_SMP +cpumask_var_t irq_default_affinity; + /** * irq_can_set_affinity - Check if the affinity of a given irq can be set * @irq: Interrupt to check @@ -72,6 +80,18 @@ int irq_can_set_affinity(unsigned int irq) return 1; } +static void +irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) +{ + struct irqaction *action = desc->action; + + while (action) { + if (action->thread) + set_cpus_allowed_ptr(action->thread, cpumask); + action = action->next; + } +} + /** * irq_set_affinity - Set the irq affinity of a given irq * @irq: Interrupt to set affinity @@ -100,6 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) cpumask_copy(desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); #endif + irq_set_thread_affinity(desc, cpumask); desc->status |= IRQ_AFFINITY_SET; spin_unlock_irqrestore(&desc->lock, flags); return 0; @@ -150,6 +171,8 @@ int irq_select_affinity_usr(unsigned int irq) spin_lock_irqsave(&desc->lock, flags); ret = setup_affinity(irq, desc); + if (!ret) + irq_set_thread_affinity(desc, desc->affinity); spin_unlock_irqrestore(&desc->lock, flags); return ret; @@ -384,6 +407,93 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, return ret; } +static inline int irq_thread_should_run(struct irqaction *action) +{ + return test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags); +} + +static int irq_wait_for_interrupt(struct irqaction *action) +{ + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + if (irq_thread_should_run(action)) { + __set_current_state(TASK_RUNNING); + return 0; + } else + schedule(); + } + return -1; +} + +/* + * Interrupt handler thread + */ +static int irq_thread(void *data) +{ + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; + struct irqaction *action = data; + struct irq_desc *desc = irq_to_desc(action->irq); + int wake; + + sched_setscheduler(current, SCHED_FIFO, ¶m); + current->irqaction = action; + + while (!irq_wait_for_interrupt(action)) { + + atomic_inc(&desc->threads_active); + + spin_lock_irq(&desc->lock); + if (unlikely(desc->status & IRQ_DISABLED)) { + /* + * CHECKME: We might need a dedicated + * IRQ_THREAD_PENDING flag here, which + * retriggers the thread in check_irq_resend() + * but AFAICT IRQ_PENDING should be fine as it + * retriggers the interrupt itself --- tglx + */ + desc->status |= IRQ_PENDING; + spin_unlock_irq(&desc->lock); + } else { + spin_unlock_irq(&desc->lock); + + action->thread_fn(action->irq, action->dev_id); + } + + wake = atomic_dec_and_test(&desc->threads_active); + + if (wake && waitqueue_active(&desc->wait_for_threads)) + wake_up(&desc->wait_for_threads); + } + + /* + * Clear irqaction. Otherwise exit_irq_thread() would make + * fuzz about an active irq thread going into nirvana. + */ + current->irqaction = NULL; + return 0; +} + +/* + * Called from do_exit() + */ +void exit_irq_thread(void) +{ + struct task_struct *tsk = current; + + if (!tsk->irqaction) + return; + + printk(KERN_ERR + "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", + tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); + + /* + * Set the THREAD DIED flag to prevent further wakeups of the + * soon to be gone threaded handler. + */ + set_bit(IRQTF_DIED, &tsk->irqaction->flags); +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -419,6 +529,26 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) rand_initialize_irq(irq); } + /* + * Threaded handler ? + */ + if (new->thread_fn) { + struct task_struct *t; + + t = kthread_create(irq_thread, new, "irq/%d-%s", irq, + new->name); + if (IS_ERR(t)) + return PTR_ERR(t); + /* + * We keep the reference to the task struct even if + * the thread dies to avoid that the interrupt code + * references an already freed task_struct. + */ + get_task_struct(t); + new->thread = t; + wake_up_process(t); + } + /* * The following block of code has to be executed atomically */ @@ -456,15 +586,15 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!shared) { irq_chip_set_defaults(desc->chip); + init_waitqueue_head(&desc->wait_for_threads); + /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { ret = __irq_set_trigger(desc, irq, new->flags & IRQF_TRIGGER_MASK); - if (ret) { - spin_unlock_irqrestore(&desc->lock, flags); - return ret; - } + if (ret) + goto out_thread; } else compat_irq_chip_set_default_handler(desc); #if defined(CONFIG_IRQ_PER_CPU) @@ -532,8 +662,19 @@ mismatch: dump_stack(); } #endif + ret = -EBUSY; + +out_thread: spin_unlock_irqrestore(&desc->lock, flags); - return -EBUSY; + if (new->thread) { + struct task_struct *t = new->thread; + + new->thread = NULL; + if (likely(!test_bit(IRQTF_DIED, &new->thread_flags))) + kthread_stop(t); + put_task_struct(t); + } + return ret; } /** @@ -559,6 +700,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action, **action_ptr; + struct task_struct *irqthread; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); @@ -605,6 +747,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) else desc->chip->disable(irq); } + + irqthread = action->thread; + action->thread = NULL; + spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -612,6 +758,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* Make sure it's not being used on another CPU: */ synchronize_irq(irq); + if (irqthread) { + if (!test_bit(IRQTF_DIED, &action->thread_flags)) + kthread_stop(irqthread); + put_task_struct(irqthread); + } + #ifdef CONFIG_DEBUG_SHIRQ /* * It's a shared IRQ -- the driver ought to be prepared for an IRQ @@ -664,9 +816,12 @@ void free_irq(unsigned int irq, void *dev_id) EXPORT_SYMBOL(free_irq); /** - * request_irq - allocate an interrupt line + * request_threaded_irq - allocate an interrupt line * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs + * @handler: Function to be called when the IRQ occurs. + * Primary handler for threaded interrupts + * @thread_fn: Function called from the irq handler thread + * If NULL, no irq thread is created * @irqflags: Interrupt type flags * @devname: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function @@ -678,6 +833,15 @@ EXPORT_SYMBOL(free_irq); * raises, you must take care both to initialise your hardware * and to set up the interrupt handler in the right order. * + * If you want to set up a threaded irq handler for your device + * then you need to supply @handler and @thread_fn. @handler ist + * still called in hard interrupt context and has to check + * whether the interrupt originates from the device. If yes it + * needs to disable the interrupt on the device and return + * IRQ_THREAD_WAKE which will wake up the handler thread and run + * @thread_fn. This split handler design is necessary to support + * shared interrupts. + * * Dev_id must be globally unique. Normally the address of the * device data structure is used as the cookie. Since the handler * receives this value it makes sense to use it. @@ -693,8 +857,9 @@ EXPORT_SYMBOL(free_irq); * IRQF_TRIGGER_* Specify active edge(s) or level * */ -int request_irq(unsigned int irq, irq_handler_t handler, - unsigned long irqflags, const char *devname, void *dev_id) +int request_threaded_irq(unsigned int irq, irq_handler_t handler, + irq_handler_t thread_fn, unsigned long irqflags, + const char *devname, void *dev_id) { struct irqaction *action; struct irq_desc *desc; @@ -742,6 +907,7 @@ int request_irq(unsigned int irq, irq_handler_t handler, return -ENOMEM; action->handler = handler; + action->thread_fn = thread_fn; action->flags = irqflags; action->name = devname; action->dev_id = dev_id; @@ -771,4 +937,4 @@ int request_irq(unsigned int irq, irq_handler_t handler, #endif return retval; } -EXPORT_SYMBOL(request_irq); +EXPORT_SYMBOL(request_threaded_irq); -- cgit v1.2.3 From 935bd5b971f0df7c06d214d022cf8392e2f37952 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 23 Mar 2009 18:28:16 +0100 Subject: genirq: add support for threaded interrupts to devres Some devices use devres_request_irq() for to install their interrupt handler. Add support for threaded interrupts to devres as well. [tglx - simplified and adapted to latest threadirq version] Signed-off-by: Arjan van de Ven Signed-off-by: Thomas Gleixner --- kernel/irq/devres.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 38a25b8d8bff..d06df9c41cba 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -26,10 +26,12 @@ static int devm_irq_match(struct device *dev, void *res, void *data) } /** - * devm_request_irq - allocate an interrupt line for a managed device + * devm_request_threaded_irq - allocate an interrupt line for a managed device * @dev: device to request interrupt for * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs + * @thread_fn: function to be called in a threaded interrupt context. NULL + * for devices which handle everything in @handler * @irqflags: Interrupt type flags * @devname: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function @@ -42,9 +44,10 @@ static int devm_irq_match(struct device *dev, void *res, void *data) * If an IRQ allocated with this function needs to be freed * separately, dev_free_irq() must be used. */ -int devm_request_irq(struct device *dev, unsigned int irq, - irq_handler_t handler, unsigned long irqflags, - const char *devname, void *dev_id) +int devm_request_threaded_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, irq_handler_t thread_fn, + unsigned long irqflags, const char *devname, + void *dev_id) { struct irq_devres *dr; int rc; @@ -54,7 +57,8 @@ int devm_request_irq(struct device *dev, unsigned int irq, if (!dr) return -ENOMEM; - rc = request_irq(irq, handler, irqflags, devname, dev_id); + rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname, + dev_id); if (rc) { devres_free(dr); return rc; @@ -66,7 +70,7 @@ int devm_request_irq(struct device *dev, unsigned int irq, return 0; } -EXPORT_SYMBOL(devm_request_irq); +EXPORT_SYMBOL(devm_request_threaded_irq); /** * devm_free_irq - free an interrupt -- cgit v1.2.3 From f48fe81e5b032914183e9a17052313720c2cac56 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Mar 2009 11:46:22 +0100 Subject: genirq: threaded irq handlers review fixups Delta patch to address the review comments. - Implement warning when IRQ_WAKE_THREAD is requested and no thread handler installed - coding style fixes Pointed-out-by: Christoph Hellwig Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 29 ++++++++++++++++++++++++----- kernel/irq/manage.c | 17 +++++++---------- 2 files changed, 31 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index fe8f45374e86..38b49a9e508a 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -338,6 +338,15 @@ irqreturn_t no_action(int cpl, void *dev_id) return IRQ_NONE; } +static void warn_no_thread(unsigned int irq, struct irqaction *action) +{ + if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags)) + return; + + printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD " + "but no thread function available.", irq, action->name); +} + /** * handle_IRQ_event - irq action chain handler * @irq: the interrupt number @@ -360,6 +369,21 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) switch (ret) { case IRQ_WAKE_THREAD: + /* + * Set result to handled so the spurious check + * does not trigger. + */ + ret = IRQ_HANDLED; + + /* + * Catch drivers which return WAKE_THREAD but + * did not set up a thread function + */ + if (unlikely(!action->thread_fn)) { + warn_no_thread(irq, action); + break; + } + /* * Wake up the handler thread for this * action. In case the thread crashed and was @@ -374,11 +398,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) wake_up_process(action->thread); } - /* - * Set it to handled so the spurious check - * does not trigger. - */ - ret = IRQ_HANDLED; /* Fall through to add to randomness */ case IRQ_HANDLED: status |= action->flags; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a4c1ab86cd25..a3eb7baf1e46 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -407,20 +407,17 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, return ret; } -static inline int irq_thread_should_run(struct irqaction *action) -{ - return test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags); -} - static int irq_wait_for_interrupt(struct irqaction *action) { while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); - if (irq_thread_should_run(action)) { + + if (test_and_clear_bit(IRQTF_RUNTHREAD, + &action->thread_flags)) { __set_current_state(TASK_RUNNING); return 0; - } else - schedule(); + } + schedule(); } return -1; } @@ -820,8 +817,8 @@ EXPORT_SYMBOL(free_irq); * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Primary handler for threaded interrupts - * @thread_fn: Function called from the irq handler thread - * If NULL, no irq thread is created + * @thread_fn: Function called from the irq handler thread + * If NULL, no irq thread is created * @irqflags: Interrupt type flags * @devname: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function -- cgit v1.2.3 From a18b83b7ef3c98cd8b4bb885e4a649a8f30fb7b0 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Mon, 23 Mar 2009 10:02:53 +0530 Subject: cpuacct: make cpuacct hierarchy walk in cpuacct_charge() safe when rcupreempt is used -v2 Impact: fix cgroups race under rcu-preempt cpuacct_charge() obtains task's ca and does a hierarchy walk upwards. This can race with the task's movement between cgroups. This race can cause an access to freed ca pointer in cpuacct_charge() or access to invalid cgroups pointer of the task. This will not happen with rcu or tree rcu as cpuacct_charge() is called with preemption disabled. However if rcupreempt is used, the race is seen. Thanks to Li Zefan for explaining this. Fix this race by explicitly protecting ca and the hierarchy walk with rcu_read_lock(). Changes for v2: - Update patch descrition (as per Li Zefan's review comments). - Remove comments in cpuacct_charge() which explained why rcu_read_lock() was needed (as per Peter Zijlstra's review comments). Signed-off-by: Bharata B Rao Cc: Dhaval Giani Cc: Li Zefan Cc: Paul Menage Cc: KAMEZAWA Hiroyuki Acked-by: Peter Zijlstra Acked-by: Balbir Singh Tested-by: Balbir Singh Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 186c6fd08acf..cc397aae5eae 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9654,12 +9654,17 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) return; cpu = task_cpu(tsk); + + rcu_read_lock(); + ca = task_ca(tsk); for (; ca; ca = ca->parent) { u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; } + + rcu_read_unlock(); } struct cgroup_subsys cpuacct_subsys = { -- cgit v1.2.3 From 13b8bd0a5713bdf05659019badd7c0407984ece1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 25 Mar 2009 15:01:22 +1030 Subject: sched_rt: don't allocate cpumask in fastpath Impact: cleanup As pointed out by Steven Rostedt. Since the arg in question is unused, we simply change cpupri_find() to accept NULL. Reported-by: Steven Rostedt Signed-off-by: Rusty Russell LKML-Reference: <200903251501.22664.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- kernel/sched_cpupri.c | 5 +++-- kernel/sched_rt.c | 15 ++++----------- 2 files changed, 7 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 1e00bfacf9b8..cdd3c89574cd 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -55,7 +55,7 @@ static int convert_prio(int prio) * cpupri_find - find the best (lowest-pri) CPU in the system * @cp: The cpupri context * @p: The task - * @lowest_mask: A mask to fill in with selected CPUs + * @lowest_mask: A mask to fill in with selected CPUs (or NULL) * * Note: This function returns the recommended CPUs as calculated during the * current invokation. By the time the call returns, the CPUs may have in @@ -81,7 +81,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) continue; - cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + if (lowest_mask) + cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); return 1; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index bac1061cea2f..fbec5a58ff10 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -805,20 +805,15 @@ static int select_task_rq_rt(struct task_struct *p, int sync) static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - cpumask_var_t mask; - if (rq->curr->rt.nr_cpus_allowed == 1) return; - if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) - return; - if (p->rt.nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, mask)) - goto free; + && cpupri_find(&rq->rd->cpupri, p, NULL)) + return; - if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask)) - goto free; + if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) + return; /* * There appears to be other cpus that can accept @@ -827,8 +822,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) */ requeue_task_rt(rq, p, 1); resched_task(rq->curr); -free: - free_cpumask_var(mask); } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From c5f8d99585d7b5b7e857fabf8aefd0174903a98c Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 31 Mar 2009 16:56:03 +0900 Subject: posixtimers, sched: Fix posix clock monotonicity Impact: Regression fix (against clock_gettime() backwarding bug) This patch re-introduces a couple of functions, task_sched_runtime and thread_group_sched_runtime, which was once removed at the time of 2.6.28-rc1. These functions protect the sampling of thread/process clock with rq lock. This rq lock is required not to update rq->clock during the sampling. i.e. The clock_gettime() may return ((accounted runtime before update) + (delta after update)) that is less than what it should be. v2 -> v3: - Rename static helper function __task_delta_exec() to do_task_delta_exec() since -tip tree already has a __task_delta_exec() of different version. v1 -> v2: - Revises comments of function and patch description. - Add note about accuracy of thread group's runtime. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: stable@kernel.org [2.6.28.x][2.6.29.x] LKML-Reference: <49D1CC93.4080401@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 7 ++--- kernel/sched.c | 65 +++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 61 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index fa07da94d7be..4318c3085788 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, cpu->cpu = virt_ticks(p); break; case CPUCLOCK_SCHED: - cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p); + cpu->sched = task_sched_runtime(p); break; } return 0; @@ -240,18 +240,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock, { struct task_cputime cputime; - thread_group_cputime(p, &cputime); switch (CPUCLOCK_WHICH(which_clock)) { default: return -EINVAL; case CPUCLOCK_PROF: + thread_group_cputime(p, &cputime); cpu->cpu = cputime_add(cputime.utime, cputime.stime); break; case CPUCLOCK_VIRT: + thread_group_cputime(p, &cputime); cpu->cpu = cputime.utime; break; case CPUCLOCK_SCHED: - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); + cpu->sched = thread_group_sched_runtime(p); break; } return 0; diff --git a/kernel/sched.c b/kernel/sched.c index cc397aae5eae..c8d7f17bd036 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4139,9 +4139,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); /* - * Return any ns on the sched_clock that have not yet been banked in + * Return any ns on the sched_clock that have not yet been accounted in * @p in case that task is currently running. + * + * Called with task_rq_lock() held on @rq. */ +static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) +{ + u64 ns = 0; + + if (task_current(rq, p)) { + update_rq_clock(rq); + ns = rq->clock - p->se.exec_start; + if ((s64)ns < 0) + ns = 0; + } + + return ns; +} + unsigned long long task_delta_exec(struct task_struct *p) { unsigned long flags; @@ -4149,16 +4165,49 @@ unsigned long long task_delta_exec(struct task_struct *p) u64 ns = 0; rq = task_rq_lock(p, &flags); + ns = do_task_delta_exec(p, rq); + task_rq_unlock(rq, &flags); - if (task_current(rq, p)) { - u64 delta_exec; + return ns; +} - update_rq_clock(rq); - delta_exec = rq->clock - p->se.exec_start; - if ((s64)delta_exec > 0) - ns = delta_exec; - } +/* + * Return accounted runtime for the task. + * In case the task is currently running, return the runtime plus current's + * pending runtime that have not been accounted yet. + */ +unsigned long long task_sched_runtime(struct task_struct *p) +{ + unsigned long flags; + struct rq *rq; + u64 ns = 0; + + rq = task_rq_lock(p, &flags); + ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); + task_rq_unlock(rq, &flags); + + return ns; +} +/* + * Return sum_exec_runtime for the thread group. + * In case the task is currently running, return the sum plus current's + * pending runtime that have not been accounted yet. + * + * Note that the thread group might have other running tasks as well, + * so the return value not includes other pending runtime that other + * running tasks might have. + */ +unsigned long long thread_group_sched_runtime(struct task_struct *p) +{ + struct task_cputime totals; + unsigned long flags; + struct rq *rq; + u64 ns; + + rq = task_rq_lock(p, &flags); + thread_group_cputime(p, &totals); + ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); task_rq_unlock(rq, &flags); return ns; -- cgit v1.2.3 From ef12fefabf94b6a902ad3abd3eb124b00560c445 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Tue, 31 Mar 2009 10:02:22 +0530 Subject: cpuacct: add per-cgroup utime/stime statistics Add per-cgroup cpuacct controller statistics like the system and user time consumed by the group of tasks. Changelog: v7 - Changed the name of the statistic from utime to user and from stime to system so that in future we could easily add other statistics like irq, softirq, steal times etc easily. v6 - Fixed a bug in the error path of cpuacct_create() (pointed by Li Zefan). v5 - In cpuacct_stats_show(), use cputime64_to_clock_t() since we are operating on a 64bit variable here. v4 - Remove comments in cpuacct_update_stats() which explained why rcu_read_lock() was needed (as per Peter Zijlstra's review comments). - Don't say that percpu_counter_read() is broken in Documentation/cpuacct.txt as per KAMEZAWA Hiroyuki's review comments. v3 - Fix a small race in the cpuacct hierarchy walk. v2 - stime and utime now exported in clock_t units instead of msecs. - Addressed the code review comments from Balbir and Li Zefan. - Moved to -tip tree. v1 - Moved the stime/utime accounting to cpuacct controller. Earlier versions - http://lkml.org/lkml/2009/2/25/129 Signed-off-by: Bharata B Rao Signed-off-by: Balaji Rao Cc: Dhaval Giani Cc: Paul Menage Cc: Andrew Morton Cc: KAMEZAWA Hiroyuki Reviewed-by: Li Zefan Acked-by: Peter Zijlstra Acked-by: Balbir Singh Tested-by: Balbir Singh LKML-Reference: <20090331043222.GA4093@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 81 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c8d7f17bd036..8d1bdbe8aafc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1393,10 +1393,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, struct rq_iterator *iterator); #endif +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + #ifdef CONFIG_CGROUP_CPUACCT static void cpuacct_charge(struct task_struct *tsk, u64 cputime); +static void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val); #else static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} +static inline void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val) {} #endif static inline void inc_cpu_load(struct rq *rq, unsigned long load) @@ -4236,6 +4248,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime, cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); + + cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); /* Account for user time used */ acct_update_integrals(p); } @@ -4297,6 +4311,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset, else cpustat->system = cputime64_add(cpustat->system, tmp); + cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); + /* Account for system time used */ acct_update_integrals(p); } @@ -9539,6 +9555,7 @@ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every cpu */ u64 *cpuusage; + struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; struct cpuacct *parent; }; @@ -9563,20 +9580,32 @@ static struct cgroup_subsys_state *cpuacct_create( struct cgroup_subsys *ss, struct cgroup *cgrp) { struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + int i; if (!ca) - return ERR_PTR(-ENOMEM); + goto out; ca->cpuusage = alloc_percpu(u64); - if (!ca->cpuusage) { - kfree(ca); - return ERR_PTR(-ENOMEM); - } + if (!ca->cpuusage) + goto out_free_ca; + + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) + if (percpu_counter_init(&ca->cpustat[i], 0)) + goto out_free_counters; if (cgrp->parent) ca->parent = cgroup_ca(cgrp->parent); return &ca->css; + +out_free_counters: + while (--i >= 0) + percpu_counter_destroy(&ca->cpustat[i]); + free_percpu(ca->cpuusage); +out_free_ca: + kfree(ca); +out: + return ERR_PTR(-ENOMEM); } /* destroy an existing cpu accounting group */ @@ -9584,7 +9613,10 @@ static void cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) { struct cpuacct *ca = cgroup_ca(cgrp); + int i; + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) + percpu_counter_destroy(&ca->cpustat[i]); free_percpu(ca->cpuusage); kfree(ca); } @@ -9671,6 +9703,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, return 0; } +static const char *cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", +}; + +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int i; + + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { + s64 val = percpu_counter_read(&ca->cpustat[i]); + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[i], val); + } + return 0; +} + static struct cftype files[] = { { .name = "usage", @@ -9681,7 +9732,10 @@ static struct cftype files[] = { .name = "usage_percpu", .read_seq_string = cpuacct_percpu_seq_read, }, - + { + .name = "stat", + .read_map = cpuacct_stats_show, + }, }; static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) @@ -9716,6 +9770,27 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) rcu_read_unlock(); } +/* + * Charge the system/user time to the task's accounting group. + */ +static void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val) +{ + struct cpuacct *ca; + + if (unlikely(!cpuacct_subsys.active)) + return; + + rcu_read_lock(); + ca = task_ca(tsk); + + do { + percpu_counter_add(&ca->cpustat[idx], val); + ca = ca->parent; + } while (ca); + rcu_read_unlock(); +} + struct cgroup_subsys cpuacct_subsys = { .name = "cpuacct", .create = cpuacct_create, -- cgit v1.2.3 From 46e0bb9c12f4bab539736f1714cbf16600f681ec Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Mon, 30 Mar 2009 10:25:20 +0530 Subject: sched: Print sched_group::__cpu_power in sched_domain_debug Impact: extend debug info /proc/sched_debug If the user changes the value of the sched_mc/smt_power_savings sysfs tunable, it'll trigger a rebuilding of the whole sched_domain tree, with the SD_POWERSAVINGS_BALANCE flag set at certain levels. As a result, there would be a change in the __cpu_power of sched_groups in the sched_domain hierarchy. Print the __cpu_power values for each sched_group in sched_domain_debug to help verify this change and correlate it with the change in the load-balancing behavior. Signed-off-by: Gautham R Shenoy Cc: Peter Zijlstra LKML-Reference: <20090330045520.2869.24777.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8d1bdbe8aafc..6234d10c6a79 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6963,7 +6963,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_or(groupmask, groupmask, sched_group_cpus(group)); cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); - printk(KERN_CONT " %s", str); + printk(KERN_CONT " %s (__cpu_power = %d)", str, + group->__cpu_power); group = group->next; } while (group != sd->groups); -- cgit v1.2.3 From 633fe795b80693a8198e7d82f66538a72d2bbba2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 1 Apr 2009 17:47:23 -0700 Subject: timers: add missing kernel-doc Add missing kernel-doc parameter notation and change function name to its new name: Warning(kernel/timer.c:543): No description found for parameter 'name' Warning(kernel/timer.c:543): No description found for parameter 'key' Signed-off-by: Randy Dunlap Cc: akpm Cc: Johannes Berg LKML-Reference: <20090401174723.f0bea0eb.randy.dunla