From f685ceacab07d3f6c236f04803e2f2f0dbcc5afb Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Oct 2009 23:09:22 +0200 Subject: sched: Strengthen buddies and mitigate buddy induced latencies This patch restores the effectiveness of LAST_BUDDY in preventing pgsql+oltp from collapsing due to wakeup preemption. It also switches LAST_BUDDY to exclusively do what it does best, namely mitigate the effects of aggressive wakeup preemption, which improves vmark throughput markedly, and restores mysql+oltp scalability. Since buddies are about scalability, enable them beginning at the point where we begin expanding sched_latency, namely sched_nr_latency. Previously, buddies were cleared aggressively, which seriously reduced their effectiveness. Not clearing aggressively however, produces a small drop in mysql+oltp throughput immediately after peak, indicating that LAST_BUDDY is actually doing some harm. This is right at the point where X on the desktop in competition with another load wants low latency service. Ergo, do not enable until we need to scale. To mitigate latency induced by buddies, or by a task just missing wakeup preemption, check latency at tick time. Last hunk prevents buddies from stymieing BALANCE_NEWIDLE via CACHE_HOT_BUDDY. Supporting performance tests: tip = v2.6.32-rc5-1497-ga525b32 tipx = NO_GENTLE_FAIR_SLEEPERS NEXT_BUDDY granularity knobs = 31 knobs + 31 buddies tip+x = NO_GENTLE_FAIR_SLEEPERS granularity knobs = 31 knobs (Three run averages except where noted.) vmark: ------ tip 108466 messages per second tip+ 125307 messages per second tip+x 125335 messages per second tipx 117781 messages per second 2.6.31.3 122729 messages per second mysql+oltp: ----------- clients 1 2 4 8 16 32 64 128 256 .......................................................................................... tip 9949.89 18690.20 34801.24 34460.04 32682.88 30765.97 28305.27 25059.64 19548.08 tip+ 10013.90 18526.84 34900.38 34420.14 33069.83 32083.40 30578.30 28010.71 25605.47 tipx 9698.71 18002.70 34477.56 33420.01 32634.30 31657.27 29932.67 26827.52 21487.18 2.6.31.3 8243.11 18784.20 34404.83 33148.38 31900.32 31161.90 29663.81 25995.94 18058.86 pgsql+oltp: ----------- clients 1 2 4 8 16 32 64 128 256 .......................................................................................... tip 13686.37 26609.25 51934.28 51347.81 49479.51 45312.65 36691.91 26851.57 24145.35 tip+ (1x) 13907.85 27135.87 52951.98 52514.04 51742.52 50705.43 49947.97 48374.19 46227.94 tip+x 13906.78 27065.81 52951.19 52542.59 52176.11 51815.94 50838.90 49439.46 46891.00 tipx 13742.46 26769.81 52351.99 51891.73 51320.79 50938.98 50248.65 48908.70 46553.84 2.6.31.3 13815.35 26906.46 52683.34 52061.31 51937.10 51376.80 50474.28 49394.47 47003.25 Signed-off-by: Mike Galbraith Cc: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_fair.c | 73 ++++++++++++++++++++++++++++++++++------------------- 2 files changed, 48 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 789001da0a94..cae6700bedb3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2008,7 +2008,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && + if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && (&p->se == cfs_rq_of(&p->se)->next || &p->se == cfs_rq_of(&p->se)->last)) return 1; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c32c3e643daa..37087a7fac22 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -822,6 +822,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) * re-elected due to buddy favours. */ clear_buddies(cfs_rq, curr); + return; + } + + /* + * Ensure that a task that missed wakeup preemption by a + * narrow margin doesn't have to wait for a full slice. + * This also mitigates buddy induced latencies under load. + */ + if (!sched_feat(WAKEUP_PREEMPT)) + return; + + if (delta_exec < sysctl_sched_min_granularity) + return; + + if (cfs_rq->nr_running > 1) { + struct sched_entity *se = __pick_next_entity(cfs_rq); + s64 delta = curr->vruntime - se->vruntime; + + if (delta > ideal_runtime) + resched_task(rq_of(cfs_rq)->curr); } } @@ -861,21 +881,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = __pick_next_entity(cfs_rq); - struct sched_entity *buddy; + struct sched_entity *left = se; - if (cfs_rq->next) { - buddy = cfs_rq->next; - cfs_rq->next = NULL; - if (wakeup_preempt_entity(buddy, se) < 1) - return buddy; - } + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + se = cfs_rq->next; - if (cfs_rq->last) { - buddy = cfs_rq->last; - cfs_rq->last = NULL; - if (wakeup_preempt_entity(buddy, se) < 1) - return buddy; - } + /* + * Prefer last buddy, try to return the CPU to a preempted task. + */ + if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) + se = cfs_rq->last; + + clear_buddies(cfs_rq, se); return se; } @@ -1577,6 +1594,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); int sync = wake_flags & WF_SYNC; + int scale = cfs_rq->nr_running >= sched_nr_latency; update_curr(cfs_rq); @@ -1591,18 +1609,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(se == pse)) return; - /* - * Only set the backward buddy when the current task is still on the - * rq. This can happen when a wakeup gets interleaved with schedule on - * the ->pre_schedule() or idle_balance() point, either of which can - * drop the rq lock. - * - * Also, during early boot the idle thread is in the fair class, for - * obvious reasons its a bad idea to schedule back to the idle thread. - */ - if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) - set_last_buddy(se); - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) + if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) set_next_buddy(pse); /* @@ -1648,8 +1655,22 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) + if (wakeup_preempt_entity(se, pse) == 1) { resched_task(curr); + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved + * with schedule on the ->pre_schedule() or idle_balance() + * point, either of which can * drop the rq lock. + * + * Also, during early boot the idle thread is in the fair class, + * for obvious reasons its a bad idea to schedule back to it. + */ + if (unlikely(!se->on_rq || curr == rq->idle)) + return; + if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) + set_last_buddy(se); + } } static struct task_struct *pick_next_task_fair(struct rq *rq) -- cgit v1.2.3 From 49557e620339cb134127b5bfbcfecc06b77d0232 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 2 Nov 2009 20:37:20 +1030 Subject: sched: Fix boot crash by zalloc()ing most of the cpu masks I got a boot crash when forcing cpumasks offstack on 32 bit, because find_new_ilb() returned 3 on my UP system (nohz.cpu_mask wasn't zeroed). AFAICT the others need to be zeroed too: only nohz.ilb_grp_nohz_mask is initialized before use. Signed-off-by: Rusty Russell Cc: Peter Zijlstra LKML-Reference: <200911022037.21282.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index cae6700bedb3..bf21adb6c9fc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9535,13 +9535,13 @@ void __init sched_init(void) current->sched_class = &fair_sched_class; /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ - alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); #endif - alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); + zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ perf_event_init(); -- cgit v1.2.3 From b00bc0b237055b4c45816325ee14f0bd83e6f590 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 Nov 2009 13:01:56 +0100 Subject: uids: Prevent tear down race Ingo triggered the following warning: WARNING: at lib/debugobjects.c:255 debug_print_object+0x42/0x50() Hardware name: System Product Name ODEBUG: init active object type: timer_list Modules linked in: Pid: 2619, comm: dmesg Tainted: G W 2.6.32-rc5-tip+ #5298 Call Trace: [<81035443>] warn_slowpath_common+0x6a/0x81 [<8120e483>] ? debug_print_object+0x42/0x50 [<81035498>] warn_slowpath_fmt+0x29/0x2c [<8120e483>] debug_print_object+0x42/0x50 [<8120ec2a>] __debug_object_init+0x279/0x2d7 [<8120ecb3>] debug_object_init+0x13/0x18 [<810409d2>] init_timer_key+0x17/0x6f [<81041526>] free_uid+0x50/0x6c [<8104ed2d>] put_cred_rcu+0x61/0x72 [<81067fac>] rcu_do_batch+0x70/0x121 debugobjects warns about an enqueued timer being initialized. If CONFIG_USER_SCHED=y the user management code uses delayed work to remove the user from the hash table and tear down the sysfs objects. free_uid is called from RCU and initializes/schedules delayed work if the usage count of the user_struct is 0. The init/schedule happens outside of the uidhash_lock protected region which allows a concurrent caller of find_user() to reference the about to be destroyed user_struct w/o preventing the work from being scheduled. If the next free_uid call happens before the work timer expired then the active timer is initialized and the work scheduled again. The race was introduced in commit 5cb350ba (sched: group scheduling, sysfs tunables) and made more prominent by commit 3959214f (sched: delayed cleanup of user_struct) Move the init/schedule_delayed_work inside of the uidhash_lock protected region to prevent the race. Signed-off-by: Thomas Gleixner Acked-by: Dhaval Giani Cc: Paul E. McKenney Cc: Kay Sievers Cc: stable@kernel.org --- kernel/user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 2c000e7132ac..46d0165ca70c 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -330,9 +330,9 @@ done: */ static void free_user(struct user_struct *up, unsigned long flags) { - spin_unlock_irqrestore(&uidhash_lock, flags); INIT_DELAYED_WORK(&up->work, cleanup_user_struct); schedule_delayed_work(&up->work, msecs_to_jiffies(1000)); + spin_unlock_irqrestore(&uidhash_lock, flags); } #else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ -- cgit v1.2.3 From 83f5b01ffbbaea6f97c9a79d21e240dbfb69f2f1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Oct 2009 08:14:49 -0700 Subject: rcu: Fix long-grace-period race between forcing and initialization Very long RCU read-side critical sections (50 milliseconds or so) can cause a race between force_quiescent_state() and rcu_start_gp() as follows on kernel builds with multi-level rcu_node hierarchies: 1. CPU 0 calls force_quiescent_state(), sees that there is a grace period in progress, and acquires ->fsqlock. 2. CPU 1 detects the end of the grace period, and so cpu_quiet_msk_finish() sets rsp->completed to rsp->gpnum. This operation is carried out under the root rnp->lock, but CPU 0 has not yet acquired that lock. Note that rsp->signaled is still RCU_SAVE_DYNTICK from the last grace period. 3. CPU 1 calls rcu_start_gp(), but no one wants a new grace period, so it drops the root rnp->lock and returns. 4. CPU 0 acquires the root rnp->lock and picks up rsp->completed and rsp->signaled, then drops rnp->lock. It then enters the RCU_SAVE_DYNTICK leg of the switch statement. 5. CPU 2 invokes call_rcu(), and now needs a new grace period. It calls rcu_start_gp(), which acquires the root rnp->lock, sets rsp->signaled to RCU_GP_INIT (too bad that CPU 0 is already in the RCU_SAVE_DYNTICK leg of the switch statement!) and starts initializing the rcu_node hierarchy. If there are multiple levels to the hierarchy, it will drop the root rnp->lock and initialize the lower levels of the hierarchy. 6. CPU 0 notes that rsp->completed has not changed, which permits both CPU 2 and CPU 0 to try updating it concurrently. If CPU 0's update prevails, later calls to force_quiescent_state() can count old quiescent states against the new grace period, which can in turn result in premature ending of grace periods. Not good. This patch adds an RCU_GP_IDLE state for rsp->signaled that is set initially at boot time and any time a grace period ends. This prevents CPU 0 from getting into the workings of force_quiescent_state() in step 4. Additional locking and checks prevent the concurrent update of rsp->signaled in step 6. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1256742889199-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 16 +++++++++++----- kernel/rcutree.h | 7 ++++--- 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0536125b0497..f3077c0ab181 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -59,7 +59,7 @@ NUM_RCU_LVL_2, \ NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ }, \ - .signaled = RCU_SIGNAL_INIT, \ + .signaled = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ @@ -657,14 +657,17 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) * irqs disabled. */ rcu_for_each_node_breadth_first(rsp, rnp) { - spin_lock(&rnp->lock); /* irqs already disabled. */ + spin_lock(&rnp->lock); /* irqs already disabled. */ rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; - spin_unlock(&rnp->lock); /* irqs already disabled. */ + spin_unlock(&rnp->lock); /* irqs remain disabled. */ } + rnp = rcu_get_root(rsp); + spin_lock(&rnp->lock); /* irqs already disabled. */ rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ + spin_unlock(&rnp->lock); /* irqs remain disabled. */ spin_unlock_irqrestore(&rsp->onofflock, flags); } @@ -706,6 +709,7 @@ static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) { WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); rsp->completed = rsp->gpnum; + rsp->signaled = RCU_GP_IDLE; rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ } @@ -1162,9 +1166,10 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) } spin_unlock(&rnp->lock); switch (signaled) { + case RCU_GP_IDLE: case RCU_GP_INIT: - break; /* grace period still initializing, ignore. */ + break; /* grace period idle or initializing, ignore. */ case RCU_SAVE_DYNTICK: @@ -1178,7 +1183,8 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) /* Update state, record completion counter. */ spin_lock(&rnp->lock); - if (lastcomp == rsp->completed) { + if (lastcomp == rsp->completed && + rsp->signaled == RCU_SAVE_DYNTICK) { rsp->signaled = RCU_FORCE_QS; dyntick_record_completed(rsp, lastcomp); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 1823c6e20609..1899023b0962 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -201,9 +201,10 @@ struct rcu_data { }; /* Values for signaled field in struct rcu_state. */ -#define RCU_GP_INIT 0 /* Grace period being initialized. */ -#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */ -#define RCU_FORCE_QS 2 /* Need to force quiescent state. */ +#define RCU_GP_IDLE 0 /* No grace period in progress. */ +#define RCU_GP_INIT 1 /* Grace period being initialized. */ +#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ +#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ #ifdef CONFIG_NO_HZ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK #else /* #ifdef CONFIG_NO_HZ */ -- cgit v1.2.3 From b84ff7d6f1b7f8a43414e74d972ec4c8f3361db4 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 29 Oct 2009 11:48:30 +0100 Subject: sched: Fix kthread_bind() by moving the body of kthread_bind() to sched.c Eric Paris reported that commit f685ceacab07d3f6c236f04803e2f2f0dbcc5afb causes boot time PREEMPT_DEBUG complaints. [ 4.590699] BUG: using smp_processor_id() in preemptible [00000000] code: rmmod/1314 [ 4.593043] caller is task_hot+0x86/0xd0 Since kthread_bind() messes with scheduler internals, move the body to sched.c, and lock the runqueue. Reported-by: Eric Paris Signed-off-by: Mike Galbraith Tested-by: Eric Paris Cc: Peter Zijlstra LKML-Reference: <1256813310.7574.3.camel@marge.simson.net> [ v2: fix !SMP build and clean up ] Signed-off-by: Ingo Molnar --- kernel/kthread.c | 23 ----------------------- kernel/sched.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 5fe709982caa..ab7ae57773e1 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -149,29 +149,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create); -/** - * kthread_bind - bind a just-created kthread to a cpu. - * @k: thread created by kthread_create(). - * @cpu: cpu (might not be online, must be possible) for @k to run on. - * - * Description: This function is equivalent to set_cpus_allowed(), - * except that @cpu doesn't need to be online, and the thread must be - * stopped (i.e., just returned from kthread_create()). - */ -void kthread_bind(struct task_struct *k, unsigned int cpu) -{ - /* Must have done schedule() in kthread() before we set_task_cpu */ - if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) { - WARN_ON(1); - return; - } - set_task_cpu(k, cpu); - k->cpus_allowed = cpumask_of_cpu(cpu); - k->rt.nr_cpus_allowed = 1; - k->flags |= PF_THREAD_BOUND; -} -EXPORT_SYMBOL(kthread_bind); - /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). diff --git a/kernel/sched.c b/kernel/sched.c index bf21adb6c9fc..5cb7d637e33a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1996,6 +1996,38 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio, running); } +/** + * kthread_bind - bind a just-created kthread to a cpu. + * @k: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create()). + * + * Function lives here instead of kthread.c because it messes with + * scheduler internals which require locking. + */ +void kthread_bind(struct task_struct *p, unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { + WARN_ON(1); + return; + } + + spin_lock_irqsave(&rq->lock, flags); + set_task_cpu(p, cpu); + p->cpus_allowed = cpumask_of_cpu(cpu); + p->rt.nr_cpus_allowed = 1; + p->flags |= PF_THREAD_BOUND; + spin_unlock_irqrestore(&rq->lock, flags); +} +EXPORT_SYMBOL(kthread_bind); + #ifdef CONFIG_SMP /* * Is this task likely cache-hot: -- cgit v1.2.3 From f7112949f6a4cd6883d66c882d568c2197321de6 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 3 Nov 2009 19:42:45 +0800 Subject: ring-buffer: Synchronize resizing buffer with reader lock We got a sudden panic when we reduced the size of the ringbuffer. We can reproduce the panic by the following steps: echo 1 > events/sched/enable cat trace_pipe > /dev/null & while ((1)) do echo 12000 > buffer_size_kb echo 512 > buffer_size_kb done (not more than 5 seconds, panic ...) Reported-by: KOSAKI Motohiro Signed-off-by: Lai Jiangshan LKML-Reference: <4AF01735.9060409@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3ffa502fb243..5dd017fea6f5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1193,6 +1193,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); + spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1207,6 +1208,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) return; rb_reset_cpu(cpu_buffer); + spin_unlock_irq(&cpu_buffer->reader_lock); rb_check_pages(cpu_buffer); -- cgit v1.2.3 From ed146b25942b428f8e8056587b7638ce76573c2f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 3 Nov 2009 08:55:38 +0800 Subject: ftrace: Fix unmatched locking in ftrace_regex_write() When a command is passed to the set_ftrace_filter, then the ftrace_regex_lock is still held going back to user space. # echo 'do_open : foo' > set_ftrace_filter (still holding ftrace_regex_lock when returning to user space!) Signed-off-by: Li Zefan LKML-Reference: <4AEF7F8A.3080300@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9c451a1930b6..6dc4e5ef7a01 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2222,15 +2222,15 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, ret = ftrace_process_regex(parser->buffer, parser->idx, enable); if (ret) - goto out; + goto out_unlock; trace_parser_clear(parser); } ret = read; - +out_unlock: mutex_unlock(&ftrace_regex_lock); -out: + return ret; } -- cgit v1.2.3 From e7e7e0c084ef862d5754701108d4a038514d6314 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Sat, 7 Nov 2009 11:16:13 +0800 Subject: genirq: try_one_irq() must be called with irq disabled Prarit reported: ================================= [ INFO: inconsistent lock state ] 2.6.32-rc5 #1 --------------------------------- inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. swapper/0 [HC0[0]:SC1[1]:HE1:SE0] takes: (&irq_desc_lock_class){?.-...}, at: [] try_one_irq+0x32/0x138 {IN-HARDIRQ-W} state was registered at: [] __lock_acquire+0x2fc/0xd5d [] lock_acquire+0xf3/0x12d [] _spin_lock+0x40/0x89 [] handle_level_irq+0x30/0x105 [] handle_irq+0x95/0xb7 [] do_IRQ+0x6a/0xe0 [] ret_from_intr+0x0/0x16 irq event stamp: 195096 hardirqs last enabled at (195096): [] _spin_unlock_irq+0x3a/0x5c hardirqs last disabled at (195095): [] _spin_lock_irq+0x29/0x95 softirqs last enabled at (195088): [] __do_softirq+0x1c1/0x1ef softirqs last disabled at (195093): [] call_softirq+0x1c/0x30 other info that might help us debug this: 1 lock held by swapper/0: #0: (kernel/irq/spurious.c:21){+.-...}, at: [] run_timer_softirq+0x1a9/0x315 stack backtrace: Pid: 0, comm: swapper Not tainted 2.6.32-rc5 #1 Call Trace: [] valid_state+0x187/0x1ae [] mark_lock+0x129/0x253 [] __lock_acquire+0x370/0xd5d [] lock_acquire+0xf3/0x12d [] _spin_lock+0x40/0x89 [] try_one_irq+0x32/0x138 [] poll_all_shared_irqs+0x41/0x6d [] poll_spurious_irqs+0x1c/0x49 [] run_timer_softirq+0x239/0x315 [] __do_softirq+0x102/0x1ef [] call_softirq+0x1c/0x30 [] do_softirq+0x59/0xca [] irq_exit+0x58/0xae [] smp_apic_timer_interrupt+0x94/0xba [] apic_timer_interrupt+0x13/0x20 The reason is that try_one_irq() is called from hardirq context with interrupts disabled and from softirq context (poll_all_shared_irqs()) with interrupts enabled. Disable interrupts before calling it from poll_all_shared_irqs(). Reported-and-tested-by: Prarit Bhargava Signed-off-by: Yong Zhang LKML-Reference: <1257563773-4620-1-git-send-email-yong.zhang0@gmail.com> Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 114e704760fe..bd7273e6282e 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -121,7 +121,9 @@ static void poll_all_shared_irqs(void) if (!(status & IRQ_SPURIOUS_DISABLED)) continue; + local_irq_disable(); try_one_irq(i, desc); + local_irq_enable(); } } -- cgit v1.2.3 From 968c86458a5975efa7a95f832a4ec9fb21471137 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 6 Nov 2009 15:31:08 -0800 Subject: sched: Fix kernel-doc function parameter name Fix variable name in sched.c kernel-doc notation. Fixes this DocBook warning: Warning(kernel/sched.c:2008): No description found for parameter 'p' Warning(kernel/sched.c:2008): Excess function parameter 'k' description in 'kthread_bind' Signed-off-by: Randy Dunlap LKML-Reference: <4AF4B1BC.8020604@oracle.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 28dd4f490bfc..7d7d5fcca4cb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1994,7 +1994,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, /** * kthread_bind - bind a just-created kthread to a cpu. - * @k: thread created by kthread_create(). + * @p: thread created by kthread_create(). * @cpu: cpu (might not be online, must be possible) for @k to run on. * * Description: This function is equivalent to set_cpus_allowed(), -- cgit v1.2.3 From e9036b36eed4d3cdb33fa9cbcdd9888ae516889f Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 26 Oct 2009 22:24:14 +0300 Subject: sched: Use root_task_group_empty only with FAIR_GROUP_SCHED root_task_group_empty is used only with FAIR_GROUP_SCHED so if we use other scheduler options we get: kernel/sched.c:314: warning: 'root_task_group_empty' defined but not used So move CONFIG_FAIR_GROUP_SCHED up that it covers root_task_group_empty(). Signed-off-by: Cyrill Gorcunov Cc: Peter Zijlstra LKML-Reference: <20091026192414.GB5321@lenovo> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7d7d5fcca4cb..3c11ae0a948d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -309,6 +309,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); */ static DEFINE_SPINLOCK(task_group_lock); +#ifdef CONFIG_FAIR_GROUP_SCHED + #ifdef CONFIG_SMP static int root_task_group_empty(void) { @@ -316,7 +318,6 @@ static int root_task_group_empty(void) } #endif -#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) #else /* !CONFIG_USER_SCHED */ -- cgit v1.2.3 From 9398180097e359646d46083c3e079a54e20bee82 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 17 Nov 2009 14:06:20 -0800 Subject: workqueue: fix race condition in schedule_on_each_cpu() Commit 65a64464349883891e21e74af16c05d6e1eeb4e9 ("HWPOISON: Allow schedule_on_each_cpu() from keventd") which allows schedule_on_each_cpu() to be called from keventd added a race condition. schedule_on_each_cpu() may race with cpu hotplug and end up executing the function twice on a cpu. Fix it by moving direct execution into the section protected with get/put_online_cpus(). While at it, update code such that direct execution is done after works have been scheduled for all other cpus and drop unnecessary cpu != orig test from flush loop. Signed-off-by: Tejun Heo Cc: Andi Kleen Acked-by: Oleg Nesterov Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 12328147132c..67e526b6ae81 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -692,31 +692,29 @@ int schedule_on_each_cpu(work_func_t func) if (!works) return -ENOMEM; + get_online_cpus(); + /* - * when running in keventd don't schedule a work item on itself. - * Can just call directly because the work queue is already bound. - * This also is faster. - * Make this a generic parameter for other workqueues? + * When running in keventd don't schedule a work item on + * itself. Can just call directly because the work queue is + * already bound. This also is faster. */ - if (current_is_keventd()) { + if (current_is_keventd()) orig = raw_smp_processor_id(); - INIT_WORK(per_cpu_ptr(works, orig), func); - func(per_cpu_ptr(works, orig)); - } - get_online_cpus(); for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); - if (cpu == orig) - continue; INIT_WORK(work, func); - schedule_work_on(cpu, work); - } - for_each_online_cpu(cpu) { if (cpu != orig) - flush_work(per_cpu_ptr(works, cpu)); + schedule_work_on(cpu, work); } + if (orig >= 0) + func(per_cpu_ptr(works, orig)); + + for_each_online_cpu(cpu) + flush_work(per_cpu_ptr(works, cpu)); + put_online_cpus(); free_percpu(works); return 0; -- cgit v1.2.3 From 3d7a641e544e428191667e8b1f83f96fa46dbd65 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:10:23 +0000 Subject: SLOW_WORK: Wait for outstanding work items belonging to a module to clear Wait for outstanding slow work items belonging to a module to clear when unregistering that module as a user of the facility. This prevents the put_ref code of a work item from being taken away before it returns. Signed-off-by: David Howells --- kernel/slow-work.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 126 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/slow-work.c b/kernel/slow-work.c index 0d31135efbf4..dd08f376e406 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -22,6 +22,8 @@ #define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after * OOM */ +#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */ + static void slow_work_cull_timeout(unsigned long); static void slow_work_oom_timeout(unsigned long); @@ -46,7 +48,7 @@ static unsigned vslow_work_proportion = 50; /* % of threads that may process #ifdef CONFIG_SYSCTL static const int slow_work_min_min_threads = 2; -static int slow_work_max_max_threads = 255; +static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT; static const int slow_work_min_vslow = 1; static const int slow_work_max_vslow = 99; @@ -97,6 +99,23 @@ static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0); static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0); static struct slow_work slow_work_new_thread; /* new thread starter */ +/* + * slow work ID allocation (use slow_work_queue_lock) + */ +static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT); + +/* + * Unregistration tracking to prevent put_ref() from disappearing during module + * unload + */ +#ifdef CONFIG_MODULES +static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT]; +static struct module *slow_work_unreg_module; +static struct slow_work *slow_work_unreg_work_item; +static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq); +static DEFINE_MUTEX(slow_work_unreg_sync_lock); +#endif + /* * The queues of work items and the lock governing access to them. These are * shared between all the CPUs. It doesn't make sense to have per-CPU queues @@ -149,8 +168,11 @@ static unsigned slow_work_calc_vsmax(void) * Attempt to execute stuff queued on a slow thread. Return true if we managed * it, false if there was nothing to do. */ -static bool slow_work_execute(void) +static bool slow_work_execute(int id) { +#ifdef CONFIG_MODULES + struct module *module; +#endif struct slow_work *work = NULL; unsigned vsmax; bool very_slow; @@ -186,6 +208,12 @@ static bool slow_work_execute(void) } else { very_slow = false; /* avoid the compiler warning */ } + +#ifdef CONFIG_MODULES + if (work) + slow_work_thread_processing[id] = work->owner; +#endif + spin_unlock_irq(&slow_work_queue_lock); if (!work) @@ -219,7 +247,18 @@ static bool slow_work_execute(void) spin_unlock_irq(&slow_work_queue_lock); } + /* sort out the race between module unloading and put_ref() */ work->ops->put_ref(work); + +#ifdef CONFIG_MODULES + module = slow_work_thread_processing[id]; + slow_work_thread_processing[id] = NULL; + smp_mb(); + if (slow_work_unreg_work_item == work || + slow_work_unreg_module == module) + wake_up_all(&slow_work_unreg_wq); +#endif + return true; auto_requeue: @@ -232,6 +271,7 @@ auto_requeue: else list_add_tail(&work->link, &slow_work_queue); spin_unlock_irq(&slow_work_queue_lock); + slow_work_thread_processing[id] = NULL; return true; } @@ -368,13 +408,22 @@ static inline bool slow_work_available(int vsmax) */ static int slow_work_thread(void *_data) { - int vsmax; + int vsmax, id; DEFINE_WAIT(wait); set_freezable(); set_user_nice(current, -5); + /* allocate ourselves an ID */ + spin_lock_irq(&slow_work_queue_lock); + id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT); + BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT); + __set_bit(id, slow_work_ids); + spin_unlock_irq(&slow_work_queue_lock); + + sprintf(current->comm, "kslowd%03u", id); + for (;;) { vsmax = vslow_work_proportion; vsmax *= atomic_read(&slow_work_thread_count); @@ -395,7 +444,7 @@ static int slow_work_thread(void *_data) vsmax *= atomic_read(&slow_work_thread_count); vsmax /= 100; - if (slow_work_available(vsmax) && slow_work_execute()) { + if (slow_work_available(vsmax) && slow_work_execute(id)) { cond_resched(); if (list_empty(&slow_work_queue) && list_empty(&vslow_work_queue) && @@ -412,6 +461,10 @@ static int slow_work_thread(void *_data) break; } + spin_lock_irq(&slow_work_queue_lock); + __clear_bit(id, slow_work_ids); + spin_unlock_irq(&slow_work_queue_lock); + if (atomic_dec_and_test(&slow_work_thread_count)) complete_and_exit(&slow_work_last_thread_exited, 0); return 0; @@ -475,6 +528,7 @@ static void slow_work_new_thread_execute(struct slow_work *work) } static const struct slow_work_ops slow_work_new_thread_ops = { + .owner = THIS_MODULE, .get_ref = slow_work_new_thread_get_ref, .put_ref = slow_work_new_thread_put_ref, .execute = slow_work_new_thread_execute, @@ -546,12 +600,13 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, /** * slow_work_register_user - Register a user of the facility + * @module: The module about to make use of the facility * * Register a user of the facility, starting up the initial threads if there * aren't any other users at this point. This will return 0 if successful, or * an error if not. */ -int slow_work_register_user(void) +int slow_work_register_user(struct module *module) { struct task_struct *p; int loop; @@ -598,14 +653,79 @@ error: } EXPORT_SYMBOL(slow_work_register_user); +/* + * wait for all outstanding items from the calling module to complete + * - note that more items may be queued whilst we're waiting + */ +static void slow_work_wait_for_items(struct module *module) +{ + DECLARE_WAITQUEUE(myself, current); + struct slow_work *work; + int loop; + + mutex_lock(&slow_work_unreg_sync_lock); + add_wait_queue(&slow_work_unreg_wq, &myself); + + for (;;) { + spin_lock_irq(&slow_work_queue_lock); + + /* first of all, we wait for the last queued item in each list + * to be processed */ + list_for_each_entry_reverse(work, &vslow_work_queue, link) { + if (work->owner == module) { + set_current_state(TASK_UNINTERRUPTIBLE); + slow_work_unreg_work_item = work; + goto do_wait; + } + } + list_for_each_entry_reverse(work, &slow_work_queue, link) { + if (work->owner == module) { + set_current_state(TASK_UNINTERRUPTIBLE); + slow_work_unreg_work_item = work; + goto do_wait; + } + } + + /* then we wait for the items being processed to finish */ + slow_work_unreg_module = module; + smp_mb(); + for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) { + if (slow_work_thread_processing[loop] == module) + goto do_wait; + } + spin_unlock_irq(&slow_work_queue_lock); + break; /* okay, we're done */ + + do_wait: + spin_unlock_irq(&slow_work_queue_lock); + schedule(); + slow_work_unreg_work_item = NULL; + slow_work_unreg_module = NULL; + } + + remove_wait_queue(&slow_work_unreg_wq, &myself); + mutex_unlock(&slow_work_unreg_sync_lock); +} + /** * slow_work_unregister_user - Unregister a user of the facility + * @module: The module whose items should be cleared * * Unregister a user of the facility, killing all the threads if this was the * last one. + * + * This waits for all the work items belonging to the nominated module to go + * away before proceeding. */ -void slow_work_unregister_user(void) +void slow_work_unregister_user(struct module *module) { + /* first of all, wait for all outstanding items from the calling module + * to complete */ + if (module) + slow_work_wait_for_items(module); + + /* then we can actually go about shutting down the facility if need + * be */ mutex_lock(&slow_work_user_lock); BUG_ON(slow_work_user_count <= 0); -- cgit v1.2.3 From 4d8bb2cbccf6dccaada509aafeb01c6205c9d8c4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Nov 2009 18:10:39 +0000 Subject: SLOW_WORK: Make slow_work_ops ->get_ref/->put_ref optional Make the ability for the slow-work facility to take references on a work item optional as not everyone requires this. Even the internal slow-work stubs them out, so those can be got rid of too. Signed-off-by: Jens Axboe Signed-off-by: David Howells --- kernel/slow-work.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/slow-work.c b/kernel/slow-work.c index dd08f376e406..fccf421eb5c1 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -145,6 +145,20 @@ static DECLARE_COMPLETION(slow_work_last_thread_exited); static int slow_work_user_count; static DEFINE_MUTEX(slow_work_user_lock); +static inline int slow_work_get_ref(struct slow_work *work) +{ + if (work->ops->get_ref) + return work->ops->get_ref(work); + + return 0; +} + +static inline void slow_work_put_ref(struct slow_work *work) +{ + if (work->ops->put_ref) + work->ops->put_ref(work); +} + /* * Calculate the maximum number of active threads in the pool that are * permitted to process very slow work items. @@ -248,7 +262,7 @@ static bool slow_work_execute(int id) } /* sort out the race between module unloading and put_ref() */ - work->ops->put_ref(work); + slow_work_put_ref(work); #ifdef CONFIG_MODULES module = slow_work_thread_processing[id]; @@ -309,7 +323,6 @@ int slow_work_enqueue(struct slow_work *work) BUG_ON(slow_work_user_count <= 0); BUG_ON(!work); BUG_ON(!work->ops); - BUG_ON(!work->ops->get_ref); /* when honouring an enqueue request, we only promise that we will run * the work function in the future; we do not promise to run it once @@ -339,7 +352,7 @@ int slow_work_enqueue(struct slow_work *work) if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); } else { - if (work->ops->get_ref(work) < 0) + if (slow_work_get_ref(work) < 0) goto cant_get_ref; if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) list_add_tail(&work->link, &vslow_work_queue); @@ -479,21 +492,6 @@ static void slow_work_cull_timeout(unsigned long data) wake_up(&slow_work_thread_wq); } -/* - * Get a reference on slow work thread starter - */ -static int slow_work_new_thread_get_ref(struct slow_work *work) -{ - return 0; -} - -/* - * Drop a reference on slow work thread starter - */ -static void slow_work_new_thread_put_ref(struct slow_work *work) -{ -} - /* * Start a new slow work thread */ @@ -529,8 +527,6 @@ static void slow_work_new_thread_execute(struct slow_work *work) static const struct slow_work_ops slow_work_new_thread_ops = { .owner = THIS_MODULE, - .get_ref = slow_work_new_thread_get_ref, - .put_ref = slow_work_new_thread_put_ref, .execute = slow_work_new_thread_execute, }; -- cgit v1.2.3 From 0160950297c08f8233c89b9f9e7dd59cfb080809 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Nov 2009 18:10:43 +0000 Subject: SLOW_WORK: Add support for cancellation of slow work Add support for cancellation of queued slow work and delayed slow work items. The cancellation functions will wait for items that are pending or undergoing execution to be discarded by the slow work facility. Attempting to enqueue work that is in the process of being cancelled will result in ECANCELED. Signed-off-by: Jens Axboe Signed-off-by: David Howells --- kernel/slow-work.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 75 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/slow-work.c b/kernel/slow-work.c index fccf421eb5c1..671cc434532a 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -236,12 +236,17 @@ static bool slow_work_execute(int id) if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) BUG(); - work->ops->execute(work); + /* don't execute if the work is in the process of being cancelled */ + if (!test_bit(SLOW_WORK_CANCELLING, &work->flags)) + work->ops->execute(work); if (very_slow) atomic_dec(&vslow_work_executing_count); clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); + /* wake up anyone waiting for this work to be complete */ + wake_up_bit(&work->flags, SLOW_WORK_EXECUTING); + /* if someone tried to enqueue the item whilst we were executing it, * then it'll be left unenqueued to avoid multiple threads trying to * execute it simultaneously @@ -314,11 +319,16 @@ auto_requeue: * allowed to pick items to execute. This ensures that very slow items won't * overly block ones that are just ordinarily slow. * - * Returns 0 if successful, -EAGAIN if not. + * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is + * attempted queued) */ int slow_work_enqueue(struct slow_work *work) { unsigned long flags; + int ret; + + if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) + return -ECANCELED; BUG_ON(slow_work_user_count <= 0); BUG_ON(!work); @@ -335,6 +345,9 @@ int slow_work_enqueue(struct slow_work *work) if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { spin_lock_irqsave(&slow_work_queue_lock, flags); + if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags))) + goto cancelled; + /* we promise that we will not attempt to execute the work * function in more than one thread simultaneously * @@ -352,8 +365,9 @@ int slow_work_enqueue(struct slow_work *work) if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); } else { - if (slow_work_get_ref(work) < 0) - goto cant_get_ref; + ret = slow_work_get_ref(work); + if (ret < 0) + goto failed; if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) list_add_tail(&work->link, &vslow_work_queue); else @@ -365,12 +379,67 @@ int slow_work_enqueue(struct slow_work *work) } return 0; -cant_get_ref: +cancelled: + ret = -ECANCELED; +failed: spin_unlock_irqrestore(&slow_work_queue_lock, flags); - return -EAGAIN; + return ret; } EXPORT_SYMBOL(slow_work_enqueue); +static int slow_work_wait(void *word) +{ + schedule(); + return 0; +} + +/** + * slow_work_cancel - Cancel a slow work item + * @work: The work item to cancel + * + * This function will cancel a previously enqueued work item. If we cannot + * cancel the work item, it is guarenteed to have run when this function + * returns. + */ +void slow_work_cancel(struct slow_work *work) +{ + bool wait = true, put = false; + + set_bit(SLOW_WORK_CANCELLING, &work->flags); + + spin_lock_irq(&slow_work_queue_lock); + + if (test_bit(SLOW_WORK_PENDING, &work->flags) && + !list_empty(&work->link)) { + /* the link in the pending queue holds a reference on the item + * that we will need to release */ + list_del_init(&work->link); + wait = false; + put = true; + clear_bit(SLOW_WORK_PENDING, &work->flags); + + } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) { + /* the executor is holding our only reference on the item, so + * we merely need to wait for it to finish executing */ + clear_bit(SLOW_WORK_PENDING, &work->flags); + } + + spin_unlock_irq(&slow_work_queue_lock); + + /* the EXECUTING flag is set by the executor whilst the spinlock is set + * and before the item is dequeued - so assuming the above doesn't + * actually dequeue it, simply waiting for the EXECUTING flag to be + * released here should be sufficient */ + if (wait) + wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait, + TASK_UNINTERRUPTIBLE); + + clear_bit(SLOW_WORK_CANCELLING, &work->flags); + if (put) + slow_work_put_ref(work); +} +EXPORT_SYMBOL(slow_work_cancel); + /* * Schedule a cull of the thread pool at some time in the near future */ -- cgit v1.2.3 From 6b8268b17a1ffc942bc72d7d00274e433d6b6719 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Nov 2009 18:10:47 +0000 Subject: SLOW_WORK: Add delayed_slow_work support This adds support for starting slow work with a delay, similar to the functionality we have for workqueues. Signed-off-by: Jens Axboe Signed-off-by: David Howells --- kernel/slow-work.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 127 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/slow-work.c b/kernel/slow-work.c index 671cc434532a..f67e1daae93d 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -406,11 +406,40 @@ void slow_work_cancel(struct slow_work *work) bool wait = true, put = false; set_bit(SLOW_WORK_CANCELLING, &work->flags); + smp_mb(); + + /* if the work item is a delayed work item with an active timer, we + * need to wait for the timer to finish _before_ getting the spinlock, + * lest we deadlock against the timer routine + * + * the timer routine will leave DELAYED set if it notices the + * CANCELLING flag in time + */ + if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { + struct delayed_slow_work *dwork = + container_of(work, struct delayed_slow_work, work); + del_timer_sync(&dwork->timer); + } spin_lock_irq(&slow_work_queue_lock); - if (test_bit(SLOW_WORK_PENDING, &work->flags) && - !list_empty(&work->link)) { + if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { + /* the timer routine aborted or never happened, so we are left + * holding the timer's reference on the item and should just + * drop the pending flag and wait for any ongoing execution to + * finish */ + struct delayed_slow_work *dwork = + container_of(work, struct delayed_slow_work, work); + + BUG_ON(timer_pending(&dwork->timer)); + BUG_ON(!list_empty(&work->link)); + + clear_bit(SLOW_WORK_DELAYED, &work->flags); + put = true; + clear_bit(SLOW_WORK_PENDING, &work->flags); + + } else if (test_bit(SLOW_WORK_PENDING, &work->flags) && + !list_empty(&work->link)) { /* the link in the pending queue holds a reference on the item * that we will need to release */ list_del_init(&work->link); @@ -440,6 +469,102 @@ void slow_work_cancel(struct slow_work *work) } EXPORT_SYMBOL(slow_work_cancel); +/* + * Handle expiry of the delay timer, indicating that a delayed slow work item + * should now be queued if not cancelled + */ +static void delayed_slow_work_timer(unsigned long data) +{ + struct slow_work *work = (struct slow_work *) data; + unsigned long flags; + bool queued = false, put = false; + + spin_lock_irqsave(&slow_work_queue_lock, flags); + if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) { + clear_bit(SLOW_WORK_DELAYED, &work->flags); + + if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { + /* we discard the reference the timer was holding in + * favour of the one the executor holds */ + set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); + put = true; + } else { + if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) + list_add_tail(&work->link, &vslow_work_queue); + else + list_add_tail(&work->link, &slow_work_queue); + queued = true; + } + } + + spin_unlock_irqrestore(&slow_work_queue_lock, flags); + if (put) + slow_work_put_ref(work); + if (queued) + wake_up(&slow_work_thread_wq); +} + +/** + * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing + * @dwork: The delayed work item to queue + * @delay: When to start executing the work, in jiffies from now + * + * This is similar to slow_work_enqueue(), but it adds a delay before the work + * is actually queued for processing. + * + * The item can have delayed processing requested on it whilst it is being + * executed. The delay will begin immediately, and if it expires before the + * item finishes executing, the item will be placed back on the queue when it + * has done executing. + */ +int delayed_slow_work_enqueue(struct delayed_slow_work *dwork, + unsigned long delay) +{ + struct slow_work *work = &dwork->work; + unsigned long flags; + int ret; + + if (delay == 0) + return slow_work_enqueue(&dwork->work); + + BUG_ON(slow_work_user_count <= 0); + BUG_ON(!work); + BUG_ON(!work->ops); + + if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) + return -ECANCELED; + + if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { + spin_lock_irqsave(&slow_work_queue_lock, flags); + + if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) + goto cancelled; + + /* the timer holds a reference whilst it is pending */ + ret = work->ops->get_ref(work); + if (ret < 0) + goto cant_get_ref; + + if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags)) + BUG(); + dwork->timer.expires = jiffies + delay; + dwork->timer.data = (unsigned long) work; + dwork->timer.function = delayed_slow_work_timer; + add_timer(&dwork->timer); + + spin_unlock_irqrestore(&slow_work_queue_lock, flags); + } + + return 0; + +cancelled: + ret = -ECANCELED; +cant_get_ref: + spin_unlock_irqrestore(&slow_work_queue_lock, flags); + return ret; +} +EXPORT_SYMBOL(delayed_slow_work_enqueue); + /* * Schedule a cull of the thread pool at some time in the near future */ -- cgit v1.2.3 From 8fba10a42d191de612e60e7009c8f0313f90a9b3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:10:51 +0000 Subject: SLOW_WORK: Allow the work items to be viewed through a /proc file Allow the executing and queued work items to be viewed through a /proc file for debugging purposes. The contents look something like the following: THR PID ITEM ADDR FL MARK DESC === ===== ================ == ===== ========== 0 3005 ffff880023f52348 a 952ms FSC: OBJ17d3: LOOK 1 3006 ffff880024e33668 2 160ms FSC: OBJ17e5 OP60d3b: Write1/Store fl=2 2 3165 ffff8800296dd180 a 424ms FSC: OBJ17e4: LOOK 3 4089 ffff8800262c8d78 a 212ms FSC: OBJ17ea: CRTN 4 4090 ffff88002792bed8 2 388ms FSC: OBJ17e8 OP60d36: Write1/Store fl=2 5 4092 ffff88002a0ef308 2 388ms FSC: OBJ17e7 OP60d2e: Write1/Store fl=2 6 4094 ffff88002abaf4b8 2 132ms FSC: OBJ17e2 OP60d4e: Write1/Store fl=2 7 4095 ffff88002bb188e0 a 388ms FSC: OBJ17e9: CRTN vsq - ffff880023d99668 1 308ms FSC: OBJ17e0 OP60f91: Write1/EnQ fl=2 vsq - ffff8800295d1740 1 212ms FSC: OBJ16be OP4d4b6: Write1/EnQ fl=2 vsq - ffff880025ba3308 1 160ms FSC: OBJ179a OP58dec: Write1/EnQ fl=2 vsq - ffff880024ec83e0 1 160ms FSC: OBJ17ae OP599f2: Write1/EnQ fl=2 vsq - ffff880026618e00 1 160ms FSC: OBJ17e6 OP60d33: Write1/EnQ fl=2 vsq - ffff880025a2a4b8 1 132ms FSC: OBJ16a2 OP4d583: Write1/EnQ fl=2 vsq - ffff880023cbe6d8 9 212ms FSC: OBJ17eb: LOOK vsq - ffff880024d37590 9 212ms FSC: OBJ17ec: LOOK vsq - ffff880027746cb0 9 212ms FSC: OBJ17ed: LOOK vsq - ffff880024d37ae8 9 212ms FSC: OBJ17ee: LOOK vsq - ffff880024d37cb0 9 212ms FSC: OBJ17ef: LOOK vsq - ffff880025036550 9 212ms FSC: OBJ17f0: LOOK vsq - ffff8800250368e0 9 212ms FSC: OBJ17f1: LOOK vsq - ffff880025036aa8 9 212ms FSC: OBJ17f2: LOOK In the 'THR' column, executing items show the thread they're occupying and queued threads indicate which queue they're on. 'PID' shows the process ID of a slow-work thread that's executing something. 'FL' shows the work item flags. 'MARK' indicates how long since an item was queued or began executing. Lastly, the 'DESC' column permits the owner of an item to give some information. Signed-off-by: David Howells --- kernel/Makefile | 1 + kernel/slow-work-proc.c | 227 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/slow-work.c | 44 +++++++--- kernel/slow-work.h | 72 +++++++++++++++ 4 files changed, 333 insertions(+), 11 deletions(-) create mode 100644 kernel/slow-work-proc.c create mode 100644 kernel/slow-work.h (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index b8d4cd8ac0b9..776ffed1556d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -94,6 +94,7 @@ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o +obj-$(CONFIG_SLOW_WORK_PROC) += slow-work-proc.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) diff --git a/kernel/slow-work-proc.c b/kernel/slow-work-proc.c new file mode 100644 index 000000000000..3988032571f5 --- /dev/null +++ b/kernel/slow-work-proc.c @@ -0,0 +1,227 @@ +/* Slow work debugging + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include "slow-work.h" + +#define ITERATOR_SHIFT (BITS_PER_LONG - 4) +#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT) +#define ITERATOR_COUNTER (~ITERATOR_SELECTOR) + +void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m) +{ + seq_puts(m, "Slow-work: New thread"); +} + +/* + * Render the time mark field on a work item into a 5-char time with units plus + * a space + */ +static void slow_work_print_mark(struct seq_file *m, struct slow_work *work) +{ + struct timespec now, diff; + + now = CURRENT_TIME; + diff = timespec_sub(now, work->mark); + + if (diff.tv_sec < 0) + seq_puts(m, " -ve "); + else if (diff.tv_sec == 0 && diff.tv_nsec < 1000) + seq_printf(m, "%3luns ", diff.tv_nsec); + else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000) + seq_printf(m, "%3luus ", diff.tv_nsec / 1000); + else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000) + seq_printf(m, "%3lums ", diff.tv_nsec / 1000000); + else if (diff.tv_sec <= 1) + seq_puts(m, " 1s "); + else if (diff.tv_sec < 60) + seq_printf(m, "%4lus ", diff.tv_sec); + else if (diff.tv_sec < 60 * 60) + seq_printf(m, "%4lum ", diff.tv_sec / 60); + else if (diff.tv_sec < 60 * 60 * 24) + seq_printf(m, "%4luh ", diff.tv_sec / 3600); + else + seq_puts(m, "exces "); +} + +/* + * Describe a slow work item for /proc + */ +static int slow_work_runqueue_show(struct seq_file *m, void *v) +{ + struct slow_work *work; + struct list_head *p = v; + unsigned long id; + + switch ((unsigned long) v) { + case 1: + seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n"); + return 0; + case 2: + seq_puts(m, "=== ===== ================ == ===== ==========\n"); + return 0; + + case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1: + id = (unsigned long) v - 3; + + read_lock(&slow_work_execs_lock); + work = slow_work_execs[id]; + if (work) { + smp_read_barrier_depends(); + + seq_printf(m, "%3lu %5d %16p %2lx ", + id, slow_work_pids[id], work, work->flags); + slow_work_print_mark(m, work); + + if (work->ops->desc) + work->ops->desc(work, m); + seq_putc(m, '\n'); + } + read_unlock(&slow_work_execs_lock); + return 0; + + default: + work = list_entry(p, struct slow_work, link); + seq_printf(m, "%3s - %16p %2lx ", + work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq", + work, work->flags); + slow_work_print_mark(m, work); + + if (work->ops->desc) + work->ops->desc(work, m); + seq_putc(m, '\n'); + return 0; + } +} + +/* + * map the iterator to a work item + */ +static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos) +{ + struct list_head *p; + unsigned long count, id; + + switch (*_pos >> ITERATOR_SHIFT) { + case 0x0: + if (*_pos == 0) + *_pos = 1; + if (*_pos < 3) + return (void *)(unsigned long) *_pos; + if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT) + for (id = *_pos - 3; + id < SLOW_WORK_THREAD_LIMIT; + id++, (*_pos)++) + if (slow_work_execs[id]) + return (void *)(unsigned long) *_pos; + *_pos = 0x1UL << ITERATOR_SHIFT; + + case 0x1: + count = *_pos & ITERATOR_COUNTER; + list_for_each(p, &slow_work_queue) { + if (count == 0) + return p; + count--; + } + *_pos = 0x2UL << ITERATOR_SHIFT; + + case 0x2: + count = *_pos & ITERATOR_COUNTER; + list_for_each(p, &vslow_work_queue) { + if (count == 0) + return p; + count--; + } + *_pos = 0x3UL << ITERATOR_SHIFT; + + default: + return NULL; + } +} + +/* + * set up the iterator to start reading from the first line + */ +static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos) +{ + spin_lock_irq(&slow_work_queue_lock); + return slow_work_runqueue_index(m, _pos); +} + +/* + * move to the next line + */ +static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos) +{ + struct list_head *p = v; + unsigned long selector = *_pos >> ITERATOR_SHIFT; + + (*_pos)++; + switch (selector) { + case 0x0: + return slow_work_runqueue_index(m, _pos); + + case 0x1: + if (*_pos >> ITERATOR_SHIFT == 0x1) { + p = p->next; + if (p != &slow_work_queue) + return p; + } + *_pos = 0x2UL << ITERATOR_SHIFT; + p = &vslow_work_queue; + + case 0x2: + if (*_pos >> ITERATOR_SHIFT == 0x2) { + p = p->next; + if (p != &vslow_work_queue) + return p; + } + *_pos = 0x3UL << ITERATOR_SHIFT; + + default: + return NULL; + } +} + +/* + * clean up after reading + */ +static void slow_work_runqueue_stop(struct seq_file *m, void *v) +{ + spin_unlock_irq(&slow_work_queue_lock); +} + +static const struct seq_operations slow_work_runqueue_ops = { + .start = slow_work_runqueue_start, + .stop = slow_work_runqueue_stop, + .next = slow_work_runqueue_next, + .show = slow_work_runqueue_show, +}; + +/* + * open "/proc/slow_work_rq" to list queue contents + */ +static int slow_work_runqueue_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &slow_work_runqueue_ops); +} + +const struct file_operations slow_work_runqueue_fops = { + .owner = THIS_MODULE, + .open = slow_work_runqueue_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/kernel/slow-work.c b/kernel/slow-work.c index f67e1daae93d..b763bc2d2670 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -16,13 +16,8 @@ #include #include #include - -#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of - * things to do */ -#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after - * OOM */ - -#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */ +#include +#include "slow-work.h" static void slow_work_cull_timeout(unsigned long); static void slow_work_oom_timeout(unsigned long); @@ -116,6 +111,15 @@ static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq); static DEFINE_MUTEX(slow_work_unreg_sync_lock); #endif +/* + * Data for tracking currently executing items for indication through /proc + */ +#ifdef CONFIG_SLOW_WORK_PROC +struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT]; +pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT]; +DEFINE_RWLOCK(slow_work_execs_lock); +#endif + /* * The queues of work items and the lock governing access to them. These are * shared between all the CPUs. It doesn't make sense to have per-CPU queues @@ -124,9 +128,9 @@ static DEFINE_MUTEX(slow_work_unreg_sync_lock); * There are two queues of work items: one for slow work items, and one for * very slow work items. */ -static LIST_HEAD(slow_work_queue); -static LIST_HEAD(vslow_work_queue); -static DEFINE_SPINLOCK(slow_work_queue_lock); +LIST_HEAD(slow_work_queue); +LIST_HEAD(vslow_work_queue); +DEFINE_SPINLOCK(slow_work_queue_lock); /* * The thread controls. A variable used to signal to the threads that they @@ -182,7 +186,7 @@ static unsigned slow_work_calc_vsmax(void) * Attempt to execute stuff queued on a slow thread. Return true if we managed * it, false if there was nothing to do. */ -static bool slow_work_execute(int id) +static noinline bool slow_work_execute(int id) { #ifdef CONFIG_MODULES struct module *module; @@ -227,6 +231,10 @@ static bool slow_work_execute(int id) if (work) slow_work_thread_processing[id] = work->owner; #endif + if (work) { + slow_work_mark_time(work); + slow_work_begin_exec(id, work); + } spin_unlock_irq(&slow_work_queue_lock); @@ -247,6 +255,8 @@ static bool slow_work_execute(int id) /* wake up anyone waiting for this work to be complete */ wake_up_bit(&work->flags, SLOW_WORK_EXECUTING); + slow_work_end_exec(id, work); + /* if someone tried to enqueue the item whilst we were executing it, * then it'll be left unenqueued to avoid multiple threads trying to * execute it s