From c9dccacfccc72c32692eedff4a27a4b0833a2afd Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Thu, 11 Jul 2019 16:29:37 +0200 Subject: printk: Do not lose last line in kmsg buffer dump kmsg_dump_get_buffer() is supposed to select all the youngest log messages which fit into the provided buffer. It determines the correct start index by using msg_print_text() with a NULL buffer to calculate the size of each entry. However, when performing the actual writes, msg_print_text() only writes the entry to the buffer if the written len is lesser than the size of the buffer. So if the lengths of the selected youngest log messages happen to precisely fill up the provided buffer, the last log message is not included. We don't want to modify msg_print_text() to fill up the buffer and start returning a length which is equal to the size of the buffer, since callers of its other users, such as kmsg_dump_get_line(), depend upon the current behaviour. Instead, fix kmsg_dump_get_buffer() to compensate for this. For example, with the following two final prints: [ 6.427502] AAAAAAAAAAAAA [ 6.427769] BBBBBBBB12345 A dump of a 64-byte buffer filled by kmsg_dump_get_buffer(), before this patch: 00000000: 3c 30 3e 5b 20 20 20 20 36 2e 35 32 32 31 39 37 <0>[ 6.522197 00000010: 5d 20 41 41 41 41 41 41 41 41 41 41 41 41 41 0a ] AAAAAAAAAAAAA. 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ After this patch: 00000000: 3c 30 3e 5b 20 20 20 20 36 2e 34 35 36 36 37 38 <0>[ 6.456678 00000010: 5d 20 42 42 42 42 42 42 42 42 31 32 33 34 35 0a ] BBBBBBBB12345. 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Link: http://lkml.kernel.org/r/20190711142937.4083-1-vincent.whitchurch@axis.com Fixes: e2ae715d66bf4bec ("kmsg - kmsg_dump() use iterator to receive log buffer content") To: rostedt@goodmis.org Cc: linux-kernel@vger.kernel.org Cc: # v3.5+ Signed-off-by: Vincent Whitchurch Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1888f6a3b694..424abf802f02 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3274,7 +3274,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, /* move first record forward until length fits into the buffer */ seq = dumper->cur_seq; idx = dumper->cur_idx; - while (l > size && seq < dumper->next_seq) { + while (l >= size && seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); l -= msg_print_text(msg, true, time, NULL, 0); -- cgit v1.2.3 From f240652b6032b48ad7fa35c5e701cc4c8d697c0b Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 5 Jul 2019 10:53:21 -0700 Subject: x86/mpx: Remove MPX APIs MPX is being removed from the kernel due to a lack of support in the toolchain going forward (gcc). The first step is to remove the userspace-visible ABIs so that applications will stop using it. The most visible one are the enable/disable prctl()s. Remove them first. This is the most minimal and least invasive change needed to ensure that apps stop using MPX with new kernels. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190705175321.DB42F0AD@viggo.jf.intel.com --- kernel/sys.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 2969304c29fe..384b000b7865 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -103,12 +103,6 @@ #ifndef SET_TSC_CTL # define SET_TSC_CTL(a) (-EINVAL) #endif -#ifndef MPX_ENABLE_MANAGEMENT -# define MPX_ENABLE_MANAGEMENT() (-EINVAL) -#endif -#ifndef MPX_DISABLE_MANAGEMENT -# define MPX_DISABLE_MANAGEMENT() (-EINVAL) -#endif #ifndef GET_FP_MODE # define GET_FP_MODE(a) (-EINVAL) #endif @@ -2456,15 +2450,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, up_write(&me->mm->mmap_sem); break; case PR_MPX_ENABLE_MANAGEMENT: - if (arg2 || arg3 || arg4 || arg5) - return -EINVAL; - error = MPX_ENABLE_MANAGEMENT(); - break; case PR_MPX_DISABLE_MANAGEMENT: - if (arg2 || arg3 || arg4 || arg5) - return -EINVAL; - error = MPX_DISABLE_MANAGEMENT(); - break; + /* No longer implemented: */ + return -EINVAL; case PR_SET_FP_MODE: error = SET_FP_MODE(me, arg2); break; -- cgit v1.2.3 From 3a79bc63d90750f737ab9d7219bd3091d2fd6d84 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Jul 2019 13:03:20 +0200 Subject: PCI: irq: Introduce rearm_wake_irq() Introduce a new function, rearm_wake_irq(), allowing a wakeup IRQ to be armed for systen wakeup detection again without running any action handlers associated with it after it has been armed for wakeup detection and triggered. That is useful for IRQs, like ACPI SCI, that may deliver wakeup as well as non-wakeup interrupts when armed for systen wakeup detection. In those cases, it may be possible to determine whether or not the delivered interrupt is a systen wakeup one without running the entire action handler (or handlers, if the IRQ is shared) for the IRQ, and if the interrupt turns out to be a non-wakeup one, the IRQ can be rearmed with the help of the new function. Signed-off-by: Rafael J. Wysocki Acked-by: Thomas Gleixner --- kernel/irq/pm.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index d6961d3c6f9e..8f557fa1f4fe 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -176,6 +176,26 @@ static void resume_irqs(bool want_early) } } +/** + * rearm_wake_irq - rearm a wakeup interrupt line after signaling wakeup + * @irq: Interrupt to rearm + */ +void rearm_wake_irq(unsigned int irq) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); + + if (!desc || !(desc->istate & IRQS_SUSPENDED) || + !irqd_is_wakeup_set(&desc->irq_data)) + return; + + desc->istate &= ~IRQS_SUSPENDED; + irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); + __enable_irq(desc); + + irq_put_desc_busunlock(desc, flags); +} + /** * irq_pm_syscore_ops - enable interrupt lines early * -- cgit v1.2.3 From 56b991849009f5def0443bfb2f48c8321d888e15 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Jul 2019 23:52:03 +0200 Subject: PM: sleep: Simplify suspend-to-idle control flow After commit 33e4f80ee69b ("ACPI / PM: Ignore spurious SCI wakeups from suspend-to-idle") the "noirq" phases of device suspend and resume may run for multiple times during suspend-to-idle, if there are spurious system wakeup events while suspended. However, this is complicated and fragile and actually unnecessary. The main reason for doing this is that on some systems the EC may signal system wakeup events (power button events, for example) as well as events that should not cause the system to resume (spurious system wakeup events). Thus, in order to determine whether or not a given event signaled by the EC while suspended is a proper system wakeup one, the EC GPE needs to be dispatched and to start with that was achieved by allowing the ACPI SCI action handler to run, which was only possible after calling resume_device_irqs(). However, dispatching the EC GPE this way turned out to take too much time in some cases and some EC events might be missed due to that, so commit 68e22011856f ("ACPI: EC: Dispatch the EC GPE directly on s2idle wake") started to dispatch the EC GPE right after a wakeup event has been detected, so in fact the full ACPI SCI action handler doesn't need to run any more to deal with the wakeups coming from the EC. Use this observation to simplify the suspend-to-idle control flow so that the "noirq" phases of device suspend and resume are each run only once in every suspend-to-idle cycle, which is reported to significantly reduce power drawn by some systems when suspended to idle (by allowing them to reach a deep platform-wide low-power state through the suspend-to-idle flow). [What appears to happen is that the "noirq" resume of devices after a spurious EC wakeup brings some devices into a state in which they prevent the platform from reaching the deep low-power state going forward, even after a subsequent "noirq" suspend phase, and on some systems the EC triggers such wakeups already when the "noirq" suspend of devices is running for the first time in the given suspend/resume cycle, so the platform cannot reach the deep low-power state at all.] First, make acpi_s2idle_wake() use the acpi_ec_dispatch_gpe() return value to determine whether or not the wakeup may have been triggered by the EC (in which case the system wakeup is canceled and ACPI events are processed in order to determine whether or not the event is a proper system wakeup one) and use rearm_wake_irq() (introduced by a previous change) in it to rearm the ACPI SCI for system wakeup detection in case the system will remain suspended. Second, drop acpi_s2idle_sync(), which is not needed any more, and the corresponding global platform suspend-to-idle callback. Next, drop the pm_wakeup_pending() check (which is an optimization only) from __device_suspend_noirq() to prevent it from returning errors on system wakeups occurring before the "noirq" phase of device suspend is complete (as in the case of suspend-to-idle it is not known whether or not these wakeups are suprious at that point), in order to avoid having to carry out a "noirq" resume of devices on a spurious system wakeup. Finally, change the code flow in s2idle_loop() to (1) run the "noirq" suspend of devices once before starting the loop, (2) check for spurious EC wakeups (via the platform ->wake callback) for the first time before calling s2idle_enter(), and (3) run the "noirq" resume of devices once after leaving the loop. Signed-off-by: Rafael J. Wysocki Acked-by: Thomas Gleixner --- kernel/power/suspend.c | 53 ++++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c874a7026e24..907b2be0372f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -119,48 +119,41 @@ static void s2idle_enter(void) static void s2idle_loop(void) { + int error; + + dpm_noirq_begin(); + error = dpm_noirq_suspend_devices(PMSG_SUSPEND); + if (error) + goto resume; + pm_pr_dbg("suspend-to-idle\n"); + /* + * Suspend-to-idle equals: + * frozen processes + suspended devices + idle processors. + * Thus s2idle_enter() should be called right after all devices have + * been suspended. + * + * Wakeups during the noirq suspend of devices may be spurious, so try + * to avoid them upfront. + */ for (;;) { - int error; - - dpm_noirq_begin(); - - /* - * Suspend-to-idle equals - * frozen processes + suspended devices + idle processors. - * Thus s2idle_enter() should be called right after - * all devices have been suspended. - * - * Wakeups during the noirq suspend of devices may be spurious, - * so prevent them from terminating the loop right away. - */ - error = dpm_noirq_suspend_devices(PMSG_SUSPEND); - if (!error) - s2idle_enter(); - else if (error == -EBUSY && pm_wakeup_pending()) - error = 0; - - if (!error && s2idle_ops && s2idle_ops->wake) + if (s2idle_ops && s2idle_ops->wake) s2idle_ops->wake(); - dpm_noirq_resume_devices(PMSG_RESUME); - - dpm_noirq_end(); - - if (error) - break; - - if (s2idle_ops && s2idle_ops->sync) - s2idle_ops->sync(); - if (pm_wakeup_pending()) break; pm_wakeup_clear(false); + + s2idle_enter(); } pm_pr_dbg("resume from suspend-to-idle\n"); + +resume: + dpm_noirq_resume_devices(PMSG_RESUME); + dpm_noirq_end(); } void s2idle_wake(void) -- cgit v1.2.3 From 8eb0fd3b55f084320ae511cd5a64d356cf497c83 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Jul 2019 23:52:12 +0200 Subject: PM: sleep: Integrate suspend-to-idle with generig suspend flow After previous changes the suspend-to-idle code flow can be integrated more tightly with the generic system suspend code flow by making suspend_enter() call s2idle_loop() later and removing the direct invocations of dpm_noirq_begin(), dpm_noirq_suspend_devices(), dpm_noirq_end(), and dpm_noirq_resume_devices() from the latter, so do that. This change is not expected to alter functionality. Signed-off-by: Rafael J. Wysocki Acked-by: Thomas Gleixner --- kernel/power/suspend.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 907b2be0372f..2b6057853b33 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -119,13 +119,6 @@ static void s2idle_enter(void) static void s2idle_loop(void) { - int error; - - dpm_noirq_begin(); - error = dpm_noirq_suspend_devices(PMSG_SUSPEND); - if (error) - goto resume; - pm_pr_dbg("suspend-to-idle\n"); /* @@ -150,10 +143,6 @@ static void s2idle_loop(void) } pm_pr_dbg("resume from suspend-to-idle\n"); - -resume: - dpm_noirq_resume_devices(PMSG_RESUME); - dpm_noirq_end(); } void s2idle_wake(void) @@ -408,11 +397,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (error) goto Devices_early_resume; - if (state == PM_SUSPEND_TO_IDLE && pm_test_level != TEST_PLATFORM) { - s2idle_loop(); - goto Platform_early_resume; - } - error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { pr_err("noirq suspend of devices failed\n"); @@ -425,6 +409,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (suspend_test(TEST_PLATFORM)) goto Platform_wake; + if (state == PM_SUSPEND_TO_IDLE) { + s2idle_loop(); + goto Platform_wake; + } + error = suspend_disable_secondary_cpus(); if (error || suspend_test(TEST_CPUS)) goto Enable_cpus; -- cgit v1.2.3 From 85db0023376f529c477c6110043e069ccee16d9c Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 2 Jul 2019 19:26:59 +0200 Subject: cgroup: Replace a seq_printf() call by seq_puts() in cgroup_print_ss_mask() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A string which did not contain a data format specification should be put into a sequence. Thus use the corresponding function “seq_puts”. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 753afbca549f..3c3d92d993e8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2894,7 +2894,7 @@ static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) do_each_subsys_mask(ss, ssid, ss_mask) { if (printed) seq_putc(seq, ' '); - seq_printf(seq, "%s", ss->name); + seq_puts(seq, ss->name); printed = true; } while_each_subsys_mask(); if (printed) -- cgit v1.2.3 From a581563f1bef035e4c8d634a1df26dae9140b115 Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Wed, 3 Jul 2019 10:07:49 +0800 Subject: cgroup: minor tweak for logic to get cgroup css We could only handle the case that css exists and css_try_get_online() fails. Signed-off-by: Peng Wang Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3c3d92d993e8..21ddf3053235 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -488,7 +488,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, rcu_read_lock(); css = cgroup_css(cgrp, ss); - if (!css || !css_tryget_online(css)) + if (css && !css_tryget_online(css)) css = NULL; rcu_read_unlock(); -- cgit v1.2.3 From 364f6afc4f5537b79cf454eb35cae92920676075 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Jul 2019 11:24:40 -0700 Subject: locking/lockdep: Make it clear that what lock_class::key points at is not modified This patch does not change the behavior of the lockdep code. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Link: https://lkml.kernel.org/r/20190722182443.216015-2-bvanassche@acm.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 2 +- kernel/locking/lockdep_internals.h | 3 ++- kernel/locking/lockdep_proc.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4861cf8e274b..af6627866191 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -511,7 +511,7 @@ static const char *usage_str[] = }; #endif -const char * __get_key_name(struct lockdep_subclass_key *key, char *str) +const char *__get_key_name(const struct lockdep_subclass_key *key, char *str) { return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); } diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index cc83568d5012..2e518369add4 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -116,7 +116,8 @@ extern struct lock_chain lock_chains[]; extern void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]); -extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); +extern const char *__get_key_name(const struct lockdep_subclass_key *key, + char *str); struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index bda006f8a88b..ed9842425cac 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -399,7 +399,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt) static void seq_stats(struct seq_file *m, struct lock_stat_data *data) { - struct lockdep_subclass_key *ckey; + const struct lockdep_subclass_key *ckey; struct lock_class_stats *stats; struct lock_class *class; const char *cname; -- cgit v1.2.3 From a2970421640bd9b6a78f2685d7750a791abdfd4e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Jul 2019 11:24:41 -0700 Subject: stacktrace: Constify 'entries' arguments Make it clear to humans and to the compiler that the stack trace ('entries') arguments are not modified. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Link: https://lkml.kernel.org/r/20190722182443.216015-3-bvanassche@acm.org Signed-off-by: Ingo Molnar --- kernel/stacktrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index f5440abb7532..6d1f68b7e528 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -20,7 +20,7 @@ * @nr_entries: Number of entries in the storage array * @spaces: Number of leading spaces to print */ -void stack_trace_print(unsigned long *entries, unsigned int nr_entries, +void stack_trace_print(const unsigned long *entries, unsigned int nr_entries, int spaces) { unsigned int i; @@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(stack_trace_print); * * Return: Number of bytes printed. */ -int stack_trace_snprint(char *buf, size_t size, unsigned long *entries, +int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries, unsigned int nr_entries, int spaces) { unsigned int generated, i, total = 0; -- cgit v1.2.3 From 12593b7467f9130b64a6d4b6a26ed4ec217b6784 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Jul 2019 11:24:42 -0700 Subject: locking/lockdep: Reduce space occupied by stack traces Although commit 669de8bda87b ("kernel/workqueue: Use dynamic lockdep keys for workqueues") unregisters dynamic lockdep keys when a workqueue is destroyed, a side effect of that commit is that all stack traces associated with the lockdep key are leaked when a workqueue is destroyed. Fix this by storing each unique stack trace once. Other changes in this patch are: - Use NULL instead of { .nr_entries = 0 } to represent 'no trace'. - Store a pointer to a stack trace in struct lock_class and struct lock_list instead of storing 'nr_entries' and 'offset'. This patch avoids that the following program triggers the "BUG: MAX_STACK_TRACE_ENTRIES too low!" complaint: #include #include int main() { for (;;) { int fd = open("/dev/infiniband/rdma_cm", O_RDWR); close(fd); } } Suggested-by: Peter Zijlstra Reported-by: Eric Biggers Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Cc: Yuyang Du Link: https://lkml.kernel.org/r/20190722182443.216015-4-bvanassche@acm.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 128 ++++++++++++++++++++++++++----------- kernel/locking/lockdep_internals.h | 2 + 2 files changed, 92 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index af6627866191..1a96869cb2f0 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -449,33 +449,72 @@ static void print_lockdep_off(const char *bug_msg) unsigned long nr_stack_trace_entries; #ifdef CONFIG_PROVE_LOCKING +/** + * struct lock_trace - single stack backtrace + * @hash_entry: Entry in a stack_trace_hash[] list. + * @hash: jhash() of @entries. + * @nr_entries: Number of entries in @entries. + * @entries: Actual stack backtrace. + */ +struct lock_trace { + struct hlist_node hash_entry; + u32 hash; + u32 nr_entries; + unsigned long entries[0] __aligned(sizeof(unsigned long)); +}; +#define LOCK_TRACE_SIZE_IN_LONGS \ + (sizeof(struct lock_trace) / sizeof(unsigned long)) /* - * Stack-trace: tightly packed array of stack backtrace - * addresses. Protected by the graph_lock. + * Stack-trace: sequence of lock_trace structures. Protected by the graph_lock. */ static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; +static struct hlist_head stack_trace_hash[STACK_TRACE_HASH_SIZE]; + +static bool traces_identical(struct lock_trace *t1, struct lock_trace *t2) +{ + return t1->hash == t2->hash && t1->nr_entries == t2->nr_entries && + memcmp(t1->entries, t2->entries, + t1->nr_entries * sizeof(t1->entries[0])) == 0; +} -static int save_trace(struct lock_trace *trace) +static struct lock_trace *save_trace(void) { - unsigned long *entries = stack_trace + nr_stack_trace_entries; + struct lock_trace *trace, *t2; + struct hlist_head *hash_head; + u32 hash; unsigned int max_entries; - trace->offset = nr_stack_trace_entries; - max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - trace->nr_entries = stack_trace_save(entries, max_entries, 3); - nr_stack_trace_entries += trace->nr_entries; + BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE); + BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES); + + trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries); + max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries - + LOCK_TRACE_SIZE_IN_LONGS; + trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3); - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { + if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES - + LOCK_TRACE_SIZE_IN_LONGS - 1) { if (!debug_locks_off_graph_unlock()) - return 0; + return NULL; print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); dump_stack(); - return 0; + return NULL; } - return 1; + hash = jhash(trace->entries, trace->nr_entries * + sizeof(trace->entries[0]), 0); + trace->hash = hash; + hash_head = stack_trace_hash + (hash & (STACK_TRACE_HASH_SIZE - 1)); + hlist_for_each_entry(t2, hash_head, hash_entry) { + if (traces_identical(trace, t2)) + return t2; + } + nr_stack_trace_entries += LOCK_TRACE_SIZE_IN_LONGS + trace->nr_entries; + hlist_add_head(&trace->hash_entry, hash_head); + + return trace; } #endif @@ -1235,7 +1274,7 @@ static struct lock_list *alloc_list_entry(void) static int add_lock_to_list(struct lock_class *this, struct lock_class *links_to, struct list_head *head, unsigned long ip, int distance, - struct lock_trace *trace) + const struct lock_trace *trace) { struct lock_list *entry; /* @@ -1249,7 +1288,7 @@ static int add_lock_to_list(struct lock_class *this, entry->class = this; entry->links_to = links_to; entry->distance = distance; - entry->trace = *trace; + entry->trace = trace; /* * Both allocation and removal are done under the graph lock; but * iteration is under RCU-sched; see look_up_lock_class() and @@ -1470,11 +1509,10 @@ static inline int __bfs_backwards(struct lock_list *src_entry, } -static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) +static void print_lock_trace(const struct lock_trace *trace, + unsigned int spaces) { - unsigned long *entries = stack_trace + trace->offset; - - stack_trace_print(entries, trace->nr_entries, spaces); + stack_trace_print(trace->entries, trace->nr_entries, spaces); } /* @@ -1489,7 +1527,7 @@ print_circular_bug_entry(struct lock_list *target, int depth) printk("\n-> #%u", depth); print_lock_name(target->class); printk(KERN_CONT ":\n"); - print_lock_trace(&target->trace, 6); + print_lock_trace(target->trace, 6); } static void @@ -1592,7 +1630,8 @@ static noinline void print_circular_bug(struct lock_list *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return; - if (!save_trace(&this->trace)) + this->trace = save_trace(); + if (!this->trace) return; depth = get_lock_depth(target); @@ -1715,7 +1754,7 @@ check_path(struct lock_class *target, struct lock_list *src_entry, */ static noinline int check_noncircular(struct held_lock *src, struct held_lock *target, - struct lock_trace *trace) + struct lock_trace **const trace) { int ret; struct lock_list *uninitialized_var(target_entry); @@ -1729,13 +1768,13 @@ check_noncircular(struct held_lock *src, struct held_lock *target, ret = check_path(hlock_class(target), &src_entry, &target_entry); if (unlikely(!ret)) { - if (!trace->nr_entries) { + if (!*trace) { /* * If save_trace fails here, the printing might * trigger a WARN but because of the !nr_entries it * should not do bad things. */ - save_trace(trace); + *trace = save_trace(); } print_circular_bug(&src_entry, target_entry, src, target); @@ -1859,7 +1898,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) len += printk("%*s %s", depth, "", usage_str[bit]); len += printk(KERN_CONT " at:\n"); - print_lock_trace(class->usage_traces + bit, len); + print_lock_trace(class->usage_traces[bit], len); } } printk("%*s }\n", depth, ""); @@ -1884,7 +1923,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf, do { print_lock_class_header(entry->class, depth); printk("%*s ... acquired at:\n", depth, ""); - print_lock_trace(&entry->trace, 2); + print_lock_trace(entry->trace, 2); printk("\n"); if (depth == 0 && (entry != root)) { @@ -1995,14 +2034,14 @@ print_bad_irq_dependency(struct task_struct *curr, print_lock_name(backwards_entry->class); pr_warn("\n... which became %s-irq-safe at:\n", irqclass); - print_lock_trace(backwards_entry->class->usage_traces + bit1, 1); + print_lock_trace(backwards_entry->class->usage_traces[bit1], 1); pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); print_lock_name(forwards_entry->class); pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); pr_warn("..."); - print_lock_trace(forwards_entry->class->usage_traces + bit2, 1); + print_lock_trace(forwards_entry->class->usage_traces[bit2], 1); pr_warn("\nother info that might help us debug this:\n\n"); print_irq_lock_scenario(backwards_entry, forwards_entry, @@ -2011,13 +2050,15 @@ print_bad_irq_dependency(struct task_struct *curr, lockdep_print_held_locks(curr); pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); - if (!save_trace(&prev_root->trace)) + prev_root->trace = save_trace(); + if (!prev_root->trace) return; print_shortest_lock_dependencies(backwards_entry, prev_root); pr_warn("\nthe dependencies between the lock to be acquired"); pr_warn(" and %s-irq-unsafe lock:\n", irqclass); - if (!save_trace(&next_root->trace)) + next_root->trace = save_trace(); + if (!next_root->trace) return; print_shortest_lock_dependencies(forwards_entry, next_root); @@ -2369,7 +2410,8 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, struct lock_trace *trace) + struct held_lock *next, int distance, + struct lock_trace **const trace) { struct lock_list *entry; int ret; @@ -2444,8 +2486,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return ret; #endif - if (!trace->nr_entries && !save_trace(trace)) - return 0; + if (!*trace) { + *trace = save_trace(); + if (!*trace) + return 0; + } /* * Ok, all validations passed, add the new lock @@ -2453,14 +2498,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ ret = add_lock_to_list(hlock_class(next), hlock_class(prev), &hlock_class(prev)->locks_after, - next->acquire_ip, distance, trace); + next->acquire_ip, distance, *trace); if (!ret) return 0; ret = add_lock_to_list(hlock_class(prev), hlock_class(next), &hlock_class(next)->locks_before, - next->acquire_ip, distance, trace); + next->acquire_ip, distance, *trace); if (!ret) return 0; @@ -2476,7 +2521,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { - struct lock_trace trace = { .nr_entries = 0 }; + struct lock_trace *trace = NULL; int depth = curr->lockdep_depth; struct held_lock *hlock; @@ -3015,7 +3060,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, print_lock(this); pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); - print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1); + print_lock_trace(hlock_class(this)->usage_traces[prev_bit], 1); print_irqtrace_events(curr); pr_warn("\nother info that might help us debug this:\n"); @@ -3096,7 +3141,8 @@ print_irq_inversion_bug(struct task_struct *curr, lockdep_print_held_locks(curr); pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); - if (!save_trace(&root->trace)) + root->trace = save_trace(); + if (!root->trace) return; print_shortest_lock_dependencies(other, root); @@ -3580,7 +3626,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, hlock_class(this)->usage_mask |= new_mask; - if (!save_trace(hlock_class(this)->usage_traces + new_bit)) + if (!(hlock_class(this)->usage_traces[new_bit] = save_trace())) return 0; switch (new_bit) { @@ -5157,6 +5203,12 @@ void __init lockdep_init(void) ) / 1024 ); +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + printk(" memory used for stack traces: %zu kB\n", + (sizeof(stack_trace) + sizeof(stack_trace_hash)) / 1024 + ); +#endif + printk(" per task-struct memory footprint: %zu bytes\n", sizeof(((struct task_struct *)NULL)->held_locks)); } diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 2e518369add4..93a008bf77db 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -92,6 +92,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = #define MAX_LOCKDEP_ENTRIES 16384UL #define MAX_LOCKDEP_CHAINS_BITS 15 #define MAX_STACK_TRACE_ENTRIES 262144UL +#define STACK_TRACE_HASH_SIZE 8192 #else #define MAX_LOCKDEP_ENTRIES 32768UL @@ -102,6 +103,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = * addresses. Protected by the hash_lock. */ #define MAX_STACK_TRACE_ENTRIES 524288UL +#define STACK_TRACE_HASH_SIZE 16384 #endif #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) -- cgit v1.2.3 From 8c779229d0f4fe83ead90bdcbbf08b02989aa200 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Jul 2019 11:24:43 -0700 Subject: locking/lockdep: Report more stack trace statistics Report the number of stack traces and the number of stack trace hash chains. These two numbers are useful because these allow to estimate the number of stack trace hash collisions. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Link: https://lkml.kernel.org/r/20190722182443.216015-5-bvanassche@acm.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 29 +++++++++++++++++++++++++++++ kernel/locking/lockdep_internals.h | 4 ++++ kernel/locking/lockdep_proc.c | 6 ++++++ 3 files changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 1a96869cb2f0..3c3902c40a0e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -516,6 +516,35 @@ static struct lock_trace *save_trace(void) return trace; } + +/* Return the number of stack traces in the stack_trace[] array. */ +u64 lockdep_stack_trace_count(void) +{ + struct lock_trace *trace; + u64 c = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++) { + hlist_for_each_entry(trace, &stack_trace_hash[i], hash_entry) { + c++; + } + } + + return c; +} + +/* Return the number of stack hash chains that have at least one stack trace. */ +u64 lockdep_stack_hash_count(void) +{ + u64 c = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++) + if (!hlist_empty(&stack_trace_hash[i])) + c++; + + return c; +} #endif unsigned int nr_hardirq_chains; diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 93a008bf77db..18d85aebbb57 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -140,6 +140,10 @@ extern unsigned int max_bfs_queue_depth; #ifdef CONFIG_PROVE_LOCKING extern unsigned long lockdep_count_forward_deps(struct lock_class *); extern unsigned long lockdep_count_backward_deps(struct lock_class *); +#ifdef CONFIG_TRACE_IRQFLAGS +u64 lockdep_stack_trace_count(void); +u64 lockdep_stack_hash_count(void); +#endif #else static inline unsigned long lockdep_count_forward_deps(struct lock_class *class) diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index ed9842425cac..dadb7b7fba37 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -285,6 +285,12 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_process_chains); seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n", nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES); +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + seq_printf(m, " number of stack traces: %llu\n", + lockdep_stack_trace_count()); + seq_printf(m, " number of stack hash chains: %llu\n", + lockdep_stack_hash_count()); +#endif seq_printf(m, " combined max dependencies: %11u\n", (nr_hardirq_chains + 1) * (nr_softirq_chains + 1) * -- cgit v1.2.3 From e797bda3fd29137f6c151dfa10ea6a61c17895ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Jul 2019 20:47:16 +0200 Subject: smp/hotplug: Track booted once CPUs in a cpumask The booted once information which is required to deal with the MCE broadcast issue on X86 correctly is stored in the per cpu hotplug state, which is perfectly fine for the intended purpose. X86 needs that information for supporting NMI broadcasting via shortcuts, but retrieving it from per cpu data is cumbersome. Move it to a cpumask so the information can be checked against the cpu_present_mask quickly. No functional change intended. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190722105219.818822855@linutronix.de --- kernel/cpu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index e84c0873559e..05778e32674a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -62,7 +62,6 @@ struct cpuhp_cpu_state { bool rollback; bool single; bool bringup; - bool booted_once; struct hlist_node *node; struct hlist_node *last; enum cpuhp_state cb_state; @@ -76,6 +75,10 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { .fail = CPUHP_INVALID, }; +#ifdef CONFIG_SMP +cpumask_t cpus_booted_once_mask; +#endif + #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) static struct lockdep_map cpuhp_state_up_map = STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map); @@ -433,7 +436,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu) * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any * core will shutdown the machine. */ - return !per_cpu(cpuhp_state, cpu).booted_once; + return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); } #else static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } @@ -1066,7 +1069,7 @@ void notify_cpu_starting(unsigned int cpu) int ret; rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ - st->booted_once = true; + cpumask_set_cpu(cpu, &cpus_booted_once_mask); while (st->state < target) { st->state++; ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); @@ -2334,7 +2337,7 @@ void __init boot_cpu_init(void) void __init boot_cpu_hotplug_init(void) { #ifdef CONFIG_SMP - this_cpu_write(cpuhp_state.booted_once, true); + cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask); #endif this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); } -- cgit v1.2.3 From 0c09ab96fc820109d63097a2adcbbd20836b655f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Jul 2019 16:23:40 +0200 Subject: cpu/hotplug: Cache number of online CPUs Re-evaluating the bitmap wheight of the online cpus bitmap in every invocation of num_online_cpus() over and over is a pretty useless exercise. Especially when num_online_cpus() is used in code paths like the IPI delivery of x86 or the membarrier code. Cache the number of online CPUs in the core and just return the cached variable. The accessor function provides only a snapshot when used without protection against concurrent CPU hotplug. The storage needs to use an atomic_t because the kexec and reboot code (ab)use set_cpu_online() in their 'shutdown' handlers without any form of serialization as pointed out by Mathieu. Regular CPU hotplug usage is properly serialized. Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1907091622590.1634@nanos.tec.linutronix.de --- kernel/cpu.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 05778e32674a..e1967e9eddc2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2298,6 +2298,9 @@ EXPORT_SYMBOL(__cpu_present_mask); struct cpumask __cpu_active_mask __read_mostly; EXPORT_SYMBOL(__cpu_active_mask); +atomic_t __num_online_cpus __read_mostly; +EXPORT_SYMBOL(__num_online_cpus); + void init_cpu_present(const struct cpumask *src) { cpumask_copy(&__cpu_present_mask, src); @@ -2313,6 +2316,27 @@ void init_cpu_online(const struct cpumask *src) cpumask_copy(&__cpu_online_mask, src); } +void set_cpu_online(unsigned int cpu, bool online) +{ + /* + * atomic_inc/dec() is required to handle the horrid abuse of this + * function by the reboot and kexec code which invoke it from + * IPI/NMI broadcasts when shutting down CPUs. Invocation from + * regular CPU hotplug is properly serialized. + * + * Note, that the fact that __num_online_cpus is of type atomic_t + * does not protect readers which are not serialized against + * concurrent hotplug operations. + */ + if (online) { + if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask)) + atomic_inc(&__num_online_cpus); + } else { + if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask)) + atomic_dec(&__num_online_cpus); + } +} + /* * Activate the first processor. */ -- cgit v1.2.3 From d35927a144641700c8328d707d1c89d305b4ecb8 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 15 Jul 2019 11:25:06 +0100 Subject: sched/fair: Move init_numa_balancing() below task_numa_work() To reference task_numa_work() from within init_numa_balancing(), we need the former to be declared before the latter. Do just that. This is a pure code movement. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: mgorman@suse.de Cc: riel@surriel.com Link: https://lkml.kernel.org/r/20190715102508.32434-2-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 82 ++++++++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc9cfeaac8bd..f0c488015649 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1188,47 +1188,6 @@ static unsigned int task_scan_max(struct task_struct *p) return max(smin, smax); } -void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) -{ - int mm_users = 0; - struct mm_struct *mm = p->mm; - - if (mm) { - mm_users = atomic_read(&mm->mm_users); - if (mm_users == 1) { - mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); - mm->numa_scan_seq = 0; - } - } - p->node_stamp = 0; - p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; - p->numa_scan_period = sysctl_numa_balancing_scan_delay; - p->numa_work.next = &p->numa_work; - p->numa_faults = NULL; - RCU_INIT_POINTER(p->numa_group, NULL); - p->last_task_numa_placement = 0; - p->last_sum_exec_runtime = 0; - - /* New address space, reset the preferred nid */ - if (!(clone_flags & CLONE_VM)) { - p->numa_preferred_nid = NUMA_NO_NODE; - return; - } - - /* - * New thread, keep existing numa_preferred_nid which should be copied - * already by arch_dup_task_struct but stagger when scans start. - */ - if (mm) { - unsigned int delay; - - delay = min_t(unsigned int, task_scan_max(current), - current->numa_scan_period * mm_users * NSEC_PER_MSEC); - delay += 2 * TICK_NSEC; - p->node_stamp = delay; - } -} - static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); @@ -2665,6 +2624,47 @@ out: } } +void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +{ + int mm_users = 0; + struct mm_struct *mm = p->mm; + + if (mm) { + mm_users = atomic_read(&mm->mm_users); + if (mm_users == 1) { + mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + mm->numa_scan_seq = 0; + } + } + p->node_stamp = 0; + p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_work.next = &p->numa_work; + p->numa_faults = NULL; + RCU_INIT_POINTER(p->numa_group, NULL); + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; + + /* New address space, reset the preferred nid */ + if (!(clone_flags & CLONE_VM)) { + p->numa_preferred_nid = NUMA_NO_NODE; + return; + } + + /* + * New thread, keep existing numa_preferred_nid which should be copied + * already by arch_dup_task_struct but stagger when scans start. + */ + if (mm) { + unsigned int delay; + + delay = min_t(unsigned int, task_scan_max(current), + current->numa_scan_period * mm_users * NSEC_PER_MSEC); + delay += 2 * TICK_NSEC; + p->node_stamp = delay; + } +} + /* * Drive the periodic memory faults.. */ -- cgit v1.2.3 From b34920d4ce6e6fc9424c20a4be98676eb543122f Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 15 Jul 2019 11:25:07 +0100 Subject: sched/fair: Move task_numa_work() init to init_numa_balancing() We only need to set the callback_head worker function once, do it during sched_fork(). While at it, move the comment regarding double task_work addition to init_numa_balancing(), since the double add sentinel is first set there. Suggested-by: Peter Zijlstra Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Thomas Gleixner Cc: mgorman@suse.de Cc: riel@surriel.com Link: https://lkml.kernel.org/r/20190715102508.32434-3-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f0c488015649..fd391fc00ed8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2495,7 +2495,7 @@ void task_numa_work(struct callback_head *work) SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); - work->next = work; /* protect against double add */ + work->next = work; /* * Who cares about NUMA placement when they're dying. * @@ -2639,12 +2639,15 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) p->node_stamp = 0; p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; + /* Protect against double add, see task_tick_numa and task_numa_work */ p->numa_work.next = &p->numa_work; p->numa_faults = NULL; RCU_INIT_POINTER(p->numa_group, NULL); p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; + init_task_work(&p->numa_work, task_numa_work); + /* New address space, reset the preferred nid */ if (!(clone_flags & CLONE_VM)) { p->numa_preferred_nid = NUMA_NO_NODE; @@ -2693,10 +2696,8 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) curr->numa_scan_period = task_scan_start(curr); curr->node_stamp += period; - if (!time_before(jiffies, curr->mm->numa_next_scan)) { - init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ + if (!time_before(jiffies, curr->mm->numa_next_scan)) task_work_add(curr, work, true); - } } } -- cgit v1.2.3 From 9434f9f5d117302cc7ddf038e7879f6871dc7a81 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 15 Jul 2019 11:25:08 +0100 Subject: sched/fair: Change task_numa_work() storage to static There are no callers outside of fair.c. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: mgorman@suse.de Cc: riel@surriel.com Link: https://lkml.kernel.org/r/20190715102508.32434-4-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fd391fc00ed8..b5546a15206c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2482,7 +2482,7 @@ static void reset_ptenuma_scan(struct task_struct *p) * The expensive part of numa migration is done from task_work context. * Triggered from task_tick_numa(). */ -void task_numa_work(struct callback_head *work) +static void task_numa_work(struct callback_head *work) { unsigned long migrate, next_scan, now = jiffies; struct task_struct *p = current; -- cgit v1.2.3 From f6cad8df6b30a5d2bbbd2e698f74b4cafb9fb82b Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 1 Jul 2019 17:47:02 +0200 Subject: sched/fair: Fix imbalance due to CPU affinity The load_balance() has a dedicated mecanism to detect when an imbalance is due to CPU affinity and must be handled at parent level. In this case, the imbalance field of the parent's sched_group is set. The description of sg_imbalanced() gives a typical example of two groups of 4 CPUs each and 4 tasks each with a cpumask covering 1 CPU of the first group and 3 CPUs of the second group. Something like: { 0 1 2 3 } { 4 5 6 7 } * * * * But the load_balance fails to fix this UC on my octo cores system made of 2 clusters of quad cores. Whereas the load_balance is able to detect that the imbalanced is due to CPU affinity, it fails to fix it because the imbalance field is cleared before letting parent level a chance to run. In fact, when the imbalance is detected, the load_balance reruns without the CPU with pinned tasks. But there is no other running tasks in the situation described above and everything looks balanced this time so the imbalance field is immediately cleared. The imbalance field should not be cleared if there is no other task to move when the imbalance is detected. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/1561996022-28829-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b5546a15206c..9be36ffb5689 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9048,9 +9048,10 @@ more_balance: out_balanced: /* * We reach balance although we may have faced some affinity - * constraints. Clear the imbalance flag if it was set. + * constraints. Clear the imbalance flag only if other tasks got + * a chance to move and fix the imbalance. */ - if (sd_parent) { + if (sd_parent && !(env.flags & LBF_ALL_PINNED)) { int *group_imbalance = &sd_parent->groups->sgc->imbalance; if (*group_imbalance) -- cgit v1.2.3 From 84ec3a0787086fcd25f284f59b3aa01fd6fc0a5d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 Jun 2019 09:52:38 -0700 Subject: time/tick-broadcast: Fix tick_broadcast_offline() lockdep complaint time/tick-broadcast: Fix tick_broadcast_offline() lockdep complaint The TASKS03 and TREE04 rcutorture scenarios produce the following lockdep complaint: WARNING: inconsistent lock state 5.2.0-rc1+ #513 Not tainted -------------------------------- inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. migration/1/14 [HC0[0]:SC0[0]:HE1:SE1] takes: (____ptrval____) (tick_broadcast_lock){?...}, at: tick_broadcast_offline+0xf/0x70 {IN-HARDIRQ-W} state was registered at: lock_acquire+0xb0/0x1c0 _raw_spin_lock_irqsave+0x3c/0x50 tick_broadcast_switch_to_oneshot+0xd/0x40 tick_switch_to_oneshot+0x4f/0xd0 hrtimer_run_queues+0xf3/0x130 run_local_timers+0x1c/0x50 update_process_times+0x1c/0x50 tick_periodic+0x26/0xc0 tick_handle_periodic+0x1a/0x60 smp_apic_timer_interrupt+0x80/0x2a0 apic_timer_interrupt+0xf/0x20 _raw_spin_unlock_irqrestore+0x4e/0x60 rcu_nocb_gp_kthread+0x15d/0x590 kthread+0xf3/0x130 ret_from_fork+0x3a/0x50 irq event stamp: 171 hardirqs last enabled at (171): [] trace_hardirqs_on_thunk+0x1a/0x1c hardirqs last disabled at (170): [] trace_hardirqs_off_thunk+0x1a/0x1c softirqs last enabled at (0): [] copy_process.part.56+0x650/0x1cb0 softirqs last disabled at (0): [<0000000000000000>] 0x0 [...] To reproduce, run the following rcutorture test: $ tools/testing/selftests/rcutorture/bin/kvm.sh --duration 5 --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" --configs "TASKS03 TREE04" It turns out that tick_broadcast_offline() was an innocent bystander. After all, interrupts are supposed to be disabled throughout take_cpu_down(), and therefore should have been disabled upon entry to tick_offline_cpu() and thus to tick_broadcast_offline(). This suggests that one of the CPU-hotplug notifiers was incorrectly enabling interrupts, and leaving them enabled on return. Some debugging code showed that the culprit was sched_cpu_dying(). It had irqs enabled after return from sched_tick_stop(). Which in turn had irqs enabled after return from cancel_delayed_work_sync(). Which is a wrapper around __cancel_work_timer(). Which can sleep in the case where something else is concurrently trying to cancel the same delayed work, and as Thomas Gleixner pointed out on IRC, sleeping is a decidedly bad idea when you are invoked from take_cpu_down(), regardless of the state you leave interrupts in upon return. Code inspection located no reason why the delayed work absolutely needed to be canceled from sched_tick_stop(): The work is not bound to the outgoing CPU by design, given that the whole point is to collect statistics without disturbing the outgoing CPU. This commit therefore simply drops the cancel_delayed_work_sync() from sched_tick_stop(). Instead, a new ->state field is added to the tick_work structure so that the delayed-work handler function sched_tick_remote() can avoid reposting itself. A cpu_is_offline() check is also added to sched_tick_remote() to avoid mucking with the state of an offlined CPU (though it does appear safe to do so). The sched_tick_start() and sched_tick_stop() functions also update ->state, and sched_tick_start() also schedules the delayed work if ->state indicates that it is not already in flight. Signed-off-by: Paul E. McKenney [ paulmck: Apply Peter Zijlstra and Frederic Weisbecker atomics feedback. ] Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190625165238.GJ26519@linux.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 57 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b037f195473..0b22e55cebe8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3486,8 +3486,36 @@ void scheduler_tick(void) struct tick_work { int cpu; + atomic_t state; struct delayed_work work; }; +/* Values for ->state, see diagram below. */ +#define TICK_SCHED_REMOTE_OFFLINE 0 +#define TICK_SCHED_REMOTE_OFFLINING 1 +#define TICK_SCHED_REMOTE_RUNNING 2 + +/* + * State diagram for ->state: + * + * + * TICK_SCHED_REMOTE_OFFLINE + * | ^ + * | | + * | | sched_tick_remote() + * | | + * | | + * +--TICK_SCHED_REMOTE_OFFLINING + * | ^ + * | | + * sched_tick_start() | | sched_tick_stop() + * | | + * V | + * TICK_SCHED_REMOTE_RUNNING + * + * + * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() + * and sched_tick_start() are happy to leave the state in RUNNING. + */ static struct tick_work __percpu *tick_work_cpu; @@ -3500,6 +3528,7 @@ static void sched_tick_remote(struct work_struct *work) struct task_struct *curr; struct rq_flags rf; u64 delta; + int os; /* * Handle the tick only if it appears the remote CPU is running in full @@ -3513,7 +3542,7 @@ static void sched_tick_remote(struct work_struct *work) rq_lock_irq(rq, &rf); curr = rq->curr; - if (is_idle_task(curr)) + if (is_idle_task(curr) || cpu_is_offline(cpu)) goto out_unlock; update_rq_clock(rq); @@ -3533,13 +3562,18 @@ out_requeue: /* * Run the remote tick once per second (1Hz). This arbitrary * frequency is large enough to avoid overload but short enough - * to keep scheduler internal stats reasonably up to date. + * to keep scheduler internal stats reasonably up to date. But + * first update state to reflect hotplug activity if required. */ - queue_delayed_work(system_unbound_wq, dwork, HZ); + os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); + WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); + if (os == TICK_SCHED_REMOTE_RUNNING) + queue_delayed_work(system_unbound_wq, dwork, HZ); } static void sched_tick_start(int cpu) { + int os; struct tick_work *twork; if (housekeeping_cpu(cpu, HK_FLAG_TICK)) @@ -3548,15 +3582,20 @@ static void sched_tick_start(int cpu) WARN_ON_ONCE(!tick_work_cpu); twork = per_cpu_ptr(tick_work_cpu, cpu); - twork->cpu = cpu; - INIT_DELAYED_WORK(&twork->work, sched_tick_remote); - queue_delayed_work(system_unbound_wq, &twork->work, HZ); + os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); + WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); + if (os == TICK_SCHED_REMOTE_OFFLINE) { + twork->cpu = cpu; + INIT_DELAYED_WORK(&twork->work, sched_tick_remote); + queue_delayed_work(system_unbound_wq, &twork->work, HZ); + } } #ifdef CONFIG_HOTPLUG_CPU static void sched_tick_stop(int cpu) { struct tick_work *twork; + int os; if (housekeeping_cpu(cpu, HK_FLAG_TICK)) return; @@ -3564,7 +3603,10 @@ static void sched_tick_stop(int cpu) WARN_ON_ONCE(!tick_work_cpu); twork = per_cpu_ptr(tick_work_cpu, cpu); - cancel_delayed_work_sync(&twork->work); + /* There cannot be competing actions, but don't rely on stop-machine. */ + os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); + WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); + /* Don't cancel, as this would mess up the state machine. */ } #endif /* CONFIG_HOTPLUG_CPU */ @@ -3572,7 +3614,6 @@ int __init sched_tick_offload_init(void) { tick_work_cpu = alloc_percpu(struct tick_work); BUG_ON(!tick_work_cpu); - return 0; } -- cgit v1.2.3 From 43e9f7f231e40e4534fc3a735da152911a085c16 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 26 Jun 2019 10:36:29 +0530 Subject: sched/fair: Start tracking SCHED_IDLE tasks count in cfs_rq Track how many tasks are present with SCHED_IDLE policy in each cfs_rq. This will be used by later commits. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Cc: Daniel Lezcano Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Cc: chris.redpath@arm.com Cc: quentin.perret@linaro.org Cc: songliubraving@fb.com Cc: steven.sistare@oracle.com Cc: subhra.mazumdar@oracle.com Cc: tkjos@google.com Link: https://lkml.kernel.org/r/0d3cdc427fc68808ad5bccc40e86ed0bf9da8bb4.1561523542.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 ++++++++++++-- kernel/sched/sched.h | 3 ++- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9be36ffb5689..9ed5ab53872f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4555,7 +4555,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, dequeue = 1; + long task_delta, idle_task_delta, dequeue = 1; bool empty; se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -4566,6 +4566,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) rcu_read_unlock(); task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ @@ -4575,6 +4576,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (dequeue) dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; if (qcfs_rq->load.weight) dequeue = 0; @@ -4614,7 +4616,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; int enqueue = 1; - long task_delta; + long task_delta, idle_task_delta; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4634,6 +4636,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) return; task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { if (se->on_rq) enqueue = 0; @@ -4642,6 +4645,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (enqueue) enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; if (cfs_rq_throttled(cfs_rq)) break; @@ -5255,6 +5259,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int idle_h_nr_running = task_has_idle_policy(p); /* * The code below (indirectly) updates schedutil which looks at @@ -5287,6 +5292,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; flags = ENQUEUE_WAKEUP; } @@ -5294,6 +5300,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; if (cfs_rq_throttled(cfs_rq)) break; @@ -5355,6 +5362,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; + int idle_h_nr_running = task_has_idle_policy(p); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -5369,6 +5377,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; + cfs_rq->idle_h_nr_running -= idle_h_nr_running; /* Don't dequeue parent if it has other entities besides