From a2775bbc1d58ce630517dfe86090c166f27d719f Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Tue, 12 Mar 2019 21:21:26 +0100 Subject: kernel/workqueue: Use __printf markup to silence compiler in function 'alloc_workqueue' Silence warnings (triggered at W=1) by adding relevant __printf attributes. kernel/workqueue.c:4249:2: warning: function 'alloc_workqueue' might be a candidate for 'gnu_printf' format attribute [-Wsuggest-attribute=format] Signed-off-by: Mathieu Malaterre Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4026d1871407..56b7cf898f10 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4208,6 +4208,7 @@ static int init_rescuer(struct workqueue_struct *wq) return 0; } +__printf(1, 4) struct workqueue_struct *alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...) -- cgit v1.2.3 From 95e0b46fcebd7dbf6850dee96046e4c4ddc7f69c Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 7 Mar 2019 09:16:24 +0800 Subject: audit: fix a memleak caused by auditing load module module.name will be allocated unconditionally when auditing load module, and audit_log_start() can fail with other reasons, or audit_log_exit maybe not called, caused module.name is not freed so free module.name in audit_free_context and __audit_syscall_exit unreferenced object 0xffff88af90837d20 (size 8): comm "modprobe", pid 1036, jiffies 4294704867 (age 3069.138s) hex dump (first 8 bytes): 69 78 67 62 65 00 ff ff ixgbe... backtrace: [<0000000008da28fe>] __audit_log_kern_module+0x33/0x80 [<00000000c1491e61>] load_module+0x64f/0x3850 [<000000007fc9ae3f>] __do_sys_init_module+0x218/0x250 [<0000000000d4a478>] do_syscall_64+0x117/0x400 [<000000004924ded8>] entry_SYSCALL_64_after_hwframe+0x49/0xbe [<000000007dc331dd>] 0xffffffffffffffff Fixes: ca86cad7380e3 ("audit: log module name on init_module") Signed-off-by: Zhang Yu Signed-off-by: Li RongQing [PM: manual merge fixup in __audit_syscall_exit()] Signed-off-by: Paul Moore --- kernel/auditsc.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d1eab1d4a930..fa7b8047aab8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -840,6 +840,13 @@ static inline void audit_proctitle_free(struct audit_context *context) context->proctitle.len = 0; } +static inline void audit_free_module(struct audit_context *context) +{ + if (context->type == AUDIT_KERN_MODULE) { + kfree(context->module.name); + context->module.name = NULL; + } +} static inline void audit_free_names(struct audit_context *context) { struct audit_names *n, *next; @@ -923,6 +930,7 @@ int audit_alloc(struct task_struct *tsk) static inline void audit_free_context(struct audit_context *context) { + audit_free_module(context); audit_free_names(context); unroll_tree_refs(context, NULL, 0); free_tree_refs(context); @@ -1266,7 +1274,6 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_format(ab, "name="); if (context->module.name) { audit_log_untrustedstring(ab, context->module.name); - kfree(context->module.name); } else audit_log_format(ab, "(null)"); @@ -1697,6 +1704,7 @@ void __audit_syscall_exit(int success, long return_code) context->in_syscall = 0; context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; + audit_free_module(context); audit_free_names(context); unroll_tree_refs(context, NULL, 0); audit_free_aux(context); -- cgit v1.2.3 From 8194fe94ab08f56ea55653df924647d23873d18c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 19 Mar 2019 10:45:09 -0700 Subject: kernel/workqueue: Document wq_worker_last_func() argument This patch avoids that the following warning is reported when building with W=1: kernel/workqueue.c:938: warning: Function parameter or member 'task' not described in 'wq_worker_last_func' Signed-off-by: Bart Van Assche Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 56b7cf898f10..21721faa923c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -913,6 +913,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) /** * wq_worker_last_func - retrieve worker's last work function + * @task: Task to retrieve last work function of. * * Determine the last function a worker executed. This is called from * the scheduler to get a worker's last known identity. -- cgit v1.2.3 From 73e65b88feb919f95bdb77c4ed35f69588cf27ee Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 19 Mar 2019 15:23:29 -0400 Subject: audit: connect LOGIN record to its syscall record Currently the AUDIT_LOGIN event is a standalone record that isn't connected to any other records that may be part of its syscall event. To avoid the confusion of generating two events, connect the records by using its syscall context. Please see the github issue https://github.com/linux-audit/audit-kernel/issues/110 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index c89ea48c70a6..b96bf69183f4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2220,7 +2220,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; -- cgit v1.2.3 From 2efa48fec0c344a6ca1bba66b15d63d38cf20199 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 20 Mar 2019 21:59:22 +0800 Subject: audit: Make audit_log_cap and audit_copy_inode static Fix sparse warning: kernel/auditsc.c:1150:6: warning: symbol 'audit_log_cap' was not declared. Should it be static? kernel/auditsc.c:1908:6: warning: symbol 'audit_copy_inode' was not declared. Should it be static? Signed-off-by: YueHaibing Acked-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fa7b8047aab8..17b0007fafc2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1147,7 +1147,8 @@ out: kfree(buf_head); } -void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) +static void audit_log_cap(struct audit_buffer *ab, char *prefix, + kernel_cap_t *cap) { int i; @@ -1905,8 +1906,9 @@ static inline int audit_copy_fcaps(struct audit_names *name, } /* Copy inode data into an audit_names. */ -void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - struct inode *inode, unsigned int flags) +static void audit_copy_inode(struct audit_names *name, + const struct dentry *dentry, + struct inode *inode, unsigned int flags) { name->ino = inode->i_ino; name->dev = inode->i_sb->s_dev; -- cgit v1.2.3 From 16add411645cff83360086e102daa67b25f1e39a Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 18 Mar 2019 02:30:18 +0300 Subject: syscall_get_arch: add "struct task_struct *" argument This argument is required to extend the generic ptrace API with PTRACE_GET_SYSCALL_INFO request: syscall_get_arch() is going to be called from ptrace_request() along with syscall_get_nr(), syscall_get_arguments(), syscall_get_error(), and syscall_get_return_value() functions with a tracee as their argument. The primary intent is that the triple (audit_arch, syscall_nr, arg1..arg6) should describe what system call is being called and what its arguments are. Reverts: 5e937a9ae913 ("syscall_get_arch: remove useless function arguments") Reverts: 1002d94d3076 ("syscall.h: fix doc text for syscall_get_arch()") Reviewed-by: Andy Lutomirski # for x86 Reviewed-by: Palmer Dabbelt Acked-by: Paul Moore Acked-by: Paul Burton # MIPS parts Acked-by: Michael Ellerman (powerpc) Acked-by: Kees Cook # seccomp parts Acked-by: Mark Salter # for the c6x bit Cc: Elvira Khabirova Cc: Eugene Syromyatnikov Cc: Oleg Nesterov Cc: x86@kernel.org Cc: linux-alpha@vger.kernel.org Cc: linux-snps-arc@lists.infradead.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-c6x-dev@linux-c6x.org Cc: uclinux-h8-devel@lists.sourceforge.jp Cc: linux-hexagon@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-m68k@lists.linux-m68k.org Cc: linux-mips@vger.kernel.org Cc: nios2-dev@lists.rocketboards.org Cc: openrisc@lists.librecores.org Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-riscv@lists.infradead.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: sparclinux@vger.kernel.org Cc: linux-um@lists.infradead.org Cc: linux-xtensa@linux-xtensa.org Cc: linux-arch@vger.kernel.org Cc: linux-audit@redhat.com Signed-off-by: Dmitry V. Levin Signed-off-by: Paul Moore --- kernel/auditsc.c | 4 ++-- kernel/seccomp.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 17b0007fafc2..98a98e6dca05 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1636,7 +1636,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, return; } - context->arch = syscall_get_arch(); + context->arch = syscall_get_arch(current); context->major = major; context->argv[0] = a1; context->argv[1] = a2; @@ -2590,7 +2590,7 @@ void audit_seccomp(unsigned long syscall, long signr, int code) return; audit_log_task(ab); audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", - signr, syscall_get_arch(), syscall, + signr, syscall_get_arch(current), syscall, in_compat_syscall(), KSTK_EIP(current), code); audit_log_end(ab); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 54a0347ca812..36f36ab00f48 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -148,7 +148,7 @@ static void populate_seccomp_data(struct seccomp_data *sd) unsigned long args[6]; sd->nr = syscall_get_nr(task, regs); - sd->arch = syscall_get_arch(); + sd->arch = syscall_get_arch(task); syscall_get_arguments(task, regs, 0, 6, args); sd->args[0] = args[0]; sd->args[1] = args[1]; @@ -591,7 +591,7 @@ static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason info->si_code = SYS_SECCOMP; info->si_call_addr = (void __user *)KSTK_EIP(current); info->si_errno = reason; - info->si_arch = syscall_get_arch(); + info->si_arch = syscall_get_arch(current); info->si_syscall = syscall; } -- cgit v1.2.3 From 0f3adc288df8ba2ac2fea0a8e4890f9fb4cd075d Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:53:59 +0800 Subject: bpf: track references based on is_acquire_func So far, the verifier only acquires reference tracking state for RET_PTR_TO_SOCKET_OR_NULL. Instead of extending this for every new return type which desires these semantics, acquire reference tracking state iff the called helper is an acquire function. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 86f9cd5d1c4e..868a82ad5597 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3147,19 +3147,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - if (is_acquire_function(func_id)) { - int id = acquire_reference_state(env, insn_idx); - - if (id < 0) - return id; - /* For mark_ptr_or_null_reg() */ - regs[BPF_REG_0].id = id; - /* For release_reference() */ - regs[BPF_REG_0].ref_obj_id = id; - } else { - /* For mark_ptr_or_null_reg() */ - regs[BPF_REG_0].id = ++env->id_gen; - } + regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; @@ -3170,9 +3158,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } - if (is_ptr_cast_function(func_id)) + if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + } else if (is_acquire_function(func_id)) { + int id = acquire_reference_state(env, insn_idx); + + if (id < 0) + return id; + /* For mark_ptr_or_null_reg() */ + regs[BPF_REG_0].id = id; + /* For release_reference() */ + regs[BPF_REG_0].ref_obj_id = id; + } do_refine_retval_range(regs, fn->ret_type, func_id, &meta); -- cgit v1.2.3 From 85a51f8c28b9812642d76db6889f3f39dc3fbab3 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:00 +0800 Subject: bpf: allow helpers to return PTR_TO_SOCK_COMMON It's currently not possible to access timewait or request sockets from eBPF, since there is no way to return a PTR_TO_SOCK_COMMON from a helper. Introduce RET_PTR_TO_SOCK_COMMON to enable this behaviour. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 868a82ad5597..a476e13201d6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3148,6 +3148,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; -- cgit v1.2.3 From edbf8c01de5a104a71ed6df2bf6421ceb2836a8e Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:01 +0800 Subject: bpf: add skc_lookup_tcp helper Allow looking up a sock_common. This gives eBPF programs access to timewait and request sockets. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a476e13201d6..dffeec3706ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -369,7 +369,8 @@ static bool is_release_function(enum bpf_func_id func_id) static bool is_acquire_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_lookup_tcp || - func_id == BPF_FUNC_sk_lookup_udp; + func_id == BPF_FUNC_sk_lookup_udp || + func_id == BPF_FUNC_skc_lookup_tcp; } static bool is_ptr_cast_function(enum bpf_func_id func_id) -- cgit v1.2.3 From d7dcf26ff0ffd7b56fe2b09ed7f1867589f3cdf1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Mar 2019 23:48:21 +0100 Subject: softirq: Remove tasklet_hrtimer There are no more users of this interface. Remove it. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: David S. Miller Cc: netdev@vger.kernel.org Link: https://lkml.kernel.org/r/20190301224821.29843-4-bigeasy@linutronix.de --- kernel/softirq.c | 51 --------------------------------------------------- 1 file changed, 51 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 10277429ed84..2c3382378d94 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -573,57 +573,6 @@ void tasklet_kill(struct tasklet_struct *t) } EXPORT_SYMBOL(tasklet_kill); -/* - * tasklet_hrtimer - */ - -/* - * The trampoline is called when the hrtimer expires. It schedules a tasklet - * to run __tasklet_hrtimer_trampoline() which in turn will call the intended - * hrtimer callback, but from softirq context. - */ -static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) -{ - struct tasklet_hrtimer *ttimer = - container_of(timer, struct tasklet_hrtimer, timer); - - tasklet_hi_schedule(&ttimer->tasklet); - return HRTIMER_NORESTART; -} - -/* - * Helper function which calls the hrtimer callback from - * tasklet/softirq context - */ -static void __tasklet_hrtimer_trampoline(unsigned long data) -{ - struct tasklet_hrtimer *ttimer = (void *)data; - enum hrtimer_restart restart; - - restart = ttimer->function(&ttimer->timer); - if (restart != HRTIMER_NORESTART) - hrtimer_restart(&ttimer->timer); -} - -/** - * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks - * @ttimer: tasklet_hrtimer which is initialized - * @function: hrtimer callback function which gets called from softirq context - * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) - * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) - */ -void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, - enum hrtimer_restart (*function)(struct hrtimer *), - clockid_t which_clock, enum hrtimer_mode mode) -{ - hrtimer_init(&ttimer->timer, which_clock, mode); - ttimer->timer.function = __hrtimer_tasklet_trampoline; - tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, - (unsigned long)ttimer); - ttimer->function = function; -} -EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); - void __init softirq_init(void) { int cpu; -- cgit v1.2.3 From 3b0f31f2b8c9fb348e4530b88f6b64f9621f83d6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 21 Mar 2019 22:51:02 +0100 Subject: genetlink: make policy common to family Since maxattr is common, the policy can't really differ sanely, so make it common as well. The only user that did in fact manage to make a non-common policy is taskstats, which has to be really careful about it (since it's still using a common maxattr!). This is no longer supported, but we can fake it using pre_doit. This reduces the size of e.g. nl80211.o (which has lots of commands): text data bss dec hex filename 398745 14323 2240 415308 6564c net/wireless/nl80211.o (before) 397913 14331 2240 414484 65314 net/wireless/nl80211.o (after) -------------------------------- -832 +8 0 -824 Which is obviously just 8 bytes for each command, and an added 8 bytes for the new policy pointer. I'm not sure why the ops list is counted as .text though. Most of the code transformations were done using the following spatch: @ops@ identifier OPS; expression POLICY; @@ struct genl_ops OPS[] = { ..., { - .policy = POLICY, }, ... }; @@ identifier ops.OPS; expression ops.POLICY; identifier fam; expression M; @@ struct genl_family fam = { .ops = OPS, .maxattr = M, + .policy = POLICY, ... }; This also gets rid of devlink_nl_cmd_region_read_dumpit() accessing the cb->data as ops, which we want to change in a later genl patch. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- kernel/taskstats.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 4e62a4a8fa91..1b942a7caf26 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -650,16 +650,37 @@ static const struct genl_ops taskstats_ops[] = { { .cmd = TASKSTATS_CMD_GET, .doit = taskstats_user_cmd, - .policy = taskstats_cmd_get_policy, - .flags = GENL_ADMIN_PERM, + /* policy enforced later */ + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_HASPOL, }, { .cmd = CGROUPSTATS_CMD_GET, .doit = cgroupstats_user_cmd, - .policy = cgroupstats_cmd_get_policy, + /* policy enforced later */ + .flags = GENL_CMD_CAP_HASPOL, }, }; +static int taskstats_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + const struct nla_policy *policy = NULL; + + switch (ops->cmd) { + case TASKSTATS_CMD_GET: + policy = taskstats_cmd_get_policy; + break; + case CGROUPSTATS_CMD_GET: + policy = cgroupstats_cmd_get_policy; + break; + default: + return -EINVAL; + } + + return nlmsg_validate(info->nlhdr, GENL_HDRLEN, TASKSTATS_CMD_ATTR_MAX, + policy, info->extack); +} + static struct genl_family family __ro_after_init = { .name = TASKSTATS_GENL_NAME, .version = TASKSTATS_GENL_VERSION, @@ -667,6 +688,7 @@ static struct genl_family family __ro_after_init = { .module = THIS_MODULE, .ops = taskstats_ops, .n_ops = ARRAY_SIZE(taskstats_ops), + .pre_doit = taskstats_pre_doit, }; /* Needed early in initialization */ -- cgit v1.2.3 From e1e41b6ce5f9c1a80bf4f2404ec5ab11c6c5a2ad Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 18 Mar 2019 20:55:56 +0100 Subject: timekeeping: Consistently use unsigned int for seqcount snapshot The timekeeping code uses a random mix of "unsigned long" and "unsigned int" for the seqcount snapshots (ratio 14:12). Since the seqlock.h API is entirely based on unsigned int, use that throughout. Signed-off-by: Rasmus Villemoes Signed-off-by: Thomas Gleixner Cc: Frederic Weisbecker Cc: John Stultz Cc: Stephen Boyd Link: https://lkml.kernel.org/r/20190318195557.20773-1-linux@rasmusvillemoes.dk --- kernel/time/jiffies.c | 2 +- kernel/time/sched_clock.c | 4 ++-- kernel/time/tick-common.c | 2 +- kernel/time/tick-sched.c | 3 ++- kernel/time/timekeeping.c | 18 +++++++++--------- 5 files changed, 15 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index dc1b6f1929f9..95f8f3304c19 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -63,7 +63,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { - unsigned long seq; + unsigned int seq; u64 ret; do { diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 094b82ca95e5..16b80c2b4fe8 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -94,7 +94,7 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) unsigned long long notrace sched_clock(void) { u64 cyc, res; - unsigned long seq; + unsigned int seq; struct clock_read_data *rd; do { @@ -267,7 +267,7 @@ void __init generic_sched_clock_init(void) */ static u64 notrace suspended_sched_clock_read(void) { - unsigned long seq = raw_read_seqcount(&cd.seq); + unsigned int seq = raw_read_seqcount(&cd.seq); return cd.read_data[seq & 1].epoch_cyc; } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 529143b4c8d2..561641b2153f 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -149,7 +149,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) !tick_broadcast_oneshot_active()) { clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); } else { - unsigned long seq; + unsigned int seq; ktime_t next; do { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6fa52cd6df0b..b50f6f22c88e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -645,7 +645,8 @@ static inline bool local_timer_softirq_pending(void) static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; - unsigned long seq, basejiff; + unsigned long basejiff; + unsigned int seq; /* Read jiffies and the time when jiffies were updated last */ do { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f986e1918d12..540145da33da 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -720,7 +720,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) void ktime_get_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); @@ -829,7 +829,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) { ktime_t *offset = offsets[offs]; - unsigned long seq; + unsigned int seq; ktime_t tconv; do { @@ -960,7 +960,7 @@ time64_t __ktime_get_real_seconds(void) void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; ktime_t base_raw; ktime_t base_real; u64 nsec_raw; @@ -1122,7 +1122,7 @@ int get_device_system_crosststamp(int (*get_time_fn) ktime_t base_real, base_raw; u64 nsec_real, nsec_raw; u8 cs_was_changed_seq; - unsigned long seq; + unsigned int seq; bool do_interp; int ret; @@ -1409,7 +1409,7 @@ int timekeeping_notify(struct clocksource *clock) void ktime_get_raw_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; u64 nsecs; do { @@ -1431,7 +1431,7 @@ EXPORT_SYMBOL(ktime_get_raw_ts64); int timekeeping_valid_for_hres(void) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; int ret; do { @@ -1450,7 +1450,7 @@ int timekeeping_valid_for_hres(void) u64 timekeeping_max_deferment(void) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; u64 ret; do { @@ -2150,7 +2150,7 @@ EXPORT_SYMBOL_GPL(getboottime64); void ktime_get_coarse_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); @@ -2164,7 +2164,7 @@ void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now, mono; - unsigned long seq; + unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); -- cgit v1.2.3 From 1b72d43237980eab9b6ae6bb8181e51c840377e6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Mar 2019 16:39:20 +0100 Subject: tick: Remove outgoing CPU from broadcast masks Valentin reported that unplugging a CPU occasionally results in a warning in the tick broadcast code which is triggered when an offline CPU is in the broadcast mask. This happens because the outgoing CPU is not removing itself from the broadcast masks, especially not from the broadcast_force_mask. The removal happens on the control CPU after the outgoing CPU is dead. It's a long standing issue, but the warning is harmless. Rework the hotplug mechanism so that the outgoing CPU removes itself from the broadcast masks after disabling interrupts and removing itself from the online mask. Reported-by: Valentin Schneider Signed-off-by: Thomas Gleixner Tested-by: Valentin Schneider Cc: Frederic Weisbecker Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1903211540180.1784@nanos.tec.linutronix.de --- kernel/cpu.c | 2 ++ kernel/time/clockevents.c | 18 ++++++++++++++++-- kernel/time/tick-broadcast.c | 40 +++++++++++++++++++--------------------- kernel/time/tick-internal.h | 10 ++++++---- 4 files changed, 43 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 025f419d16f6..f69ba38573c2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -844,6 +844,8 @@ static int take_cpu_down(void *_param) /* Give up timekeeping duties */ tick_handover_do_timer(); + /* Remove CPU from timer broadcasting */ + tick_offline_cpu(cpu); /* Park the stopper thread */ stop_machine_park(cpu); return 0; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 5e77662dd2d9..f5490222e134 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -611,6 +611,22 @@ void clockevents_resume(void) } #ifdef CONFIG_HOTPLUG_CPU + +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +/** + * tick_offline_cpu - Take CPU out of the broadcast mechanism + * @cpu: The outgoing CPU + * + * Called on the outgoing CPU after it took itself offline. + */ +void tick_offline_cpu(unsigned int cpu) +{ + raw_spin_lock(&clockevents_lock); + tick_broadcast_offline(cpu); + raw_spin_unlock(&clockevents_lock); +} +# endif + /** * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu */ @@ -621,8 +637,6 @@ void tick_cleanup_dead_cpu(int cpu) raw_spin_lock_irqsave(&clockevents_lock, flags); - tick_shutdown_broadcast_oneshot(cpu); - tick_shutdown_broadcast(cpu); tick_shutdown(cpu); /* * Unregister the clock event devices which were diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index ee834d4fb814..0283523de045 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -36,10 +36,12 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); static void tick_broadcast_setup_oneshot(struct clock_event_device *bc); static void tick_broadcast_clear_oneshot(int cpu); static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); +static void tick_broadcast_oneshot_offline(unsigned int cpu); #else static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } static inline void tick_broadcast_clear_oneshot(int cpu) { } static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } +static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { } #endif /* @@ -433,27 +435,29 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) } #ifdef CONFIG_HOTPLUG_CPU -/* - * Remove a CPU from broadcasting - */ -void tick_shutdown_broadcast(unsigned int cpu) +static void tick_shutdown_broadcast(void) { - struct clock_event_device *bc; - unsigned long flags; - - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - - bc = tick_broadcast_device.evtdev; - cpumask_clear_cpu(cpu, tick_broadcast_mask); - cpumask_clear_cpu(cpu, tick_broadcast_on); + struct clock_event_device *bc = tick_broadcast_device.evtdev; if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { if (bc && cpumask_empty(tick_broadcast_mask)) clockevents_shutdown(bc); } +} - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +/* + * Remove a CPU from broadcasting + */ +void tick_broadcast_offline(unsigned int cpu) +{ + raw_spin_lock(&tick_broadcast_lock); + cpumask_clear_cpu(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_broadcast_on); + tick_broadcast_oneshot_offline(cpu); + tick_shutdown_broadcast(); + raw_spin_unlock(&tick_broadcast_lock); } + #endif void tick_suspend_broadcast(void) @@ -950,14 +954,10 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) } /* - * Remove a dead CPU from broadcasting + * Remove a dying CPU from broadcasting */ -void tick_shutdown_broadcast_oneshot(unsigned int cpu) +static void tick_broadcast_oneshot_offline(unsigned int cpu) { - unsigned long flags; - - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - /* * Clear the broadcast masks for the dead cpu, but do not stop * the broadcast device! @@ -965,8 +965,6 @@ void tick_shutdown_broadcast_oneshot(unsigned int cpu) cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); cpumask_clear_cpu(cpu, tick_broadcast_force_mask); - - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } #endif diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index e277284c2831..7b2496136729 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -64,7 +64,6 @@ extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); extern void tick_install_broadcast_device(struct clock_event_device *dev); extern int tick_is_broadcast_device(struct clock_event_device *dev); -extern void tick_shutdown_broadcast(unsigned int cpu); extern void tick_suspend_broadcast(void); extern void tick_resume_broadcast(void); extern bool tick_resume_check_broadcast(void); @@ -78,7 +77,6 @@ static inline void tick_install_broadcast_device(struct clock_event_device *dev) static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } -static inline void tick_shutdown_broadcast(unsigned int cpu) { } static inline void tick_suspend_broadcast(void) { } static inline void tick_resume_broadcast(void) { } static inline bool tick_resume_check_broadcast(void) { return false; } @@ -128,19 +126,23 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } /* Functions related to oneshot broadcasting */ #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) extern void tick_broadcast_switch_to_oneshot(void); -extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); extern int tick_broadcast_oneshot_active(void); extern void tick_check_oneshot_broadcast_this_cpu(void); bool tick_broadcast_oneshot_available(void); extern struct cpumask *tick_get_broadcast_oneshot_mask(void); #else /* !(BROADCAST && ONESHOT): */ static inline void tick_broadcast_switch_to_oneshot(void) { } -static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } static inline void tick_check_oneshot_broadcast_this_cpu(void) { } static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } #endif /* !(BROADCAST && ONESHOT) */ +#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU) +extern void tick_broadcast_offline(unsigned int cpu); +#else +static inline void tick_broadcast_offline(unsigned int cpu) { } +#endif + /* NO_HZ_FULL internal */ #ifdef CONFIG_NO_HZ_FULL extern void tick_nohz_init(void); -- cgit v1.2.3 From d6b87eaf10bd061914f6d277d7428b3285d8850e Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Mar 2019 13:09:18 +0100 Subject: tick/sched: Update tick_sched struct documentation Adapt the documentation order of struct members to the effective order of struct members and add missing descriptions. Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Cc: fweisbec@gmail.com Cc: peterz@infradead.org Link: https://lkml.kernel.org/r/20190321120921.16463-2-anna-maria@linutronix.de --- kernel/time/tick-sched.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 6de959a854b2..4fb06527cf64 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -24,12 +24,19 @@ enum tick_nohz_mode { * struct tick_sched - sched tick emulation and no idle tick control/stats * @sched_timer: hrtimer to schedule the periodic tick in high * resolution mode + * @check_clocks: Notification mechanism about clocksource changes + * @nohz_mode: Mode - one state of tick_nohz_mode + * @inidle: Indicator that the CPU is in the tick idle mode + * @tick_stopped: Indicator that the idle tick has been stopped + * @idle_active: Indicator that the CPU is actively in the tick idle mode; + * it is resetted during irq handling phases. + * @do_timer_lst: CPU was the last one doing do_timer before going idle + * @got_idle_tick: Tick timer function has run with @inidle set * @last_tick: Store the last tick expiry time when the tick * timer is modified for nohz sleeps. This is necessary * to resume the tick timer operation in the timeline * when the CPU returns from nohz sleep. * @next_tick: Next tick to be fired when in dynticks mode. - * @tick_stopped: Indicator that the idle tick has been stopped * @idle_jiffies: jiffies at the entry to idle for idle time accounting * @idle_calls: Total number of idle calls * @idle_sleeps: Number of idle calls, where the sched tick was stopped @@ -40,8 +47,8 @@ enum tick_nohz_mode { * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) * @timer_expires_base: Base time clock monotonic for @timer_expires - * @do_timer_lst: CPU was the last one doing do_timer before going idle - * @got_idle_tick: Tick timer function has run with @inidle set + * @next_timer: Expiry time of next expiring timer for debugging purpose only + * @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick */ struct tick_sched { struct hrtimer sched_timer; -- cgit v1.2.3 From dc1e7dc5ac6254ba0502323381a7ec847e408f1d Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Mar 2019 13:09:19 +0100 Subject: timer: Move trace point to get proper index When placing the timer_start trace point before the timer wheel bucket index is calculated, the index information in the trace point is useless. It is not possible to simply move the debug_activate() call after the index calculation, because debug_object_activate() needs to be called before touching the object. Therefore split debug_activate() and move the trace point into enqueue_timer() after the new index has been calculated. The debug_object_activate() call remains at the original place. Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Cc: fweisbec@gmail.com Cc: peterz@infradead.org Cc: Steven Rostedt Link: https://lkml.kernel.org/r/20190321120921.16463-3-anna-maria@linutronix.de --- kernel/time/timer.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2fce056f8a49..8d7918ae4d0c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -536,6 +536,8 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, hlist_add_head(&timer->entry, base->vectors + idx); __set_bit(idx, base->pending_map); timer_set_idx(timer, idx); + + trace_timer_start(timer, timer->expires, timer->flags); } static void @@ -757,13 +759,6 @@ static inline void debug_init(struct timer_list *timer) trace_timer_init(timer); } -static inline void -debug_activate(struct timer_list *timer, unsigned long expires) -{ - debug_timer_activate(timer); - trace_timer_start(timer, expires, timer->flags); -} - static inline void debug_deactivate(struct timer_list *timer) { debug_timer_deactivate(timer); @@ -1037,7 +1032,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option } } - debug_activate(timer, expires); + debug_timer_activate(timer); timer->expires = expires; /* @@ -1171,7 +1166,7 @@ void add_timer_on(struct timer_list *timer, int cpu) } forward_timer_base(base); - debug_activate(timer, timer->expires); + debug_timer_activate(timer); internal_add_timer(base, timer); raw_spin_unlock_irqrestore(&base->lock, flags); } -- cgit v1.2.3 From f28d3d5346e97e60c81f933ac89ccf015430e5cf Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Mar 2019 13:09:21 +0100 Subject: timer/trace: Improve timer tracing Timers are added to the timer wheel off by one. This is required in case a timer is queued directly before incrementing jiffies to prevent early timer expiry. When reading a timer trace and relying only on the expiry time of the timer in the timer_start trace point and on the now in the timer_expiry_entry trace point, it seems that the timer fires late. With the current timer_expiry_entry trace point information only now=jiffies is printed but not the value of base->clk. This makes it impossible to draw a conclusion to the index of base->clk and makes it impossible to examine timer problems without additional trace points. Therefore add the base->clk value to the timer_expire_entry trace point, to be able to calculate the index the timer base is located at during collecting expired timers. Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Cc: fweisbec@gmail.com Cc: peterz@infradead.org Cc: Steven Rostedt Link: https://lkml.kernel.org/r/20190321120921.16463-5-anna-maria@linutronix.de --- kernel/time/timer.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8d7918ae4d0c..a9b1bbc2d88d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1293,7 +1293,9 @@ int del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(del_timer_sync); #endif -static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *)) +static void call_timer_fn(struct timer_list *timer, + void (*fn)(struct timer_list *), + unsigned long baseclk) { int count = preempt_count(); @@ -1316,7 +1318,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list */ lock_map_acquire(&lockdep_map); - trace_timer_expire_entry(timer); + trace_timer_expire_entry(timer, baseclk); fn(timer); trace_timer_expire_exit(timer); @@ -1337,6 +1339,13 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list static void expire_timers(struct timer_base *base, struct hlist_head *head) { + /* + * This value is required only for tracing. base->clk was + * incremented directly before expire_timers was called. But expiry + * is related to the old base->clk value. + */ + unsigned long baseclk = base->clk - 1; + while (!hlist_empty(head)) { struct timer_list *timer; void (*fn)(struct timer_list *); @@ -1350,11 +1359,11 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) if (timer->flags & TIMER_IRQSAFE) { raw_spin_unlock(&base->lock); - call_timer_fn(timer, fn); + call_timer_fn(timer, fn, baseclk); raw_spin_lock(&base->lock); } else { raw_spin_unlock_irq(&base->lock); - call_timer_fn(timer, fn); + call_timer_fn(timer, fn, baseclk); raw_spin_lock_irq(&base->lock); } } -- cgit v1.2.3 From 59c39840f5abf4a71e1810a8da71aaccd6c17d26 Mon Sep 17 00:00:00 2001 From: Prasad Sodagudi Date: Sun, 24 Mar 2019 07:57:04 -0700 Subject: genirq: Prevent use-after-free and work list corruption When irq_set_affinity_notifier() replaces the notifier, then the reference count on the old notifier is dropped which causes it to be freed. But nothing ensures that the old notifier is not longer queued in the work list. If it is queued this results in a use after free and possibly in work list corruption. Ensure that the work is canceled before the reference is dropped. Signed-off-by: Prasad Sodagudi Signed-off-by: Thomas Gleixner Cc: marc.zyngier@arm.com Link: https://lkml.kernel.org/r/1553439424-6529-1-git-send-email-psodagud@codeaurora.org --- kernel/irq/manage.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1401afa0d58a..53a081392115 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -357,8 +357,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) desc->affinity_notify = notify; raw_spin_unlock_irqrestore(&desc->lock, flags); - if (old_notify) + if (old_notify) { + cancel_work_sync(&old_notify->work); kref_put(&old_notify->kref, old_notify->release); + } return 0; } -- cgit v1.2.3 From e85e6a21b2b5f31148cc3f2e785262b37c3e1ec7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Jan 2019 15:30:15 -0800 Subject: rcu: Unconditionally expedite during suspend/hibernate The rcu_pm_notify() function refuses to switch to/from expedited grace periods on systems with more than 256 CPUs due to the serialized initialization of expedited grace periods. However, expedited grace periods are now initialized in parallel, removing this concern. This commit therefore removes the checks from rcu_pm_notify(), so that expedited grace periods are used unconditionally during suspend/resume and hibernate/wake operations. As always, real-time workloads wishing to completely avoid expedited grace periods can use the rcupdate.rcu_normal= kernel parameter. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index acd6ccf56faf..95e3250b7b6e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3559,13 +3559,11 @@ static int rcu_pm_notify(struct notifier_block *self, switch (action) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: - if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ - rcu_expedite_gp(); + rcu_expedite_gp(); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: - if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ - rcu_unexpedite_gp(); + rcu_unexpedite_gp(); break; default: break; -- cgit v1.2.3 From 671a63517cf983ad8eaa324167165cef245ab744 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 19 Jan 2019 11:14:18 -0500 Subject: rcu: Avoid unnecessary softirq when system is idle When there are no callbacks pending on an idle system, I noticed that RCU softirq is continuously firing. During this the cpu_no_qs is set to false, and core_needs_qs is set to true indefinitely. This causes rcu_process_callbacks to be repeatedly called, even though the node corresponding to the CPU has that CPU's mask bit cleared and the system is idle. I believe the race is when such mask clearing is done during idle CPU scan of the quiescent state forcing stage in the kthread instead of the softirq. Since the rnp mask is cleared, but the flags on the CPU's rdp are not cleared, the CPU thinks it still needs to report to core RCU. Cure this by clearing the core_needs_qs flag when the CPU detects that its node is already updated which will avoid the unwanted softirq raises to the benefit of real-time systems. Test: Ran rcutorture for various tree RCU configs. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 95e3250b7b6e..2f78a115d34c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2296,6 +2296,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) } mask = rdp->grpmask; if ((rnp->qsmask & mask) == 0) { + rdp->core_needs_qs = false; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { rdp->core_needs_qs = false; -- cgit v1.2.3 From 18d7e40679ef574e428f893101be1c0035e95ee3 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 24 Jan 2019 21:14:37 +0300 Subject: rcu: rcu_qs -- Use raise_softirq_irqoff to not save irqs twice The rcu_qs is disabling IRQs by self so no need to do the same in raise_softirq but instead we can save some cycles using raise_softirq_irqoff directly. CC: Paul E. McKenney Signed-off-by: Cyrill Gorcunov Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 911bd9076d43..477b4eb44af5 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -52,7 +52,7 @@ void rcu_qs(void) local_irq_save(flags); if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { rcu_ctrlblk.donetail = rcu_ctrlblk.curtail; - raise_softirq(RCU_SOFTIRQ); + raise_softirq_irqoff(RCU_SOFTIRQ); } local_irq_restore(flags); } -- cgit v1.2.3 From 884157cef0acf05648fe921d80c680afababb428 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Feb 2019 07:21:29 -0800 Subject: rcu: Make exit_rcu() handle non-preempted RCU readers The purpose of exit_rcu() is to handle cases where buggy code causes a task to exit within an RCU read-side critical section. It currently does that in the case where said RCU read-side critical section was preempted at least once, but fails to handle cases where preemption did not occur. This case needs to be handled because otherwise the final context switch away from the exiting task will incorrectly behave as if task exit were instead a preemption of an RCU read-side critical section, and will therefore queue the exiting task. The exiting task will have exited, and thus won't ever execute rcu_read_unlock(), which means that it will remain queued forever, blocking all subsequent grace periods, and eventually resulting in OOM. Although this is arguably better than letting grace periods proceed and having a later rcu_read_unlock() access the now-freed task structure that once belonged to the exiting tasks, it would obviously be better to correctly handle this case. This commit therefore sets ->rcu_read_lock_nesting to 1 in that case, so that the subsequence call to __rcu_read_unlock() causes the exiting task to exit its dangling RCU read-side critical section. Note that deferred quiescent states need not be considered. The reason is that removing the task from the ->blkd_tasks[] list in the call to rcu_preempt_deferred_qs() handles the per-task component of any deferred quiescent state, and all other components of any deferred quiescent state are associated with the CPU, which isn't going anywhere until some later CPU-hotplug operation, which will report any remaining deferred quiescent states from within the rcu_report_dead() function. Note also that negative values of ->rcu_read_lock_nesting need not be considered. First, these won't show up in exit_rcu() unless there is a serious bug in RCU, and second, setting ->rcu_read_lock_nesting sets the state so that the RCU read-side critical section will be exited normally. Again, this code has no effect unless there has been some prior bug that prevents a task from leaving an RCU read-side critical section before exiting. Furthermore, there have been no reports of the bug fixed by this commit appearing in production. This commit is therefore absolutely -not- recommended for backporting to -stable. Reported-by: ABHISHEK DUBEY Reported-by: BHARATH Y MOURYA Reported-by: Aravinda Prasad Signed-off-by: Paul E. McKenney Tested-by: ABHISHEK DUBEY --- kernel/rcu/tree_plugin.h | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 97dba50f6fb2..d408661d5fb7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -804,19 +804,25 @@ static void rcu_flavor_sched_clock_irq(int user) /* * Check for a task exiting while in a preemptible-RCU read-side - * critical section, clean up if so. No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. + * critical section, clean up if so. No need to issue warnings, as + * debug_check_no_locks_held() already does this if lockdep is enabled. + * Besides, if this function does anything other than just immediately + * return, there was a bug of some sort. Spewing warnings from this + * function is like as not to simply obscure important prior warnings. */ void exit_rcu(void) { struct task_struct *t = current; - if (likely(list_empty(¤t->rcu_node_entry))) + if (unlikely(!list_empty(¤t->rcu_node_entry))) { + t->rcu_read_lock_nesting = 1; + barrier(); + t->rcu_read_unlock_special.b.blocked = true; + } else if (unlikely(t->rcu_read_lock_nesting)) { + t->rcu_read_lock_nesting = 1; + } else { return; - t->rcu_read_lock_nesting = 1; - barrier(); - t->rcu_read_unlock_special.b.blocked = true; + } __rcu_read_unlock(); rcu_preempt_deferred_qs(current); } -- cgit v1.2.3 From 3ffe3d1adc0b6cfb9b24db9995a537bb0aa30a8b Mon Sep 17 00:00:00 2001 From: Liu Song Date: Thu, 21 Feb 2019 22:13:27 +0800 Subject: rcu: Set rcutree.kthread_prio sysfs access to read-only The rcutree.kthread_prio kernel-boot parameter is used to set the priority for boost (rcub), per-CPU (rcuc), and grace-period (rcu_preempt or rcu_sched) kthreads. It is also used by rcutorture to check whether it is possible to meaningfully test RCU priority boosting. However, all of these cases will either ignore or be confused by any post-boot changes to rcutree.kthread_prio. Note that the user really can change the priorities of all of these kthreads using chrt, given sufficient privileges. Therefore, the read-write nature of sysfs access to rcutree.kthread_prio is thus at best an attractive nuisance. This commit therefore changes sysfs access to rcutree.kthread_prio to be read-only. Signed-off-by: Liu Song Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2f78a115d34c..296131450414 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -149,7 +149,7 @@ static void sync_sched_exp_online_cleanup(int cpu); /* rcuc/rcub kthread realtime priority */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; -module_param(kthread_prio, int, 0644); +module_param(kthread_prio, int, 0444); /* Delay in jiffies for grace-period initialization delays, debug only. */ -- cgit v1.2.3 From b2eb85b49a576515fb845cb12568b173c2bedffc Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Sat, 2 Mar 2019 17:25:19 +0900 Subject: rcu: Move common code out of if-else block As the result of recent addition of "rdp->core_needs_qs = false;" in the "if" block, now both branches of the if-else have the same assignment. Factor it out and reduce line count. Signed-off-by: Akira Yokosawa Cc: Joel Fernandes Signed-off-by: Paul E. McKenney Acked-by: Joel Fernandes (Google) --- kernel/rcu/tree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 296131450414..5aefd36ac648 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2295,12 +2295,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) return; } mask = rdp->grpmask; + rdp->core_needs_qs = false; if ((rnp->qsmask & mask) == 0) { - rdp->core_needs_qs = false; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { - rdp->core_needs_qs = false; - /* * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. -- cgit v1.2.3 From da8739f23fadf05809c6c37c327367b229467045 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Mar 2019 15:28:19 -0800 Subject: rcu: Allow rcu_nocbs= to specify all CPUs Currently, the rcu_nocbs= kernel boot parameter requires that a specific list of CPUs be specified, and has no way to say "all of them". As noted by user RavFX in a comment to Phoronix topic 1002538, this is an inconvenient side effect of the removal of the RCU_NOCB_CPU_ALL Kconfig option. This commit therefore enables the rcu_nocbs= kernel boot parameter to be given the string "all", as in "rcu_nocbs=all" to specify that all CPUs on the system are to have their RCU callbacks offloaded. Another approach would be to make cpulist_parse() check for "all", but there are uses of cpulist_parse() that do other checking, which could conflict with an "all". This commit therefore focuses on the specific use of cpulist_parse() in rcu_nocb_setup(). Just a note to other people who would like changes to Linux-kernel RCU: If you send your requests to me directly, they might get fixed somewhat faster. RavFX's comment was posted on January 22, 2018 and I first saw it on March 5, 2019. And the only reason that I found it -at- -all- was that I was looking for projects using RCU, and my search engine showed me that Phoronix comment quite by accident. Your choice, though! ;-) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d408661d5fb7..ed4a6dabf31d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1776,7 +1776,10 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp) static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); - cpulist_parse(str, rcu_nocb_mask); + if (!strcasecmp(str, "all")) + cpumask_setall(rcu_nocb_mask); + else + cpulist_parse(str, rcu_nocb_mask); return 1; } __setup("rcu_nocbs=", rcu_nocb_setup); -- cgit v1.2.3 From 497e42600b69aca1b799c840d2cfc7ad60bb8017 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Mar 2019 14:47:56 -0800 Subject: rcu: Report error for bad rcu_nocbs= parameter values This commit prints a console message when cpulist_parse() reports a bad list of CPUs, and sets all CPUs' bits in that case. The reason for setting all CPUs' bits is that this is the safe(r) choice for real-time workloads, which would normally be the ones using the rcu_nocbs= kernel boot parameter. Either way, later RCU console log messages list the actual set of CPUs whose RCU callbacks will be offloaded. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ed4a6dabf31d..f0aeb7416dcc 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1772,14 +1772,22 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp) */ -/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ +/* + * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. + * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a + * comma-separated list of CPUs and/or CPU ranges. If an invalid list is + * given, a warning is emitted and all CPUs are offloaded. + */ static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); if (!strcasecmp(str, "all")) cpumask_setall(rcu_nocb_mask); else - cpulist_parse(str, rcu_nocb_mask); + if (cpulist_parse(str, rcu_nocb_mask)) { + pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); + cpumask_setall(rcu_nocb_mask); + } return 1; } __setup("rcu_nocbs=", rcu_nocb_setup); -- cgit v1.2.3 From 0f58d2ac2c87d27006c2b610668ebc4ff1f7c3ba Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Fri, 8 Mar 2019 15:16:18 +0530 Subject: rcu: Fix self-wakeups for grace-period kthread The current rcu_gp_kthread_wake() function uses in_interrupt() and thus does a self-wakeup from all interrupt contexts, including the pointless case where the GP kthread happens to be running with bottom halves disabled, along with the impossible case where the GP kthread is running within an NMI handler (you are not supposed to invoke rcu_gp_kthread_wake() from within an NMI handler. This commit therefore replaces the in_interrupt() with in_irq(), so that the self-wakeups happen only from handlers for hardware interrupts and softirqs. This also makes the code match the comment. Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt (VMware) --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5aefd36ac648..139fa1f5c537 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1585,7 +1585,7 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) static void rcu_gp_kthread_wake(void) { if ((current == rcu_state.gp_kthread && - !in_interrupt() && !in_serving_softirq()) || + !in_irq() && !in_serving_softirq()) || !READ_ONCE(rcu_state.gp_flags) || !rcu_state.gp_kthread) return; -- cgit v1.2.3 From 6973032a602ee678c98644a30d57ebf9c72dd6d3 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Mon, 11 Mar 2019 15:16:11 +0530 Subject: rcu: Default jiffies_to_sched_qs to jiffies_till_sched_qs The current code only calls adjust_jiffies_till_sched_qs() if jiffies_till_sched_qs is left at its default value, so when the jiffies_till_sched_qs kernel-boot parameter actually is specified, jiffies_to_sched_qs will be left with the value zero, which will result in useless slowdowns of cond_resched(). This commit therefore changes rcu_init_geometry() to unconditionally invoke adjust_jiffies_till_sched_qs(), which ensures that jiffies_to_sched_qs will be initialized in all cases, thus maintaining good cond_resched() performance. Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 139fa1f5c537..466299c3d2da 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3739,8 +3739,7 @@ static void __init rcu_init_geometry(void) jiffies_till_first_fqs = d; if (jiffies_till_next_fqs == ULONG_MAX) jiffies_till_next_fqs = d; - if (jiffies_till_sched_qs == ULONG_MAX) - adjust_jiffies_till_sched_qs(); + adjust_jiffies_till_sched_qs(); /* If the compile-time values are accurate, just leave. */ if (rcu_fanout_leaf == RCU_FANOUT_LEAF && -- cgit v1.2.3 From 85f2b60c4321b088ba08ec9a05b8a7b68e4ada2a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Mar 2019 15:45:13 -0700 Subject: rcu: Update jiffies_to_sched_qs and adjust_jiffies_till_sched_qs() comments This commit better documents the jiffies_to_sched_qs default-value strategy used by adjust_jiffies_till_sched_qs() Reported-by: Joel Fernandes Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 466299c3d2da..e117732bcd5d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -406,7 +406,7 @@ static bool rcu_kick_kthreads; */ static ulong jiffies_till_sched_qs = ULONG_MAX; module_param(jiffies_till_sched_qs, ulong, 0444); -static ulong jiffies_to_sched_qs; /* Adj