From 5a17f040fa332e71a45ca9ff02d6979d9176a423 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 26 Oct 2022 16:31:11 -0700 Subject: cred: Do not default to init_cred in prepare_kernel_cred() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A common exploit pattern for ROP attacks is to abuse prepare_kernel_cred() in order to construct escalated privileges[1]. Instead of providing a short-hand argument (NULL) to the "daemon" argument to indicate using init_cred as the base cred, require that "daemon" is always set to an actual task. Replace all existing callers that were passing NULL with &init_task. Future attacks will need to have sufficiently powerful read/write primitives to have found an appropriately privileged task and written it to the ROP stack as an argument to succeed, which is similarly difficult to the prior effort needed to escalate privileges before struct cred existed: locate the current cred and overwrite the uid member. This has the added benefit of meaning that prepare_kernel_cred() can no longer exceed the privileges of the init task, which may have changed from the original init_cred (e.g. dropping capabilities from the bounding set). [1] https://google.com/search?q=commit_creds(prepare_kernel_cred(0)) Cc: "Eric W. Biederman" Cc: David Howells Cc: "Rafael J. Wysocki" Cc: Steve French Cc: Ronnie Sahlberg Cc: Shyam Prasad N Cc: Tom Talpey Cc: Namjae Jeon Cc: Trond Myklebust Cc: Anna Schumaker Cc: Chuck Lever Cc: Jeff Layton Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: "Michal Koutný" Cc: Peter Zijlstra Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Cc: linux-nfs@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Acked-by: Luis Chamberlain Reviewed-by: Sergey Senozhatsky Acked-by: Russ Weight Acked-by: Greg Kroah-Hartman Acked-by: Paulo Alcantara (SUSE) Link: https://lore.kernel.org/r/20221026232943.never.775-kees@kernel.org --- kernel/cred.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index e10c15f51c1f..811ad654abd1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -701,9 +701,9 @@ void __init cred_init(void) * override a task's own credentials so that work can be done on behalf of that * task that requires a different subjective context. * - * @daemon is used to provide a base for the security record, but can be NULL. - * If @daemon is supplied, then the security data will be derived from that; - * otherwise they'll be set to 0 and no groups, full capabilities and no keys. + * @daemon is used to provide a base cred, with the security data derived from + * that; if this is "&init_task", they'll be set to 0, no groups, full + * capabilities, and no keys. * * The caller may change these controls afterwards if desired. * @@ -714,17 +714,16 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) const struct cred *old; struct cred *new; + if (WARN_ON_ONCE(!daemon)) + return NULL; + new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; kdebug("prepare_kernel_cred() alloc %p", new); - if (daemon) - old = get_task_cred(daemon); - else - old = get_cred(&init_cred); - + old = get_task_cred(daemon); validate_creds(old); *new = *old; -- cgit v1.2.3 From 9360d035a579d95d1e76c471061b9065b18a0eb1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 15:43:21 -0800 Subject: panic: Separate sysctl logic from CONFIG_SMP In preparation for adding more sysctls directly in kernel/panic.c, split CONFIG_SMP from the logic that adds sysctls. Cc: Petr Mladek Cc: Andrew Morton Cc: tangmeng Cc: "Guilherme G. Piccoli" Cc: Tiezhu Yang Cc: Sebastian Andrzej Siewior Reviewed-by: Luis Chamberlain Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221117234328.594699-1-keescook@chromium.org --- kernel/panic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index da323209f583..d843d036651e 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -75,8 +75,9 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); -#if defined(CONFIG_SMP) && defined(CONFIG_SYSCTL) +#ifdef CONFIG_SYSCTL static struct ctl_table kern_panic_table[] = { +#ifdef CONFIG_SMP { .procname = "oops_all_cpu_backtrace", .data = &sysctl_oops_all_cpu_backtrace, @@ -86,6 +87,7 @@ static struct ctl_table kern_panic_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, +#endif { } }; -- cgit v1.2.3 From d4ccd54d28d3c8598e2354acc13e28c060961dbb Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 17 Nov 2022 15:43:22 -0800 Subject: exit: Put an upper limit on how often we can oops Many Linux systems are configured to not panic on oops; but allowing an attacker to oops the system **really** often can make even bugs that look completely unexploitable exploitable (like NULL dereferences and such) if each crash elevates a refcount by one or a lock is taken in read mode, and this causes a counter to eventually overflow. The most interesting counters for this are 32 bits wide (like open-coded refcounts that don't use refcount_t). (The ldsem reader count on 32-bit platforms is just 16 bits, but probably nobody cares about 32-bit platforms that much nowadays.) So let's panic the system if the kernel is constantly oopsing. The speed of oopsing 2^32 times probably depends on several factors, like how long the stack trace is and which unwinder you're using; an empirically important one is whether your console is showing a graphical environment or a text console that oopses will be printed to. In a quick single-threaded benchmark, it looks like oopsing in a vfork() child with a very short stack trace only takes ~510 microseconds per run when a graphical console is active; but switching to a text console that oopses are printed to slows it down around 87x, to ~45 milliseconds per run. (Adding more threads makes this faster, but the actual oops printing happens under &die_lock on x86, so you can maybe speed this up by a factor of around 2 and then any further improvement gets eaten up by lock contention.) It looks like it would take around 8-12 days to overflow a 32-bit counter with repeated oopsing on a multi-core X86 system running a graphical environment; both me (in an X86 VM) and Seth (with a distro kernel on normal hardware in a standard configuration) got numbers in that ballpark. 12 days aren't *that* short on a desktop system, and you'd likely need much longer on a typical server system (assuming that people don't run graphical desktop environments on their servers), and this is a *very* noisy and violent approach to exploiting the kernel; and it also seems to take orders of magnitude longer on some machines, probably because stuff like EFI pstore will slow it down a ton if that's active. Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20221107201317.324457-1-jannh@google.com Reviewed-by: Luis Chamberlain Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221117234328.594699-2-keescook@chromium.org --- kernel/exit.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 35e0a31a0315..2ab3ead62118 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -72,6 +72,33 @@ #include #include +/* + * The default value should be high enough to not crash a system that randomly + * crashes its kernel from time to time, but low enough to at least not permit + * overflowing 32-bit refcounts or the ldsem writer count. + */ +static unsigned int oops_limit = 10000; + +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_exit_table[] = { + { + .procname = "oops_limit", + .data = &oops_limit, + .maxlen = sizeof(oops_limit), + .mode = 0644, + .proc_handler = proc_douintvec, + }, + { } +}; + +static __init int kernel_exit_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_exit_table); + return 0; +} +late_initcall(kernel_exit_sysctls_init); +#endif + static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; @@ -874,6 +901,8 @@ void __noreturn do_exit(long code) void __noreturn make_task_dead(int signr) { + static atomic_t oops_count = ATOMIC_INIT(0); + /* * Take the task off the cpu after something catastrophic has * happened. @@ -897,6 +926,19 @@ void __noreturn make_task_dead(int signr) preempt_count_set(PREEMPT_ENABLED); } + /* + * Every time the system oopses, if the oops happens while a reference + * to an object was held, the reference leaks. + * If the oops doesn't also leak memory, repeated oopsing can cause + * reference counters to wrap around (if they're not using refcount_t). + * This means that repeated oopsing can make unexploitable-looking bugs + * exploitable through repeated oopsing. + * To make sure this can't happen, place an upper bound on how often the + * kernel may oops without panic(). + */ + if (atomic_inc_return(&oops_count) >= READ_ONCE(oops_limit)) + panic("Oopsed too often (kernel.oops_limit is %d)", oops_limit); + /* * We're taking recursive faults here in make_task_dead. Safest is to just * leave this task alone and wait for reboot. -- cgit v1.2.3 From 9db89b41117024f80b38b15954017fb293133364 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 15:43:23 -0800 Subject: exit: Expose "oops_count" to sysfs Since Oops count is now tracked and is a fairly interesting signal, add the entry /sys/kernel/oops_count to expose it to userspace. Cc: "Eric W. Biederman" Cc: Jann Horn Cc: Arnd Bergmann Reviewed-by: Luis Chamberlain Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221117234328.594699-3-keescook@chromium.org --- kernel/exit.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 2ab3ead62118..dc1a32149f94 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include @@ -99,6 +100,25 @@ static __init int kernel_exit_sysctls_init(void) late_initcall(kernel_exit_sysctls_init); #endif +static atomic_t oops_count = ATOMIC_INIT(0); + +#ifdef CONFIG_SYSFS +static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); +} + +static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); + +static __init int kernel_exit_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); + return 0; +} +late_initcall(kernel_exit_sysfs_init); +#endif + static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; @@ -901,8 +921,6 @@ void __noreturn do_exit(long code) void __noreturn make_task_dead(int signr) { - static atomic_t oops_count = ATOMIC_INIT(0); - /* * Take the task off the cpu after something catastrophic has * happened. -- cgit v1.2.3 From de92f65719cd672f4b48397540b9f9eff67eca40 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 2 Dec 2022 12:59:11 -0800 Subject: exit: Allow oops_limit to be disabled In preparation for keeping oops_limit logic in sync with warn_limit, have oops_limit == 0 disable checking the Oops counter. Cc: Jann Horn Cc: Jonathan Corbet Cc: Andrew Morton Cc: Baolin Wang Cc: "Jason A. Donenfeld" Cc: Eric Biggers Cc: Huang Ying Cc: "Eric W. Biederman" Cc: Arnd Bergmann Cc: linux-doc@vger.kernel.org Signed-off-by: Kees Cook --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index dc1a32149f94..deffb8e4b1b2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -954,7 +954,7 @@ void __noreturn make_task_dead(int signr) * To make sure this can't happen, place an upper bound on how often the * kernel may oops without panic(). */ - if (atomic_inc_return(&oops_count) >= READ_ONCE(oops_limit)) + if (atomic_inc_return(&oops_count) >= READ_ONCE(oops_limit) && oops_limit) panic("Oopsed too often (kernel.oops_limit is %d)", oops_limit); /* -- cgit v1.2.3 From 79cc1ba7badf9e7a12af99695a557e9ce27ee967 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 15:43:24 -0800 Subject: panic: Consolidate open-coded panic_on_warn checks Several run-time checkers (KASAN, UBSAN, KFENCE, KCSAN, sched) roll their own warnings, and each check "panic_on_warn". Consolidate this into a single function so that future instrumentation can be added in a single location. Cc: Marco Elver Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Daniel Bristot de Oliveira Cc: Valentin Schneider Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Vincenzo Frascino Cc: Andrew Morton Cc: David Gow Cc: tangmeng Cc: Jann Horn Cc: Shuah Khan Cc: Petr Mladek Cc: "Paul E. McKenney" Cc: Sebastian Andrzej Siewior Cc: "Guilherme G. Piccoli" Cc: Tiezhu Yang Cc: kasan-dev@googlegroups.com Cc: linux-mm@kvack.org Reviewed-by: Luis Chamberlain Signed-off-by: Kees Cook Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Link: https://lore.kernel.org/r/20221117234328.594699-4-keescook@chromium.org --- kernel/kcsan/report.c | 3 +-- kernel/panic.c | 9 +++++++-- kernel/sched/core.c | 3 +-- 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 67794404042a..e95ce7d7a76e 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -492,8 +492,7 @@ static void print_report(enum kcsan_value_change value_change, dump_stack_print_info(KERN_DEFAULT); pr_err("==================================================================\n"); - if (panic_on_warn) - panic("panic_on_warn set ...\n"); + check_panic_on_warn("KCSAN"); } static void release_report(unsigned long *flags, struct other_info *other_info) diff --git a/kernel/panic.c b/kernel/panic.c index d843d036651e..cfa354322d5f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -201,6 +201,12 @@ static void panic_print_sys_info(bool console_flush) ftrace_dump(DUMP_ALL); } +void check_panic_on_warn(const char *origin) +{ + if (panic_on_warn) + panic("%s: panic_on_warn set ...\n", origin); +} + /** * panic - halt the system * @fmt: The text string to print @@ -619,8 +625,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, if (regs) show_regs(regs); - if (panic_on_warn) - panic("panic_on_warn set ...\n"); + check_panic_on_warn("kernel"); if (!regs) dump_stack(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5800b0623ff3..285ef8821b4f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5729,8 +5729,7 @@ static noinline void __schedule_bug(struct task_struct *prev) pr_err("Preemption disabled at:"); print_ip_sym(KERN_ERR, preempt_disable_ip); } - if (panic_on_warn) - panic("scheduling while atomic\n"); + check_panic_on_warn("scheduling while atomic"); dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -- cgit v1.2.3 From 9fc9e278a5c0b708eeffaf47d6eb0c82aa74ed78 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 15:43:25 -0800 Subject: panic: Introduce warn_limit Like oops_limit, add warn_limit for limiting the number of warnings when panic_on_warn is not set. Cc: Jonathan Corbet Cc: Andrew Morton Cc: Baolin Wang Cc: "Jason A. Donenfeld" Cc: Eric Biggers Cc: Huang Ying Cc: Petr Mladek Cc: tangmeng Cc: "Guilherme G. Piccoli" Cc: Tiezhu Yang Cc: Sebastian Andrzej Siewior Cc: linux-doc@vger.kernel.org Reviewed-by: Luis Chamberlain Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221117234328.594699-5-keescook@chromium.org --- kernel/panic.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index cfa354322d5f..f4403fc14f67 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -58,6 +58,7 @@ bool crash_kexec_post_notifiers; int panic_on_warn __read_mostly; unsigned long panic_on_taint; bool panic_on_taint_nousertaint = false; +static unsigned int warn_limit __read_mostly; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -88,6 +89,13 @@ static struct ctl_table kern_panic_table[] = { .extra2 = SYSCTL_ONE, }, #endif + { + .procname = "warn_limit", + .data = &warn_limit, + .maxlen = sizeof(warn_limit), + .mode = 0644, + .proc_handler = proc_douintvec, + }, { } }; @@ -203,8 +211,14 @@ static void panic_print_sys_info(bool console_flush) void check_panic_on_warn(const char *origin) { + static atomic_t warn_count = ATOMIC_INIT(0); + if (panic_on_warn) panic("%s: panic_on_warn set ...\n", origin); + + if (atomic_inc_return(&warn_count) >= READ_ONCE(warn_limit) && warn_limit) + panic("%s: system warned too often (kernel.warn_limit is %d)", + origin, warn_limit); } /** -- cgit v1.2.3 From 8b05aa26336113c4cea25f1c333ee8cd4fc212a6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 15:43:26 -0800 Subject: panic: Expose "warn_count" to sysfs Since Warn count is now tracked and is a fairly interesting signal, add the entry /sys/kernel/warn_count to expose it to userspace. Cc: Petr Mladek Cc: Andrew Morton Cc: tangmeng Cc: "Guilherme G. Piccoli" Cc: Sebastian Andrzej Siewior Cc: Tiezhu Yang Reviewed-by: Luis Chamberlain Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221117234328.594699-6-keescook@chromium.org --- kernel/panic.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index f4403fc14f67..54deb743b2d5 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -107,6 +108,25 @@ static __init int kernel_panic_sysctls_init(void) late_initcall(kernel_panic_sysctls_init); #endif +static atomic_t warn_count = ATOMIC_INIT(0); + +#ifdef CONFIG_SYSFS +static ssize_t warn_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%d\n", atomic_read(&warn_count)); +} + +static struct kobj_attribute warn_count_attr = __ATTR_RO(warn_count); + +static __init int kernel_panic_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &warn_count_attr.attr, NULL); + return 0; +} +late_initcall(kernel_panic_sysfs_init); +#endif + static long no_blink(int state) { return 0; @@ -211,8 +231,6 @@ static void panic_print_sys_info(bool console_flush) void check_panic_on_warn(const char *origin) { - static atomic_t warn_count = ATOMIC_INIT(0); - if (panic_on_warn) panic("%s: panic_on_warn set ...\n", origin); -- cgit v1.2.3 From 3a017d6355f24de42f2ad688df9fa19e0cb128f2 Mon Sep 17 00:00:00 2001 From: "haifeng.xu" Date: Mon, 28 Nov 2022 06:56:06 +0000 Subject: signal: Initialize the info in ksignal When handing the SIGNAL_GROUP_EXIT flag, the info in ksignal isn't cleared. However, the info acquired by dequeue_synchronous_signal/dequeue_signal is initialized and can be safely used. Fortunately, the fatal signal process just uses the si_signo and doesn't use any other member. Even so, the initialization before use is more safer. Signed-off-by: haifeng.xu Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221128065606.19570-1-haifeng.xu@shopee.com --- kernel/signal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index d140672185a4..b9b0c8c620e7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2693,6 +2693,7 @@ relock: /* Has this task already been marked for death? */ if ((signal->flags & SIGNAL_GROUP_EXIT) || signal->group_exec_task) { + clear_siginfo(&ksig->info); ksig->info.si_signo = signr = SIGKILL; sigdelset(¤t->pending.signal, SIGKILL); trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, -- cgit v1.2.3