diff options
Diffstat (limited to 'kernel/module')
| -rw-r--r-- | kernel/module/Kconfig | 100 | ||||
| -rw-r--r-- | kernel/module/Makefile | 6 | ||||
| -rw-r--r-- | kernel/module/decompress.c | 6 | ||||
| -rw-r--r-- | kernel/module/dups.c | 246 | ||||
| -rw-r--r-- | kernel/module/internal.h | 140 | ||||
| -rw-r--r-- | kernel/module/kallsyms.c | 78 | ||||
| -rw-r--r-- | kernel/module/kdb.c | 17 | ||||
| -rw-r--r-- | kernel/module/kmod.c | 180 | ||||
| -rw-r--r-- | kernel/module/main.c | 1091 | ||||
| -rw-r--r-- | kernel/module/procfs.c | 16 | ||||
| -rw-r--r-- | kernel/module/stats.c | 430 | ||||
| -rw-r--r-- | kernel/module/strict_rwx.c | 99 | ||||
| -rw-r--r-- | kernel/module/tracking.c | 7 | ||||
| -rw-r--r-- | kernel/module/tree_lookup.c | 39 |
14 files changed, 1786 insertions, 669 deletions
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 424b3bc58f3f..33a2e991f608 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -22,6 +22,104 @@ menuconfig MODULES if MODULES +config MODULE_DEBUGFS + bool + +config MODULE_DEBUG + bool "Module debugging" + depends on DEBUG_FS + help + Allows you to enable / disable features which can help you debug + modules. You don't need these options on production systems. + +if MODULE_DEBUG + +config MODULE_STATS + bool "Module statistics" + depends on DEBUG_FS + select MODULE_DEBUGFS + help + This option allows you to maintain a record of module statistics. + For example, size of all modules, average size, text size, a list + of failed modules and the size for each of those. For failed + modules we keep track of modules which failed due to either the + existing module taking too long to load or that module was already + loaded. + + You should enable this if you are debugging production loads + and want to see if userspace or the kernel is doing stupid things + with loading modules when it shouldn't or if you want to help + optimize userspace / kernel space module autoloading schemes. + You might want to do this because failed modules tend to use + up significant amount of memory, and so you'd be doing everyone a + favor in avoiding these failures proactively. + + This functionality is also useful for those experimenting with + module .text ELF section optimization. + + If unsure, say N. + +config MODULE_DEBUG_AUTOLOAD_DUPS + bool "Debug duplicate modules with auto-loading" + help + Module autoloading allows in-kernel code to request modules through + the *request_module*() API calls. This in turn just calls userspace + modprobe. Although modprobe checks to see if a module is already + loaded before trying to load a module there is a small time window in + which multiple duplicate requests can end up in userspace and multiple + modprobe calls race calling finit_module() around the same time for + duplicate modules. The finit_module() system call can consume in the + worst case more than twice the respective module size in virtual + memory for each duplicate module requests. Although duplicate module + requests are non-fatal virtual memory is a limited resource and each + duplicate module request ends up just unnecessarily straining virtual + memory. + + This debugging facility will create pr_warn() splats for duplicate + module requests to help identify if module auto-loading may be the + culprit to your early boot virtual memory pressure. Since virtual + memory abuse caused by duplicate module requests could render a + system unusable this functionality will also converge races in + requests for the same module to a single request. You can boot with + the module.enable_dups_trace=1 kernel parameter to use WARN_ON() + instead of the pr_warn(). + + If the first module request used request_module_nowait() we cannot + use that as the anchor to wait for duplicate module requests, since + users of request_module() do want a proper return value. If a call + for the same module happened earlier with request_module() though, + then a duplicate request_module_nowait() would be detected. The + non-wait request_module() call is synchronous and waits until modprobe + completes. Subsequent auto-loading requests for the same module do + not trigger a new finit_module() calls and do not strain virtual + memory, and so as soon as modprobe successfully completes we remove + tracking for duplicates for that module. + + Enable this functionality to try to debug virtual memory abuse during + boot on systems which are failing to boot or if you suspect you may be + straining virtual memory during boot, and you want to identify if the + abuse was due to module auto-loading. These issues are currently only + known to occur on systems with many CPUs (over 400) and is likely the + result of udev issuing duplicate module requests for each CPU, and so + module auto-loading is not the culprit. There may very well still be + many duplicate module auto-loading requests which could be optimized + for and this debugging facility can be used to help identify them. + + Only enable this for debugging system functionality, never have it + enabled on real systems. + +config MODULE_DEBUG_AUTOLOAD_DUPS_TRACE + bool "Force full stack trace when duplicates are found" + depends on MODULE_DEBUG_AUTOLOAD_DUPS + help + Enabling this will force a full stack trace for duplicate module + auto-loading requests using WARN_ON() instead of pr_warn(). You + should keep this disabled at all times unless you are a developer + and are doing a manual inspection and want to debug exactly why + these duplicates occur. + +endif # MODULE_DEBUG + config MODULE_FORCE_LOAD bool "Forced module loading" default n @@ -51,7 +149,7 @@ config MODULE_FORCE_UNLOAD config MODULE_UNLOAD_TAINT_TRACKING bool "Tainted module unload tracking" depends on MODULE_UNLOAD - default n + select MODULE_DEBUGFS help This option allows you to maintain a record of each unloaded module that tainted the kernel. In addition to displaying a diff --git a/kernel/module/Makefile b/kernel/module/Makefile index 948efea81e85..a10b2b9a6fdf 100644 --- a/kernel/module/Makefile +++ b/kernel/module/Makefile @@ -7,7 +7,10 @@ # and produce insane amounts of uninteresting coverage. KCOV_INSTRUMENT_module.o := n -obj-y += main.o strict_rwx.o +obj-y += main.o +obj-y += strict_rwx.o +obj-y += kmod.o +obj-$(CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS) += dups.o obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o obj-$(CONFIG_MODULE_SIG) += signing.o obj-$(CONFIG_LIVEPATCH) += livepatch.o @@ -19,3 +22,4 @@ obj-$(CONFIG_SYSFS) += sysfs.o obj-$(CONFIG_KGDB_KDB) += kdb.o obj-$(CONFIG_MODVERSIONS) += version.o obj-$(CONFIG_MODULE_UNLOAD_TAINT_TRACKING) += tracking.o +obj-$(CONFIG_MODULE_STATS) += stats.o diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c index bb79ac1a6d8f..e97232b125eb 100644 --- a/kernel/module/decompress.c +++ b/kernel/module/decompress.c @@ -267,7 +267,7 @@ static ssize_t module_zstd_decompress(struct load_info *info, zstd_dec.size = PAGE_SIZE; ret = zstd_decompress_stream(dstream, &zstd_dec, &zstd_buf); - kunmap(page); + kunmap_local(zstd_dec.dst); retval = zstd_get_error_code(ret); if (retval) break; @@ -297,6 +297,10 @@ int module_decompress(struct load_info *info, const void *buf, size_t size) ssize_t data_size; int error; +#if defined(CONFIG_MODULE_STATS) + info->compressed_len = size; +#endif + /* * Start with number of pages twice as big as needed for * compressed data. diff --git a/kernel/module/dups.c b/kernel/module/dups.c new file mode 100644 index 000000000000..aa8e1361fdb5 --- /dev/null +++ b/kernel/module/dups.c @@ -0,0 +1,246 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * kmod dups - the kernel module autoloader duplicate suppressor + * + * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org> + */ + +#define pr_fmt(fmt) "module: " fmt + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/binfmts.h> +#include <linux/syscalls.h> +#include <linux/unistd.h> +#include <linux/kmod.h> +#include <linux/slab.h> +#include <linux/completion.h> +#include <linux/cred.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/workqueue.h> +#include <linux/security.h> +#include <linux/mount.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/resource.h> +#include <linux/notifier.h> +#include <linux/suspend.h> +#include <linux/rwsem.h> +#include <linux/ptrace.h> +#include <linux/async.h> +#include <linux/uaccess.h> + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "module." +static bool enable_dups_trace = IS_ENABLED(CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS_TRACE); +module_param(enable_dups_trace, bool_enable_only, 0644); + +/* + * Protects dup_kmod_reqs list, adds / removals with RCU. + */ +static DEFINE_MUTEX(kmod_dup_mutex); +static LIST_HEAD(dup_kmod_reqs); + +struct kmod_dup_req { + struct list_head list; + char name[MODULE_NAME_LEN]; + struct completion first_req_done; + struct work_struct complete_work; + struct delayed_work delete_work; + int dup_ret; +}; + +static struct kmod_dup_req *kmod_dup_request_lookup(char *module_name) +{ + struct kmod_dup_req *kmod_req; + + list_for_each_entry_rcu(kmod_req, &dup_kmod_reqs, list, + lockdep_is_held(&kmod_dup_mutex)) { + if (strlen(kmod_req->name) == strlen(module_name) && + !memcmp(kmod_req->name, module_name, strlen(module_name))) { + return kmod_req; + } + } + + return NULL; +} + +static void kmod_dup_request_delete(struct work_struct *work) +{ + struct kmod_dup_req *kmod_req; + kmod_req = container_of(to_delayed_work(work), struct kmod_dup_req, delete_work); + + /* + * The typical situation is a module successully loaded. In that + * situation the module will be present already in userspace. If + * new requests come in after that, userspace will already know the + * module is loaded so will just return 0 right away. There is still + * a small chance right after we delete this entry new request_module() + * calls may happen after that, they can happen. These heuristics + * are to protect finit_module() abuse for auto-loading, if modules + * are still tryign to auto-load even if a module is already loaded, + * that's on them, and those inneficiencies should not be fixed by + * kmod. The inneficies there are a call to modprobe and modprobe + * just returning 0. + */ + mutex_lock(&kmod_dup_mutex); + list_del_rcu(&kmod_req->list); + synchronize_rcu(); + mutex_unlock(&kmod_dup_mutex); + kfree(kmod_req); +} + +static void kmod_dup_request_complete(struct work_struct *work) +{ + struct kmod_dup_req *kmod_req; + + kmod_req = container_of(work, struct kmod_dup_req, complete_work); + + /* + * This will ensure that the kernel will let all the waiters get + * informed its time to check the return value. It's time to + * go home. + */ + complete_all(&kmod_req->first_req_done); + + /* + * Now that we have allowed prior request_module() calls to go on + * with life, let's schedule deleting this entry. We don't have + * to do it right away, but we *eventually* want to do it so to not + * let this linger forever as this is just a boot optimization for + * possible abuses of vmalloc() incurred by finit_module() thrashing. + */ + queue_delayed_work(system_wq, &kmod_req->delete_work, 60 * HZ); +} + +bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret) +{ + struct kmod_dup_req *kmod_req, *new_kmod_req; + int ret; + + /* + * Pre-allocate the entry in case we have to use it later + * to avoid contention with the mutex. + */ + new_kmod_req = kzalloc(sizeof(*new_kmod_req), GFP_KERNEL); + if (!new_kmod_req) + return false; + + memcpy(new_kmod_req->name, module_name, strlen(module_name)); + INIT_WORK(&new_kmod_req->complete_work, kmod_dup_request_complete); + INIT_DELAYED_WORK(&new_kmod_req->delete_work, kmod_dup_request_delete); + init_completion(&new_kmod_req->first_req_done); + + mutex_lock(&kmod_dup_mutex); + + kmod_req = kmod_dup_request_lookup(module_name); + if (!kmod_req) { + /* + * If the first request that came through for a module + * was with request_module_nowait() we cannot wait for it + * and share its return value with other users which may + * have used request_module() and need a proper return value + * so just skip using them as an anchor. + * + * If a prior request to this one came through with + * request_module() though, then a request_module_nowait() + * would benefit from duplicate detection. + */ + if (!wait) { + kfree(new_kmod_req); + pr_debug("New request_module_nowait() for %s -- cannot track duplicates for this request\n", module_name); + mutex_unlock(&kmod_dup_mutex); + return false; + } + + /* + * There was no duplicate, just add the request so we can + * keep tab on duplicates later. + */ + pr_debug("New request_module() for %s\n", module_name); + list_add_rcu(&new_kmod_req->list, &dup_kmod_reqs); + mutex_unlock(&kmod_dup_mutex); + return false; + } + mutex_unlock(&kmod_dup_mutex); + + /* We are dealing with a duplicate request now */ + kfree(new_kmod_req); + + /* + * To fix these try to use try_then_request_module() instead as that + * will check if the component you are looking for is present or not. + * You could also just queue a single request to load the module once, + * instead of having each and everything you need try to request for + * the module. + * + * Duplicate request_module() calls can cause quite a bit of wasted + * vmalloc() space when racing with userspace. + */ + if (enable_dups_trace) + WARN(1, "module-autoload: duplicate request for module %s\n", module_name); + else + pr_warn("module-autoload: duplicate request for module %s\n", module_name); + + if (!wait) { + /* + * If request_module_nowait() was used then the user just + * wanted to issue the request and if another module request + * was already its way with the same name we don't care for + * the return value either. Let duplicate request_module_nowait() + * calls bail out right away. + */ + *dup_ret = 0; + return true; + } + + /* + * If a duplicate request_module() was used they *may* care for + * the return value, so we have no other option but to wait for + * the first caller to complete. If the first caller used + * the request_module_nowait() call, subsquent callers will + * deal with the comprmise of getting a successful call with this + * optimization enabled ... + */ + ret = wait_for_completion_state(&kmod_req->first_req_done, + TASK_UNINTERRUPTIBLE | TASK_KILLABLE); + if (ret) { + *dup_ret = ret; + return true; + } + + /* Now the duplicate request has the same exact return value as the first request */ + *dup_ret = kmod_req->dup_ret; + + return true; +} + +void kmod_dup_request_announce(char *module_name, int ret) +{ + struct kmod_dup_req *kmod_req; + + mutex_lock(&kmod_dup_mutex); + + kmod_req = kmod_dup_request_lookup(module_name); + if (!kmod_req) + goto out; + + kmod_req->dup_ret = ret; + + /* + * If we complete() here we may allow duplicate threads + * to continue before the first one that submitted the + * request. We're in no rush also, given that each and + * every bounce back to userspace is slow we avoid that + * with a slight delay here. So queueue up the completion + * and let duplicates suffer, just wait a tad bit longer. + * There is no rush. But we also don't want to hold the + * caller up forever or introduce any boot delays. + */ + queue_work(system_wq, &kmod_req->complete_work); + +out: + mutex_unlock(&kmod_dup_mutex); +} diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 1c877561a7d2..dc7b0160c480 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -3,6 +3,7 @@ * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) + * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org> */ #include <linux/elf.h> @@ -17,27 +18,19 @@ #define ARCH_SHF_SMALL 0 #endif -/* If this is set, the section belongs in the init part of the module */ -#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG - 1)) -/* Maximum number of characters written by module_flags() */ -#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) - -#ifndef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC -#define data_layout core_layout -#endif - /* - * Modules' sections will be aligned on page boundaries - * to ensure complete separation of code and data, but - * only when CONFIG_STRICT_MODULE_RWX=y + * Use highest 4 bits of sh_entsize to store the mod_mem_type of this + * section. This leaves 28 bits for offset on 32-bit systems, which is + * about 256 MiB (WARN_ON_ONCE if we exceed that). */ -static inline unsigned int strict_align(unsigned int size) -{ - if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) - return PAGE_ALIGN(size); - else - return size; -} + +#define SH_ENTSIZE_TYPE_BITS 4 +#define SH_ENTSIZE_TYPE_SHIFT (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS) +#define SH_ENTSIZE_TYPE_MASK ((1UL << SH_ENTSIZE_TYPE_BITS) - 1) +#define SH_ENTSIZE_OFFSET_MASK ((1UL << (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)) - 1) + +/* Maximum number of characters written by module_flags() */ +#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) extern struct mutex module_mutex; extern struct list_head modules; @@ -53,7 +46,6 @@ extern const struct kernel_symbol __stop___ksymtab_gpl[]; extern const s32 __start___kcrctab[]; extern const s32 __start___kcrctab_gpl[]; -#include <linux/dynamic_debug.h> struct load_info { const char *name; /* pointer to module in temporary copy, freed at end of load_module() */ @@ -63,12 +55,14 @@ struct load_info { Elf_Shdr *sechdrs; char *secstrings, *strtab; unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs; - struct _ddebug_info dyndbg; bool sig_ok; #ifdef CONFIG_KALLSYMS unsigned long mod_kallsyms_init_off; #endif #ifdef CONFIG_MODULE_DECOMPRESS +#ifdef CONFIG_MODULE_STATS + unsigned long compressed_len; +#endif struct page **pages; unsigned int max_pages; unsigned int used_pages; @@ -101,11 +95,16 @@ int try_to_force_load(struct module *mod, const char *reason); bool find_symbol(struct find_symbol_arg *fsa); struct module *find_module_all(const char *name, size_t len, bool even_unformed); int cmp_name(const void *name, const void *sym); -long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr, - unsigned int section); +long module_get_offset_and_type(struct module *mod, enum mod_mem_type type, + Elf_Shdr *sechdr, unsigned int section); char *module_flags(struct module *mod, char *buf, bool show_state); size_t module_flags_taint(unsigned long taints, char *buf); +char *module_next_tag_pair(char *string, unsigned long *secsize); + +#define for_each_modinfo_entry(entry, info, name) \ + for (entry = get_modinfo(info, name); entry; entry = get_next_modinfo(info, name, entry)) + static inline void module_assert_mutex_or_preempt(void) { #ifdef CONFIG_LOCKDEP @@ -148,6 +147,95 @@ static inline bool set_livepatch_module(struct module *mod) #endif } +/** + * enum fail_dup_mod_reason - state at which a duplicate module was detected + * + * @FAIL_DUP_MOD_BECOMING: the module is read properly, passes all checks but + * we've determined that another module with the same name is already loaded + * or being processed on our &modules list. This happens on early_mod_check() + * right before layout_and_allocate(). The kernel would have already + * vmalloc()'d space for the entire module through finit_module(). If + * decompression was used two vmap() spaces were used. These failures can + * happen when userspace has not seen the module present on the kernel and + * tries to load the module multiple times at same time. + * @FAIL_DUP_MOD_LOAD: the module has been read properly, passes all validation + * checks and the kernel determines that the module was unique and because + * of this allocated yet another private kernel copy of the module space in + * layout_and_allocate() but after this determined in add_unformed_module() + * that another module with the same name is already loaded or being processed. + * These failures should be mitigated as much as possible and are indicative + * of really fast races in loading modules. Without module decompression + * they waste twice as much vmap space. With module decompression three + * times the module's size vmap space is wasted. + */ +enum fail_dup_mod_reason { + FAIL_DUP_MOD_BECOMING = 0, + FAIL_DUP_MOD_LOAD, +}; + +#ifdef CONFIG_MODULE_DEBUGFS +extern struct dentry *mod_debugfs_root; +#endif + +#ifdef CONFIG_MODULE_STATS + +#define mod_stat_add_long(count, var) atomic_long_add(count, var) +#define mod_stat_inc(name) atomic_inc(name) + +extern atomic_long_t total_mod_size; +extern atomic_long_t total_text_size; +extern atomic_long_t invalid_kread_bytes; +extern atomic_long_t invalid_decompress_bytes; + +extern atomic_t modcount; +extern atomic_t failed_kreads; +extern atomic_t failed_decompress; +struct mod_fail_load { + struct list_head list; + char name[MODULE_NAME_LEN]; + atomic_long_t count; + unsigned long dup_fail_mask; +}; + +int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason); +void mod_stat_bump_invalid(struct load_info *info, int flags); +void mod_stat_bump_becoming(struct load_info *info, int flags); + +#else + +#define mod_stat_add_long(name, var) +#define mod_stat_inc(name) + +static inline int try_add_failed_module(const char *name, + enum fail_dup_mod_reason reason) +{ + return 0; +} + +static inline void mod_stat_bump_invalid(struct load_info *info, int flags) +{ +} + +static inline void mod_stat_bump_becoming(struct load_info *info, int flags) +{ +} + +#endif /* CONFIG_MODULE_STATS */ + +#ifdef CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS +bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret); +void kmod_dup_request_announce(char *module_name, int ret); +#else +static inline bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret) +{ + return false; +} + +static inline void kmod_dup_request_announce(char *module_name, int ret) +{ +} +#endif + #ifdef CONFIG_MODULE_UNLOAD_TAINT_TRACKING struct mod_unload_taint { struct list_head list; @@ -190,10 +278,13 @@ struct mod_tree_root { #endif unsigned long addr_min; unsigned long addr_max; +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + unsigned long data_addr_min; + unsigned long data_addr_max; +#endif }; extern struct mod_tree_root mod_tree; -extern struct mod_tree_root mod_data_tree; #ifdef CONFIG_MODULES_TREE_LOOKUP void mod_tree_insert(struct module *mod); @@ -224,7 +315,6 @@ void module_enable_nx(const struct module *mod); void module_enable_x(const struct module *mod); int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod); -bool module_check_misalignment(const struct module *mod); #ifdef CONFIG_MODULE_SIG int module_sig_check(struct load_info *info, int flags); diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index bdc911dbcde5..c550d7d45f2f 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -6,6 +6,7 @@ */ #include <linux/module.h> +#include <linux/module_symbol.h> #include <linux/kallsyms.h> #include <linux/buildid.h> #include <linux/bsearch.h> @@ -78,6 +79,7 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, unsigned int shnum, unsigned int pcpundx) { const Elf_Shdr *sec; + enum mod_mem_type type; if (src->st_shndx == SHN_UNDEF || src->st_shndx >= shnum || @@ -90,11 +92,12 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, #endif sec = sechdrs + src->st_shndx; + type = sec->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT; if (!(sec->sh_flags & SHF_ALLOC) #ifndef CONFIG_KALLSYMS_ALL || !(sec->sh_flags & SHF_EXECINSTR) #endif - || (sec->sh_entsize & INIT_OFFSET_MASK)) + || mod_mem_type_is_init(type)) return false; return true; @@ -113,11 +116,13 @@ void layout_symtab(struct module *mod, struct load_info *info) Elf_Shdr *strsect = info->sechdrs + info->index.str; const Elf_Sym *src; unsigned int i, nsrc, ndst, strtab_size = 0; + struct module_memory *mod_mem_data = &mod->mem[MOD_DATA]; + struct module_memory *mod_mem_init_data = &mod->mem[MOD_INIT_DATA]; /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; - symsect->sh_entsize = module_get_offset(mod, &mod->init_layout.size, symsect, - info->index.sym) | INIT_OFFSET_MASK; + symsect->sh_entsize = module_get_offset_and_type(mod, MOD_INIT_DATA, + symsect, info->index.sym); pr_debug("\t%s\n", info->secstrings + symsect->sh_name); src = (void *)info->hdr + symsect->sh_offset; @@ -134,28 +139,27 @@ void layout_symtab(struct module *mod, struct load_info *info) } /* Append room for core symbols at end of core part. */ - info->symoffs = ALIGN(mod->data_layout.size, symsect->sh_addralign ?: 1); - info->stroffs = mod->data_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); - mod->data_layout.size += strtab_size; + info->symoffs = ALIGN(mod_mem_data->size, symsect->sh_addralign ?: 1); + info->stroffs = mod_mem_data->size = info->symoffs + ndst * sizeof(Elf_Sym); + mod_mem_data->size += strtab_size; /* Note add_kallsyms() computes strtab_size as core_typeoffs - stroffs */ - info->core_typeoffs = mod->data_layout.size; - mod->data_layout.size += ndst * sizeof(char); - mod->data_layout.size = strict_align(mod->data_layout.size); + info->core_typeoffs = mod_mem_data->size; + mod_mem_data->size += ndst * sizeof(char); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; - strsect->sh_entsize = module_get_offset(mod, &mod->init_layout.size, strsect, - info->index.str) | INIT_OFFSET_MASK; + strsect->sh_entsize = module_get_offset_and_type(mod, MOD_INIT_DATA, + strsect, info->index.str); pr_debug("\t%s\n", info->secstrings + strsect->sh_name); /* We'll tack temporary mod_kallsyms on the end. */ - mod->init_layout.size = ALIGN(mod->init_layout.size, - __alignof__(struct mod_kallsyms)); - info->mod_kallsyms_init_off = mod->init_layout.size; - mod->init_layout.size += sizeof(struct mod_kallsyms); - info->init_typeoffs = mod->init_layout.size; - mod->init_layout.size += nsrc * sizeof(char); - mod->init_layout.size = strict_align(mod->init_layout.size); + mod_mem_init_data->size = ALIGN(mod_mem_init_data->size, + __alignof__(struct mod_kallsyms)); + info->mod_kallsyms_init_off = mod_mem_init_data->size; + + mod_mem_init_data->size += sizeof(struct mod_kallsyms); + info->init_typeoffs = mod_mem_init_data->size; + mod_mem_init_data->size += nsrc * sizeof(char); } /* @@ -171,9 +175,11 @@ void add_kallsyms(struct module *mod, const struct load_info *info) char *s; Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; unsigned long strtab_size; + void *data_base = mod->mem[MOD_DATA].base; + void *init_data_base = mod->mem[MOD_INIT_DATA].base; /* Set up to point into init section. */ - mod->kallsyms = (void __rcu *)mod->init_layout.base + + mod->kallsyms = (void __rcu *)init_data_base + info->mod_kallsyms_init_off; rcu_read_lock(); @@ -183,15 +189,15 @@ void add_kallsyms(struct module *mod, const struct load_info *info) /* Make sure we get permanent strtab: don't use info->strtab. */ rcu_dereference(mod->kallsyms)->strtab = (void *)info->sechdrs[info->index.str].sh_addr; - rcu_dereference(mod->kallsyms)->typetab = mod->init_layout.base + info->init_typeoffs; + rcu_dereference(mod->kallsyms)->typetab = init_data_base + info->init_typeoffs; /* * Now populate the cut down core kallsyms for after init * and set types up while we still have access to sections. */ - mod->core_kallsyms.symtab = dst = mod->data_layout.base + info->symoffs; - mod->core_kallsyms.strtab = s = mod->data_layout.base + info->stroffs; - mod->core_kallsyms.typetab = mod->data_layout.base + info->core_typeoffs; + mod->core_kallsyms.symtab = dst = data_base + info->symoffs; + mod->core_kallsyms.strtab = s = data_base + info->stroffs; + mod->core_kallsyms.typetab = data_base + info->core_typeoffs; strtab_size = info->core_typeoffs - info->stroffs; src = rcu_dereference(mod->kallsyms)->symtab; for (ndst = i = 0; i < rcu_dereference(mod->kallsyms)->num_symtab; i++) { @@ -238,18 +244,6 @@ void init_build_id(struct module *mod, const struct load_info *info) } #endif -/* - * This ignores the intensely annoying "mapping symbols" found - * in ARM ELF files: $a, $t and $d. - */ -static inline int is_arm_mapping_symbol(const char *str) -{ - if (str[0] == '.' && str[1] == 'L') - return true; - return str[0] == '$' && strchr("axtd", str[1]) && - (str[2] == '\0' || str[2] == '.'); -} - static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum) { return kallsyms->strtab + kallsyms->symtab[symnum].st_name; @@ -267,12 +261,15 @@ static const char *find_kallsyms_symbol(struct module *mod, unsigned int i, best = 0; unsigned long nextval, bestval; struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); + struct module_memory *mod_mem; /* At worse, next value is at end of module */ if (within_module_init(addr, mod)) - nextval = (unsigned long)mod->init_layout.base + mod->init_layout.text_size; + mod_mem = &mod->mem[MOD_INIT_TEXT]; else - nextval = (unsigned long)mod->core_layout.base + mod->core_layout.text_size; + mod_mem = &mod->mem[MOD_TEXT]; + + nextval = (unsigned long)mod_mem->base + mod_mem->size; bestval = kallsyms_symbol_value(&kallsyms->symtab[best]); @@ -292,7 +289,7 @@ static const char *find_kallsyms_symbol(struct module *mod, * and inserted at a whim. */ if (*kallsyms_symbol_name(kallsyms, i) == '\0' || - is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i))) + is_mapping_symbol(kallsyms_symbol_name(kallsyms, i))) continue; if (thisval <= addr && thisval > bestval) { @@ -505,8 +502,7 @@ unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) } int module_kallsyms_on_each_symbol(const char *modname, - int (*fn)(void *, const char *, - struct module *, unsigned long), + int (*fn)(void *, const char *, unsigned long), void *data) { struct module *mod; @@ -535,7 +531,7 @@ int module_kallsyms_on_each_symbol(const char *modname, continue; ret = fn(data, kallsyms_symbol_name(kallsyms, i), - mod, kallsyms_symbol_value(sym)); + kallsyms_symbol_value(sym)); if (ret != 0) goto out; } diff --git a/kernel/module/kdb.c b/kernel/module/kdb.c index f4317f92e189..995c32d3698f 100644 --- a/kernel/module/kdb.c +++ b/kernel/module/kdb.c @@ -26,10 +26,11 @@ int kdb_lsmod(int argc, const char **argv) if (mod->state == MODULE_STATE_UNFORMED) continue; - kdb_printf("%-20s%8u", mod->name, mod->core_layout.size); -#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC - kdb_printf("/%8u", mod->data_layout.size); -#endif + kdb_printf("%-20s%8u", mod->name, mod->mem[MOD_TEXT].size); + kdb_printf("/%8u", mod->mem[MOD_RODATA].size); + kdb_printf("/%8u", mod->mem[MOD_RO_AFTER_INIT].size); + kdb_printf("/%8u", mod->mem[MOD_DATA].size); + kdb_printf(" 0x%px ", (void *)mod); #ifdef CONFIG_MODULE_UNLOAD kdb_printf("%4d ", module_refcount(mod)); @@ -40,10 +41,10 @@ int kdb_lsmod(int argc, const char **argv) kdb_printf(" (Loading)"); else kdb_printf(" (Live)"); - kdb_printf(" 0x%px", mod->core_layout.base); -#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC - kdb_printf("/0x%px", mod->data_layout.base); -#endif + kdb_printf(" 0x%px", mod->mem[MOD_TEXT].base); + kdb_printf("/0x%px", mod->mem[MOD_RODATA].base); + kdb_printf("/0x%px", mod->mem[MOD_RO_AFTER_INIT].base); + kdb_printf("/0x%px", mod->mem[MOD_DATA].base); #ifdef CONFIG_MODULE_UNLOAD { diff --git a/kernel/module/kmod.c b/kernel/module/kmod.c new file mode 100644 index 000000000000..0800d9891692 --- /dev/null +++ b/kernel/module/kmod.c @@ -0,0 +1,180 @@ +/* + * kmod - the kernel module loader + * + * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org> + */ + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/binfmts.h> +#include <linux/syscalls.h> +#include <linux/unistd.h> +#include <linux/kmod.h> +#include <linux/slab.h> +#include <linux/completion.h> +#include <linux/cred.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/workqueue.h> +#include <linux/security.h> +#include <linux/mount.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/resource.h> +#include <linux/notifier.h> +#include <linux/suspend.h> +#include <linux/rwsem.h> +#include <linux/ptrace.h> +#include <linux/async.h> +#include <linux/uaccess.h> + +#include <trace/events/module.h> +#include "internal.h" + +/* |
