diff options
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/Kconfig.debug | 25 | ||||
| -rw-r--r-- | lib/Kconfig.kcsan | 2 | ||||
| -rw-r--r-- | lib/Makefile | 4 | ||||
| -rw-r--r-- | lib/bug.c | 15 | ||||
| -rw-r--r-- | lib/cpumask.c | 52 | ||||
| -rw-r--r-- | lib/crypto/blake2s-selftest.c | 25 | ||||
| -rw-r--r-- | lib/dec_and_lock.c | 31 | ||||
| -rw-r--r-- | lib/find_bit.c | 9 | ||||
| -rw-r--r-- | lib/group_cpus.c | 428 | ||||
| -rw-r--r-- | lib/iov_iter.c | 299 | ||||
| -rw-r--r-- | lib/kunit/assert.c | 40 | ||||
| -rw-r--r-- | lib/kunit/string-stream.c | 4 | ||||
| -rw-r--r-- | lib/kunit/test.c | 1 | ||||
| -rw-r--r-- | lib/lockref.c | 1 | ||||
| -rw-r--r-- | lib/maple_tree.c | 22 | ||||
| -rw-r--r-- | lib/memcpy_kunit.c | 2 | ||||
| -rw-r--r-- | lib/mpi/mpicoder.c | 3 | ||||
| -rw-r--r-- | lib/nlattr.c | 3 | ||||
| -rw-r--r-- | lib/parser.c | 39 | ||||
| -rw-r--r-- | lib/sbitmap.c | 102 | ||||
| -rw-r--r-- | lib/scatterlist.c | 25 | ||||
| -rw-r--r-- | lib/string.c | 10 | ||||
| -rw-r--r-- | lib/test_maple_tree.c | 89 | ||||
| -rw-r--r-- | lib/ubsan.c | 73 | ||||
| -rw-r--r-- | lib/ubsan.h | 32 | ||||
| -rw-r--r-- | lib/usercopy.c | 7 | ||||
| -rw-r--r-- | lib/win_minmax.c | 2 |
27 files changed, 1174 insertions, 171 deletions
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index dad99197d269..5a69b3805b1c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -389,6 +389,15 @@ config PAHOLE_HAS_BTF_TAG btf_decl_tag) or not. Currently only clang compiler implements these attributes, so make the config depend on CC_IS_CLANG. +config PAHOLE_HAS_LANG_EXCLUDE + def_bool PAHOLE_VERSION >= 124 + help + Support for the --lang_exclude flag which makes pahole exclude + compilation units from the supplied language. Used in Kbuild to + omit Rust CUs which are not supported in version 1.24 of pahole, + otherwise it would emit malformed kernel and module binaries when + using DEBUG_INFO_BTF_MODULES. + config DEBUG_INFO_BTF_MODULES def_bool y depends on DEBUG_INFO_BTF && MODULES && PAHOLE_HAS_SPLIT_BTF @@ -754,6 +763,7 @@ config DEBUG_KMEMLEAK select KALLSYMS select CRC32 select STACKDEPOT + select STACKDEPOT_ALWAYS_INIT if !DEBUG_KMEMLEAK_DEFAULT_OFF help Say Y here if you want to enable the memory leak detector. The memory allocation/freeing is traced in a way @@ -1207,7 +1217,7 @@ config SCHED_DEBUG depends on DEBUG_KERNEL && PROC_FS default y help - If you say Y here, the /proc/sched_debug file will be provided + If you say Y here, the /sys/kernel/debug/sched file will be provided that can help debug the scheduler. The runtime overhead of this option is minimal. @@ -1928,7 +1938,7 @@ config FUNCTION_ERROR_INJECTION help Add fault injections into various functions that are annotated with ALLOW_ERROR_INJECTION() in the kernel. BPF may also modify the return - value of theses functions. This is useful to test error paths of code. + value of these functions. This is useful to test error paths of code. If unsure, say N @@ -2577,6 +2587,15 @@ config MEMCPY_KUNIT_TEST If unsure, say N. +config MEMCPY_SLOW_KUNIT_TEST + bool "Include exhaustive memcpy tests" + depends on MEMCPY_KUNIT_TEST + default y + help + Some memcpy tests are quite exhaustive in checking for overlaps + and bit ranges. These can be very slow, so they are split out + as a separate config, in case they need to be disabled. + config IS_SIGNED_TYPE_KUNIT_TEST tristate "Test is_signed_type() macro" if !KUNIT_ALL_TESTS depends on KUNIT @@ -2883,6 +2902,4 @@ config RUST_BUILD_ASSERT_ALLOW endmenu # "Rust" -source "Documentation/Kconfig" - endmenu # Kernel hacking diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan index 375575a5a0e3..4dedd61e5192 100644 --- a/lib/Kconfig.kcsan +++ b/lib/Kconfig.kcsan @@ -194,7 +194,7 @@ config KCSAN_WEAK_MEMORY Enable support for modeling a subset of weak memory, which allows detecting a subset of data races due to missing memory barriers. - Depends on KCSAN_STRICT, because the options strenghtening certain + Depends on KCSAN_STRICT, because the options strengthening certain plain accesses by default (depending on !KCSAN_STRICT) reduce the ability to detect any data races invoving reordered accesses, in particular reordered writes. diff --git a/lib/Makefile b/lib/Makefile index 4d9461bfea42..36938c564a2a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -340,9 +340,7 @@ quiet_cmd_build_OID_registry = GEN $@ clean-files += oid_registry_data.c obj-$(CONFIG_UCS2_STRING) += ucs2_string.o -ifneq ($(CONFIG_UBSAN_TRAP),y) obj-$(CONFIG_UBSAN) += ubsan.o -endif UBSAN_SANITIZE_ubsan.o := n KASAN_SANITIZE_ubsan.o := n @@ -353,6 +351,8 @@ obj-$(CONFIG_SBITMAP) += sbitmap.o obj-$(CONFIG_PARMAN) += parman.o +obj-y += group_cpus.o + # GCC library routines obj-$(CONFIG_GENERIC_LIB_ASHLDI3) += ashldi3.o obj-$(CONFIG_GENERIC_LIB_ASHRDI3) += ashrdi3.o diff --git a/lib/bug.c b/lib/bug.c index c223a2575b72..e0ff21989990 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -47,6 +47,7 @@ #include <linux/sched.h> #include <linux/rculist.h> #include <linux/ftrace.h> +#include <linux/context_tracking.h> extern struct bug_entry __start___bug_table[], __stop___bug_table[]; @@ -153,7 +154,7 @@ struct bug_entry *find_bug(unsigned long bugaddr) return module_find_bug(bugaddr); } -enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) +static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *regs) { struct bug_entry *bug; const char *file; @@ -209,6 +210,18 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) return BUG_TRAP_TYPE_BUG; } +enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) +{ + enum bug_trap_type ret; + bool rcu = false; + + rcu = warn_rcu_enter(); + ret = __report_bug(bugaddr, regs); + warn_rcu_exit(rcu); + + return ret; +} + static void clear_once_table(struct bug_entry *start, struct bug_entry *end) { struct bug_entry *bug; diff --git a/lib/cpumask.c b/lib/cpumask.c index c7c392514fd3..e7258836b60b 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -110,15 +110,33 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask) #endif /** - * cpumask_local_spread - select the i'th cpu with local numa cpu's first + * cpumask_local_spread - select the i'th cpu based on NUMA distances * @i: index number * @node: local numa_node * - * This function selects an online CPU according to a numa aware policy; - * local cpus are returned first, followed by non-local ones, then it - * wraps around. + * Returns online CPU according to a numa aware policy; local cpus are returned + * first, followed by non-local ones, then it wraps around. * - * It's not very efficient, but useful for setup. + * For those who wants to enumerate all CPUs based on their NUMA distances, + * i.e. call this function in a loop, like: + * + * for (i = 0; i < num_online_cpus(); i++) { + * cpu = cpumask_local_spread(i, node); + * do_something(cpu); + * } + * + * There's a better alternative based on for_each()-like iterators: + * + * for_each_numa_hop_mask(mask, node) { + * for_each_cpu_andnot(cpu, mask, prev) + * do_something(cpu); + * prev = mask; + * } + * + * It's simpler and more verbose than above. Complexity of iterator-based + * enumeration is O(sched_domains_numa_levels * nr_cpu_ids), while + * cpumask_local_spread() when called for each cpu is + * O(sched_domains_numa_levels * nr_cpu_ids * log(nr_cpu_ids)). */ unsigned int cpumask_local_spread(unsigned int i, int node) { @@ -127,24 +145,12 @@ unsigned int cpumask_local_spread(unsigned int i, int node) /* Wrap: we always want a cpu. */ i %= num_online_cpus(); - if (node == NUMA_NO_NODE) { - cpu = cpumask_nth(i, cpu_online_mask); - if (cpu < nr_cpu_ids) - return cpu; - } else { - /* NUMA first. */ - cpu = cpumask_nth_and(i, cpu_online_mask, cpumask_of_node(node)); - if (cpu < nr_cpu_ids) - return cpu; - - i -= cpumask_weight_and(cpu_online_mask, cpumask_of_node(node)); - - /* Skip NUMA nodes, done above. */ - cpu = cpumask_nth_andnot(i, cpu_online_mask, cpumask_of_node(node)); - if (cpu < nr_cpu_ids) - return cpu; - } - BUG(); + cpu = (node == NUMA_NO_NODE) ? + cpumask_nth(i, cpu_online_mask) : + sched_numa_find_nth_cpu(cpu_online_mask, i, node); + + WARN_ON(cpu >= nr_cpu_ids); + return cpu; } EXPORT_SYMBOL(cpumask_local_spread); diff --git a/lib/crypto/blake2s-selftest.c b/lib/crypto/blake2s-selftest.c index 7d77dea15587..d0634ed6a937 100644 --- a/lib/crypto/blake2s-selftest.c +++ b/lib/crypto/blake2s-selftest.c @@ -545,7 +545,7 @@ static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = { 0xd6, 0x98, 0x6b, 0x07, 0x10, 0x65, 0x52, 0x65, }, }; -bool __init blake2s_selftest(void) +static bool __init noinline_for_stack blake2s_digest_test(void) { u8 key[BLAKE2S_KEY_SIZE]; u8 buf[ARRAY_SIZE(blake2s_testvecs)]; @@ -589,11 +589,20 @@ bool __init blake2s_selftest(void) } } + return success; +} + +static bool __init noinline_for_stack blake2s_random_test(void) +{ + struct blake2s_state state; + bool success = true; + int i, l; + for (i = 0; i < 32; ++i) { enum { TEST_ALIGNMENT = 16 }; - u8 unaligned_block[BLAKE2S_BLOCK_SIZE + TEST_ALIGNMENT - 1] + u8 blocks[BLAKE2S_BLOCK_SIZE * 2 + TEST_ALIGNMENT - 1] __aligned(TEST_ALIGNMENT); - u8 blocks[BLAKE2S_BLOCK_SIZE * 2]; + u8 *unaligned_block = blocks + BLAKE2S_BLOCK_SIZE; struct blake2s_state state1, state2; get_random_bytes(blocks, sizeof(blocks)); @@ -630,3 +639,13 @@ bool __init blake2s_selftest(void) return success; } + +bool __init blake2s_selftest(void) +{ + bool success; + + success = blake2s_digest_test(); + success &= blake2s_random_test(); + + return success; +} diff --git a/lib/dec_and_lock.c b/lib/dec_and_lock.c index 9555b68bb774..1dcca8f2e194 100644 --- a/lib/dec_and_lock.c +++ b/lib/dec_and_lock.c @@ -49,3 +49,34 @@ int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, return 0; } EXPORT_SYMBOL(_atomic_dec_and_lock_irqsave); + +int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock) +{ + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ + if (atomic_add_unless(atomic, -1, 1)) + return 0; + + /* Otherwise do it the slow way */ + raw_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + raw_spin_unlock(lock); + return 0; +} +EXPORT_SYMBOL(_atomic_dec_and_raw_lock); + +int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock, + unsigned long *flags) +{ + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ + if (atomic_add_unless(atomic, -1, 1)) + return 0; + + /* Otherwise do it the slow way */ + raw_spin_lock_irqsave(lock, *flags); + if (atomic_dec_and_test(atomic)) + return 1; + raw_spin_unlock_irqrestore(lock, *flags); + return 0; +} +EXPORT_SYMBOL(_atomic_dec_and_raw_lock_irqsave); diff --git a/lib/find_bit.c b/lib/find_bit.c index 18bc0a7ac8ee..c10920e66788 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -155,6 +155,15 @@ unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned l } EXPORT_SYMBOL(__find_nth_andnot_bit); +unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, + const unsigned long *addr2, + const unsigned long *addr3, + unsigned long size, unsigned long n) +{ + return FIND_NTH_BIT(addr1[idx] & addr2[idx] & ~addr3[idx], size, n); +} +EXPORT_SYMBOL(__find_nth_and_andnot_bit); + #ifndef find_next_and_bit unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start) diff --git a/lib/group_cpus.c b/lib/group_cpus.c new file mode 100644 index 000000000000..9c837a35fef7 --- /dev/null +++ b/lib/group_cpus.c @@ -0,0 +1,428 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2016 Thomas Gleixner. + * Copyright (C) 2016-2017 Christoph Hellwig. + */ +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/cpu.h> +#include <linux/sort.h> +#include <linux/group_cpus.h> + +#ifdef CONFIG_SMP + +static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, + unsigned int cpus_per_grp) +{ + const struct cpumask *siblmsk; + int cpu, sibl; + + for ( ; cpus_per_grp > 0; ) { + cpu = cpumask_first(nmsk); + + /* Should not happen, but I'm too lazy to think about it */ + if (cpu >= nr_cpu_ids) + return; + + cpumask_clear_cpu(cpu, nmsk); + cpumask_set_cpu(cpu, irqmsk); + cpus_per_grp--; + + /* If the cpu has siblings, use them first */ + siblmsk = topology_sibling_cpumask(cpu); + for (sibl = -1; cpus_per_grp > 0; ) { + sibl = cpumask_next(sibl, siblmsk); + if (sibl >= nr_cpu_ids) + break; + if (!cpumask_test_and_clear_cpu(sibl, nmsk)) + continue; + cpumask_set_cpu(sibl, irqmsk); + cpus_per_grp--; + } + } +} + +static cpumask_var_t *alloc_node_to_cpumask(void) +{ + cpumask_var_t *masks; + int node; + + masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); + if (!masks) + return NULL; + + for (node = 0; node < nr_node_ids; node++) { + if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) + goto out_unwind; + } + + return masks; + +out_unwind: + while (--node >= 0) + free_cpumask_var(masks[node]); + kfree(masks); + return NULL; +} + +static void free_node_to_cpumask(cpumask_var_t *masks) +{ + int node; + + for (node = 0; node < nr_node_ids; node++) + free_cpumask_var(masks[node]); + kfree(masks); +} + +static void build_node_to_cpumask(cpumask_var_t *masks) +{ + int cpu; + + for_each_possible_cpu(cpu) + cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); +} + +static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, + const struct cpumask *mask, nodemask_t *nodemsk) +{ + int n, nodes = 0; + + /* Calculate the number of nodes in the supplied affinity mask */ + for_each_node(n) { + if (cpumask_intersects(mask, node_to_cpumask[n])) { + node_set(n, *nodemsk); + nodes++; + } + } + return nodes; +} + +struct node_groups { + unsigned id; + + union { + unsigned ngroups; + unsigned ncpus; + }; +}; + +static int ncpus_cmp_func(const void *l, const void *r) +{ + const struct node_groups *ln = l; + const struct node_groups *rn = r; + + return ln->ncpus - rn->ncpus; +} + +/* + * Allocate group number for each node, so that for each node: + * + * 1) the allocated number is >= 1 + * + * 2) the allocated number is <= active CPU number of this node + * + * The actual allocated total groups may be less than @numgrps when + * active total CPU number is less than @numgrps. + * + * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' + * for each node. + */ +static void alloc_nodes_groups(unsigned int numgrps, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + const nodemask_t nodemsk, + struct cpumask *nmsk, + struct node_groups *node_groups) +{ + unsigned n, remaining_ncpus = 0; + + for (n = 0; n < nr_node_ids; n++) { + node_groups[n].id = n; + node_groups[n].ncpus = UINT_MAX; + } + + for_each_node_mask(n, nodemsk) { + unsigned ncpus; + + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + ncpus = cpumask_weight(nmsk); + + if (!ncpus) + continue; + remaining_ncpus += ncpus; + node_groups[n].ncpus = ncpus; + } + + numgrps = min_t(unsigned, remaining_ncpus, numgrps); + + sort(node_groups, nr_node_ids, sizeof(node_groups[0]), + ncpus_cmp_func, NULL); + + /* + * Allocate groups for each node according to the ratio of this + * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is + * bigger than number of active numa nodes. Always start the + * allocation from the node with minimized nr_cpus. + * + * This way guarantees that each active node gets allocated at + * least one group, and the theory is simple: over-allocation + * is only done when this node is assigned by one group, so + * other nodes will be allocated >= 1 groups, since 'numgrps' is + * bigger than number of numa nodes. + * + * One perfect invariant is that number of allocated groups for + * each node is <= CPU count of this node: + * + * 1) suppose there are two nodes: A and B + * ncpu(X) is CPU count of node X + * grps(X) is the group count allocated to node X via this + * algorithm + * + * ncpu(A) <= ncpu(B) + * ncpu(A) + ncpu(B) = N + * grps(A) + grps(B) = G + * + * grps(A) = max(1, round_down(G * ncpu(A) / N)) + * grps(B) = G - grps(A) + * + * both N and G are integer, and 2 <= G <= N, suppose + * G = N - delta, and 0 <= delta <= N - 2 + * + * 2) obviously grps(A) <= ncpu(A) because: + * + * if grps(A) is 1, then grps(A) <= ncpu(A) given + * ncpu(A) >= 1 + * + * otherwise, + * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N + * + * 3) prove how grps(B) <= ncpu(B): + * + * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be + * over-allocated, so grps(B) <= ncpu(B), + * + * otherwise: + * + * grps(A) = + * round_down(G * ncpu(A) / N) = + * round_down((N - delta) * ncpu(A) / N) = + * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= + * round_down((N * ncpu(A) - delta * N) / N) = + * cpu(A) - delta + * + * then: + * + * grps(A) - G >= ncpu(A) - delta - G + * => + * G - grps(A) <= G + delta - ncpu(A) + * => + * grps(B) <= N - ncpu(A) + * => + * grps(B) <= cpu(B) + * + * For nodes >= 3, it can be thought as one node and another big + * node given that is exactly what this algorithm is implemented, + * and we always re-calculate 'remaining_ncpus' & 'numgrps', and + * finally for each node X: grps(X) <= ncpu(X). + * + */ + for (n = 0; n < nr_node_ids; n++) { + unsigned ngroups, ncpus; + + if (node_groups[n].ncpus == UINT_MAX) + continue; + + WARN_ON_ONCE(numgrps == 0); + + ncpus = node_groups[n].ncpus; + ngroups = max_t(unsigned, 1, + numgrps * ncpus / remaining_ncpus); + WARN_ON_ONCE(ngroups > ncpus); + + node_groups[n].ngroups = ngroups; + + remaining_ncpus -= ncpus; + numgrps -= ngroups; + } +} + +static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + struct cpumask *nmsk, struct cpumask *masks) +{ + unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; + unsigned int last_grp = numgrps; + unsigned int curgrp = startgrp; + nodemask_t nodemsk = NODE_MASK_NONE; + struct node_groups *node_groups; + + if (cpumask_empty(cpu_mask)) + return 0; + + nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); + + /* + * If the number of nodes in the mask is greater than or equal the + * number of groups we just spread the groups across the nodes. + */ + if (numgrps <= nodes) { + for_each_node_mask(n, nodemsk) { + /* Ensure that only CPUs which are in both masks are set */ + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); + if (++curgrp == last_grp) + curgrp = 0; + } + return numgrps; + } + + node_groups = kcalloc(nr_node_ids, + sizeof(struct node_groups), + GFP_KERNEL); + if (!node_groups) + return -ENOMEM; + + /* allocate group number for each node */ + alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, + nodemsk, nmsk, node_groups); + for (i = 0; i < nr_node_ids; i++) { + unsigned int ncpus, v; + struct node_groups *nv = &node_groups[i]; + + if (nv->ngroups == UINT_MAX) + continue; + + /* Get the cpus on this node which are in the mask */ + cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); + ncpus = cpumask_weight(nmsk); + if (!ncpus) + continue; + + WARN_ON_ONCE(nv->ngroups > ncpus); + + /* Account for rounding errors */ + extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); + + /* Spread allocated groups on CPUs of the current node */ + for (v = 0; v < nv->ngroups; v++, curgrp++) { + cpus_per_grp = ncpus / nv->ngroups; + + /* Account for extra groups to compensate rounding errors */ + if (extra_grps) { + cpus_per_grp++; + --extra_grps; + } + + /* + * wrapping has to be considered given 'startgrp' + * may start anywhere + */ + if (curgrp >= last_grp) + curgrp = 0; + grp_spread_init_one(&masks[curgrp], nmsk, + cpus_per_grp); + } + done += nv->ngroups; + } + kfree(node_groups); + return done; +} + +/** + * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality + * @numgrps: number of groups + * + * Return: cpumask array if successful, NULL otherwise. And each element + * includes CPUs assigned to this group + * + * Try to put close CPUs from viewpoint of CPU and NUMA locality into + * same group, and run two-stage grouping: + * 1) allocate present CPUs on these groups evenly first + * 2) allocate other possible CPUs on these groups evenly + * + * We guarantee in the resulted grouping that all CPUs are covered, and + * no same CPU is assigned to multiple groups + */ +struct cpumask *group_cpus_evenly(unsigned int numgrps) +{ + unsigned int curgrp = 0, nr_present = 0, nr_others = 0; + cpumask_var_t *node_to_cpumask; + cpumask_var_t nmsk, npresmsk; + int ret = -ENOMEM; + struct cpumask *masks = NULL; + + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) + return NULL; + + if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) + goto fail_nmsk; + + node_to_cpumask = alloc_node_to_cpumask(); + if (!node_to_cpumask) + goto fail_npresmsk; + + masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); + if (!masks) + goto fail_node_to_cpumask; + + /* Stabilize the cpumasks */ + cpus_read_lock(); + build_node_to_cpumask(node_to_cpumask); + + /* grouping present CPUs first */ + ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, + cpu_present_mask, nmsk, masks); + if (ret < 0) + goto fail_build_affinity; + nr_present = ret; + + /* + * Allocate non present CPUs starting from the next group to be + * handled. If the grouping of present CPUs already exhausted the + * group space, assign the non present CPUs to the already + * allocated out groups. + */ + if (nr_present >= numgrps) + curgrp = 0; + else + curgrp = nr_present; + cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); + ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, + npresmsk, nmsk, masks); + if (ret >= 0) + nr_others = ret; + + fail_build_affinity: + cpus_read_unlock(); + + if (ret >= 0) + WARN_ON(nr_present + nr_others < numgrps); + + fail_node_to_cpumask: + free_node_to_cpumask(node_to_cpumask); + + fail_npresmsk: + free_cpumask_var(npresmsk); + + fail_nmsk: + free_cpumask_var(nmsk); + if (ret < 0) { + kfree(masks); + return NULL; + } + return masks; +} +#else /* CONFIG_SMP */ +struct cpumask *group_cpus_evenly(unsigned int numgrps) +{ + struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); + + if (!masks) + return NULL; + + /* assign all CPUs(cpu 0) to the 1st group only */ + cpumask_copy(&masks[0], cpu_possible_mask); + return masks; +} +#endif /* CONFIG_SMP */ diff --git a/lib/iov_iter.c b/lib/iov_iter.c index f9a3ff37ecd1..274014e4eafe 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -186,12 +186,6 @@ static int copyin(void *to, const void __user *from, size_t n) return res; } -static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe, - unsigned int slot) -{ - return &pipe->bufs[slot & (pipe->ring_size - 1)]; -} - #ifdef PIPE_PARANOIA static bool sanity(const struct iov_iter *i) { @@ -1432,9 +1426,9 @@ static struct page *first_bvec_segment(const struct iov_iter *i, static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, size_t *start, - unsigned int gup_flags) + iov_iter_extraction_t extraction_flags) { - unsigned int n; + unsigned int n, gup_flags = 0; if (maxsize > i->count) maxsize = i->count; @@ -1442,6 +1436,8 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, return 0; if (maxsize > MAX_RW_COUNT) maxsize = MAX_RW_COUNT; + if (extraction_flags & ITER_ALLOW_P2PDMA) + gup_flags |= FOLL_PCI_P2PDMA; if (likely(user_backed_iter(i))) { unsigned long addr; @@ -1495,14 +1491,14 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, - size_t *start, unsigned gup_flags) + size_t *start, iov_iter_extraction_t extraction_flags) { if (!maxpages) return 0; BUG_ON(!pages); return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, - start, gup_flags); + start, extraction_flags); } EXPORT_SYMBOL_GPL(iov_iter_get_pages); @@ -1515,14 +1511,14 @@ EXPORT_SYMBOL(iov_iter_get_pages2); ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, - size_t *start, unsigned gup_flags) + size_t *start, iov_iter_extraction_t extraction_flags) { ssize_t len; *pages = NULL; len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start, - gup_flags); + extraction_flags); if (len <= 0) { kvfree(*pages); *pages = NULL; @@ -1877,6 +1873,17 @@ int import_single_range(int rw, void __user *buf, size_t len, } EXPORT_SYMBOL(import_single_range); +int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) +{ + if (len > MAX_RW_COUNT) + len = MAX_RW_COUNT; + if (unlikely(!access_ok(buf, len))) + return -EFAULT; + + iov_iter_ubuf(i, rw, buf, len); + return 0; +} + /** * iov_iter_restore() - Restore a &struct iov_iter to the same state as when * iov_iter_save_state() was called. @@ -1891,8 +1898,8 @@ EXPORT_SYMBOL(import_single_range); */ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) { - if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) && - !iov_iter_is_kvec(i) && !iter_is_ubuf(i)) + if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) && + !iter_is_ubuf(i)) && !iov_iter_is_kvec(i)) return; i->iov_offset = state->iov_offset; i->count = state->count; @@ -1914,3 +1921,267 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) i->iov -= state->nr_segs - i->nr_segs; i->nr_segs = state->nr_segs; } + +/* + * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not + * get references on the pages, nor does it get a pin on them. + */ +static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, + struct page ***pages, size_t maxsize, + unsigned int maxpages, + iov_iter_extraction_t extraction_flags, + size_t *offset0) +{ + struct page *page, **p; + unsigned int nr = 0, offset; + loff_t pos = i->xarray_start + i->iov_offset; + pgoff_t index = pos >> PAGE_SHIFT; + XA_STATE(xas, i->xarray, index); + + offset = pos & ~PAGE_MASK; + *offset0 = offset; + + maxpages = want_pages_array(pages, maxsize, offset, maxpages); + if (!maxpages) + return -ENOMEM; + p = *pages; + + rcu_read_lock(); + for (page = xas_load(&xas); page; page = xas_next(&xas)) { + if (xas_retry(&xas, page)) + continue; + + /* Has the page moved or been split? */ |
