summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/auditsc.c25
-rw-r--r--kernel/bpf/arraymap.c6
-rw-r--r--kernel/bpf/bpf_iter.c11
-rw-r--r--kernel/bpf/hashtab.c8
-rw-r--r--kernel/bpf/reuseport_array.c9
-rw-r--r--kernel/bpf/syscall.c35
-rw-r--r--kernel/bpf/trampoline.c5
-rw-r--r--kernel/cgroup/cpuset.c2
-rw-r--r--kernel/crash_core.c28
-rw-r--r--kernel/dma/coherent.c10
-rw-r--r--kernel/dma/direct.c43
-rw-r--r--kernel/dma/direct.h8
-rw-r--r--kernel/dma/mapping.c47
-rw-r--r--kernel/dma/swiotlb.c263
-rw-r--r--kernel/events/core.c2
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/hung_task.c2
-rw-r--r--kernel/kallsyms.c23
-rw-r--r--kernel/kallsyms_internal.h30
-rw-r--r--kernel/kexec_file.c10
-rw-r--r--kernel/kprobes.c3
-rw-r--r--kernel/module/Kconfig293
-rw-r--r--kernel/module/decompress.c8
-rw-r--r--kernel/module/internal.h2
-rw-r--r--kernel/module/kallsyms.c41
-rw-r--r--kernel/module/main.c43
-rw-r--r--kernel/module/procfs.c2
-rw-r--r--kernel/profile.c15
-rw-r--r--kernel/resource.c185
-rw-r--r--kernel/sched/core.c16
-rw-r--r--kernel/sched/sched.h7
-rw-r--r--kernel/sysctl.c20
-rw-r--r--kernel/watchdog.c21
33 files changed, 991 insertions, 234 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3a8c9d744800..dd8d9ab747c3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1073,31 +1073,6 @@ int audit_alloc(struct task_struct *tsk)
return 0;
}
-/**
- * audit_alloc_kernel - allocate an audit_context for a kernel task
- * @tsk: the kernel task
- *
- * Similar to the audit_alloc() function, but intended for kernel private
- * threads. Returns zero on success, negative values on failure.
- */
-int audit_alloc_kernel(struct task_struct *tsk)
-{
- /*
- * At the moment we are just going to call into audit_alloc() to
- * simplify the code, but there two things to keep in mind with this
- * approach:
- *
- * 1. Filtering internal kernel tasks is a bit laughable in almost all
- * cases, but there is at least one case where there is a benefit:
- * the '-a task,never' case allows the admin to effectively disable
- * task auditing at runtime.
- *
- * 2. The {set,clear}_task_syscall_work() ops likely have zero effect
- * on these internal kernel tasks, but they probably don't hurt either.
- */
- return audit_alloc(tsk);
-}
-
static inline void audit_free_context(struct audit_context *context)
{
/* resetting is extra work, but it is likely just noise */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index d3e734bf8056..624527401d4d 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -649,6 +649,11 @@ static int bpf_iter_init_array_map(void *priv_data,
seq_info->percpu_value_buf = value_buf;
}
+ /* bpf_iter_attach_map() acquires a map uref, and the uref may be
+ * released before or in the middle of iterating map elements, so
+ * acquire an extra map uref for iterator.
+ */
+ bpf_map_inc_with_uref(map);
seq_info->map = map;
return 0;
}
@@ -657,6 +662,7 @@ static void bpf_iter_fini_array_map(void *priv_data)
{
struct bpf_iter_seq_array_map_info *seq_info = priv_data;
+ bpf_map_put_with_uref(seq_info->map);
kfree(seq_info->percpu_value_buf);
}
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 2726a5950cfa..24b755eca0b3 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -68,13 +68,18 @@ static void bpf_iter_done_stop(struct seq_file *seq)
iter_priv->done_stop = true;
}
+static inline bool bpf_iter_target_support_resched(const struct bpf_iter_target_info *tinfo)
+{
+ return tinfo->reg_info->feature & BPF_ITER_RESCHED;
+}
+
static bool bpf_iter_support_resched(struct seq_file *seq)
{
struct bpf_iter_priv_data *iter_priv;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
- return iter_priv->tinfo->reg_info->feature & BPF_ITER_RESCHED;
+ return bpf_iter_target_support_resched(iter_priv->tinfo);
}
/* maximum visited objects before bailing out */
@@ -537,6 +542,10 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
if (!tinfo)
return -ENOENT;
+ /* Only allow sleepable program for resched-able iterator */
+ if (prog->aux->sleepable && !bpf_iter_target_support_resched(tinfo))
+ return -EINVAL;
+
link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
if (!link)
return -ENOMEM;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index da7578426a46..6c530a5e560a 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -311,12 +311,8 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
struct htab_elem *l;
if (node) {
- u32 key_size = htab->map.key_size;
-
l = container_of(node, struct htab_elem, lru_node);
- memcpy(l->key, key, key_size);
- check_and_init_map_value(&htab->map,
- l->key + round_up(key_size, 8));
+ memcpy(l->key, key, htab->map.key_size);
return l;
}
@@ -2064,6 +2060,7 @@ static int bpf_iter_init_hash_map(void *priv_data,
seq_info->percpu_value_buf = value_buf;
}
+ bpf_map_inc_with_uref(map);
seq_info->map = map;
seq_info->htab = container_of(map, struct bpf_htab, map);
return 0;
@@ -2073,6 +2070,7 @@ static void bpf_iter_fini_hash_map(void *priv_data)
{
struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
+ bpf_map_put_with_uref(seq_info->map);
kfree(seq_info->percpu_value_buf);
}
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index e2618fb5870e..85fa9dbfa8bf 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -21,14 +21,11 @@ static struct reuseport_array *reuseport_array(struct bpf_map *map)
/* The caller must hold the reuseport_lock */
void bpf_sk_reuseport_detach(struct sock *sk)
{
- uintptr_t sk_user_data;
+ struct sock __rcu **socks;
write_lock_bh(&sk->sk_callback_lock);
- sk_user_data = (uintptr_t)sk->sk_user_data;
- if (sk_user_data & SK_USER_DATA_BPF) {
- struct sock __rcu **socks;
-
- socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
+ socks = __rcu_dereference_sk_user_data_with_flags(sk, SK_USER_DATA_BPF);
+ if (socks) {
WRITE_ONCE(sk->sk_user_data, NULL);
/*
* Do not move this NULL assignment outside of
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 83c7136c5788..a4d40d98428a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3886,6 +3886,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
union bpf_attr __user *uattr)
{
struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct btf *attach_btf = bpf_prog_get_target_btf(prog);
struct bpf_prog_info info;
u32 info_len = attr->info.info_len;
struct bpf_prog_kstats stats;
@@ -4088,10 +4089,8 @@ static int bpf_prog_get_info_by_fd(struct file *file,
if (prog->aux->btf)
info.btf_id = btf_obj_id(prog->aux->btf);
info.attach_btf_id = prog->aux->attach_btf_id;
- if (prog->aux->attach_btf)
- info.attach_btf_obj_id = btf_obj_id(prog->aux->attach_btf);
- else if (prog->aux->dst_prog)
- info.attach_btf_obj_id = btf_obj_id(prog->aux->dst_prog->aux->attach_btf);
+ if (attach_btf)
+ info.attach_btf_obj_id = btf_obj_id(attach_btf);
ulen = info.nr_func_info;
info.nr_func_info = prog->aux->func_info_cnt;
@@ -5072,9 +5071,6 @@ static bool syscall_prog_is_valid_access(int off, int size,
BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
{
- struct bpf_prog * __maybe_unused prog;
- struct bpf_tramp_run_ctx __maybe_unused run_ctx;
-
switch (cmd) {
case BPF_MAP_CREATE:
case BPF_MAP_UPDATE_ELEM:
@@ -5084,6 +5080,26 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
case BPF_LINK_CREATE:
case BPF_RAW_TRACEPOINT_OPEN:
break;
+ default:
+ return -EINVAL;
+ }
+ return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
+}
+
+
+/* To shut up -Wmissing-prototypes.
+ * This function is used by the kernel light skeleton
+ * to load bpf programs when modules are loaded or during kernel boot.
+ * See tools/lib/bpf/skel_internal.h
+ */
+int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
+
+int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
+{
+ struct bpf_prog * __maybe_unused prog;
+ struct bpf_tramp_run_ctx __maybe_unused run_ctx;
+
+ switch (cmd) {
#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
case BPF_PROG_TEST_RUN:
if (attr->test.data_in || attr->test.data_out ||
@@ -5114,11 +5130,10 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
return 0;
#endif
default:
- return -EINVAL;
+ return ____bpf_sys_bpf(cmd, attr, size);
}
- return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
}
-EXPORT_SYMBOL(bpf_sys_bpf);
+EXPORT_SYMBOL(kern_sys_bpf);
static const struct bpf_func_proto bpf_sys_bpf_proto = {
.func = bpf_sys_bpf,
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 0f532e6a717f..ff87e38af8a7 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -841,7 +841,10 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
* multiple rcu callbacks.
*/
hlist_del(&tr->hlist);
- kfree(tr->fops);
+ if (tr->fops) {
+ ftrace_free_filter(tr->fops);
+ kfree(tr->fops);
+ }
kfree(tr);
out:
mutex_unlock(&trampoline_mutex);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 71a418858a5e..58aadfda9b8b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2239,7 +2239,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
goto out_unlock;
cgroup_taskset_for_each(task, css, tset) {
- ret = task_can_attach(task, cs->cpus_allowed);
+ ret = task_can_attach(task, cs->effective_cpus);
if (ret)
goto out_unlock;
ret = security_task_setscheduler(task);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 71122e01623c..07b26df453a9 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -9,12 +9,15 @@
#include <linux/init.h>
#include <linux/utsname.h>
#include <linux/vmalloc.h>
+#include <linux/sizes.h>
#include <asm/page.h>
#include <asm/sections.h>
#include <crypto/sha1.h>
+#include "kallsyms_internal.h"
+
/* vmcoreinfo stuff */
unsigned char *vmcoreinfo_data;
size_t vmcoreinfo_size;
@@ -43,6 +46,15 @@ static int __init parse_crashkernel_mem(char *cmdline,
unsigned long long *crash_base)
{
char *cur = cmdline, *tmp;
+ unsigned long long total_mem = system_ram;
+
+ /*
+ * Firmware sometimes reserves some memory regions for its own use,
+ * so the system memory size is less than the actual physical memory
+ * size. Work around this by rounding up the total size to 128M,
+ * which is enough for most test cases.
+ */
+ total_mem = roundup(total_mem, SZ_128M);
/* for each entry of the comma-separated list */
do {
@@ -87,13 +99,13 @@ static int __init parse_crashkernel_mem(char *cmdline,
return -EINVAL;
}
cur = tmp;
- if (size >= system_ram) {
+ if (size >= total_mem) {
pr_warn("crashkernel: invalid size\n");
return -EINVAL;
}
/* match ? */
- if (system_ram >= start && system_ram < end) {
+ if (total_mem >= start && total_mem < end) {
*crash_size = size;
break;
}
@@ -480,6 +492,18 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
#endif
+#ifdef CONFIG_KALLSYMS
+ VMCOREINFO_SYMBOL(kallsyms_names);
+ VMCOREINFO_SYMBOL(kallsyms_token_table);
+ VMCOREINFO_SYMBOL(kallsyms_token_index);
+#ifdef CONFIG_KALLSYMS_BASE_RELATIVE
+ VMCOREINFO_SYMBOL(kallsyms_offsets);
+ VMCOREINFO_SYMBOL(kallsyms_relative_base);
+#else
+ VMCOREINFO_SYMBOL(kallsyms_addresses);
+#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */
+#endif /* CONFIG_KALLSYMS */
+
arch_crash_save_vmcoreinfo();
update_vmcoreinfo_note();
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 375fb3c9538d..c21abc77c53e 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -74,7 +74,7 @@ out_unmap_membase:
return ERR_PTR(-ENOMEM);
}
-static void dma_release_coherent_memory(struct dma_coherent_mem *mem)
+static void _dma_release_coherent_memory(struct dma_coherent_mem *mem)
{
if (!mem)
return;
@@ -126,10 +126,16 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
ret = dma_assign_coherent_memory(dev, mem);
if (ret)
- dma_release_coherent_memory(mem);
+ _dma_release_coherent_memory(mem);
return ret;
}
+void dma_release_coherent_memory(struct device *dev)
+{
+ if (dev)
+ _dma_release_coherent_memory(dev->dma_mem);
+}
+
static void *__dma_alloc_from_coherent(struct device *dev,
struct dma_coherent_mem *mem,
ssize_t size, dma_addr_t *dma_handle)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 8d0b68a17042..63859a101ed8 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -453,29 +453,60 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
arch_sync_dma_for_cpu_all();
}
+/*
+ * Unmaps segments, except for ones marked as pci_p2pdma which do not
+ * require any further action as they contain a bus address.
+ */
void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
int nents, enum dma_data_direction dir, unsigned long attrs)
{
struct scatterlist *sg;
int i;
- for_each_sg(sgl, sg, nents, i)
- dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
- attrs);
+ for_each_sg(sgl, sg, nents, i) {
+ if (sg_is_dma_bus_address(sg))
+ sg_dma_unmark_bus_address(sg);
+ else
+ dma_direct_unmap_page(dev, sg->dma_address,
+ sg_dma_len(sg), dir, attrs);
+ }
}
#endif
int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir, unsigned long attrs)
{
- int i;
+ struct pci_p2pdma_map_state p2pdma_state = {};
+ enum pci_p2pdma_map_type map;
struct scatterlist *sg;
+ int i, ret;
for_each_sg(sgl, sg, nents, i) {
+ if (is_pci_p2pdma_page(sg_page(sg))) {
+ map = pci_p2pdma_map_segment(&p2pdma_state, dev, sg);
+ switch (map) {
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ continue;
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ /*
+ * Any P2P mapping that traverses the PCI
+ * host bridge must be mapped with CPU physical
+ * address and not PCI bus addresses. This is
+ * done with dma_direct_map_page() below.
+ */
+ break;
+ default:
+ ret = -EREMOTEIO;
+ goto out_unmap;
+ }
+ }
+
sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
sg->offset, sg->length, dir, attrs);
- if (sg->dma_address == DMA_MAPPING_ERROR)
+ if (sg->dma_address == DMA_MAPPING_ERROR) {
+ ret = -EIO;
goto out_unmap;
+ }
sg_dma_len(sg) = sg->length;
}
@@ -483,7 +514,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
out_unmap:
dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
- return -EIO;
+ return ret;
}
dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index a78c0ba70645..e38ffc5e6bdd 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -8,6 +8,7 @@
#define _KERNEL_DMA_DIRECT_H
#include <linux/dma-direct.h>
+#include <linux/memremap.h>
int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
@@ -87,10 +88,15 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
phys_addr_t phys = page_to_phys(page) + offset;
dma_addr_t dma_addr = phys_to_dma(dev, phys);
- if (is_swiotlb_force_bounce(dev))
+ if (is_swiotlb_force_bounce(dev)) {
+ if (is_pci_p2pdma_page(page))
+ return DMA_MAPPING_ERROR;
return swiotlb_map(dev, phys, size, dir, attrs);
+ }
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
+ if (is_pci_p2pdma_page(page))
+ return DMA_MAPPING_ERROR;
if (is_swiotlb_active(dev))
return swiotlb_map(dev, phys, size, dir, attrs);
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index db7244291b74..49cbf3e33de7 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -197,7 +197,7 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
if (ents > 0)
debug_dma_map_sg(dev, sg, nents, ents, dir, attrs);
else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
- ents != -EIO))
+ ents != -EIO && ents != -EREMOTEIO))
return -EIO;
return ents;
@@ -249,12 +249,15 @@ EXPORT_SYMBOL(dma_map_sg_attrs);
* Returns 0 on success or a negative error code on error. The following
* error codes are supported with the given meaning:
*
- * -EINVAL An invalid argument, unaligned access or other error
- * in usage. Will not succeed if retried.
- * -ENOMEM Insufficient resources (like memory or IOVA space) to
- * complete the mapping. Should succeed if retried later.
- * -EIO Legacy error code with an unknown meaning. eg. this is
- * returned if a lower level call returned DMA_MAPPING_ERROR.
+ * -EINVAL An invalid argument, unaligned access or other error
+ * in usage. Will not succeed if retried.
+ * -ENOMEM Insufficient resources (like memory or IOVA space) to
+ * complete the mapping. Should succeed if retried later.
+ * -EIO Legacy error code with an unknown meaning. eg. this is
+ * returned if a lower level call returned
+ * DMA_MAPPING_ERROR.
+ * -EREMOTEIO The DMA device cannot access P2PDMA memory specified
+ * in the sg_table. This will not succeed if retried.
*/
int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
enum dma_data_direction dir, unsigned long attrs)
@@ -720,6 +723,24 @@ int dma_supported(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_supported);
+bool dma_pci_p2pdma_supported(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ /* if ops is not set, dma direct will be used which supports P2PDMA */
+ if (!ops)
+ return true;
+
+ /*
+ * Note: dma_ops_bypass is not checked here because P2PDMA should
+ * not be used with dma mapping ops that do not have support even
+ * if the specific device is bypassing them.
+ */
+
+ return ops->flags & DMA_F_PCI_P2PDMA_SUPPORTED;
+}
+EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported);
+
#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
void arch_dma_set_mask(struct device *dev, u64 mask);
#else
@@ -773,6 +794,18 @@ size_t dma_max_mapping_size(struct device *dev)
}
EXPORT_SYMBOL_GPL(dma_max_mapping_size);
+size_t dma_opt_mapping_size(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ size_t size = SIZE_MAX;
+
+ if (ops && ops->opt_mapping_size)
+ size = ops->opt_mapping_size();
+
+ return min(dma_max_mapping_size(dev), size);
+}
+EXPORT_SYMBOL_GPL(dma_opt_mapping_size);
+
bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index cb50f8d38360..c5a9190b218f 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -62,6 +62,12 @@
#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
+struct io_tlb_slot {
+ phys_addr_t orig_addr;
+ size_t alloc_size;
+ unsigned int list;
+};
+
static bool swiotlb_force_bounce;
static bool swiotlb_force_disable;
@@ -70,6 +76,62 @@ struct io_tlb_mem io_tlb_default_mem;
phys_addr_t swiotlb_unencrypted_base;
static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
+static unsigned long default_nareas;
+
+/**
+ * struct io_tlb_area - IO TLB memory area descriptor
+ *
+ * This is a single area with a single lock.
+ *
+ * @used: The number of used IO TLB block.
+ * @index: The slot index to start searching in this area for next round.
+ * @lock: The lock to protect the above data structures in the map and
+ * unmap calls.
+ */
+struct io_tlb_area {
+ unsigned long used;
+ unsigned int index;
+ spinlock_t lock;
+};
+
+/*
+ * Round up number of slabs to the next power of 2. The last area is going
+ * be smaller than the rest if default_nslabs is not power of two.
+ * The number of slot in an area should be a multiple of IO_TLB_SEGSIZE,
+ * otherwise a segment may span two or more areas. It conflicts with free
+ * contiguous slots tracking: free slots are treated contiguous no matter
+ * whether they cross an area boundary.
+ *
+ * Return true if default_nslabs is rounded up.
+ */
+static bool round_up_default_nslabs(void)
+{
+ if (!default_nareas)
+ return false;
+
+ if (default_nslabs < IO_TLB_SEGSIZE * default_nareas)
+ default_nslabs = IO_TLB_SEGSIZE * default_nareas;
+ else if (is_power_of_2(default_nslabs))
+ return false;
+ default_nslabs = roundup_pow_of_two(default_nslabs);
+ return true;
+}
+
+static void swiotlb_adjust_nareas(unsigned int nareas)
+{
+ /* use a single area when non is specified */
+ if (!nareas)
+ nareas = 1;
+ else if (!is_power_of_2(nareas))
+ nareas = roundup_pow_of_two(nareas);
+
+ default_nareas = nareas;
+
+ pr_info("area num %d.\n", nareas);
+ if (round_up_default_nslabs())
+ pr_info("SWIOTLB bounce buffer size roundup to %luMB",
+ (default_nslabs << IO_TLB_SHIFT) >> 20);
+}
static int __init
setup_io_tlb_npages(char *str)
@@ -81,6 +143,10 @@ setup_io_tlb_npages(char *str)
}
if (*str == ',')
++str;
+ if (isdigit(*str))
+ swiotlb_adjust_nareas(simple_strtoul(str, &str, 0));
+ if (*str == ',')
+ ++str;
if (!strcmp(str, "force"))
swiotlb_force_bounce = true;
else if (!strcmp(str, "noforce"))
@@ -112,8 +178,11 @@ void __init swiotlb_adjust_size(unsigned long size)
*/
if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT)
return;
+
size = ALIGN(size, IO_TLB_SIZE);
default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+ if (round_up_default_nslabs())
+ size = default_nslabs << IO_TLB_SHIFT;
pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
}
@@ -192,7 +261,8 @@ void __init swiotlb_update_mem_attributes(void)
}
static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
- unsigned long nslabs, unsigned int flags, bool late_alloc)
+ unsigned long nslabs, unsigned int flags,
+ bool late_alloc, unsigned int nareas)
{
void *vaddr = phys_to_virt(start);
unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
@@ -200,12 +270,18 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
mem->nslabs = nslabs;
mem->start = start;
mem->end = mem->start + bytes;
- mem->index = 0;
mem->late_alloc = late_alloc;
+ mem->nareas = nareas;
+ mem->area_nslabs = nslabs / mem->nareas;
mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE);
- spin_lock_init(&mem->lock);
+ for (i = 0; i < mem->nareas; i++) {
+ spin_lock_init(&mem->areas[i].lock);
+ mem->areas[i].index = 0;
+ mem->areas[i].used = 0;
+ }
+
for (i = 0; i < mem->nslabs; i++) {
mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
@@ -232,7 +308,7 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
int (*remap)(void *tlb, unsigned long nslabs))
{
struct io_tlb_mem *mem = &io_tlb_default_mem;
- unsigned long nslabs = default_nslabs;
+ unsigned long nslabs;
size_t alloc_size;
size_t bytes;
void *tlb;
@@ -243,6 +319,17 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
return;
/*
+ * default_nslabs maybe changed when adjust area number.
+ * So allocate bounce buffer after adjusting area number.
+ */
+ if (!default_nareas)
+ swiotlb_adjust_nareas(num_possible_cpus());
+
+ nslabs = default_nslabs;
+ if (nslabs < IO_TLB_MIN_SLABS)
+ panic("%s: nslabs = %lu too small\n", __func__, nslabs);
+
+ /*
* By default allocate the bounce buffer memory from low memory, but
* allow to pick a location everywhere for hypervisors with guest
* memory encryption.
@@ -254,7 +341,8 @@ retry:
else
tlb = memblock_alloc_low(bytes, PAGE_SIZE);
if (!tlb) {
- pr_warn("%s: failed to allocate tlb structure\n", __func__);
+ pr_warn("%s: Failed to allocate %zu bytes tlb structure\n",
+ __func__, bytes);
return;
}
@@ -274,7 +362,13 @@ retry:
panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
__func__, alloc_size, PAGE_SIZE);
- swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false);
+ mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area),
+ default_nareas), SMP_CACHE_BYTES);
+ if (!mem->areas)
+ panic("%s: Failed to allocate mem->areas.\n", __func__);
+
+ swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false,
+ default_nareas);
if (flags & SWIOTLB_VERBOSE)
swiotlb_print_info();
@@ -282,7 +376,7 @@ retry:
void __init swiotlb_init(bool addressing_limit, unsigned int flags)
{
- return swiotlb_init_remap(addressing_limit, flags, NULL);
+ swiotlb_init_remap(addressing_limit, flags, NULL);
}
/*
@@ -296,7 +390,7 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
struct io_tlb_mem *mem = &io_tlb_default_mem;
unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
unsigned char *vstart = NULL;
- unsigned int order;
+ unsigned int order, area_order;
bool retried = false;
int rc = 0;
@@ -337,19 +431,34 @@ retry:
(PAGE_SIZE << order) >> 20);
}
+ if (!default_nareas)
+ swiotlb_adjust_nareas(num_possible_cpus());
+
+ area_order = get_order(array_size(sizeof(*mem->areas),
+ default_nareas));
+ mem->areas = (struct io_tlb_area *)
+ __get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order);
+ if (!mem->areas)
+ goto error_area;
+
mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(array_size(sizeof(*mem->slots), nslabs)));
- if (!mem->slots) {
- free_pages((unsigned long)vstart, order);
- return -ENOMEM;
- }
+ if (!mem->slots)
+ goto error_slots;
set_memory_decrypted((unsigned long)vstart,
(nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
- swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true);
+ swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true,