diff options
| author | Jason Gunthorpe <jgg@mellanox.com> | 2019-08-21 14:12:29 -0300 |
|---|---|---|
| committer | Jason Gunthorpe <jgg@mellanox.com> | 2019-08-21 20:58:18 -0300 |
| commit | daa138a58c802e7b4c2fb73f9b85bb082616ef43 (patch) | |
| tree | be913e8e3745bb367d2ba371598f447649102cfc /kernel | |
| parent | 6869b7b206595ae0e326f59719090351eb8f4f5d (diff) | |
| parent | fba0e448a2c5b297a4ddc1ec4e48f4aa6600a1c9 (diff) | |
| download | linux-daa138a58c802e7b4c2fb73f9b85bb082616ef43.tar.gz linux-daa138a58c802e7b4c2fb73f9b85bb082616ef43.tar.bz2 linux-daa138a58c802e7b4c2fb73f9b85bb082616ef43.zip | |
Merge branch 'odp_fixes' into hmm.git
From rdma.git
Jason Gunthorpe says:
====================
This is a collection of general cleanups for ODP to clarify some of the
flows around umem creation and use of the interval tree.
====================
The branch is based on v5.3-rc5 due to dependencies, and is being taken
into hmm.git due to dependencies in the next patches.
* odp_fixes:
RDMA/mlx5: Use odp instead of mr->umem in pagefault_mr
RDMA/mlx5: Use ib_umem_start instead of umem.address
RDMA/core: Make invalidate_range a device operation
RDMA/odp: Use kvcalloc for the dma_list and page_list
RDMA/odp: Check for overflow when computing the umem_odp end
RDMA/odp: Provide ib_umem_odp_release() to undo the allocs
RDMA/odp: Split creating a umem_odp from ib_umem_get
RDMA/odp: Make the three ways to create a umem_odp clear
RMDA/odp: Consolidate umem_odp initialization
RDMA/odp: Make it clearer when a umem is an implicit ODP umem
RDMA/odp: Iterate over the whole rbtree directly
RDMA/odp: Use the common interval tree library instead of generic
RDMA/mlx5: Fix MR npages calculation for IB_ACCESS_HUGETLB
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/Kconfig.preempt | 8 | ||||
| -rw-r--r-- | kernel/Makefile | 1 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 4 | ||||
| -rw-r--r-- | kernel/configs.c | 16 | ||||
| -rw-r--r-- | kernel/cred.c | 21 | ||||
| -rw-r--r-- | kernel/dma/contiguous.c | 8 | ||||
| -rw-r--r-- | kernel/dma/direct.c | 10 | ||||
| -rw-r--r-- | kernel/dma/mapping.c | 32 | ||||
| -rw-r--r-- | kernel/dma/remap.c | 2 | ||||
| -rw-r--r-- | kernel/events/core.c | 2 | ||||
| -rw-r--r-- | kernel/exit.c | 6 | ||||
| -rw-r--r-- | kernel/fork.c | 2 | ||||
| -rw-r--r-- | kernel/irq/affinity.c | 6 | ||||
| -rw-r--r-- | kernel/locking/lockdep.c | 13 | ||||
| -rw-r--r-- | kernel/locking/lockdep_proc.c | 3 | ||||
| -rw-r--r-- | kernel/locking/mutex.c | 11 | ||||
| -rw-r--r-- | kernel/locking/rwsem.c | 28 | ||||
| -rw-r--r-- | kernel/memremap.c | 426 | ||||
| -rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 14 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 8 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 144 | ||||
| -rw-r--r-- | kernel/sched/psi.c | 4 | ||||
| -rw-r--r-- | kernel/signal.c | 3 | ||||
| -rw-r--r-- | kernel/trace/trace_functions_graph.c | 17 |
24 files changed, 236 insertions, 553 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index fc020c09b7e8..deff97217496 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -35,10 +35,10 @@ config PREEMPT_VOLUNTARY Select this if you are building a kernel for a desktop system. -config PREEMPT_LL +config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" depends on !ARCH_NO_PREEMPT - select PREEMPT + select PREEMPTION select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK help This option reduces the latency of the kernel by making @@ -58,7 +58,7 @@ config PREEMPT_LL config PREEMPT_RT bool "Fully Preemptible Kernel (Real-Time)" depends on EXPERT && ARCH_SUPPORTS_RT - select PREEMPT + select PREEMPTION help This option turns the kernel into a real-time kernel by replacing various locking primitives (spinlocks, rwlocks, etc.) with @@ -77,6 +77,6 @@ endchoice config PREEMPT_COUNT bool -config PREEMPT +config PREEMPTION bool select PREEMPT_COUNT diff --git a/kernel/Makefile b/kernel/Makefile index a8d923b5481b..ef0d95a190b4 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -111,7 +111,6 @@ obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o obj-$(CONFIG_HAS_IOMEM) += iomem.o -obj-$(CONFIG_ZONE_DEVICE) += memremap.o obj-$(CONFIG_RSEQ) += rseq.o obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5900cbb966b1..c84d83f86141 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8616,8 +8616,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } if (is_narrower_load && size < target_size) { - u8 shift = (off & (size_default - 1)) * 8; - + u8 shift = bpf_ctx_narrow_load_shift(off, size, + size_default); if (ctx_field_size <= 4) { if (shift) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, diff --git a/kernel/configs.c b/kernel/configs.c index b062425ccf8d..c09ea4c995e1 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * kernel/configs.c * Echo the kernel .config file used to build the kernel @@ -6,21 +7,6 @@ * Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net> * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com> * Copyright (C) 2002 Hewlett-Packard Company - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include <linux/kernel.h> diff --git a/kernel/cred.c b/kernel/cred.c index f9a0ce66c9c3..c0a4c12d38b2 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -144,7 +144,10 @@ void __put_cred(struct cred *cred) BUG_ON(cred == current->cred); BUG_ON(cred == current->real_cred); - call_rcu(&cred->rcu, put_cred_rcu); + if (cred->non_rcu) + put_cred_rcu(&cred->rcu); + else + call_rcu(&cred->rcu, put_cred_rcu); } EXPORT_SYMBOL(__put_cred); @@ -261,6 +264,7 @@ struct cred *prepare_creds(void) old = task->cred; memcpy(new, old, sizeof(struct cred)); + new->non_rcu = 0; atomic_set(&new->usage, 1); set_cred_subscribers(new, 0); get_group_info(new->group_info); @@ -544,7 +548,19 @@ const struct cred *override_creds(const struct cred *new) validate_creds(old); validate_creds(new); - get_cred(new); + + /* + * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'. + * + * That means that we do not clear the 'non_rcu' flag, since + * we are only installing the cred into the thread-synchronous + * '->cred' pointer, not the '->real_cred' pointer that is + * visible to other threads under RCU. + * + * Also note that we did validate_creds() manually, not depending + * on the validation in 'get_cred()'. + */ + get_new_cred((struct cred *)new); alter_cred_subscribers(new, 1); rcu_assign_pointer(current->cred, new); alter_cred_subscribers(old, -1); @@ -681,6 +697,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) validate_creds(old); *new = *old; + new->non_rcu = 0; atomic_set(&new->usage, 1); set_cred_subscribers(new, 0); get_uid(new->user); diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index bfc0c17f2a3d..2bd410f934b3 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -243,8 +243,9 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) /* CMA can be used only in the context which permits sleeping */ if (cma && gfpflags_allow_blocking(gfp)) { - align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); - page = cma_alloc(cma, count, align, gfp & __GFP_NOWARN); + size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); + + page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN); } /* Fallback allocation of normal pages */ @@ -266,7 +267,8 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) */ void dma_free_contiguous(struct device *dev, struct page *page, size_t size) { - if (!cma_release(dev_get_cma_area(dev), page, size >> PAGE_SHIFT)) + if (!cma_release(dev_get_cma_area(dev), page, + PAGE_ALIGN(size) >> PAGE_SHIFT)) __free_pages(page, get_order(size)); } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 59bdceea3737..795c9b095d75 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -47,9 +47,6 @@ u64 dma_direct_get_required_mask(struct device *dev) { u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT); - if (dev->bus_dma_mask && dev->bus_dma_mask < max_dma) - max_dma = dev->bus_dma_mask; - return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } @@ -130,10 +127,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, if (!page) return NULL; - if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { + if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && + !force_dma_unencrypted(dev)) { /* remove any dirty cache lines on the kernel alias */ if (!PageHighMem(page)) arch_dma_prep_coherent(page, size); + *dma_handle = phys_to_dma(dev, page_to_phys(page)); /* return the page pointer as the opaque cookie */ return page; } @@ -178,7 +177,8 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, { unsigned int page_order = get_order(size); - if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { + if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && + !force_dma_unencrypted(dev)) { /* cpu_addr is a struct page cookie, not a kernel address */ __dma_direct_free_pages(dev, size, cpu_addr); return; diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 1f628e7ac709..b0038ca3aa92 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -116,11 +116,16 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, int ret; if (!dev_is_dma_coherent(dev)) { + unsigned long pfn; + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)) return -ENXIO; - page = pfn_to_page(arch_dma_coherent_to_pfn(dev, cpu_addr, - dma_addr)); + /* If the PFN is not valid, we do not have a struct page */ + pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr); + if (!pfn_valid(pfn)) + return -ENXIO; + page = pfn_to_page(pfn); } else { page = virt_to_page(cpu_addr); } @@ -145,6 +150,23 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, } EXPORT_SYMBOL(dma_get_sgtable_attrs); +#ifdef CONFIG_MMU +/* + * Return the page attributes used for mapping dma_alloc_* memory, either in + * kernel space if remapping is needed, or to userspace through dma_mmap_*. + */ +pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) +{ + if (dev_is_dma_coherent(dev) || + (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) && + (attrs & DMA_ATTR_NON_CONSISTENT))) + return prot; + if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_MMAP_PGPROT)) + return arch_dma_mmap_pgprot(dev, prot, attrs); + return pgprot_noncached(prot); +} +#endif /* CONFIG_MMU */ + /* * Create userspace mapping for the DMA-coherent memory. */ @@ -159,7 +181,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, unsigned long pfn; int ret = -ENXIO; - vma->vm_page_prot = arch_dma_mmap_pgprot(dev, vma->vm_page_prot, attrs); + vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; @@ -170,7 +192,11 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, if (!dev_is_dma_coherent(dev)) { if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)) return -ENXIO; + + /* If the PFN is not valid, we do not have a struct page */ pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr); + if (!pfn_valid(pfn)) + return -ENXIO; } else { pfn = page_to_pfn(virt_to_page(cpu_addr)); } diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index a594aec07882..ffe78f0b2fe4 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -218,7 +218,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, /* create a coherent mapping */ ret = dma_common_contiguous_remap(page, size, VM_USERMAP, - arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs), + dma_pgprot(dev, PAGE_KERNEL, attrs), __builtin_return_address(0)); if (!ret) { __dma_direct_free_pages(dev, size, page); diff --git a/kernel/events/core.c b/kernel/events/core.c index 026a14541a38..0463c1151bae 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11274,7 +11274,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_unlock; } - perf_install_in_context(ctx, event, cpu); + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); diff --git a/kernel/exit.c b/kernel/exit.c index a75b6a7f458a..5b4a5dcce8f8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -720,6 +720,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); + tsk->exit_state = EXIT_ZOMBIE; if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && @@ -733,9 +734,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead) autoreap = true; } - tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; - if (tsk->exit_state == EXIT_DEAD) + if (autoreap) { + tsk->exit_state = EXIT_DEAD; list_add(&tsk->ptrace_entry, &dead); + } /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) diff --git a/kernel/fork.c b/kernel/fork.c index bd4a0762f12f..92c8559d9745 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -726,7 +726,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(tsk == current); cgroup_free(tsk); - task_numa_free(tsk); + task_numa_free(tsk, true); security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 4352b08ae48d..6fef48033f96 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -251,11 +251,9 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) * Determine the number of vectors which need interrupt affinities * assigned. If the pre/post request exhausts the available vectors * then nothing to do here except for invoking the calc_sets() - * callback so the device driver can adjust to the situation. If there - * is only a single vector, then managing the queue is pointless as - * well. + * callback so the device driver can adjust to the situation. */ - if (nvecs > 1 && nvecs > affd->pre_vectors + affd->post_vectors) + if (nvecs > affd->pre_vectors + affd->post_vectors) affvecs = nvecs - affd->pre_vectors - affd->post_vectors; else affvecs = 0; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 341f52117f88..4861cf8e274b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -448,7 +448,7 @@ static void print_lockdep_off(const char *bug_msg) unsigned long nr_stack_trace_entries; -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +#ifdef CONFIG_PROVE_LOCKING /* * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the graph_lock. @@ -491,7 +491,7 @@ unsigned int max_lockdep_depth; DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); #endif -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +#ifdef CONFIG_PROVE_LOCKING /* * Locking printouts: */ @@ -2969,7 +2969,7 @@ static void check_chain_key(struct task_struct *curr) #endif } -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +#ifdef CONFIG_PROVE_LOCKING static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit); @@ -3608,7 +3608,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return ret; } -#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ +#else /* CONFIG_PROVE_LOCKING */ static inline int mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) @@ -3627,7 +3627,7 @@ static inline int separate_irq_context(struct task_struct *curr, return 0; } -#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ +#endif /* CONFIG_PROVE_LOCKING */ /* * Initialize a lock instance's lock-class mapping info: @@ -4321,8 +4321,7 @@ static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie */ static void check_flags(unsigned long flags) { -#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \ - defined(CONFIG_TRACE_IRQFLAGS) +#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) if (!debug_locks) return; diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 65b6a1600c8f..bda006f8a88b 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -200,7 +200,6 @@ static void lockdep_stats_debug_show(struct seq_file *m) static int lockdep_stats_show(struct seq_file *m, void *v) { - struct lock_class *class; unsigned long nr_unused = 0, nr_uncategorized = 0, nr_irq_safe = 0, nr_irq_unsafe = 0, nr_softirq_safe = 0, nr_softirq_unsafe = 0, @@ -211,6 +210,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v) sum_forward_deps = 0; #ifdef CONFIG_PROVE_LOCKING + struct lock_class *class; + list_for_each_entry(class, &all_lock_classes, lock_entry) { if (class->usage_mask == 0) diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index edd1c082dbf5..5e069734363c 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -908,6 +908,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, might_sleep(); +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(lock->magic != lock); +#endif + ww = container_of(lock, struct ww_mutex, base); if (use_ww_ctx && ww_ctx) { if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) @@ -1379,8 +1383,13 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, */ int __sched mutex_trylock(struct mutex *lock) { - bool locked = __mutex_trylock(lock); + bool locked; + +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(lock->magic != lock); +#endif + locked = __mutex_trylock(lock); if (locked) mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 37524a47f002..bd0f0d05724c 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -666,7 +666,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, preempt_disable(); rcu_read_lock(); owner = rwsem_owner_flags(sem, &flags); - if ((flags & nonspinnable) || (owner && !owner_on_cpu(owner))) + /* + * Don't check the read-owner as the entry may be stale. + */ + if ((flags & nonspinnable) || + (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner))) ret = false; rcu_read_unlock(); preempt_enable(); @@ -1000,6 +1004,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) atomic_long_add(-RWSEM_READER_BIAS, &sem->count); adjustment = 0; if (rwsem_optimistic_spin(sem, false)) { + /* rwsem_optimistic_spin() implies ACQUIRE on success */ /* * Wake up other readers in the wait list if the front * waiter is a reader. @@ -1014,6 +1019,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) } return sem; } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) { + /* rwsem_reader_phase_trylock() implies ACQUIRE on success */ return sem; } @@ -1032,6 +1038,8 @@ queue: */ if (adjustment && !(atomic_long_read(&sem->count) & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + /* Provide lock ACQUIRE */ + smp_acquire__after_ctrl_dep(); raw_spin_unlock_irq(&sem->wait_lock); rwsem_set_reader_owned(sem); lockevent_inc(rwsem_rlock_fast); @@ -1065,15 +1073,18 @@ queue: wake_up_q(&wake_q); /* wait to be given the lock */ - while (true) { + for (;;) { set_current_state(state); - if (!waiter.task) + if (!smp_load_acquire(&waiter.task)) { + /* Matches rwsem_mark_wake()'s smp_store_release(). */ break; + } if (signal_pending_state(state, current)) { raw_spin_lock_irq(&sem->wait_lock); if (waiter.task) goto out_nolock; raw_spin_unlock_irq(&sem->wait_lock); + /* Ordered by sem->wait_lock against rwsem_mark_wake(). */ break; } schedule(); @@ -1083,6 +1094,7 @@ queue: __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock); return sem; + out_nolock: list_del(&waiter.list); if (list_empty(&sem->wait_list)) { @@ -1123,8 +1135,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) /* do optimistic spinning and steal lock if possible */ if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) && - rwsem_optimistic_spin(sem, true)) + rwsem_optimistic_spin(sem, true)) { + /* rwsem_optimistic_spin() implies ACQUIRE on success */ return sem; + } /* * Disable reader optimistic spinning for this rwsem after @@ -1184,9 +1198,11 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) wait: /* wait until we successfully acquire the lock */ set_current_state(state); - while (true) { - if (rwsem_try_write_lock(sem, wstate)) + for (;;) { + if (rwsem_try_write_lock(sem, wstate)) { + /* rwsem_try_write_lock() implies ACQUIRE on success */ break; + } raw_spin_unlock_irq(&sem->wait_lock); diff --git a/kernel/memremap.c b/kernel/memremap.c deleted file mode 100644 index 77a77704eb28..000000000000 --- a/kernel/memremap.c +++ /dev/null @@ -1,426 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright(c) 2015 Intel Corporation. All rights reserved. */ -#include <linux/device.h> -#include <linux/io.h> -#include <linux/kasan.h> -#include <linux/memory_hotplug.h> -#include <linux/mm.h> -#include <linux/pfn_t.h> -#include <linux/swap.h> -#include <linux/swapops.h> -#include <linux/types.h> -#include <linux/wait_bit.h> -#include <linux/xarray.h> - -static DEFINE_XARRAY(pgmap_array); -#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) -#define SECTION_SIZE (1UL << PA_SECTION_SHIFT) - -#ifdef CONFIG_DEV_PAGEMAP_OPS -DEFINE_STATIC_KEY_FALSE(devmap_managed_key); -EXPORT_SYMBOL(devmap_managed_key); -static atomic_t devmap_managed_enable; - -static void devmap_managed_enable_put(void) -{ - if (atomic_dec_and_test(&devmap_managed_enable)) - static_branch_disable(&devmap_managed_key); -} - -static int devmap_managed_enable_get(struct dev_pagemap *pgmap) -{ - if (!pgmap->ops || !pgmap->ops->page_free) { - WARN(1, "Missing page_free method\n"); - return -EINVAL; - } - - if (atomic_inc_return(&devmap_managed_enable) == 1) - static_branch_enable(&devmap_managed_key); - return 0; -} -#else -static int devmap_managed_enable_get(struct dev_pagemap *pgmap) -{ - return -EINVAL; -} -static void devmap_managed_enable_put(void) -{ -} -#endif /* CONFIG_DEV_PAGEMAP_OPS */ - -static void pgmap_array_delete(struct resource *res) -{ - xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end), - NULL, GFP_KERNEL); - synchronize_rcu(); -} - -static unsigned long pfn_first(struct dev_pagemap *pgmap) -{ - return PHYS_PFN(pgmap->res.start) + - vmem_altmap_offset(pgmap_altmap(pgmap)); -} - -static unsigned long pfn_end(struct dev_pagemap *pgmap) -{ - const struct resource *res = &pgmap->res; - - return (res->start + resource_size(res)) >> PAGE_SHIFT; -} - -static unsigned long pfn_next(unsigned long pfn) -{ - if (pfn % 1024 == 0) - cond_resched(); - return pfn + 1; -} - -#define for_each_device_pfn(pfn, map) \ - for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) - -static void dev_pagemap_kill(struct dev_pagemap *pgmap) -{ - if (pgmap->ops && pgmap->ops->kill) - pgmap->ops->kill(pgmap); - else - percpu_ref_kill(pgmap->ref); -} - -static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) -{ - if (pgmap->ops && pgmap->ops->cleanup) { - pgmap->ops->cleanup(pgmap); - } else { - wait_for_completion(&pgmap->done); - percpu_ref_exit(pgmap->ref); - } -} - -void memunmap_pages(struct dev_pagemap *pgmap) -{ - struct resource *res = &pgmap->res; - unsigned long pfn; - int nid; - - dev_pagemap_kill(pgmap); - for_each_device_pfn(pfn, pgmap) - put_page(pfn_to_page(pfn)); - dev_pagemap_cleanup(pgmap); - - /* pages are dead and unused, undo the arch mapping */ - nid = page_to_nid(pfn_to_page(PHYS_PFN(res->start))); - - mem_hotplug_begin(); - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - pfn = PHYS_PFN(res->start); - __remove_pages(page_zone(pfn_to_page(pfn)), pfn, - PHYS_PFN(resource_size(res)), NULL); - } else { - arch_remove_memory(nid, res->start, resource_size(res), - pgmap_altmap(pgmap)); - kasan_remove_zero_shadow(__va(res->start), resource_size(res)); - } - mem_hotplug_done(); - - untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); - pgmap_array_delete(res); - WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n"); - devmap_managed_enable_put(); -} -EXPORT_SYMBOL_GPL(memunmap_pages); - -static void devm_memremap_pages_release(void *data) -{ - memunmap_pages(data); -} - -static void dev_pagemap_percpu_release(struct percpu_ref *ref) -{ - struct dev_pagemap *pgmap = - container_of(ref, struct dev_pagemap, internal_ref); - - complete(&pgmap->done); -} - -/* - * Not device managed version of dev_memremap_pages, undone by - * memunmap_pages(). Please use dev_memremap_pages if you have a struct - * device available. - */ -void *memremap_pages(struct dev_pagemap *pgmap, int nid) -{ - struct resource *res = &pgmap->res; - struct dev_pagemap *conflict_pgmap; - struct mhp_restrictions restrictions = { - /* - * We do not want any optional features only our own memmap - */ - .altmap = pgmap_altmap(pgmap), - }; - pgprot_t pgprot = PAGE_KERNEL; - int error, is_ram; - bool need_devmap_managed = true; - - switch (pgmap->type) { - case MEMORY_DEVICE_PRIVATE: - if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) { - WARN(1, "Device private memory not supported\n"); - return ERR_PTR(-EINVAL); - } - if (!pgmap->ops || !pgmap->ops->migrate_to_ram) { - WARN(1, "Missing migrate_to_ram method\n"); - return ERR_PTR(-EINVAL); - } - break; - case MEMORY_DEVICE_FS_DAX: - if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || - IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { - WARN(1, "File system DAX not supported\n"); - return ERR_PTR(-EINVAL); - } - break; - case MEMORY_DEVICE_DEVDAX: - case MEMORY_DEVICE_PCI_P2PDMA: - need_devmap_managed = false; - break; - default: - WARN(1, "Invalid pgmap type %d\n", pgmap->type); - break; - } - - if (!pgmap->ref) { - if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) - return ERR_PTR(-EINVAL); - - init_completion(&pgmap->done); - error = percpu_ref_init(&pgmap->internal_ref, - dev_pagemap_percpu_release, 0, GFP_KERNEL); - if (error) - return ERR_PTR(error); - pgmap->ref = &pgmap->internal_ref; - } else { - if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { - WARN(1, "Missing reference count teardown definition\n"); - return ERR_PTR(-EINVAL); - } - } - - if (need_devmap_managed) { - error = devmap_managed_enable_get(pgmap); - if (error) - return ERR_PTR(error); - } - - conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL); - if (conflict_pgmap) { - WARN(1, "Conflicting mapping in same section\n"); - put_dev_pagemap(conflict_pgmap); - error = -ENOMEM; - goto err_array; - } - - conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL); - if (conflict_pgmap) { - WARN(1, "Conflicting mapping in same section\n"); - put_dev_pagemap(conflict_pgmap); - error = -ENOMEM; - goto err_array; - } - - is_ram = region_intersects(res->start, resource_size(res), - IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); - - if (is_ram != REGION_DISJOINT) { - WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__, - is_ram == REGION_MIXED ? "mixed" : "ram", res); - error = -ENXIO; - goto err_array; - } - - error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start), - |
