// SPDX-License-Identifier: GPL-2.0
/*
* linux/mm/madvise.c
*
* Copyright (C) 1999 Linus Torvalds
* Copyright (C) 2002 Christoph Hellwig
*/
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/mempolicy.h>
#include <linux/page-isolation.h>
#include <linux/page_idle.h>
#include <linux/userfaultfd_k.h>
#include <linux/hugetlb.h>
#include <linux/falloc.h>
#include <linux/fadvise.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mm_inline.h>
#include <linux/string.h>
#include <linux/uio.h>
#include <linux/ksm.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/pagewalk.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/mmu_notifier.h>
#include <asm/tlb.h>
#include "internal.h"
#include "swap.h"
struct madvise_walk_private {
struct mmu_gather *tlb;
bool pageout;
};
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
* take mmap_lock for writing. Others, which simply traverse vmas, need
* to only take it for reading.
*/
static int madvise_need_mmap_write(int behavior)
{
switch (behavior) {
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
case MADV_COLD:
case MADV_PAGEOUT:
case MADV_FREE:
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
case MADV_COLLAPSE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
return 1;
}
}
#ifdef CONFIG_ANON_VMA_NAME
struct anon_vma_name *anon_vma_name_alloc(const char *name)
{
struct anon_vma_name *anon_name;
size_t count;
/* Add 1 for NUL terminator at the end of the anon_name->name */
count = strlen(name) + 1;
anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
if (anon_name) {
kref_init(&anon_name->kref);
memcpy(anon_name->name, name, count);
}
return anon_name;
}
void anon_vma_name_free(struct kref *kref)
{
struct anon_vma_name *anon_name =
container_of(kref, struct anon_vma_name, kref);
kfree(anon_name);
}
struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
mmap_assert_locked(vma->vm_mm);
if (vma->vm_file)
return NULL;
return vma->anon_name;
}
/* mmap_lock should be write-locked */
static int replace_anon_vma_name(struct vm_area_struct *vma,
struct anon_vma_name *anon_name)
{
struct anon_vma_name *orig_name = anon_vma_name(vma);
if (!anon_name) {
vma->anon_name = NULL;
anon_vma_name_put(orig_name);
return 0;
}
if (anon_vma_name_eq(orig_name, anon_name))
return 0;
vma->anon_name = anon_vma_name_reuse(anon_name);
anon_vma_name_put(orig_name);
return 0;
}
#else /* CONFIG_ANON_VMA_NAME */
static int replace_anon_vma_name(struct vm_area_struct *vma,
struct anon_vma_name *anon_name)
{
if (anon_name)
return -EINVAL;
return 0;
}
#endif /* CONFIG_ANON_VMA_NAME */
/*
* Update the vm_flags on region of a vma, splitting it or merging it as
* necessary. Must be called with mmap_sem held for writing;
* Caller should ensure anon_name stability by raising its refcount even when
* anon_name belongs to a valid vma because this function might free that vma.
*/
static int madvise_update_vma(struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
unsigned long end, unsigned long new_flags,
struct anon_vma_name *anon_name)
{
struct mm_struct *mm = vma->vm_mm;
int error;
pgoff_t pgoff;
if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
*prev = vma;
return 0;
}
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, anon_name);
if (*prev) {
vma = *prev;
goto success;
}
*prev = vma;
if (start != vma->vm_start) {
if (unlikely(mm->map_count >= sysctl_max_map_count))
return -ENOMEM;
error = __split_vma(mm, vma, start, 1);
if (error)
return error;
}
if (end != vma->vm_end) {
if (unlikely(mm->map_count >= sysctl_max_map_count))
return -ENOMEM;
error = __split_vma(mm, vma, end, 0);
if (error)
return error;
}
success:
/*
* vm_flags is protected by the mmap_lock held in write mode.
*/
vma->vm_flags = new_flags;
if (!vma->vm_file) {
error = replace_anon_vma_name(vma, anon_name);
if (error)
return error;
}
return 0;
}
#ifdef CONFIG_SWAP
static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
unsigned long end, struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->private;
unsigned long index;
struct swap_iocb *splug = NULL;
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
return 0;
for (index = start; index != end; index += PAGE_SIZE) {
pte_t pte;
swp_entry_t entry;
struct page *page;
spinlock_t *ptl;
pte_t *ptep;
ptep = pte_offset_map_lock(vma->vm_mm, pmd, index, &ptl);
pte = *ptep;
pte_unmap_unlock(ptep, ptl);
if (!is_swap_pte(pte))
continue;
entry = pte_to_swp_entry(pte);
if (unlikely(non_swap_entry(entry)))
continue;
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
vma, index, false, &splug);
if (page)
put_page(page);
}
swap_read_unplug(splug);
return 0;
}
static const struct mm_walk_ops swapin_walk_ops = {
.pmd_entry = swapin_walk_pmd_entry,
};
static void force_shm_swapin_readahead(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct address_space *mapping)
{
XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
struct page *page;
struct swap_iocb *splug = NULL;
rcu_read_lock();
xas_for_each(&xas, page, end_index) {
swp_entry_t swap;
if (!xa_is_value(page))
continue;
swap = radix_to_swp_entry(page);
/* There might be swapin error entries in shmem mapping. */
if (non_swap_entry(swap))
continue;
xas_pause(&xas);
rcu_read_unlock();
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
NULL, 0, false, &splug);
if (page)
put_page(page);
rcu_read_lock();
}
rcu_read_unlock();
swap_read_unplug(splug);
lru_add_drain(); /* Push any new pages onto the LRU now */
}
#endif /* CONFIG_SWAP */
/*
* Schedule all required I/O operations. Do not wait for completion.
*/
static long madvise_willneed(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
loff_t offset;
*prev = vma;
#ifdef CONFIG_SWAP
if (!file) {
walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
lru_add_drain(); /* Push any new pages onto the LRU now */
return 0;
}
if (shmem_mapping(file->f_mapping)) {
force_shm_swapin_readahead(vma, start, end,
file->f_mapping);
return 0;
}
#else
if (!file)
return -EBADF;
#endif
if (IS_DAX(file_inode(file))) {
/* no bad return value, but ignore advice */
return 0;
}
/*
* Filesystem's fadvise may need to take various locks. We need to
* explicitly grab a reference because the vma (and hence the
* vma's reference to the file) can go away as soon as we drop
* mmap_lock.
*/
*prev = NULL; /* tell sys_madvise we drop mmap_lock */
get_file(file);
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
mmap_read_unlock(mm);
vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
fput(file);
mmap_read_lock(mm);
return 0;
}
static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct madvise_walk_private *private = walk->private;
struct mmu_gather *tlb = private->tlb;
bool pageout = private->pageout;
struct mm_struct *mm = tlb->mm;
struct vm_area_struct *vma = walk->vma;
pte_t *orig_pte, *pte, ptent;
spinlock_t *ptl;
struct page *page = NULL;
LIST_HEAD(page_list);
if (fatal_signal_pending(current))
return -EINTR;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pmd_trans_huge(*pmd)) {
pmd_t orig_pmd;
unsign
|