// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1995 Linus Torvalds
* Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
* Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
*/
#include <linux/sched.h> /* test_thread_flag(), ... */
#include <linux/sched/task_stack.h> /* task_stack_*(), ... */
#include <linux/kdebug.h> /* oops_begin/end, ... */
#include <linux/memblock.h> /* max_low_pfn */
#include <linux/kfence.h> /* kfence_handle_page_fault */
#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
#include <linux/perf_event.h> /* perf_sw_event */
#include <linux/hugetlb.h> /* hstate_index_to_shift */
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <linux/mm_types.h>
#include <linux/mm.h> /* find_and_lock_vma() */
#include <linux/vmalloc.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/fixmap.h> /* VSYSCALL_ADDR */
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
#include <asm/mmu_context.h> /* vma_pkey() */
#include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <asm/desc.h> /* store_idt(), ... */
#include <asm/cpu_entry_area.h> /* exception stack */
#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
#include <asm/kvm_para.h> /* kvm_handle_async_pf */
#include <asm/vdso.h> /* fixup_vdso_exception() */
#include <asm/irq_stack.h>
#include <asm/fred.h>
#include <asm/sev.h> /* snp_dump_hva_rmpentry() */
#define CREATE_TRACE_POINTS
#include <trace/events/exceptions.h>
/*
* Returns 0 if mmiotrace is disabled, or if the fault is not
* handled by mmiotrace:
*/
static nokprobe_inline int
kmmio_fault(struct pt_regs *regs, unsigned long addr)
{
if (unlikely(is_kmmio_active()))
if (kmmio_handler(regs, addr) == 1)
return -1;
return 0;
}
/*
* Prefetch quirks:
*
* 32-bit mode:
*
* Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
* Check that here and ignore it. This is AMD erratum #91.
*
* 64-bit mode:
*
* Sometimes the CPU reports invalid exceptions on prefetch.
* Check that here and ignore it.
*
* Opcode checker based on code by Richard Brunner.
*/
static inline int
check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
unsigned char opcode, int *prefetch)
{
unsigned char instr_hi = opcode & 0xf0;
unsigned char instr_lo = opcode & 0x0f;
switch (instr_hi) {
case 0x20:
case 0x30:
/*
* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
* In X86_64 long mode, the CPU will signal invalid
* opcode if some of these prefixes are present so
* X86_64 will never get here anyway
*/
return ((instr_lo & 7) == 0x6);
#ifdef CONFIG_X86_64
case 0x40:
/*
* In 64-bit mode 0x40..0x4F are valid REX prefixes
*/
return (!user_mode(regs) || user_64bit_mode(regs));
#endif
case 0x60:
/* 0x64 thru 0x67 are valid prefixes in all modes. */
return (instr_lo & 0xC) == 0x4;
case 0xF0:
/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
return !instr_lo || (instr_lo>>1) == 1;
case 0x00:
/* Prefetch instruction is 0x0F0D or 0x0F18 */
if (get_kernel_nofault(opcode, instr))
return 0;
*prefetch = (instr_lo == 0xF) &&
(opcode == 0x0D || opcode == 0x18);
return 0;
default:
return 0;
}
}
static bool is_amd_k8_pre_npt(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
c->x86_vendor == X86_VENDOR_AMD &&
c->x86 == 0xf && c->x86_model < 0x40);
}
static int
is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
{
unsigned char *max_instr;
unsigned char *instr;
int prefetch = 0;
/* Erratum #91 affects AMD K8, pre-NPT CPUs */
if (!is_amd_k8_pre_npt())
return 0;
/*
* If it was a exec (instruction fetch) fault on NX page, then
* do not ignore the fault:
*/
if (error_code & X86_PF_INSTR)
return 0;
instr = (void *)convert_ip_to_linear(current, regs);
max_instr = instr + 15;
/*
* This code has historically always bailed out if IP points to a
* not-present page (e.g. due to a race). No one has ever
* complained about this.
*/
pagefault_disable();
while (instr < max_instr) {
unsigned char opcode;
if (user_mode(regs)) {
if (get_user(opcode, (unsigned char __user *) instr))
break;
} else {
if (get_kernel_nofault(opcode, instr))
break;
}
instr++;
if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
break;
}
pagefault_enable();
return prefetch;
}
DEFINE_SPINLOCK(pgd_lock);
LIST_HEAD(pgd_list);
#ifdef CONFIG_X86_32
static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
{
unsigned index = pgd_index(address);
pgd_t *pgd_k;
p4d_t *p4d, *p4d_k;
pud_t *pud, *pud_k;
pmd_t *pmd, *pmd_k;
pgd += index;
pgd_k = init_mm.pgd + index;
if (!pgd_present(*pgd_k))
return NULL;
/*
* set_pgd(pgd, *pgd_k); here would be useless on PAE
* and redundant with the set_pmd() on non-PAE. As would
* set_p4d/set_pud.
*/
p4d = p4d_offset(pgd, address);
p4d_k = p4d_offset(pgd_k, address);
if (!p4d_present(*p4d_k))
return NULL;
pud = pud_offset(p4d, address);
pud_k = pud_offset(p4d_k, address);
if (!pud_present(*pud_k))
return NULL;
pmd = pmd_offset(pud, address);
pmd_k = pmd_offset(pud_k, address);
if (pmd_present(*pmd) != pmd_present(*pmd_k))
set_pmd(pmd, *pmd_k);
if (!pmd_present(*pmd_k))
return NULL;
else
BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));
return pmd_k;
}
/*
* Handle a fault on the vmalloc or module mapping area
*
* This is needed because there is a race condition between the time
* when the vmalloc mapping code updates the PMD to the point in time
* where it synchronizes this update with the other page-tables in the
* system.
*
* In this race window another thread/CPU can map an area on the same
* PMD, finds it already present and does not synchronize it with the
* rest of the system yet. As a result v[mz]alloc might return areas
* which are not mapped in every page-table in the system, causing an
* unhandled page-fault when they are accessed.
*/
static noinline int vmalloc_fault(unsigned long address)
{
unsigned long pgd_paddr;
pmd_t *pmd_k;
pte_t *pte_k;
/* Make sure we are in vmalloc area: */
if (!(address >= VMALLOC_START && address < VMALLOC_END))
return -1;
/*
* Synchronize this task's top level page-table
* with the 'reference' page table.
*
* Do _not_ use "current" here. We might be inside
* an interrupt in the middle of a task switch..
*/
pgd_paddr = read_cr3_pa();
pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
if (!pmd_k)
return -1;
if (pmd_leaf(*pmd_k))
return 0;
pte_k = pte_offset_kernel(pmd_k, address);
if (!pte_present(*pte_k))
return -1;
return 0;
}
NOKPROBE_SYMBOL(vmalloc_fault);
void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
{
unsigned long addr;
for (addr = start & PMD_MASK;
addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
addr += PMD_SIZE) {
struct page *page;
spin_lock(&pgd_lock);
list_for_each_entry(page, &pgd_list, lru) {
spinlock_t *pgt_lock;
/* the pgt_lock only for Xen */
pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
spin_lock(pgt_lock);
vmalloc_sync_one(page_addr
|