summaryrefslogtreecommitdiff
path: root/arch/x86/xen
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-08 11:46:48 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-08 11:46:48 -0700
commit752240e74d650faa24425adc523f1308973ea51c (patch)
tree47657b7d468352424f844156883302653252f70e /arch/x86/xen
parentb8cb642af98216fe6eeca1525345b8a5c9d7c9a4 (diff)
parent626d7508664c4bc8e67f496da4387ecd0c410b8c (diff)
downloadlinux-752240e74d650faa24425adc523f1308973ea51c.tar.gz
linux-752240e74d650faa24425adc523f1308973ea51c.tar.bz2
linux-752240e74d650faa24425adc523f1308973ea51c.zip
Merge tag 'for-linus-4.3-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip
Pull xen updates from David Vrabel: "Xen features and fixes for 4.3: - Convert xen-blkfront to the multiqueue API - [arm] Support binding event channels to different VCPUs. - [x86] Support > 512 GiB in a PV guests (off by default as such a guest cannot be migrated with the current toolstack). - [x86] PMU support for PV dom0 (limited support for using perf with Xen and other guests)" * tag 'for-linus-4.3-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (33 commits) xen: switch extra memory accounting to use pfns xen: limit memory to architectural maximum xen: avoid another early crash of memory limited dom0 xen: avoid early crash of memory limited dom0 arm/xen: Remove helpers which are PV specific xen/x86: Don't try to set PCE bit in CR4 xen/PMU: PMU emulation code xen/PMU: Intercept PMU-related MSR and APIC accesses xen/PMU: Describe vendor-specific PMU registers xen/PMU: Initialization code for Xen PMU xen/PMU: Sysfs interface for setting Xen PMU mode xen: xensyms support xen: remove no longer needed p2m.h xen: allow more than 512 GB of RAM for 64 bit pv-domains xen: move p2m list if conflicting with e820 map xen: add explicit memblock_reserve() calls for special pages mm: provide early_memremap_ro to establish read-only mapping xen: check for initrd conflicting with e820 map xen: check pre-allocated page tables for conflict with memory map xen: check for kernel memory conflicting with memory layout ...
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig21
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/apic.c6
-rw-r--r--arch/x86/xen/enlighten.c20
-rw-r--r--arch/x86/xen/mmu.c399
-rw-r--r--arch/x86/xen/p2m.c43
-rw-r--r--arch/x86/xen/p2m.h15
-rw-r--r--arch/x86/xen/platform-pci-unplug.c2
-rw-r--r--arch/x86/xen/pmu.c570
-rw-r--r--arch/x86/xen/pmu.h15
-rw-r--r--arch/x86/xen/setup.c496
-rw-r--r--arch/x86/xen/smp.c29
-rw-r--r--arch/x86/xen/suspend.c23
-rw-r--r--arch/x86/xen/xen-head.S2
-rw-r--r--arch/x86/xen/xen-ops.h7
15 files changed, 1435 insertions, 215 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 484145368a24..c7b15f3e2cf3 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -7,6 +7,7 @@ config XEN
depends on PARAVIRT
select PARAVIRT_CLOCK
select XEN_HAVE_PVMMU
+ select XEN_HAVE_VPMU
depends on X86_64 || (X86_32 && X86_PAE)
depends on X86_LOCAL_APIC && X86_TSC
help
@@ -23,14 +24,18 @@ config XEN_PVHVM
def_bool y
depends on XEN && PCI && X86_LOCAL_APIC
-config XEN_MAX_DOMAIN_MEMORY
- int
- default 500 if X86_64
- default 64 if X86_32
- depends on XEN
- help
- This only affects the sizing of some bss arrays, the unused
- portions of which are freed.
+config XEN_512GB
+ bool "Limit Xen pv-domain memory to 512GB"
+ depends on XEN && X86_64
+ default y
+ help
+ Limit paravirtualized user domains to 512GB of RAM.
+
+ The Xen tools and crash dump analysis tools might not support
+ pv-domains with more than 512 GB of RAM. This option controls the
+ default setting of the kernel to use only up to 512 GB or more.
+ It is always possible to change the default via specifying the
+ boot parameter "xen_512gb_limit".
config XEN_SAVE_RESTORE
bool
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 4b6e29ac0968..e47e52787d32 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -13,7 +13,7 @@ CFLAGS_mmu.o := $(nostackp)
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
grant-table.o suspend.o platform-pci-unplug.o \
- p2m.o apic.o
+ p2m.o apic.o pmu.o
obj-$(CONFIG_EVENT_TRACING) += trace.o
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 70e060ad879a..acda713ab5be 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -7,6 +7,7 @@
#include <xen/xen.h>
#include <xen/interface/physdev.h>
#include "xen-ops.h"
+#include "pmu.h"
#include "smp.h"
static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
@@ -72,6 +73,11 @@ static u32 xen_apic_read(u32 reg)
static void xen_apic_write(u32 reg, u32 val)
{
+ if (reg == APIC_LVTPC) {
+ (void)pmu_apic_update(reg);
+ return;
+ }
+
/* Warn to see if there's any stray references */
WARN(1,"register: %x, value: %x\n", reg, val);
}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d9cfa452da9d..30d12afe52ed 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -84,6 +84,7 @@
#include "mmu.h"
#include "smp.h"
#include "multicalls.h"
+#include "pmu.h"
EXPORT_SYMBOL_GPL(hypercall_page);
@@ -1010,8 +1011,7 @@ static void xen_write_cr0(unsigned long cr0)
static void xen_write_cr4(unsigned long cr4)
{
- cr4 &= ~X86_CR4_PGE;
- cr4 &= ~X86_CR4_PSE;
+ cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
native_write_cr4(cr4);
}
@@ -1030,6 +1030,9 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
{
u64 val;
+ if (pmu_msr_read(msr, &val, err))
+ return val;
+
val = native_read_msr_safe(msr, err);
switch (msr) {
case MSR_IA32_APICBASE:
@@ -1076,7 +1079,8 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
Xen console noise. */
default:
- ret = native_write_msr_safe(msr, low, high);
+ if (!pmu_msr_write(msr, low, high, &ret))
+ ret = native_write_msr_safe(msr, low, high);
}
return ret;
@@ -1215,7 +1219,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_msr = xen_read_msr_safe,
.write_msr = xen_write_msr_safe,
- .read_pmc = native_read_pmc,
+ .read_pmc = xen_read_pmc,
.iret = xen_iret,
#ifdef CONFIG_X86_64
@@ -1264,6 +1268,10 @@ static const struct pv_apic_ops xen_apic_ops __initconst = {
static void xen_reboot(int reason)
{
struct sched_shutdown r = { .reason = reason };
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ xen_pmu_finish(cpu);
if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
BUG();
@@ -1607,7 +1615,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
early_boot_irqs_disabled = true;
xen_raw_console_write("mapping kernel into physical memory\n");
- xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages);
+ xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
+ xen_start_info->nr_pages);
+ xen_reserve_special_pages();
/*
* Modify the cache mode translation tables to match Xen's PAT
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index dd151b2045b0..2c50b445884e 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -116,6 +116,7 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
+static phys_addr_t xen_pt_base, xen_pt_size __initdata;
/*
* Just beyond the highest usermode address. STACK_TOP_MAX has a
@@ -1093,6 +1094,16 @@ static void xen_exit_mmap(struct mm_struct *mm)
static void xen_post_allocator_init(void);
+static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+ struct mmuext_op op;
+
+ op.cmd = cmd;
+ op.arg1.mfn = pfn_to_mfn(pfn);
+ if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+ BUG();
+}
+
#ifdef CONFIG_X86_64
static void __init xen_cleanhighmap(unsigned long vaddr,
unsigned long vaddr_end)
@@ -1114,6 +1125,83 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
xen_mc_flush();
}
+/*
+ * Make a page range writeable and free it.
+ */
+static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
+{
+ void *vaddr = __va(paddr);
+ void *vaddr_end = vaddr + size;
+
+ for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
+ make_lowmem_page_readwrite(vaddr);
+
+ memblock_free(paddr, size);
+}
+
+static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
+{
+ unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
+
+ if (unpin)
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
+ ClearPagePinned(virt_to_page(__va(pa)));
+ xen_free_ro_pages(pa, PAGE_SIZE);
+}
+
+/*
+ * Since it is well isolated we can (and since it is perhaps large we should)
+ * also free the page tables mapping the initial P->M table.
+ */
+static void __init xen_cleanmfnmap(unsigned long vaddr)
+{
+ unsigned long va = vaddr & PMD_MASK;
+ unsigned long pa;
+ pgd_t *pgd = pgd_offset_k(va);
+ pud_t *pud_page = pud_offset(pgd, 0);
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned int i;
+ bool unpin;
+
+ unpin = (vaddr == 2 * PGDIR_SIZE);
+ set_pgd(pgd, __pgd(0));
+ do {
+ pud = pud_page + pud_index(va);
+ if (pud_none(*pud)) {
+ va += PUD_SIZE;
+ } else if (pud_large(*pud)) {
+ pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
+ xen_free_ro_pages(pa, PUD_SIZE);
+ va += PUD_SIZE;
+ } else {
+ pmd = pmd_offset(pud, va);
+ if (pmd_large(*pmd)) {
+ pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
+ xen_free_ro_pages(pa, PMD_SIZE);
+ } else if (!pmd_none(*pmd)) {
+ pte = pte_offset_kernel(pmd, va);
+ set_pmd(pmd, __pmd(0));
+ for (i = 0; i < PTRS_PER_PTE; ++i) {
+ if (pte_none(pte[i]))
+ break;
+ pa = pte_pfn(pte[i]) << PAGE_SHIFT;
+ xen_free_ro_pages(pa, PAGE_SIZE);
+ }
+ xen_cleanmfnmap_free_pgtbl(pte, unpin);
+ }
+ va += PMD_SIZE;
+ if (pmd_index(va))
+ continue;
+ set_pud(pud, __pud(0));
+ xen_cleanmfnmap_free_pgtbl(pmd, unpin);
+ }
+
+ } while (pud_index(va) || pmd_index(va));
+ xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
+}
+
static void __init xen_pagetable_p2m_free(void)
{
unsigned long size;
@@ -1128,18 +1216,31 @@ static void __init xen_pagetable_p2m_free(void)
/* using __ka address and sticking INVALID_P2M_ENTRY! */
memset((void *)xen_start_info->mfn_list, 0xff, size);
- /* We should be in __ka space. */
- BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
addr = xen_start_info->mfn_list;
- /* We roundup to the PMD, which means that if anybody at this stage is
- * using the __ka address of xen_start_info or xen_start_info->shared_info
- * they are in going to crash. Fortunatly we have already revectored
- * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
+ /*
+ * We could be in __ka space.
+ * We roundup to the PMD, which means that if anybody at this stage is
+ * using the __ka address of xen_start_info or
+ * xen_start_info->shared_info they are in going to crash. Fortunatly
+ * we have already revectored in xen_setup_kernel_pagetable and in
+ * xen_setup_shared_info.
+ */
size = roundup(size, PMD_SIZE);
- xen_cleanhighmap(addr, addr + size);
- size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
- memblock_free(__pa(xen_start_info->mfn_list), size);
+ if (addr >= __START_KERNEL_map) {
+ xen_cleanhighmap(addr, addr + size);
+ size = PAGE_ALIGN(xen_start_info->nr_pages *
+ sizeof(unsigned long));
+ memblock_free(__pa(addr), size);
+ } else {
+ xen_cleanmfnmap(addr);
+ }
+}
+
+static void __init xen_pagetable_cleanhighmap(void)
+{
+ unsigned long size;
+ unsigned long addr;
/* At this stage, cleanup_highmap has already cleaned __ka space
* from _brk_limit way up to the max_pfn_mapped (which is the end of
@@ -1172,6 +1273,8 @@ static void __init xen_pagetable_p2m_setup(void)
#ifdef CONFIG_X86_64
xen_pagetable_p2m_free();
+
+ xen_pagetable_cleanhighmap();
#endif
/* And revector! Bye bye old array */
xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
@@ -1461,6 +1564,24 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
#else /* CONFIG_X86_64 */
static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
{
+ unsigned long pfn;
+
+ if (xen_feature(XENFEAT_writable_page_tables) ||
+ xen_feature(XENFEAT_auto_translated_physmap) ||
+ xen_start_info->mfn_list >= __START_KERNEL_map)
+ return pte;
+
+ /*
+ * Pages belonging to the initial p2m list mapped outside the default
+ * address range must be mapped read-only. This region contains the
+ * page tables for mapping the p2m list, too, and page tables MUST be
+ * mapped read-only.
+ */
+ pfn = pte_pfn(pte);
+ if (pfn >= xen_start_info->first_p2m_pfn &&
+ pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
+ pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
+
return pte;
}
#endif /* CONFIG_X86_64 */
@@ -1489,15 +1610,6 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
native_set_pte(ptep, pte);
}
-static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
-{
- struct mmuext_op op;
- op.cmd = cmd;
- op.arg1.mfn = pfn_to_mfn(pfn);
- if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
- BUG();
-}
-
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
@@ -1815,7 +1927,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
* mappings. Considering that on Xen after the kernel mappings we
* have the mappings of some pages that don't exist in pfn space, we
* set max_pfn_mapped to the last real pfn mapped. */
- max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
+ if (xen_start_info->mfn_list < __START_KERNEL_map)
+ max_pfn_mapped = xen_start_info->first_p2m_pfn;
+ else
+ max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
pt_end = pt_base + xen_start_info->nr_pt_frames;
@@ -1855,6 +1970,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* Graft it onto L4[511][510] */
copy_page(level2_kernel_pgt, l2);
+ /* Copy the initial P->M table mappings if necessary. */
+ i = pgd_index(xen_start_info->mfn_list);
+ if (i && i < pgd_index(__START_KERNEL_map))
+ init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
+
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
/* Make pagetable pieces RO */
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
@@ -1894,10 +2014,192 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
check_pt_base(&pt_base, &pt_end, addr[i]);
/* Our (by three pages) smaller Xen pagetable that we are using */
- memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE);
+ xen_pt_base = PFN_PHYS(pt_base);
+ xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
+ memblock_reserve(xen_pt_base, xen_pt_size);
+
/* Revector the xen_start_info */
xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
}
+
+/*
+ * Read a value from a physical address.
+ */
+static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
+{
+ unsigned long *vaddr;
+ unsigned long val;
+
+ vaddr = early_memremap_ro(addr, sizeof(val));
+ val = *vaddr;
+ early_memunmap(vaddr, sizeof(val));
+ return val;
+}
+
+/*
+ * Translate a virtual address to a physical one without relying on mapped
+ * page tables.
+ */
+static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
+{
+ phys_addr_t pa;
+ pgd_t pgd;
+ pud_t pud;
+ pmd_t pmd;
+ pte_t pte;
+
+ pa = read_cr3();
+ pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
+ sizeof(pgd)));
+ if (!pgd_present(pgd))
+ return 0;
+
+ pa = pgd_val(pgd) & PTE_PFN_MASK;
+ pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
+ sizeof(pud)));
+ if (!pud_present(pud))
+ return 0;
+ pa = pud_pfn(pud) << PAGE_SHIFT;
+ if (pud_large(pud))
+ return pa + (vaddr & ~PUD_MASK);
+
+ pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
+ sizeof(pmd)));
+ if (!pmd_present(pmd))
+ return 0;
+ pa = pmd_pfn(pmd) << PAGE_SHIFT;
+ if (pmd_large(pmd))
+ return pa + (vaddr & ~PMD_MASK);
+
+ pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
+ sizeof(pte)));
+ if (!pte_present(pte))
+ return 0;
+ pa = pte_pfn(pte) << PAGE_SHIFT;
+
+ return pa | (vaddr & ~PAGE_MASK);
+}
+
+/*
+ * Find a new area for the hypervisor supplied p2m list and relocate the p2m to
+ * this area.
+ */
+void __init xen_relocate_p2m(void)
+{
+ phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
+ unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
+ int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
+ pte_t *pt;
+ pmd_t *pmd;
+ pud_t *pud;
+ pgd_t *pgd;
+ unsigned long *new_p2m;
+
+ size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+ n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
+ n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
+ n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
+ n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
+ n_frames = n_pte + n_pt + n_pmd + n_pud;
+
+ new_area = xen_find_free_area(PFN_PHYS(n_frames));
+ if (!new_area) {
+ xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
+ BUG();
+ }
+
+ /*
+ * Setup the page tables for addressing the new p2m list.
+ * We have asked the hypervisor to map the p2m list at the user address
+ * PUD_SIZE. It may have done so, or it may have used a kernel space
+ * address depending on the Xen version.
+ * To avoid any possible virtual address collision, just use
+ * 2 * PUD_SIZE for the new area.
+ */
+ pud_phys = new_area;
+ pmd_phys = pud_phys + PFN_PHYS(n_pud);
+ pt_phys = pmd_phys + PFN_PHYS(n_pmd);
+ p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
+
+ pgd = __va(read_cr3());
+ new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
+ for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
+ pud = early_memremap(pud_phys, PAGE_SIZE);
+ clear_page(pud);
+ for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
+ idx_pmd++) {
+ pmd = early_memremap(pmd_phys, PAGE_SIZE);
+ clear_page(pmd);
+ for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
+ idx_pt++) {
+ pt = early_memremap(pt_phys, PAGE_SIZE);
+ clear_page(pt);
+ for (idx_pte = 0;
+ idx_pte < min(n_pte, PTRS_PER_PTE);
+ idx_pte++) {
+ set_pte(pt + idx_pte,
+ pfn_pte(p2m_pfn, PAGE_KERNEL));
+ p2m_pfn++;
+ }
+ n_pte -= PTRS_PER_PTE;
+ early_memunmap(pt, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pt_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
+ PFN_DOWN(pt_phys));
+ set_pmd(pmd + idx_pt,
+ __pmd(_PAGE_TABLE | pt_phys));
+ pt_phys += PAGE_SIZE;
+ }
+ n_pt -= PTRS_PER_PMD;
+ early_memunmap(pmd, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pmd_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
+ PFN_DOWN(pmd_phys));
+ set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
+ pmd_phys += PAGE_SIZE;
+ }
+ n_pmd -= PTRS_PER_PUD;
+ early_memunmap(pud, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pud_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
+ set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
+ pud_phys += PAGE_SIZE;
+ }
+
+ /* Now copy the old p2m info to the new area. */
+ memcpy(new_p2m, xen_p2m_addr, size);
+ xen_p2m_addr = new_p2m;
+
+ /* Release the old p2m list and set new list info. */
+ p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
+ BUG_ON(!p2m_pfn);
+ p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
+
+ if (xen_start_info->mfn_list < __START_KERNEL_map) {
+ pfn = xen_start_info->first_p2m_pfn;
+ pfn_end = xen_start_info->first_p2m_pfn +
+ xen_start_info->nr_p2m_frames;
+ set_pgd(pgd + 1, __pgd(0));
+ } else {
+ pfn = p2m_pfn;
+ pfn_end = p2m_pfn_end;
+ }
+
+ memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
+ while (pfn < pfn_end) {
+ if (pfn == p2m_pfn) {
+ pfn = p2m_pfn_end;
+ continue;
+ }
+ make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+ pfn++;
+ }
+
+ xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
+ xen_start_info->first_p2m_pfn = PFN_DOWN(new_area);
+ xen_start_info->nr_p2m_frames = n_frames;
+}
+
#else /* !CONFIG_X86_64 */
static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
@@ -1938,18 +2240,41 @@ static void __init xen_write_cr3_init(unsigned long cr3)
pv_mmu_ops.write_cr3 = &xen_write_cr3;
}
+/*
+ * For 32 bit domains xen_start_info->pt_base is the pgd address which might be
+ * not the first page table in the page table pool.
+ * Iterate through the initial page tables to find the real page table base.
+ */
+static phys_addr_t xen_find_pt_base(pmd_t *pmd)
+{
+ phys_addr_t pt_base, paddr;
+ unsigned pmdidx;
+
+ pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
+
+ for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
+ if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
+ paddr = m2p(pmd[pmdidx].pmd);
+ pt_base = min(pt_base, paddr);
+ }
+
+ return pt_base;
+}
+
void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
{
pmd_t *kernel_pmd;
+ kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+
+ xen_pt_base = xen_find_pt_base(kernel_pmd);
+ xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
+
initial_kernel_pmd =
extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
- max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
- xen_start_info->nr_pt_frames * PAGE_SIZE +
- 512*1024);
+ max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
- kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
copy_page(initial_kernel_pmd, kernel_pmd);
xen_map_identity_early(initial_kernel_pmd, max_pfn);
@@ -1968,11 +2293,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
PFN_DOWN(__pa(initial_page_table)));
xen_write_cr3(__pa(initial_page_table));
- memblock_reserve(__pa(xen_start_info->pt_base),
- xen_start_info->nr_pt_frames * PAGE_SIZE);
+ memblock_reserve(xen_pt_base, xen_pt_size);
}
#endif /* CONFIG_X86_64 */
+void __init xen_reserve_special_pages(void)
+{
+ phys_addr_t paddr;
+
+ memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
+ if (xen_start_info->store_mfn) {
+ paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
+ memblock_reserve(paddr, PAGE_SIZE);
+ }
+ if (!xen_initial_domain()) {
+ paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
+ memblock_reserve(paddr, PAGE_SIZE);
+ }
+}
+
+void __init xen_pt_check_e820(void)
+{
+ if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
+ xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
+ BUG();
+ }
+}
+
static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 8b7f18e200aa..bfc08b13044b 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -79,10 +79,14 @@
#include <xen/balloon.h>
#include <xen/grant_table.h>
-#include "p2m.h"
#include "multicalls.h"
#include "xen-ops.h"
+#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
#define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE)
unsigned long *xen_p2m_addr __read_mostly;
@@ -199,7 +203,8 @@ void __ref xen_build_mfn_list_list(void)
unsigned int level, topidx, mididx;
unsigned long *mid_mfn_p;
- if (xen_feature(XENFEAT_auto_translated_physmap))
+ if (xen_feature(XENFEAT_auto_translated_physmap) ||
+ xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
return;
/* Pre-initialize p2m_top_mfn to be completely missing */
@@ -260,9 +265,16 @@ void xen_setup_mfn_list_list(void)
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
- HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
- virt_to_mfn(p2m_top_mfn);
+ if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = ~0UL;
+ else
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+ virt_to_mfn(p2m_top_mfn);
HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
+ HYPERVISOR_shared_info->arch.p2m_generation = 0;
+ HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr;
+ HYPERVISOR_shared_info->arch.p2m_cr3 =
+ xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
}
/* Set up p2m_top to point to the domain-builder provided p2m pages */
@@ -478,8 +490,12 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
ptechk = lookup_address(vaddr, &level);
if (ptechk == pte_pg) {
+ HYPERVISOR_shared_info->arch.p2m_generation++;
+ wmb(); /* Tools are synchronizing via p2m_generation. */
set_pmd(pmdp,
__pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE));
+ wmb(); /* Tools are synchronizing via p2m_generation. */
+ HYPERVISOR_shared_info->arch.p2m_generation++;
pte_newpg[i] = NULL;
}
@@ -505,7 +521,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
*/
static bool alloc_p2m(unsigned long pfn)
{
- unsigned topidx, mididx;
+ unsigned topidx;
unsigned long *top_mfn_p, *mid_mfn;
pte_t *ptep, *pte_pg;
unsigned int level;
@@ -513,9 +529,6 @@ static bool alloc_p2m(unsigned long pfn)
unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
unsigned long p2m_pfn;
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
-
ptep = lookup_address(addr, &level);
BUG_ON(!ptep || level != PG_LEVEL_4K);
pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
@@ -527,7 +540,8 @@ static bool alloc_p2m(unsigned long pfn)
return false;
}
- if (p2m_top_mfn) {
+ if (p2m_top_mfn && pfn < MAX_P2M_PFN) {
+ topidx = p2m_top_index(pfn);
top_mfn_p = &p2m_top_mfn[topidx];
mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
@@ -577,10 +591,14 @@ static bool alloc_p2m(unsigned long pfn)
spin_lock_irqsave(&p2m_update_lock, flags);
if (pte_pfn(*ptep) == p2m_pfn) {
+ HYPERVISOR_shared_info->arch.p2m_generation++;
+ wmb(); /* Tools are synchronizing via p2m_generation. */
set_pte(ptep,
pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL));
+ wmb(); /* Tools are synchronizing via p2m_generation. */
+ HYPERVISOR_shared_info->arch.p2m_generation++;
if (mid_mfn)
- mid_mfn[mididx] = virt_to_mfn(p2m);
+ mid_mfn[p2m_mid_index(pfn)] = virt_to_mfn(p2m);
p2m = NULL;
}
@@ -630,6 +648,11 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
return true;
}
+ /*
+ * The interface requires atomic updates on p2m elements.
+ * xen_safe_write_ulong() is using __put_user which does an atomic
+ * store via asm().
+ */
if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn)))
return true;
diff --git a/arch/x86/xen/p2m.h b/arch/x86/xen/p2m.h
deleted file mode 100644
index ad8aee24ab72..000000000000
--- a/arch/x86/xen/p2m.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _XEN_P2M_H
-#define _XEN_P2M_H
-
-#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
-#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
-#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
-
-#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
-
-#define MAX_REMAP_RANGES 10
-
-extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
- unsigned long pfn_e);
-
-#endif /* _XEN_P2M_H */
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index a8261716d58d..9586ff32810c 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -68,7 +68,7 @@ static int check_platform_magic(void)
return 0;
}
-bool xen_has_pv_devices()
+bool xen_has_pv_devices(void)
{
if (!xen_domain())
return false;
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
new file mode 100644
index 000000000000..724a08740a04
--- /dev/null
+++ b/arch/x86/xen/pmu.c
@@ -0,0 +1,570 @@
+#include <linux/types.h>
+#include <linux/interrupt.h>
+
+#include <asm/xen/hypercall.h>
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/xenpmu.h>
+
+#include "xen-ops.h"
+#include "pmu.h"
+
+/* x86_pmu.handle_irq definition */
+#include "../kernel/cpu/perf_event.h"
+
+#define XENPMU_IRQ_PROCESSING 1
+struct xenpmu {
+ /* Shared page between hypervisor and domain */
+ struct xen_pmu_data *xenpmu_data;
+
+ uint8_t flags;
+};
+static DEFINE_PER_CPU(struct xenpmu, xenpmu_shared);
+#define get_xenpmu_data() (this_cpu_ptr(&xenpmu_shared)->xenpmu_data)
+#define get_xenpmu_flags() (this_cpu_ptr(&xenpmu_shared)->flags)
+
+/* Macro for computing address of a PMU MSR bank */
+#define field_offset(ctxt, field) ((void *)((uintptr_t)ctxt + \
+ (uintptr_t)ctxt->field))
+
+/* AMD PMU */
+#define F15H_NUM_COUNTERS 6
+#define F10H_NUM_COUNTERS 4
+
+static __read_mostly uint32_t amd_counters_base;
+static __read_mostly uint32_t amd_ctrls_base;
+static __read_mostly int amd_msr_step;
+static __read_mostly int k7_counters_mirrored;
+static __read_mostly int amd_num_counters;
+
+/* Intel PMU */
+#define MSR_TYPE_COUNTER 0
+#define MSR_TYPE_CTRL 1
+#define MSR_TYPE_GLOBAL 2
+#define MSR_TYPE_ARCH_COUNTER 3
+#define MSR_TYPE_ARCH_CTRL 4
+
+/* Number of general pmu registers (CPUID.EAX[0xa].EAX[8..15]) */
+#define PMU_GENERAL_NR_SHIFT 8
+#define PMU_GENERAL_NR_BITS 8
+#define PMU_GENERAL_NR_MASK (((1 << PMU_GENERAL_NR_BITS) - 1) \
+ << PMU_GENERAL_NR_SHIFT)
+
+/* Number of fixed pmu registers (CPUID.EDX[0xa].EDX[0..4]) */
+#define PMU_FIXED_NR_SHIFT 0
+#define PMU_FIXED_NR_BITS 5
+#define PMU_FIXED_NR_MASK (((1 << PMU_FIXED_NR_BITS) - 1) \
+ << PMU_FIXED_NR_SHIFT)
+
+/* Alias registers (0x4c1) for full-width writes to PMCs */
+#define MSR_PMC_ALIAS_MASK (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0))
+
+#define INTEL_PMC_TYPE_SHIFT 30
+
+static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters;
+
+
+static void xen_pmu_arch_init(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+
+ switch (boot_cpu_data.x86) {
+ case 0x15:
+ amd_num_counters = F15H_NUM_COUNTERS;
+ amd_counters_base = MSR_F15H_PERF_CTR;
+ amd_ctrls_base = MSR_F15H_PERF_CTL;
+ amd_msr_step = 2;
+ k7_counters_mirrored = 1;
+ break;
+ case 0x10:
+ case 0x12:
+ case 0x14:
+ case 0x16:
+ default:
+ amd_num_counters = F10H_NUM_COUNTERS;
+ amd_counters_base = MSR_K7_PERFCTR0;
+ amd_ctrls_base = MSR_K7_EVNTSEL0;
+ amd_msr_step = 1;
+ k7_counters_mirrored = 0;