// SPDX-License-Identifier: MIT
/*
* Copyright © 2020 Intel Corporation
*/
#include "i915_drv.h"
#include "intel_context.h"
#include "intel_gpu_commands.h"
#include "intel_gt.h"
#include "intel_gtt.h"
#include "intel_migrate.h"
#include "intel_ring.h"
#include "gem/i915_gem_lmem.h"
struct insert_pte_data {
u64 offset;
};
#define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
#define GET_CCS_BYTES(i915, size) (HAS_FLAT_CCS(i915) ? \
DIV_ROUND_UP(size, NUM_BYTES_PER_CCS_BYTE) : 0)
static bool engine_supports_migration(struct intel_engine_cs *engine)
{
if (!engine)
return false;
/*
* We need the ability to prevent aribtration (MI_ARB_ON_OFF),
* the ability to write PTE using inline data (MI_STORE_DATA)
* and of course the ability to do the block transfer (blits).
*/
GEM_BUG_ON(engine->class != COPY_ENGINE_CLASS);
return true;
}
static void xehpsdv_toggle_pdes(struct i915_address_space *vm,
struct i915_page_table *pt,
void *data)
{
struct insert_pte_data *d = data;
/*
* Insert a dummy PTE into every PT that will map to LMEM to ensure
* we have a correctly setup PDE structure for later use.
*/
vm->insert_page(vm, 0, d->offset,
i915_gem_get_pat_index(vm->i915, I915_CACHE_NONE),
PTE_LM);
GEM_BUG_ON(!pt->is_compact);
d->offset += SZ_2M;
}
static void xehpsdv_insert_pte(struct i915_address_space *vm,
struct i915_page_table *pt,
void *data)
{
struct insert_pte_data *d = data;
/*
* We are playing tricks here, since the actual pt, from the hw
* pov, is only 256bytes with 32 entries, or 4096bytes with 512
* entries, but we are still guaranteed that the physical
* alignment is 64K underneath for the pt, and we are careful
* not to access the space in the void.
*/
vm->insert_page(vm, px_dma(pt), d->offset,
i915_gem_get_pat_index(vm->i915, I915_CACHE_NONE),
PTE_LM);
d->offset += SZ_64K;
}
static void insert_pte(struct i915_address_space *vm,
struct i915_page_table *pt,
void *data)
{
struct insert_pte_data *d = data;
vm->insert_page(vm, px_dma(pt), d->offset,
i915_gem_get_pat_index(vm->i915, I915_CACHE_NONE),
i915_gem_object_is_lmem(pt->base) ? PTE_LM : 0);
d->offset += PAGE_SIZE;
}
static struct i915_address_space *migrate_vm(struct intel_gt *gt)
{
struct i915_vm_pt_stash stash = {};
struct i915_ppgtt *vm;
int err;
int i;
/*
* We construct a very special VM for use by all migration contexts,
* it is kept pinned so that it can be used at any time. As we need
* to pre-allocate the page directories for the migration VM, this
* limits us to only using a small number of prepared vma.
*
* To be able to pipeline and reschedule migration operations while
* avoiding unnecessary contention on the vm itself, the PTE updates
* are inline with the blits. All the blits use the same fixed
* addresses, with the backing store redirection being updated on the
* fly. Only 2 implicit vma are used for all migration operations.
*
* We lay the ppGTT out as:
*
* [0, CHUNK_SZ) -> first object
* [CHUNK_SZ, 2 * CHUNK_SZ) -> second object
* [2 * CHUNK_SZ, 2 * CHUNK_SZ + 2 * CHUNK_SZ >> 9] -> PTE
*
* By exposing the dma addresses of the page directories themselves
* within the ppGTT, we are then able to rewrite the PTE prior to use.
* But the PTE update and subsequent migration operation must be atomic,
* i.e. within the same non-preemptible window so that we do not switch
* to another migration context that overwrites the PTE.
*
* This changes quite a bit on platforms with HAS_64K_PAGES support,
* where we instead have three windows, each CHUNK_SIZE in size. The
* first is reserved for mapping system-memory, and that just uses the
* 512 entry layout using 4K GTT pages. The other two windows just map
* lmem pages and must use the new compact 32 entry layout using 64K GTT
* pages, which ensures we can address any lmem object that the user
* throws at us. We then also use the xehpsdv_toggle_pdes as a way of
* just toggling the PDE bit(GEN12_PDE_64K) for us, to enable the
* compact layout for each of these page-tables, that fall within the
* [CHUNK_SIZE, 3 * CHUNK_SIZE) range.
*
* We lay the ppGTT out as:
*
* [0, CHUNK_SZ) -> first window/object, maps smem
* [CHUNK_SZ, 2 * CHUNK_SZ) -> second window/object, maps lmem src
* [2 * CHUNK_SZ, 3 * CHUNK_SZ) -> third window/object, maps lmem dst
*
* For the PTE window it's also quite different, since each PTE must
* point to some 64K page, one for each PT(since it's in lmem), and yet
* each is only <= 4096bytes, but since the unused space within that PTE
* range is never touched, this should be fine.
*
* So basically each PT now needs 64K of virtual memory, instead of 4K,
* which looks like:
*
* [3 * CHUNK_SZ, 3 * CHUNK_SZ + ((3 * CHUNK_SZ / SZ_2M) * SZ_64K)] -> PTE
*/
vm = i915_ppgtt_create(gt, I915_BO_ALLOC_PM_EARLY);
if (IS_ERR(vm))
return ERR_CAST(vm);
if (!vm->vm.allocate_va_range || !vm->vm.foreach) {
err = -ENODEV;
goto err_vm;
}
if (HAS_64K_PAGES(gt->i915))
stash.pt_sz = I915_GTT_PAGE_SIZE_64K;
/*
* Each engine instance is assigned its own chunk in the VM, so
* that we can run multiple instances concurrently
*/
for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
struct intel_engine_cs *engine;
u64 base = (u64)i << 32;
struct insert_pte_data d = {};
struct i915_gem_ww_ctx ww;
u64 sz;
engine = gt->engine_class[COPY_ENGINE_CLASS][i];
if (!engine_supports_migration(engine