diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_svm.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 3085 |
1 files changed, 3085 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c new file mode 100644 index 000000000000..b665e9ff77e3 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -0,0 +1,3085 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright 2020-2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/types.h> +#include <linux/sched/task.h> +#include "amdgpu_sync.h" +#include "amdgpu_object.h" +#include "amdgpu_vm.h" +#include "amdgpu_mn.h" +#include "amdgpu.h" +#include "amdgpu_xgmi.h" +#include "kfd_priv.h" +#include "kfd_svm.h" +#include "kfd_migrate.h" + +#define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1 + +/* Long enough to ensure no retry fault comes after svm range is restored and + * page table is updated. + */ +#define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING 2000 + +static void svm_range_evict_svm_bo_worker(struct work_struct *work); +static bool +svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq); + +static const struct mmu_interval_notifier_ops svm_range_mn_ops = { + .invalidate = svm_range_cpu_invalidate_pagetables, +}; + +/** + * svm_range_unlink - unlink svm_range from lists and interval tree + * @prange: svm range structure to be removed + * + * Remove the svm_range from the svms and svm_bo lists and the svms + * interval tree. + * + * Context: The caller must hold svms->lock + */ +static void svm_range_unlink(struct svm_range *prange) +{ + pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, + prange, prange->start, prange->last); + + if (prange->svm_bo) { + spin_lock(&prange->svm_bo->list_lock); + list_del(&prange->svm_bo_list); + spin_unlock(&prange->svm_bo->list_lock); + } + + list_del(&prange->list); + if (prange->it_node.start != 0 && prange->it_node.last != 0) + interval_tree_remove(&prange->it_node, &prange->svms->objects); +} + +static void +svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange) +{ + pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, + prange, prange->start, prange->last); + + mmu_interval_notifier_insert_locked(&prange->notifier, mm, + prange->start << PAGE_SHIFT, + prange->npages << PAGE_SHIFT, + &svm_range_mn_ops); +} + +/** + * svm_range_add_to_svms - add svm range to svms + * @prange: svm range structure to be added + * + * Add the svm range to svms interval tree and link list + * + * Context: The caller must hold svms->lock + */ +static void svm_range_add_to_svms(struct svm_range *prange) +{ + pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, + prange, prange->start, prange->last); + + list_add_tail(&prange->list, &prange->svms->list); + prange->it_node.start = prange->start; + prange->it_node.last = prange->last; + interval_tree_insert(&prange->it_node, &prange->svms->objects); +} + +static void svm_range_remove_notifier(struct svm_range *prange) +{ + pr_debug("remove notifier svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", + prange->svms, prange, + prange->notifier.interval_tree.start >> PAGE_SHIFT, + prange->notifier.interval_tree.last >> PAGE_SHIFT); + + if (prange->notifier.interval_tree.start != 0 && + prange->notifier.interval_tree.last != 0) + mmu_interval_notifier_remove(&prange->notifier); +} + +static int +svm_range_dma_map_dev(struct device *dev, dma_addr_t **dma_addr, + unsigned long *hmm_pfns, uint64_t npages) +{ + enum dma_data_direction dir = DMA_BIDIRECTIONAL; + dma_addr_t *addr = *dma_addr; + struct page *page; + int i, r; + + if (!addr) { + addr = kvmalloc_array(npages, sizeof(*addr), + GFP_KERNEL | __GFP_ZERO); + if (!addr) + return -ENOMEM; + *dma_addr = addr; + } + + for (i = 0; i < npages; i++) { + if (WARN_ONCE(addr[i] && !dma_mapping_error(dev, addr[i]), + "leaking dma mapping\n")) + dma_unmap_page(dev, addr[i], PAGE_SIZE, dir); + + page = hmm_pfn_to_page(hmm_pfns[i]); + addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir); + r = dma_mapping_error(dev, addr[i]); + if (r) { + pr_debug("failed %d dma_map_page\n", r); + return r; + } + pr_debug("dma mapping 0x%llx for page addr 0x%lx\n", + addr[i] >> PAGE_SHIFT, page_to_pfn(page)); + } + return 0; +} + +static int +svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap, + unsigned long *hmm_pfns) +{ + struct kfd_process *p; + uint32_t gpuidx; + int r; + + p = container_of(prange->svms, struct kfd_process, svms); + + for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { + struct kfd_process_device *pdd; + struct amdgpu_device *adev; + + pr_debug("mapping to gpu idx 0x%x\n", gpuidx); + pdd = kfd_process_device_from_gpuidx(p, gpuidx); + if (!pdd) { + pr_debug("failed to find device idx %d\n", gpuidx); + return -EINVAL; + } + adev = (struct amdgpu_device *)pdd->dev->kgd; + + r = svm_range_dma_map_dev(adev->dev, &prange->dma_addr[gpuidx], + hmm_pfns, prange->npages); + if (r) + break; + } + + return r; +} + +void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr, + unsigned long offset, unsigned long npages) +{ + enum dma_data_direction dir = DMA_BIDIRECTIONAL; + int i; + + if (!dma_addr) + return; + + for (i = offset; i < offset + npages; i++) { + if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i])) + continue; + pr_debug("dma unmapping 0x%llx\n", dma_addr[i] >> PAGE_SHIFT); + dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir); + dma_addr[i] = 0; + } +} + +void svm_range_free_dma_mappings(struct svm_range *prange) +{ + struct kfd_process_device *pdd; + dma_addr_t *dma_addr; + struct device *dev; + struct kfd_process *p; + uint32_t gpuidx; + + p = container_of(prange->svms, struct kfd_process, svms); + + for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) { + dma_addr = prange->dma_addr[gpuidx]; + if (!dma_addr) + continue; + + pdd = kfd_process_device_from_gpuidx(p, gpuidx); + if (!pdd) { + pr_debug("failed to find device idx %d\n", gpuidx); + continue; + } + dev = &pdd->dev->pdev->dev; + svm_range_dma_unmap(dev, dma_addr, 0, prange->npages); + kvfree(dma_addr); + prange->dma_addr[gpuidx] = NULL; + } +} + +static void svm_range_free(struct svm_range *prange) +{ + pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange, + prange->start, prange->last); + + svm_range_vram_node_free(prange); + svm_range_free_dma_mappings(prange); + mutex_destroy(&prange->lock); + mutex_destroy(&prange->migrate_mutex); + kfree(prange); +} + +static void +svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc, + uint8_t *granularity, uint32_t *flags) +{ + *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; + *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; + *granularity = 9; + *flags = + KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT; +} + +static struct +svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, + uint64_t last) +{ + uint64_t size = last - start + 1; + struct svm_range *prange; + struct kfd_process *p; + + prange = kzalloc(sizeof(*prange), GFP_KERNEL); + if (!prange) + return NULL; + prange->npages = size; + prange->svms = svms; + prange->start = start; + prange->last = last; + INIT_LIST_HEAD(&prange->list); + INIT_LIST_HEAD(&prange->update_list); + INIT_LIST_HEAD(&prange->remove_list); + INIT_LIST_HEAD(&prange->insert_list); + INIT_LIST_HEAD(&prange->svm_bo_list); + INIT_LIST_HEAD(&prange->deferred_list); + INIT_LIST_HEAD(&prange->child_list); + atomic_set(&prange->invalid, 0); + prange->validate_timestamp = 0; + mutex_init(&prange->migrate_mutex); + mutex_init(&prange->lock); + + p = container_of(svms, struct kfd_process, svms); + if (p->xnack_enabled) + bitmap_fill(prange->bitmap_access, MAX_GPU_INSTANCE); + + svm_range_set_default_attributes(&prange->preferred_loc, + &prange->prefetch_loc, + &prange->granularity, &prange->flags); + + pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last); + + return prange; +} + +static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo) +{ + if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref)) + return false; + + return true; +} + +static struct svm_range_bo *svm_range_bo_ref(struct svm_range_bo *svm_bo) +{ + if (svm_bo) + kref_get(&svm_bo->kref); + + return svm_bo; +} + +static void svm_range_bo_release(struct kref *kref) +{ + struct svm_range_bo *svm_bo; + + svm_bo = container_of(kref, struct svm_range_bo, kref); + spin_lock(&svm_bo->list_lock); + while (!list_empty(&svm_bo->range_list)) { + struct svm_range *prange = + list_first_entry(&svm_bo->range_list, + struct svm_range, svm_bo_list); + /* list_del_init tells a concurrent svm_range_vram_node_new when + * it's safe to reuse the svm_bo pointer and svm_bo_list head. + */ + list_del_init(&prange->svm_bo_list); + spin_unlock(&svm_bo->list_lock); + + pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, + prange->start, prange->last); + mutex_lock(&prange->lock); + prange->svm_bo = NULL; + mutex_unlock(&prange->lock); + + spin_lock(&svm_bo->list_lock); + } + spin_unlock(&svm_bo->list_lock); + if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) { + /* We're not in the eviction worker. + * Signal the fence and synchronize with any + * pending eviction work. + */ + dma_fence_signal(&svm_bo->eviction_fence->base); + cancel_work_sync(&svm_bo->eviction_work); + } + dma_fence_put(&svm_bo->eviction_fence->base); + amdgpu_bo_unref(&svm_bo->bo); + kfree(svm_bo); +} + +static void svm_range_bo_unref(struct svm_range_bo *svm_bo) +{ + if (!svm_bo) + return; + + kref_put(&svm_bo->kref, svm_range_bo_release); +} + +static bool +svm_range_validate_svm_bo(struct amdgpu_device *adev, struct svm_range *prange) +{ + struct amdgpu_device *bo_adev; + + mutex_lock(&prange->lock); + if (!prange->svm_bo) { + mutex_unlock(&prange->lock); + return false; + } + if (prange->ttm_res) { + /* We still have a reference, all is well */ + mutex_unlock(&prange->lock); + return true; + } + if (svm_bo_ref_unless_zero(prange->svm_bo)) { + /* + * Migrate from GPU to GPU, remove range from source bo_adev + * svm_bo range list, and return false to allocate svm_bo from + * destination adev. + */ + bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); + if (bo_adev != adev) { + mutex_unlock(&prange->lock); + + spin_lock(&prange->svm_bo->list_lock); + list_del_init(&prange->svm_bo_list); + spin_unlock(&prange->svm_bo->list_lock); + + svm_range_bo_unref(prange->svm_bo); + return false; + } + if (READ_ONCE(prange->svm_bo->evicting)) { + struct dma_fence *f; + struct svm_range_bo *svm_bo; + /* The BO is getting evicted, + * we need to get a new one + */ + mutex_unlock(&prange->lock); + svm_bo = prange->svm_bo; + f = dma_fence_get(&svm_bo->eviction_fence->base); + svm_range_bo_unref(prange->svm_bo); + /* wait for the fence to avoid long spin-loop + * at list_empty_careful + */ + dma_fence_wait(f, false); + dma_fence_put(f); + } else { + /* The BO was still around and we got + * a new reference to it + */ + mutex_unlock(&prange->lock); + pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n", + prange->svms, prange->start, prange->last); + + prange->ttm_res = &prange->svm_bo->bo->tbo.mem; + return true; + } + + } else { + mutex_unlock(&prange->lock); + } + + /* We need a new svm_bo. Spin-loop to wait for concurrent + * svm_range_bo_release to finish removing this range from + * its range list. After this, it is safe to reuse the + * svm_bo pointer and svm_bo_list head. + */ + while (!list_empty_careful(&prange->svm_bo_list)) + ; + + return false; +} + +static struct svm_range_bo *svm_range_bo_new(void) +{ + struct svm_range_bo *svm_bo; + + svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL); + if (!svm_bo) + return NULL; + + kref_init(&svm_bo->kref); + INIT_LIST_HEAD(&svm_bo->range_list); + spin_lock_init(&svm_bo->list_lock); + + return svm_bo; +} + +int +svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange, + bool clear) +{ + struct amdgpu_bo_param bp; + struct svm_range_bo *svm_bo; + struct amdgpu_bo_user *ubo; + struct amdgpu_bo *bo; + struct kfd_process *p; + struct mm_struct *mm; + int r; + + p = container_of(prange->svms, struct kfd_process, svms); + pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms, + prange->start, prange->last); + + if (svm_range_validate_svm_bo(adev, prange)) + return 0; + + svm_bo = svm_range_bo_new(); + if (!svm_bo) { + pr_debug("failed to alloc svm bo\n"); + return -ENOMEM; + } + mm = get_task_mm(p->lead_thread); + if (!mm) { + pr_debug("failed to get mm\n"); + kfree(svm_bo); + return -ESRCH; + } + svm_bo->svms = prange->svms; + svm_bo->eviction_fence = + amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), + mm, + svm_bo); + mmput(mm); + INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker); + svm_bo->evicting = 0; + memset(&bp, 0, sizeof(bp)); + bp.size = prange->npages * PAGE_SIZE; + bp.byte_align = PAGE_SIZE; + bp.domain = AMDGPU_GEM_DOMAIN_VRAM; + bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; + bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0; + bp.flags |= AMDGPU_AMDKFD_CREATE_SVM_BO; + bp.type = ttm_bo_type_device; + bp.resv = NULL; + + r = amdgpu_bo_create_user(adev, &bp, &ubo); + if (r) { + pr_debug("failed %d to create bo\n", r); + goto create_bo_failed; + } + bo = &ubo->bo; + r = amdgpu_bo_reserve(bo, true); + if (r) { + pr_debug("failed %d to reserve bo\n", r); + goto reserve_bo_failed; + } + + r = dma_resv_reserve_shared(bo->tbo.base.resv, 1); + if (r) { + pr_debug("failed %d to reserve bo\n", r); + amdgpu_bo_unreserve(bo); + goto reserve_bo_failed; + } + amdgpu_bo_fence(bo, &svm_bo->eviction_fence->base, true); + + amdgpu_bo_unreserve(bo); + + svm_bo->bo = bo; + prange->svm_bo = svm_bo; + prange->ttm_res = &bo->tbo.mem; + prange->offset = 0; + + spin_lock(&svm_bo->list_lock); + list_add(&prange->svm_bo_list, &svm_bo->range_list); + spin_unlock(&svm_bo->list_lock); + + return 0; + +reserve_bo_failed: + amdgpu_bo_unref(&bo); +create_bo_failed: + dma_fence_put(&svm_bo->eviction_fence->base); + kfree(svm_bo); + prange->ttm_res = NULL; + + return r; +} + +void svm_range_vram_node_free(struct svm_range *prange) +{ + svm_range_bo_unref(prange->svm_bo); + prange->ttm_res = NULL; +} + +struct amdgpu_device * +svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id) +{ + struct kfd_process_device *pdd; + struct kfd_process *p; + int32_t gpu_idx; + + p = container_of(prange->svms, struct kfd_process, svms); + + gpu_idx = kfd_process_gpuidx_from_gpuid(p, gpu_id); + if (gpu_idx < 0) { + pr_debug("failed to get device by id 0x%x\n", gpu_id); + return NULL; + } + pdd = kfd_process_device_from_gpuidx(p, gpu_idx); + if (!pdd) { + pr_debug("failed to get device by idx 0x%x\n", gpu_idx); + return NULL; + } + + return (struct amdgpu_device *)pdd->dev->kgd; +} + +static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo) +{ + struct ttm_operation_ctx ctx = { false, false }; + + amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM); + + return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); +} + +static int +svm_range_check_attr(struct kfd_process *p, + uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) +{ + uint32_t i; + int gpuidx; + + for (i = 0; i < nattr; i++) { + switch (attrs[i].type) { + case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: + if (attrs[i].value != KFD_IOCTL_SVM_LOCATION_SYSMEM && + attrs[i].value != KFD_IOCTL_SVM_LOCATION_UNDEFINED && + kfd_process_gpuidx_from_gpuid(p, + attrs[i].value) < 0) { + pr_debug("no GPU 0x%x found\n", attrs[i].value); + return -EINVAL; + } + break; + case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: + if (attrs[i].value != KFD_IOCTL_SVM_LOCATION_SYSMEM && + kfd_process_gpuidx_from_gpuid(p, + attrs[i].value) < 0) { + pr_debug("no GPU 0x%x found\n", attrs[i].value); + return -EINVAL; + } + break; + case KFD_IOCTL_SVM_ATTR_ACCESS: + case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: + case KFD_IOCTL_SVM_ATTR_NO_ACCESS: + gpuidx = kfd_process_gpuidx_from_gpuid(p, + attrs[i].value); + if (gpuidx < 0) { + pr_debug("no GPU 0x%x found\n", attrs[i].value); + return -EINVAL; + } + break; + case KFD_IOCTL_SVM_ATTR_SET_FLAGS: + break; + case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: + break; + case KFD_IOCTL_SVM_ATTR_GRANULARITY: + break; + default: + pr_debug("unknown attr type 0x%x\n", attrs[i].type); + return -EINVAL; + } + } + + return 0; +} + +static void +svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange, + uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) +{ + uint32_t i; + int gpuidx; + + for (i = 0; i < nattr; i++) { + switch (attrs[i].type) { + case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: + prange->preferred_loc = attrs[i].value; + break; + case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: + prange->prefetch_loc = attrs[i].value; + break; + case KFD_IOCTL_SVM_ATTR_ACCESS: + case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: + case KFD_IOCTL_SVM_ATTR_NO_ACCESS: + gpuidx = kfd_process_gpuidx_from_gpuid(p, + attrs[i].value); + if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { + bitmap_clear(prange->bitmap_access, gpuidx, 1); + bitmap_clear(prange->bitmap_aip, gpuidx, 1); + } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) { + bitmap_set(prange->bitmap_access, gpuidx, 1); + bitmap_clear(prange->bitmap_aip, gpuidx, 1); + } else { + bitmap_clear(prange->bitmap_access, gpuidx, 1); + bitmap_set(prange->bitmap_aip, gpuidx, 1); + } + break; + case KFD_IOCTL_SVM_ATTR_SET_FLAGS: + prange->flags |= attrs[i].value; + break; + case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: + prange->flags &= ~attrs[i].value; + break; + case KFD_IOCTL_SVM_ATTR_GRANULARITY: + prange->granularity = attrs[i].value; + break; + default: + WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); + } + } +} + +/** + * svm_range_debug_dump - print all range information from svms + * @svms: svm range list header + * + * debug output svm range start, end, prefetch location from svms + * interval tree and link list + * + * Context: The caller must hold svms->lock + */ +static void svm_range_debug_dump(struct svm_range_list *svms) +{ + struct interval_tree_node *node; + struct svm_range *prange; + + pr_debug("dump svms 0x%p list\n", svms); + pr_debug("range\tstart\tpage\tend\t\tlocation\n"); + + list_for_each_entry(prange, &svms->list, list) { + pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", + prange, prange->start, prange->npages, + prange->start + prange->npages - 1, + prange->actual_loc); + } + + pr_debug("dump svms 0x%p interval tree\n", svms); + pr_debug("range\tstart\tpage\tend\t\tlocation\n"); + node = interval_tree_iter_first(&svms->objects, 0, ~0ULL); + while (node) { + prange = container_of(node, struct svm_range, it_node); + pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", + prange, prange->start, prange->npages, + prange->start + prange->npages - 1, + prange->actual_loc); + node = interval_tree_iter_next(node, 0, ~0ULL); + } +} + +static bool +svm_range_is_same_attrs(struct svm_range *old, struct svm_range *new) +{ + return (old->prefetch_loc == new->prefetch_loc && + old->flags == new->flags && + old->granularity == new->granularity); +} + +static int +svm_range_split_array(void *ppnew, void *ppold, size_t size, + uint64_t old_start, uint64_t old_n, + uint64_t new_start, uint64_t new_n) +{ + unsigned char *new, *old, *pold; + uint64_t d; + + if (!ppold) + return 0; + pold = *(unsigned char **)ppold; + if (!pold) + return 0; + + new = kvmalloc_array(new_n, size, GFP_KERNEL); + if (!new) + return -ENOMEM; + + d = (new_start - old_start) * size; + memcpy(new, pold + d, new_n * size); + + old = kvmalloc_array(old_n, size, GFP_KERNEL); + if (!old) { + kvfree(new); + return -ENOMEM; + } + + d = (new_start == old_start) ? new_n * size : 0; + memcpy(old, pold + d, old_n * size); + + kvfree(pold); + *(void **)ppold = old; + *(void **)ppnew = new; + + return 0; +} + +static int +svm_range_split_pages(struct svm_range *new, struct svm_range *old, + uint64_t start, uint64_t last) +{ + uint64_t npages = last - start + 1; + int i, r; + + for (i = 0; i < MAX_GPU_INSTANCE; i++) { + r = svm_range_split_array(&new->dma_addr[i], &old->dma_addr[i], + sizeof(*old->dma_addr[i]), old->start, + npages, new->start, new->npages); + if (r) + return r; + } + + return 0; +} + +static int +svm_range_split_nodes(struct svm_range *new, struct svm_range *old, + uint64_t start, uint64_t last) +{ + uint64_t npages = last - start + 1; + + pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n", + new->svms, new, new->start, start, last); + + if (new->start == old->start) { + new->offset = old->offset; + old->offset += new->npages; + } else { + new->offset = old->offset + npages; + } + + new->svm_bo = svm_range_bo_ref(old->svm_bo); + new->ttm_res = old->ttm_res; + + spin_lock(&new->svm_bo->list_lock); + list_add(&new->svm_bo_list, &new->svm_bo->range_list); + spin_unlock(&new->svm_bo->list_lock); + + return 0; +} + +/** + * svm_range_split_adjust - split range and adjust + * + * @new: new range + * @old: the old range + * @start: the old range adjust to start address in pages + * @last: the old range adjust to last address in pages + * + * Copy system memory dma_addr or vram ttm_res in old range to new + * range from new_start up to size new->npages, the remaining old range is from + * start to last + * + * Return: + * 0 - OK, -ENOMEM - out of memory + */ +static int +svm_range_split_adjust(struct svm_range *new, struct svm_range *old, + uint64_t start, uint64_t last) +{ + int r; + + pr_debug("svms 0x%p new 0x%lx old [0x%lx 0x%lx] => [0x%llx 0x%llx]\n", + new->svms, new->start, old->start, old->last, start, last); + + if (new->start < old->start || + new->last > old->last) { + WARN_ONCE(1, "invalid new range start or last\n"); + return -EINVAL; + } + + r = svm_range_split_pages(new, old, start, last); + if (r) + return r; + + if (old->actual_loc && old->ttm_res) { + r = svm_range_split_nodes(new, old, start, last); + if (r) + return r; + } + + old->npages = last - start + 1; + old->start = start; + old->last = last; + new->flags = old->flags; + new->preferred_loc = old->preferred_loc; + new->prefetch_loc = old->prefetch_loc; + new->actual_loc = old->actual_loc; + new->granularity = old->granularity; + bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); + bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); + + return 0; +} + +/** + * svm_range_split - split a range in 2 ranges + * + * @prange: the svm range to split + * @start: the remaining range start address in pages + * @last: the remaining range last address in pages + * @new: the result new range generated + * + * Two cases only: + * case 1: if start == prange->start + * prange ==> prange[start, last] + * new range [last + 1, prange->last] + * + * case 2: if last == prange->last + * prange ==> prange[start, last] + * new range [prange->start, start - 1] + * + * Return: + * 0 - OK, -ENOMEM - out of memory, -EINVAL - invalid start, last + */ +static int +svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last, + struct svm_range **new) +{ + uint64_t old_start = prange->start; + uint64_t old_last = prange->last; + struct svm_range_list *svms; + int r = 0; + + pr_debug("svms 0x%p [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", prange->svms, + old_start, old_last, start, last); + + if (old_start != start && old_last != last) + return -EINVAL; + if (start < old_start || last > old_last) + return -EINVAL; + + svms = prange->svms; + if (old_start == start) + *new = svm_range_new(svms, last + 1, old_last); + else + *new = svm_range_new(svms, old_start, start - 1); + if (!*new) + return -ENOMEM; + + r = svm_range_split_adjust(*new, prange, start, last); + if (r) { + pr_debug("failed %d split [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", + r, old_start, old_last, start, last); + svm_range_free(*new); + *new = NULL; + } + + return r; +} + +static int +svm_range_split_tail(struct svm_range *prange, struct svm_range *new, + uint64_t new_last, struct list_head *insert_list) +{ + struct svm_range *tail; + int r = svm_range_split(prange, prange->start, new_last, &tail); + + if (!r) + list_add(&tail->insert_list, insert_list); + return r; +} + +static int +svm_range_split_head(struct svm_range *prange, struct svm_range *new, + uint64_t new_start, struct list_head *insert_list) +{ + struct svm_range *head; + int r = svm_range_split(prange, new_start, prange->last, &head); + + if (!r) + list_add(&head->insert_list, insert_list); + return r; +} + +static void +svm_range_add_child(struct svm_range *prange, struct mm_struct *mm, + struct svm_range *pchild, enum svm_work_list_ops op) +{ + pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n", + pchild, pchild->start, pchild->last, prange, op); + + pchild->work_item.mm = mm; + pchild->work_item.op = op; + list_add_tail(&pchild->child_list, &prange->child_list); +} + +/** + * svm_range_split_by_granularity - collect ranges within granularity boundary + * + * @p: the process with svms list + * @mm: mm structure + * @addr: the vm fault address in pages, to split the prange + * @parent: parent range if prange is from child list + * @prange: prange to split + * + * Trims @prange to be a single aligned block of prange->granularity if + * possible. The head and tail are added to the child_list in @parent. + * + * Context: caller must hold mmap_read_lock and prange->lock + * + * Return: + * 0 - OK, otherwise error code + */ +int +svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm, + unsigned long addr, struct svm_range *parent, + struct svm_range *prange) +{ + struct svm_range *head, *tail; + unsigned long start, last, size; + int r; + + /* Align splited range start and size to granularity size, then a single + * PTE will be used for whole range, this reduces the number of PTE + * updated and the L1 TLB space used for translation. + */ + size = 1UL << prange->granularity; + start = ALIGN_DOWN(addr, size); + last = ALIGN(addr + 1, size) - 1; + + pr_debug("svms 0x%p split [0x%lx 0x%lx] to [0x%lx 0x%lx] size 0x%lx\n", + prange->svms, prange->start, prange->last, start, last, size); + + if (start > prange->start) { + r = svm_range_split(prange, start, prange->last, &head); + if (r) + return r; + svm_range_add_child(parent, mm, head, SVM_OP_ADD_RANGE); + } + + if (last < prange->last) { + r = svm_range_split(prange, prange->start, last, &tail); + if (r) + return r; + svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); + } + + /* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */ + if (p->xnack_enabled && prange->work_item.op == SVM_OP_ADD_RANGE) { + prange->work_item.op = SVM_OP_ADD_RANGE_AND_MAP; + pr_debug("change prange 0x%p [0x%lx 0x%lx] op %d\n", + prange, prange->start, prange->last, + SVM_OP_ADD_RANGE_AND_MAP); + } + return 0; +} + +static uint64_t +svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange) +{ + struct amdgpu_device *bo_adev; + uint32_t flags = prange->flags; + uint32_t mapping_flags = 0; + uint64_t pte_flags; + bool snoop = !prange->ttm_res; + bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT; + + if (prange->svm_bo && prange->ttm_res) + bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); + + switch (adev->asic_type) { + case CHIP_ARCTURUS: + if (prange->svm_bo && prange->ttm_res) { + if (bo_adev == adev) { + mapping_flags |= coherent ? + AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; + } else { + mapping_flags |= AMDGPU_VM_MTYPE_UC; + if (amdgpu_xgmi_same_hive(adev, bo_adev)) + snoop = true; + } + } else { + mapping_flags |= coherent ? + AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; + } + break; + case CHIP_ALDEBARAN: + if (prange->svm_bo && prange->ttm_res) { + if (bo_adev == adev) { + mapping_flags |= coherent ? + AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; + if (adev->gmc.xgmi.connected_to_cpu) + snoop = true; + } else { + mapping_flags |= AMDGPU_VM_MTYPE_UC; + if (amdgpu_xgmi_same_hive(adev, bo_adev)) + snoop = true; + } + } else { + mapping_flags |= coherent ? + AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; + } + break; + default: + mapping_flags |= coherent ? + AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; + } + + mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE; + + if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO) + mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE; + if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC) + mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; + + pte_flags = AMDGPU_PTE_VALID; + pte_flags |= prange->ttm_res ? 0 : AMDGPU_PTE_SYSTEM; + pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0; + + pte_flags |= amdgpu_gem_va_map_flags(adev, mapping_flags); + + pr_debug("svms 0x%p [0x%lx 0x%lx] vram %d PTE 0x%llx mapping 0x%x\n", + prange->svms, prange->start, prange->last, + prange->ttm_res ? 1:0, pte_flags, mapping_flags); + + return pte_flags; +} + +static int +svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, + uint64_t start, uint64_t last, + struct dma_fence **fence) +{ + uint64_t init_pte_value = 0; + + pr_debug("[0x%llx 0x%llx]\n", start, last); + + return amdgpu_vm_bo_update_mapping(adev, adev, vm, false, true, NULL, + start, last, init_pte_value, 0, + NULL, NULL, fence); +} + +static int +svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, + unsigned long last) +{ + DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); + struct kfd_process_device *pdd; + struct dma_fence *fence = NULL; + struct amdgpu_device *adev; + struct kfd_process *p; + uint32_t gpuidx; + int r = 0; + + bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, + MAX_GPU_INSTANCE); + p = container_of(prange->svms, struct kfd_process, svms); + + for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { + pr_debug("unmap from gpu idx 0x%x\n", gpuidx); + pdd = kfd_process_device_from_gpuidx(p, gpuidx); + if (!pdd) { + pr_debug("failed to find device idx %d\n", gpuidx); + return -EINVAL; + } + adev = (struct amdgpu_device *)pdd->dev->kgd; + + r = svm_range_unmap_from_gpu(adev, drm_priv_to_vm(pdd->drm_priv), + start, last, &fence); + if (r) + break; + + if (fence) { + r = dma_fence_wait(fence, false); + dma_fence_put(fence); + fence = NULL; + if (r) + break; + } + amdgpu_amdkfd_flush_gpu_tlb_pasid((struct kgd_dev *)adev, + p->pasid); + } + + return r; +} + +static int +svm_range_map_to_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, + struct svm_range *prange, dma_addr_t *dma_addr, + st |