diff options
Diffstat (limited to 'drivers/gpu')
74 files changed, 6023 insertions, 1358 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 7fe41a3c2541..e095572458cd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1164,6 +1164,7 @@ struct amdgpu_device { bool debug_disable_soft_recovery; bool debug_use_vram_fw_buf; bool debug_enable_ras_aca; + bool debug_exp_resets; bool enforce_isolation[MAX_XCP]; /* Added this mutex for cleaner shader isolation between GFX and compute processes */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index c63528a4e894..1254a43ec96b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -1151,6 +1151,10 @@ uint64_t kgd_gfx_v9_hqd_get_pq_addr(struct amdgpu_device *adev, uint32_t low, high; uint64_t queue_addr = 0; + if (!adev->debug_exp_resets && + !adev->gfx.num_gfx_rings) + return 0; + kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); amdgpu_gfx_rlc_enter_safe_mode(adev, inst); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c index cf2b4dd4d865..5ac59b62020c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c @@ -28,8 +28,8 @@ #include "atom.h" #ifndef CONFIG_DEV_COREDUMP -void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, - struct amdgpu_reset_context *reset_context) +void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, + bool vram_lost, struct amdgpu_job *job) { } #else @@ -315,7 +315,9 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, } } - if (coredump->reset_vram_lost) + if (coredump->skip_vram_check) + drm_printf(&p, "VRAM lost check is skipped!\n"); + else if (coredump->reset_vram_lost) drm_printf(&p, "VRAM is lost due to GPU reset!\n"); return count - iter.remain; @@ -326,12 +328,11 @@ static void amdgpu_devcoredump_free(void *data) kfree(data); } -void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, - struct amdgpu_reset_context *reset_context) +void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, + bool vram_lost, struct amdgpu_job *job) { - struct amdgpu_coredump_info *coredump; struct drm_device *dev = adev_to_drm(adev); - struct amdgpu_job *job = reset_context->job; + struct amdgpu_coredump_info *coredump; struct drm_sched_job *s_job; coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT); @@ -341,11 +342,12 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, return; } + coredump->skip_vram_check = skip_vram_check; coredump->reset_vram_lost = vram_lost; - if (reset_context->job && reset_context->job->vm) { + if (job && job->vm) { + struct amdgpu_vm *vm = job->vm; struct amdgpu_task_info *ti; - struct amdgpu_vm *vm = reset_context->job->vm; ti = amdgpu_vm_get_task_info_vm(vm); if (ti) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h index 52459512cb2b..ef9772c6bcc9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h @@ -26,7 +26,6 @@ #define __AMDGPU_DEV_COREDUMP_H__ #include "amdgpu.h" -#include "amdgpu_reset.h" #ifdef CONFIG_DEV_COREDUMP @@ -36,12 +35,12 @@ struct amdgpu_coredump_info { struct amdgpu_device *adev; struct amdgpu_task_info reset_task_info; struct timespec64 reset_time; + bool skip_vram_check; bool reset_vram_lost; struct amdgpu_ring *ring; }; #endif -void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, - struct amdgpu_reset_context *reset_context); - +void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, + bool vram_lost, struct amdgpu_job *job); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 49ef22dcf7fb..f4628412dac4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4531,6 +4531,9 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) { dev_info(adev->dev, "amdgpu: finishing device.\n"); flush_delayed_work(&adev->delayed_init_work); + + if (adev->mman.initialized) + drain_workqueue(adev->mman.bdev.wq); adev->shutdown = true; /* make sure IB test finished before entering exclusive mode @@ -4551,9 +4554,6 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) } amdgpu_fence_driver_hw_fini(adev); - if (adev->mman.initialized) - drain_workqueue(adev->mman.bdev.wq); - if (adev->pm.sysfs_initialized) amdgpu_pm_sysfs_fini(adev); if (adev->ucode_sysfs_en) @@ -5489,7 +5489,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, vram_lost = amdgpu_device_check_vram_lost(tmp_adev); if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) - amdgpu_coredump(tmp_adev, vram_lost, reset_context); + amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); if (vram_lost) { DRM_INFO("VRAM is lost due to GPU reset!\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 5dd39e6c6223..8dee7c62c801 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -131,6 +131,7 @@ enum AMDGPU_DEBUG_MASK { AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2), AMDGPU_DEBUG_USE_VRAM_FW_BUF = BIT(3), AMDGPU_DEBUG_ENABLE_RAS_ACA = BIT(4), + AMDGPU_DEBUG_ENABLE_EXP_RESETS = BIT(5), }; unsigned int amdgpu_vram_limit = UINT_MAX; @@ -2199,6 +2200,11 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev) pr_info("debug: enable RAS ACA\n"); adev->debug_enable_ras_aca = true; } + + if (amdgpu_debug_mask & AMDGPU_DEBUG_ENABLE_EXP_RESETS) { + pr_info("debug: enable experimental reset features\n"); + adev->debug_exp_resets = true; + } } static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long flags) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index aad2027e5c7c..0e617dff8765 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -348,6 +348,9 @@ int amdgpu_gem_create_ioctl(struct drm_device *dev, void *data, return -EINVAL; } + /* always clear VRAM */ + flags |= AMDGPU_GEM_CREATE_VRAM_CLEARED; + /* create a gem object to contain this object in */ if (args->in.domains & (AMDGPU_GEM_DOMAIN_GDS | AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA)) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index b4efeef848de..b779d47a546a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -660,7 +660,7 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int xcc_id) uint64_t queue_mask = 0; int r, i, j; - if (adev->enable_mes) + if (adev->mes.enable_legacy_queue_map) return amdgpu_gfx_mes_enable_kcq(adev, xcc_id); if (!kiq->pmf || !kiq->pmf->kiq_map_queues || !kiq->pmf->kiq_set_resources) @@ -722,7 +722,7 @@ int amdgpu_gfx_enable_kgq(struct amdgpu_device *adev, int xcc_id) amdgpu_device_flush_hdp(adev, NULL); - if (adev->enable_mes) { + if (adev->mes.enable_legacy_queue_map) { for (i = 0; i < adev->gfx.num_gfx_rings; i++) { j = i + xcc_id * adev->gfx.num_gfx_rings; r = amdgpu_mes_map_legacy_queue(adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 597489dea114..ad6bf5d4e0a9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -30,6 +30,60 @@ #include "amdgpu.h" #include "amdgpu_trace.h" #include "amdgpu_reset.h" +#include "amdgpu_dev_coredump.h" +#include "amdgpu_xgmi.h" + +static void amdgpu_job_do_core_dump(struct amdgpu_device *adev, + struct amdgpu_job *job) +{ + int i; + + dev_info(adev->dev, "Dumping IP State\n"); + for (i = 0; i < adev->num_ip_blocks; i++) + if (adev->ip_blocks[i].version->funcs->dump_ip_state) + adev->ip_blocks[i].version->funcs + ->dump_ip_state((void *)adev); + dev_info(adev->dev, "Dumping IP State Completed\n"); + + amdgpu_coredump(adev, true, false, job); +} + +static void amdgpu_job_core_dump(struct amdgpu_device *adev, + struct amdgpu_job *job) +{ + struct list_head device_list, *device_list_handle = NULL; + struct amdgpu_device *tmp_adev = NULL; + struct amdgpu_hive_info *hive = NULL; + + if (!amdgpu_sriov_vf(adev)) + hive = amdgpu_get_xgmi_hive(adev); + if (hive) + mutex_lock(&hive->hive_lock); + /* + * Reuse the logic in amdgpu_device_gpu_recover() to build list of + * devices for code dump + */ + INIT_LIST_HEAD(&device_list); + if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) + list_add_tail(&tmp_adev->reset_list, &device_list); + if (!list_is_first(&adev->reset_list, &device_list)) + list_rotate_to_front(&adev->reset_list, &device_list); + device_list_handle = &device_list; + } else { + list_add_tail(&adev->reset_list, &device_list); + device_list_handle = &device_list; + } + + /* Do the coredump for each device */ + list_for_each_entry(tmp_adev, device_list_handle, reset_list) + amdgpu_job_do_core_dump(tmp_adev, job); + + if (hive) { + mutex_unlock(&hive->hive_lock); + amdgpu_put_xgmi_hive(hive); + } +} static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) { @@ -48,9 +102,14 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) return DRM_GPU_SCHED_STAT_ENODEV; } - adev->job_hang = true; + /* + * Do the coredump immediately after a job timeout to get a very + * close dump/snapshot/representation of GPU's current error status + */ + amdgpu_job_core_dump(adev, job); + if (amdgpu_gpu_recovery && amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { dev_err(adev->dev, "ring %s timeout, but soft recovered\n", @@ -101,6 +160,12 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) reset_context.src = AMDGPU_RESET_SRC_JOB; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + /* + * To avoid an unnecessary extra coredump, as we have already + * got the very close representation of GPU's error status + */ + set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); + r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context); if (r) dev_err(adev->dev, "GPU Recovery Failed: %d\n", r); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 44c74a08987d..f7d5d4f08a53 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -826,6 +826,24 @@ int amdgpu_mes_reset_hw_queue(struct amdgpu_device *adev, int queue_id) return 0; } +int amdgpu_mes_reset_hw_queue_mmio(struct amdgpu_device *adev, int queue_type, + int me_id, int pipe_id, int queue_id, int vmid) +{ + struct mes_reset_queue_input queue_input; + int r; + + queue_input.use_mmio = true; + queue_input.me_id = me_id; + queue_input.pipe_id = pipe_id; + queue_input.queue_id = queue_id; + queue_input.vmid = vmid; + r = adev->mes.funcs->reset_hw_queue(&adev->mes, &queue_input); + if (r) + DRM_ERROR("failed to reset hardware queue by mmio, queue id = %d\n", + queue_id); + return r; +} + int amdgpu_mes_map_legacy_queue(struct amdgpu_device *adev, struct amdgpu_ring *ring) { @@ -873,7 +891,8 @@ int amdgpu_mes_unmap_legacy_queue(struct amdgpu_device *adev, int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev, struct amdgpu_ring *ring, - unsigned int vmid) + unsigned int vmid, + bool use_mmio) { struct mes_reset_legacy_queue_input queue_input; int r; @@ -882,11 +901,13 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev, queue_input.queue_type = ring->funcs->type; queue_input.doorbell_offset = ring->doorbell_index; + queue_input.me_id = ring->me; queue_input.pipe_id = ring->pipe; queue_input.queue_id = ring->queue; queue_input.mqd_addr = amdgpu_bo_gpu_offset(ring->mqd_obj); queue_input.wptr_addr = ring->wptr_gpu_addr; queue_input.vmid = vmid; + queue_input.use_mmio = use_mmio; r = adev->mes.funcs->reset_legacy_queue(&adev->mes, &queue_input); if (r) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index a5b1ea60cac8..96788c0f42f1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -75,6 +75,7 @@ struct amdgpu_mes { uint32_t sched_version; uint32_t kiq_version; + bool enable_legacy_queue_map; uint32_t total_max_queue; uint32_t max_doorbell_slices; @@ -251,6 +252,13 @@ struct mes_remove_queue_input { struct mes_reset_queue_input { uint32_t doorbell_offset; uint64_t gang_context_addr; + bool use_mmio; + uint32_t queue_type; + uint32_t me_id; + uint32_t pipe_id; + uint32_t queue_id; + uint32_t xcc_id; + uint32_t vmid; }; struct mes_map_legacy_queue_input { @@ -287,6 +295,8 @@ struct mes_resume_gang_input { struct mes_reset_legacy_queue_input { uint32_t queue_type; uint32_t doorbell_offset; + bool use_mmio; + uint32_t me_id; uint32_t pipe_id; uint32_t queue_id; uint64_t mqd_addr; @@ -396,6 +406,8 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int gang_id, int *queue_id); int amdgpu_mes_remove_hw_queue(struct amdgpu_device *adev, int queue_id); int amdgpu_mes_reset_hw_queue(struct amdgpu_device *adev, int queue_id); +int amdgpu_mes_reset_hw_queue_mmio(struct amdgpu_device *adev, int queue_type, + int me_id, int pipe_id, int queue_id, int vmid); int amdgpu_mes_map_legacy_queue(struct amdgpu_device *adev, struct amdgpu_ring *ring); @@ -405,7 +417,8 @@ int amdgpu_mes_unmap_legacy_queue(struct amdgpu_device *adev, u64 gpu_addr, u64 seq); int amdgp |