diff options
| author | Dave Airlie <airlied@redhat.com> | 2023-06-19 11:57:25 +1000 |
|---|---|---|
| committer | Dave Airlie <airlied@redhat.com> | 2023-06-19 11:57:26 +1000 |
| commit | bcbede6fbeb0e1eb85ccbb532faf06d3b31f0e73 (patch) | |
| tree | 53062368790eb070705726f24a49beb34b8737d0 | |
| parent | 4e237d84eec8783eb6dfbccb04a4e570b27a9b09 (diff) | |
| parent | 72f1de49ffb90b29748284f27f1d6b829ab1de95 (diff) | |
| download | linux-bcbede6fbeb0e1eb85ccbb532faf06d3b31f0e73.tar.gz linux-bcbede6fbeb0e1eb85ccbb532faf06d3b31f0e73.tar.bz2 linux-bcbede6fbeb0e1eb85ccbb532faf06d3b31f0e73.zip | |
Merge tag 'amd-drm-next-6.5-2023-06-16' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.5-2023-06-16:
amdgpu:
- Misc display fixes
- W=1 fixes
- Improve scheduler naming
- DCN 3.1.4 fixes
- kdoc fixes
- Enable W=1
- VCN 4.0 fix
- xgmi fixes
- TOPDOWN fix for large BAR systems
- eDP fix
- PSR fixes
- SubVP fixes
- Freesync fix
- DPIA fix
- SMU 13.0.5 fixes
- vblflash fix
- RAS fixes
- SDMA 4 fix
- BO locking fix
- BO backing store fix
- NBIO 7.9 fixes
- GC 9.4.3 fixes
- GPU reset recovery fixes
- HMM fix
amdkfd:
- Fix NULL check
- Trap fixes
- Queue count fix
- Add event age tracking
radeon:
- fbdev client fix
scheduler:
- Avoid an infinite loop
UAPI:
- Add KFD event age tracking:
Proposed ROCT-Thunk-Interface:
https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/commit/efdbf6cfbc026bd68ac3c35d00dacf84370eb81e
https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/commit/1820ae0a2db85b6f584611dc0cde1a00e7c22915
Proposed ROCR-Runtime:
https://github.com/RadeonOpenCompute/ROCR-Runtime/compare/master...zhums:ROCR-Runtime:new_event_wait_review
https://github.com/RadeonOpenCompute/ROCR-Runtime/commit/e1f5bdb88eb882ac798aeca2c00ea3fbb2dba459
https://github.com/RadeonOpenCompute/ROCR-Runtime/commit/7d26afd14107b5c2a754c1a3f415d89f3aabb503
drm:
- DP MST fix
Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230616163548.7706-1-alexander.deucher@amd.com
93 files changed, 1222 insertions, 431 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index 86b833085f19..8d16f280b695 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -40,7 +40,18 @@ ccflags-y := -I$(FULL_AMD_PATH)/include/asic_reg \ -I$(FULL_AMD_PATH)/amdkfd subdir-ccflags-y := -Wextra -subdir-ccflags-y += $(call cc-option, -Wunused-but-set-variable) +subdir-ccflags-y += -Wunused +subdir-ccflags-y += -Wmissing-prototypes +subdir-ccflags-y += -Wmissing-declarations +subdir-ccflags-y += -Wmissing-include-dirs +subdir-ccflags-y += -Wold-style-definition +subdir-ccflags-y += -Wmissing-format-attribute +# Need this to avoid recursive variable evaluation issues +cond-flags := $(call cc-option, -Wunused-but-set-variable) \ + $(call cc-option, -Wunused-const-variable) \ + $(call cc-option, -Wstringop-truncation) \ + $(call cc-option, -Wpacked-not-aligned) +subdir-ccflags-y += $(cond-flags) subdir-ccflags-y += -Wno-unused-parameter subdir-ccflags-y += -Wno-type-limits subdir-ccflags-y += -Wno-sign-compare diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 6e1d331af01f..d9503882ea97 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -309,7 +309,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, } p->gang_leader = p->jobs[p->gang_leader_idx]; - if (p->ctx->vram_lost_counter != p->gang_leader->vram_lost_counter) { + if (p->ctx->generation != p->gang_leader->generation) { ret = -ECANCELED; goto free_all_kdata; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index 410acdd4554c..0dc9c655c4fb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -333,7 +333,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority, ctx->reset_counter = atomic_read(&mgr->adev->gpu_reset_counter); ctx->reset_counter_query = ctx->reset_counter; - ctx->vram_lost_counter = atomic_read(&mgr->adev->vram_lost_counter); + ctx->generation = amdgpu_vm_generation(mgr->adev, &fpriv->vm); ctx->init_priority = priority; ctx->override_priority = AMDGPU_CTX_PRIORITY_UNSET; @@ -432,6 +432,7 @@ int amdgpu_ctx_get_entity(struct amdgpu_ctx *ctx, u32 hw_ip, u32 instance, u32 ring, struct drm_sched_entity **entity) { int r; + struct drm_sched_entity *ctx_entity; if (hw_ip >= AMDGPU_HW_IP_NUM) { DRM_ERROR("unknown HW IP type: %d\n", hw_ip); @@ -455,7 +456,14 @@ int amdgpu_ctx_get_entity(struct amdgpu_ctx *ctx, u32 hw_ip, u32 instance, return r; } - *entity = &ctx->entities[hw_ip][ring]->entity; + ctx_entity = &ctx->entities[hw_ip][ring]->entity; + r = drm_sched_entity_error(ctx_entity); + if (r) { + DRM_DEBUG("error entity %p\n", ctx_entity); + return r; + } + + *entity = ctx_entity; return 0; } @@ -586,7 +594,7 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, if (ctx->reset_counter != atomic_read(&adev->gpu_reset_counter)) out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RESET; - if (ctx->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) + if (ctx->generation != amdgpu_vm_generation(adev, &fpriv->vm)) out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST; if (atomic_read(&ctx->guilty)) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h index f1e27b6e16f4..85376baaa92f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h @@ -47,7 +47,7 @@ struct amdgpu_ctx { struct amdgpu_ctx_mgr *mgr; unsigned reset_counter; unsigned reset_counter_query; - uint32_t vram_lost_counter; + uint64_t generation; spinlock_t ring_lock; struct amdgpu_ctx_entity *entities[AMDGPU_HW_IP_NUM][AMDGPU_MAX_ENTITY_NUM]; bool preamble_presented; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index 859882109f55..8e1cfc87122d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -1515,6 +1515,7 @@ static int amdgpu_discovery_get_mall_info(struct amdgpu_device *adev) mall_size += mall_size_per_umc; } adev->gmc.mall_size = mall_size; + adev->gmc.m_half_use = half_use; break; default: dev_err(adev->dev, @@ -1896,6 +1897,8 @@ static int amdgpu_discovery_set_gc_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, &gfx_v9_0_ip_block); break; case IP_VERSION(9, 4, 3): + if (!amdgpu_exp_hw_support) + return -EINVAL; amdgpu_device_ip_block_add(adev, &gfx_v9_4_3_ip_block); break; case IP_VERSION(10, 1, 10): diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 876ec35b8f83..c694b41f6461 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -692,6 +692,30 @@ void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring) } /** + * amdgpu_fence_driver_set_error - set error code on fences + * @ring: the ring which contains the fences + * @error: the error code to set + * + * Set an error code to all the fences pending on the ring. + */ +void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error) +{ + struct amdgpu_fence_driver *drv = &ring->fence_drv; + unsigned long flags; + + spin_lock_irqsave(&drv->lock, flags); + for (unsigned int i = 0; i <= drv->num_fences_mask; ++i) { + struct dma_fence *fence; + + fence = rcu_dereference_protected(drv->fences[i], + lockdep_is_held(&drv->lock)); + if (fence && !dma_fence_is_signaled_locked(fence)) + dma_fence_set_error(fence, error); + } + spin_unlock_irqrestore(&drv->lock, flags); +} + +/** * amdgpu_fence_driver_force_completion - force signal latest fence of ring * * @ring: fence of the ring to signal @@ -699,6 +723,7 @@ void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring) */ void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring) { + amdgpu_fence_driver_set_error(ring, -ECANCELED); amdgpu_fence_write(ring, ring->fence_drv.sync_seq); amdgpu_fence_process(ring); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index 6794edd1d2d2..56d73fade568 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -301,6 +301,8 @@ struct amdgpu_gmc { /* MALL size */ u64 mall_size; + uint32_t m_half_use; + /* number of UMC instances */ int num_umc; /* mode2 save restore */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c index 2dadcfe43d03..081267161d40 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c @@ -190,8 +190,8 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier, pr_debug("hmm range: start = 0x%lx, end = 0x%lx", hmm_range->start, hmm_range->end); - /* Assuming 512MB takes maxmium 1 second to fault page address */ - timeout = max((hmm_range->end - hmm_range->start) >> 29, 1UL); + /* Assuming 128MB takes maximum 1 second to fault page address */ + timeout = max((hmm_range->end - hmm_range->start) >> 27, 1UL); timeout *= HMM_RANGE_DEFAULT_TIMEOUT; timeout = jiffies + msecs_to_jiffies(timeout); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c index 3481d2808ce5..5273decc5753 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c @@ -467,7 +467,8 @@ void amdgpu_irq_dispatch(struct amdgpu_device *adev, handled = true; } else { - DRM_DEBUG("Unhandled interrupt src_id: %d\n", src_id); + DRM_DEBUG("Unregistered interrupt src_id: %d of client_id:%d\n", + src_id, client_id); } /* Send it to amdkfd as well if it isn't already handled */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index aca3a2bfe8d2..78476bc75b4e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/ |
