summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd
diff options
context:
space:
mode:
authorMukul Joshi <mukul.joshi@amd.com>2024-08-12 11:11:28 -0400
committerAlex Deucher <alexander.deucher@amd.com>2024-08-20 22:14:13 -0400
commiteb067d65c33eecd4b81771384183ad42eec259bf (patch)
tree9277ebbe2644ae4a581a6f5c0a968c2bd478c9b6 /drivers/gpu/drm/amd/amdkfd
parent9a16042f02cd08bbd0a5a2d8e9c95347717165a3 (diff)
downloadlinux-eb067d65c33eecd4b81771384183ad42eec259bf.tar.gz
linux-eb067d65c33eecd4b81771384183ad42eec259bf.tar.bz2
linux-eb067d65c33eecd4b81771384183ad42eec259bf.zip
drm/amdkfd: Update BadOpcode Interrupt handling with MES
Based on the recommendation of MEC FW, update BadOpcode interrupt handling by unmapping all queues, removing the queue that got the interrupt from scheduling and remapping rest of the queues back when using MES scheduler. This is done to prevent the case where unmapping of the bad queue can fail thereby causing a GPU reset. Signed-off-by: Mukul Joshi <mukul.joshi@amd.com> Acked-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com> Acked-by: Alex Deucher <alexander.deucher@amd.com> Reviewed-by: Felix Kuehling <felix.kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c51
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c9
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_priv.h1
3 files changed, 58 insertions, 3 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 5825e805da50..577d121cc6d1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2931,6 +2931,57 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
kfree(dqm);
}
+int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id)
+{
+ struct kfd_process_device *pdd;
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct device_queue_manager *dqm = knode->dqm;
+ struct device *dev = dqm->dev->adev->dev;
+ struct qcm_process_device *qpd;
+ struct queue *q = NULL;
+ int ret = 0;
+
+ if (!p)
+ return -EINVAL;
+
+ dqm_lock(dqm);
+
+ pdd = kfd_get_process_device_data(dqm->dev, p);
+ if (pdd) {
+ qpd = &pdd->qpd;
+
+ list_for_each_entry(q, &qpd->queues_list, list) {
+ if (q->doorbell_id == doorbell_id && q->properties.is_active) {
+ ret = suspend_all_queues_mes(dqm);
+ if (ret) {
+ dev_err(dev, "Suspending all queues failed");
+ goto out;
+ }
+
+ q->properties.is_evicted = true;
+ q->properties.is_active = false;
+ decrement_queue_count(dqm, qpd, q);
+
+ ret = remove_queue_mes(dqm, q, qpd);
+ if (ret) {
+ dev_err(dev, "Removing bad queue failed");
+ goto out;
+ }
+
+ ret = resume_all_queues_mes(dqm);
+ if (ret)
+ dev_err(dev, "Resuming all queues failed");
+
+ break;
+ }
+ }
+ }
+
+out:
+ dqm_unlock(dqm);
+ return ret;
+}
+
static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
index f524a55eee11..b3f988b275a8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
@@ -330,11 +330,14 @@ static void event_interrupt_wq_v11(struct kfd_node *dev,
if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
kfd_signal_event_interrupt(pasid, context_id0, 32);
else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE &&
- KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)))
- kfd_set_dbg_ev_from_interrupt(dev, pasid,
- KFD_CTXID0_DOORBELL_ID(context_id0),
+ KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0))) {
+ u32 doorbell_id = KFD_CTXID0_DOORBELL_ID(context_id0);
+
+ kfd_set_dbg_ev_from_interrupt(dev, pasid, doorbell_id,
KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)),
NULL, 0);
+ kfd_dqm_suspend_bad_queue_mes(dev, pasid, doorbell_id);
+ }
/* SDMA */
else if (source_id == SOC21_INTSRC_SDMA_TRAP)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index f7c12d4f0abb..7bba6bed2f48 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1324,6 +1324,7 @@ struct kernel_queue *kernel_queue_init(struct kfd_node *dev,
enum kfd_queue_type type);
void kernel_queue_uninit(struct kernel_queue *kq);
int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid);
+int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id);
/* Process Queue Manager */
struct process_queue_node {