From cfbb6b0047448e2d986160d9f30d60f604d9ad0f Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Fri, 21 Jan 2022 17:23:32 -0500 Subject: drm/amdgpu: Rework reset domain to be refcounted. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reset domain contains register access semaphor now and so needs to be present as long as each device in a hive needs it and so it cannot be binded to XGMI hive life cycle. Adress this by making reset domain refcounted and pointed by each member of the hive and the hive itself. v4: Fix crash on boot witrh XGMI hive by adding type to reset_domain. XGMI will only create a new reset_domain if prevoius was of single device type meaning it's first boot. Otherwsie it will take a refocunt to exsiting reset_domain from the amdgou device. Add a wrapper around reset_domain->refcount get/put and a wrapper around send to reset wq (Lijo) Signed-off-by: Andrey Grodzovsky Acked-by: Christian König Link: https://www.spinics.net/lists/amd-gfx/msg74121.html --- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 35 +++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index e00d38d9160a..cc625e441fa0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -70,6 +70,19 @@ struct amdgpu_reset_control { void (*async_reset)(struct work_struct *work); }; + +enum amdgpu_reset_domain_type { + SINGLE_DEVICE, + XGMI_HIVE +}; + +struct amdgpu_reset_domain { + struct kref refcount; + struct workqueue_struct *wq; + enum amdgpu_reset_domain_type type; +}; + + int amdgpu_reset_init(struct amdgpu_device *adev); int amdgpu_reset_fini(struct amdgpu_device *adev); @@ -82,4 +95,26 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev, int amdgpu_reset_add_handler(struct amdgpu_reset_control *reset_ctl, struct amdgpu_reset_handler *handler); +struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type, + char *wq_name); + +void amdgpu_reset_destroy_reset_domain(struct kref *ref); + +static inline bool amdgpu_reset_get_reset_domain(struct amdgpu_reset_domain *domain) +{ + return kref_get_unless_zero(&domain->refcount) != 0; +} + +static inline void amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *domain) +{ + kref_put(&domain->refcount, amdgpu_reset_destroy_reset_domain); +} + +static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *domain, + struct work_struct *work) +{ + return queue_work(domain->wq, work); +} + + #endif -- cgit v1.2.3 From d0fb18b535679a28b1f55a312b7454563b9bb36e Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Wed, 19 Jan 2022 17:09:58 -0500 Subject: drm/amdgpu: Move reset sem into reset_domain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want single instance of reset sem across all reset clients because in case of XGMI we should stop access cross device MMIO because any of them could be in a reset in the moment. Signed-off-by: Andrey Grodzovsky Reviewed-by: Christian König Link: https://www.spinics.net/lists/amd-gfx/msg74117.html --- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index cc625e441fa0..80f918e87d4f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -80,6 +80,7 @@ struct amdgpu_reset_domain { struct kref refcount; struct workqueue_struct *wq; enum amdgpu_reset_domain_type type; + struct rw_semaphore sem; }; -- cgit v1.2.3 From 89a7a87093d67e2c633e1ed400ba00ffd15bdae5 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Wed, 19 Jan 2022 17:20:00 -0500 Subject: drm/amdgpu: Move in_gpu_reset into reset_domain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We should have a single instance per entrire reset domain. Signed-off-by: Andrey Grodzovsky Suggested-by: Lijo Lazar Reviewed-by: Christian König Link: https://www.spinics.net/lists/amd-gfx/msg74116.html --- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index 80f918e87d4f..ea6fc98ea927 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -81,6 +81,7 @@ struct amdgpu_reset_domain { struct workqueue_struct *wq; enum amdgpu_reset_domain_type type; struct rw_semaphore sem; + atomic_t in_gpu_reset; }; -- cgit v1.2.3 From e923be9934a9c54a94e443f9e77bda5b9fbd1ce5 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Tue, 25 Jan 2022 11:32:47 -0500 Subject: drm/amdgpu: Rework amdgpu_device_lock_adev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This functions needs to be split into 2 parts where one is called only once for locking single instance of reset_domain's sem and reset flag and the other part which handles MP1 states should still be called for each device in XGMI hive. Signed-off-by: Andrey Grodzovsky Reviewed-by: Christian König Link: https://www.spinics.net/lists/amd-gfx/msg74118.html --- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index ea6fc98ea927..92de3b7965a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -118,5 +118,9 @@ static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *doma return queue_work(domain->wq, work); } +void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain, + struct amdgpu_hive_info *hive); + +void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain); #endif -- cgit v1.2.3 From f5666d482305900b9622a2c9dd73a864a3b0d281 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Wed, 9 Feb 2022 22:17:24 -0500 Subject: drm/amdgpu: Fix compile error. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Seems I forgot to add this to the relevant commit when submitting. Signed-off-by: Andrey Grodzovsky Reported-by: kernel test robot Reviewed-by: Christian König Signed-off-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20220210031724.440943-1-andrey.grodzovsky@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index 92de3b7965a1..1949dbe28a86 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -118,8 +118,7 @@ static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *doma return queue_work(domain->wq, work); } -void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain, - struct amdgpu_hive_info *hive); +void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain); void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain); -- cgit v1.2.3