From 6c2453861f48e4e779cafa01c09e78ddc2c23c6b Mon Sep 17 00:00:00 2001 From: yipechai Date: Tue, 4 Jan 2022 18:56:20 +0800 Subject: drm/amdgpu: Modify xgmi block to fit for the unified ras block data and ops 1.Modify gmc block to fit for the unified ras block data and ops. 2.Change amdgpu_xgmi_ras_funcs to amdgpu_xgmi_ras, and the corresponding variable name remove _funcs suffix. 3.Remove the const flag of gmc ras variable so that gmc ras block can be able to be inserted into amdgpu device ras block link list. 4.Invoke amdgpu_ras_register_ras_block function to register gmc ras block into amdgpu device ras block link list. 5.Remove the redundant code about gmc in amdgpu_ras.c after using the unified ras block. Signed-off-by: yipechai Reviewed-by: Hawking Zhang Reviewed-by: John Clements Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index e8b8f28c2f72..d29acd33eb11 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -732,7 +732,7 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) return psp_xgmi_terminate(&adev->psp); } -static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev) +static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, void *ras_info) { int r; struct ras_ih_if ih_info = { @@ -746,7 +746,7 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev) adev->gmc.xgmi.num_physical_nodes == 0) return 0; - adev->gmc.xgmi.ras_funcs->reset_ras_error_count(adev); + adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); if (!adev->gmc.xgmi.ras_if) { adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); @@ -865,7 +865,7 @@ static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, return 0; } -static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, +static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status) { struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; @@ -874,7 +874,7 @@ static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, uint32_t ue_cnt = 0, ce_cnt = 0; if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL)) - return -EINVAL; + return ; err_data->ue_count = 0; err_data->ce_count = 0; @@ -940,17 +940,23 @@ static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, break; } - adev->gmc.xgmi.ras_funcs->reset_ras_error_count(adev); + adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); err_data->ue_count += ue_cnt; err_data->ce_count += ce_cnt; - - return 0; } -const struct amdgpu_xgmi_ras_funcs xgmi_ras_funcs = { - .ras_late_init = amdgpu_xgmi_ras_late_init, - .ras_fini = amdgpu_xgmi_ras_fini, +struct amdgpu_ras_block_hw_ops xgmi_ras_hw_ops = { .query_ras_error_count = amdgpu_xgmi_query_ras_error_count, .reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count, }; + +struct amdgpu_xgmi_ras xgmi_ras = { + .ras_block = { + .name = "xgmi", + .block = AMDGPU_RAS_BLOCK__XGMI_WAFL, + .hw_ops = &xgmi_ras_hw_ops, + .ras_late_init = amdgpu_xgmi_ras_late_init, + .ras_fini = amdgpu_xgmi_ras_fini, + }, +}; -- cgit v1.2.3 From 22d4ba53b1c10de6832e588f01d916e24306f6a1 Mon Sep 17 00:00:00 2001 From: yipechai Date: Wed, 5 Jan 2022 15:40:26 +0800 Subject: drm/amdgpu: Adjust error inject function code style in amdgpu_ras.c 1. Move xgmi special error inject function from amdgpu_ras.c to xgmi block. 2. Support to use psp_ras_trigger_error as default error inject function in amdgpu_ras.c. If .ras_error_inject isn't defined in ras block, default error inject function will take effect. v2: squash in warning fix (Alex) Signed-off-by: yipechai Reviewed-by: Hawking Zhang Reviewed-by: John Clements Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index d29acd33eb11..478457637d29 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -946,9 +946,36 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, err_data->ce_count += ce_cnt; } +/* Trigger XGMI/WAFL error */ +static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, void *inject_if) +{ + int ret = 0; + struct ta_ras_trigger_error_input *block_info = (struct ta_ras_trigger_error_input *)inject_if; + + if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) + dev_warn(adev->dev, "Failed to disallow df cstate"); + + if (amdgpu_dpm_allow_xgmi_power_down(adev, false)) + dev_warn(adev->dev, "Failed to disallow XGMI power down"); + + ret = psp_ras_trigger_error(&adev->psp, block_info); + + if (amdgpu_ras_intr_triggered()) + return ret; + + if (amdgpu_dpm_allow_xgmi_power_down(adev, true)) + dev_warn(adev->dev, "Failed to allow XGMI power down"); + + if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) + dev_warn(adev->dev, "Failed to allow df cstate"); + + return ret; +} + struct amdgpu_ras_block_hw_ops xgmi_ras_hw_ops = { .query_ras_error_count = amdgpu_xgmi_query_ras_error_count, .reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count, + .ras_error_inject = amdgpu_ras_error_inject_xgmi, }; struct amdgpu_xgmi_ras xgmi_ras = { -- cgit v1.2.3 From 71b6c4a277dbb2594c260ccedcafaef5154b0da0 Mon Sep 17 00:00:00 2001 From: yipechai Date: Fri, 14 Jan 2022 10:40:15 +0800 Subject: drm/amdgpu: Fix the code style warnings in hdp xgmi mca and umc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drm/amdgpu: Fix the code style warnings in hdp xgmi mca and umc: 1. WARNING: missing space after struct definition. 2. WARNING: please, no space before tabs. 3. WARNING: line length of xxx exceeds 100 columns. 4. ERROR: "foo* bar" should be "foo *bar". 5. ERROR: space required before the open parenthesis '('. 6. ERROR: space prohibited after that open parenthesis '('. Signed-off-by: yipechai Reviewed-by: Tao Zhou Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 478457637d29..5929d6f528c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -950,7 +950,8 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, void *inject_if) { int ret = 0; - struct ta_ras_trigger_error_input *block_info = (struct ta_ras_trigger_error_input *)inject_if; + struct ta_ras_trigger_error_input *block_info = + (struct ta_ras_trigger_error_input *)inject_if; if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) dev_warn(adev->dev, "Failed to disallow df cstate"); -- cgit v1.2.3 From a4c63cafa58b4bd9e15511bab77a4752b93d3aa0 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Tue, 30 Nov 2021 16:19:03 -0500 Subject: drm/amdgpu: Introduce reset domain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Defined a reset_domain struct such that all the entities that go through reset together will be serialized one against another. Do it for both single device and XGMI hive cases. Signed-off-by: Andrey Grodzovsky Suggested-by: Daniel Vetter Suggested-by: Christian König Reviewed-by: Christian König Link: https://www.spinics.net/lists/amd-gfx/msg74111.html --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index e8b8f28c2f72..d406897346d6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -398,6 +398,14 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) goto pro_end; } + hive->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-hive", 0); + if (!hive->reset_domain.wq) { + dev_err(adev->dev, "XGMI: failed allocating wq for reset domain!\n"); + kfree(hive); + hive = NULL; + goto pro_end; + } + hive->hive_id = adev->gmc.xgmi.hive_id; INIT_LIST_HEAD(&hive->device_list); INIT_LIST_HEAD(&hive->node); @@ -407,6 +415,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) task_barrier_init(&hive->tb); hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; hive->hi_req_gpu = NULL; + /* * hive pstate on boot is high in vega20 so we have to go to low * pstate on after boot. -- cgit v1.2.3 From 681260df4dad45337b14ba762f94b402204e9ac3 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Wed, 15 Dec 2021 16:55:25 -0500 Subject: drm/amdgpu: Drop hive->in_reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we serialize all resets no need to protect from concurrent resets. Signed-off-by: Andrey Grodzovsky Reviewed-by: Christian König Link: https://www.spinics.net/lists/amd-gfx/msg74115.html --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index d406897346d6..89b682afe821 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -410,7 +410,6 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) INIT_LIST_HEAD(&hive->device_list); INIT_LIST_HEAD(&hive->node); mutex_init(&hive->hive_lock); - atomic_set(&hive->in_reset, 0); atomic_set(&hive->number_devices, 0); task_barrier_init(&hive->tb); hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; -- cgit v1.2.3 From cfbb6b0047448e2d986160d9f30d60f604d9ad0f Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Fri, 21 Jan 2022 17:23:32 -0500 Subject: drm/amdgpu: Rework reset domain to be refcounted. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reset domain contains register access semaphor now and so needs to be present as long as each device in a hive needs it and so it cannot be binded to XGMI hive life cycle. Adress this by making reset domain refcounted and pointed by each member of the hive and the hive itself. v4: Fix crash on boot witrh XGMI hive by adding type to reset_domain. XGMI will only create a new reset_domain if prevoius was of single device type meaning it's first boot. Otherwsie it will take a refocunt to exsiting reset_domain from the amdgou device. Add a wrapper around reset_domain->refcount get/put and a wrapper around send to reset wq (Lijo) Signed-off-by: Andrey Grodzovsky Acked-by: Christian König Link: https://www.spinics.net/lists/amd-gfx/msg74121.html --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 89b682afe821..eb06322d2972 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -32,6 +32,8 @@ #include "wafl/wafl2_4_0_0_smn.h" #include "wafl/wafl2_4_0_0_sh_mask.h" +#include "amdgpu_reset.h" + #define smnPCS_XGMI23_PCS_ERROR_STATUS 0x11a01210 #define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c #define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210 @@ -227,6 +229,9 @@ static void amdgpu_xgmi_hive_release(struct kobject *kobj) struct amdgpu_hive_info *hive = container_of( kobj, struct amdgpu_hive_info, kobj); + amdgpu_reset_put_reset_domain(hive->reset_domain); + hive->reset_domain = NULL; + mutex_destroy(&hive->hive_lock); kfree(hive); } @@ -398,12 +403,24 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) goto pro_end; } - hive->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-hive", 0); - if (!hive->reset_domain.wq) { - dev_err(adev->dev, "XGMI: failed allocating wq for reset domain!\n"); - kfree(hive); - hive = NULL; - goto pro_end; + /** + * Avoid recreating reset domain when hive is reconstructed for the case + * of reset the devices in the XGMI hive during probe for SRIOV + * See https://www.spinics.net/lists/amd-gfx/msg58836.html + */ + if (adev->reset_domain->type != XGMI_HIVE) { + hive->reset_domain = amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive"); + if (!hive->reset_domain) { + dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n"); + ret = -ENOMEM; + kobject_put(&hive->kobj); + kfree(hive); + hive = NULL; + goto pro_end; + } + } else { + amdgpu_reset_get_reset_domain(adev->reset_domain); + hive->reset_domain = adev->reset_domain; } hive->hive_id = adev->gmc.xgmi.hive_id; -- cgit v1.2.3 From bdb3489cfca16815e9a737359e9e90a4af5d0ff3 Mon Sep 17 00:00:00 2001 From: yipechai Date: Sun, 30 Jan 2022 17:03:32 +0800 Subject: drm/amdgpu: Optimize xxx_ras_late_init/xxx_ras_late_fini for each ras block 1. Define amdgpu_ras_block_late_init to create sysfs nodes and interrupt handles. 2. Define amdgpu_ras_block_late_fini to remove sysfs nodes and interrupt handles. 3. Replace ras block variable members in struct amdgpu_ras_block_object with struct ras_common_if, which can make it easy to associate each ras block instance with each ras block functional interface. 4. Add .ras_cb to struct amdgpu_ras_block_object. 5. Change each ras block to fit for the changement of struct amdgpu_ras_block_object. Signed-off-by: yipechai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 5929d6f528c9..15707af89212 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -981,8 +981,10 @@ struct amdgpu_ras_block_hw_ops xgmi_ras_hw_ops = { struct amdgpu_xgmi_ras xgmi_ras = { .ras_block = { - .name = "xgmi", - .block = AMDGPU_RAS_BLOCK__XGMI_WAFL, + .ras_comm = { + .name = "xgmi", + .block = AMDGPU_RAS_BLOCK__XGMI_WAFL, + }, .hw_ops = &xgmi_ras_hw_ops, .ras_late_init = amdgpu_xgmi_ras_late_init, .ras_fini = amdgpu_xgmi_ras_fini, -- cgit v1.2.3 From 892a57a975c3bd51834ddb0afa5f27baa19a785b Mon Sep 17 00:00:00 2001 From: yipechai Date: Tue, 8 Feb 2022 11:24:31 +0800 Subject: drm/amdgpu: Optimize amdgpu_xgmi_ras_late_init/amdgpu_xgmi_ras_fini function code Optimize amdgpu_xgmi_ras_late_init/amdgpu_xgmi_ras_fini function code. Signed-off-by: yipechai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 40 ++++---------------------------- 1 file changed, 5 insertions(+), 35 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 15707af89212..a785b1e088cd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -734,51 +734,20 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, void *ras_info) { - int r; - struct ras_ih_if ih_info = { - .cb = NULL, - }; - struct ras_fs_if fs_info = { - .sysfs_name = "xgmi_wafl_err_count", - }; - if (!adev->gmc.xgmi.supported || adev->gmc.xgmi.num_physical_nodes == 0) return 0; adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); - if (!adev->gmc.xgmi.ras_if) { - adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); - if (!adev->gmc.xgmi.ras_if) - return -ENOMEM; - adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL; - adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; - adev->gmc.xgmi.ras_if->sub_block_index = 0; - } - ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if; - r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if, - &fs_info, &ih_info); - if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) { - kfree(adev->gmc.xgmi.ras_if); - adev->gmc.xgmi.ras_if = NULL; - } - - return r; + return amdgpu_ras_block_late_init(adev, adev->gmc.xgmi.ras_if); } static void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev) { if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) && - adev->gmc.xgmi.ras_if) { - struct ras_common_if *ras_if = adev->gmc.xgmi.ras_if; - struct ras_ih_if ih_info = { - .cb = NULL, - }; - - amdgpu_ras_late_fini(adev, ras_if, &ih_info); - kfree(ras_if); - } + adev->gmc.xgmi.ras_if) + amdgpu_ras_block_late_fini(adev, adev->gmc.xgmi.ras_if); } uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, @@ -982,8 +951,9 @@ struct amdgpu_ras_block_hw_ops xgmi_ras_hw_ops = { struct amdgpu_xgmi_ras xgmi_ras = { .ras_block = { .ras_comm = { - .name = "xgmi", + .name = "xgmi_wafl", .block = AMDGPU_RAS_BLOCK__XGMI_WAFL, + .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, }, .hw_ops = &xgmi_ras_hw_ops, .ras_late_init = amdgpu_xgmi_ras_late_init, -- cgit v1.2.3 From 4e9b1fa5a2757d11a5c40eed2b2b4837dcb2f12e Mon Sep 17 00:00:00 2001 From: yipechai Date: Mon, 14 Feb 2022 14:12:55 +0800 Subject: drm/amdgpu: Modify .ras_late_init function pointer parameter Modify .ras_late_init function pointer parameter so that it can remove redundant intermediate calls in some ras blocks. Signed-off-by: yipechai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index a785b1e088cd..91f788f6f6b5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -732,7 +732,7 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) return psp_xgmi_terminate(&adev->psp); } -static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, void *ras_info) +static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) { if (!adev->gmc.xgmi.supported || adev->gmc.xgmi.num_physical_nodes == 0) -- cgit v1.2.3 From caae42f00924498e78da8a960561936aa7eba503 Mon Sep 17 00:00:00 2001 From: yipechai Date: Mon, 14 Feb 2022 14:38:02 +0800 Subject: drm/amdgpu: Optimize xxx_ras_late_init function of each ras block 1. Move calling ras block instance members from module internal function to the top calling xxx_ras_late_init. 2. Module internal function calls can only use parameter variables of xxx_ras_late_init instead of ras block instance members. Signed-off-by: yipechai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 91f788f6f6b5..77b65434ccc2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -740,7 +740,7 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_comm adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); - return amdgpu_ras_block_late_init(adev, adev->gmc.xgmi.ras_if); + return amdgpu_ras_block_late_init(adev, ras_block); } static void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev) -- cgit v1.2.3 From 01d468d9a420152e4a1270992e69a37ea0c98e04 Mon Sep 17 00:00:00 2001 From: yipechai Date: Thu, 17 Feb 2022 14:52:20 +0800 Subject: drm/amdgpu: Modify .ras_fini function pointer parameter Modify .ras_fini function pointer parameter so that we can remove redundant intermediate calls in some ras blocks. Signed-off-by: yipechai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 91817a31f3e1..3e56adea84a3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -768,7 +768,7 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_comm return amdgpu_ras_block_late_init(adev, ras_block); } -static void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev) +static void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block) { if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) && adev->gmc.xgmi.ras_if) -- cgit v1.2.3 From 667c7091a39e8b360d34f37aed5f8dd85bdc45f7 Mon Sep 17 00:00:00 2001 From: yipechai Date: Thu, 17 Feb 2022 14:55:19 +0800 Subject: drm/amdgpu: Optimize xxx_ras_fini function of each ras block 1. Move the variables of ras block instance members from specific xxx_ras_fini to general ras_fini call. 2. Function calls inside the modules only use parameters passed from xxx_ras_fini instead of ras block instance members. Signed-off-by: yipechai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 3e56adea84a3..a54257760f9e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -771,8 +771,8 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_comm static void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block) { if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) && - adev->gmc.xgmi.ras_if) - amdgpu_ras_block_late_fini(adev, adev->gmc.xgmi.ras_if); + ras_block) + amdgpu_ras_block_late_fini(adev, ras_block); } uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, -- cgit v1.2.3 From f148c143ef3f6e897f4a1012d1bcae3aa240bd8a Mon Sep 17 00:00:00 2001 From: yipechai Date: Mon, 14 Feb 2022 16:10:32 +0800 Subject: drm/amdgpu: Remove redundant calls of amdgpu_ras_block_late_fini in xgmi ras block Remove redundant calls of amdgpu_ras_block_late_fini in xgmi ras block. Signed-off-by: yipechai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index a54257760f9e..05dc76f6fdc6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -768,13 +768,6 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_comm return amdgpu_ras_block_late_init(adev, ras_block); } -static void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block) -{ - if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) && - ras_block) - amdgpu_ras_block_late_fini(adev, ras_block); -} - uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, uint64_t addr) { @@ -982,6 +975,6 @@ struct amdgpu_xgmi_ras xgmi_ras = { }, .hw_ops = &xgmi_ras_hw_ops, .ras_late_init = amdgpu_xgmi_ras_late_init, - .ras_fini = amdgpu_xgmi_ras_fini, + .ras_fini = amdgpu_ras_block_late_fini, }, }; -- cgit v1.2.3 From 80e0c2cb37b6e00ec0b41c7670e24f72b2d54ceb Mon Sep 17 00:00:00 2001 From: yipechai Date: Thu, 17 Feb 2022 15:40:12 +0800 Subject: drm/amdgpu: Remove redundant .ras_fini initialization in some ras blocks 1. Define amdgpu_ras_block_late_fini_default in amdgpu_ras.c as .ras_fini common function, which is called when .ras_fini of ras block isn't initialized. 2. Remove the code of using amdgpu_ras_block_late_fini to initialize .ras_fini in ras blocks. Signed-off-by: yipechai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 05dc76f6fdc6..05c0d6e2c75c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -975,6 +975,5 @@ struct amdgpu_xgmi_ras xgmi_ras = { }, .hw_ops = &xgmi_ras_hw_ops, .ras_late_init = amdgpu_xgmi_ras_late_init, - .ras_fini = amdgpu_ras_block_late_fini, }, }; -- cgit v1.2.3 From a03b288650abf2a92d5ecdaa737e3d04a2a77984 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Thu, 10 Mar 2022 15:53:04 +0800 Subject: drm/amdgpu: drop xmgi23 error query/reset support xgmi_ras is only initialized when host to GPU interface is PCIE. in such case, xgmi23 is disabled and protected by security firmware. Host access will results to security violation Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 05c0d6e2c75c..1b108d03e785 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -34,7 +34,6 @@ #include "amdgpu_reset.h" -#define smnPCS_XGMI23_PCS_ERROR_STATUS 0x11a01210 #define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c #define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210 @@ -69,17 +68,6 @@ static const int wafl_pcs_err_status_reg_arct[] = { smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, }; -static const int xgmi23_pcs_err_status_reg_aldebaran[] = { - smnPCS_XGMI23_PCS_ERROR_STATUS, - smnPCS_XGMI23_PCS_ERROR_STATUS + 0x100000, - smnPCS_XGMI23_PCS_ERROR_STATUS + 0x200000, - smnPCS_XGMI23_PCS_ERROR_STATUS + 0x300000, - smnPCS_XGMI23_PCS_ERROR_STATUS + 0x400000, - smnPCS_XGMI23_PCS_ERROR_STATUS + 0x500000, - smnPCS_XGMI23_PCS_ERROR_STATUS + 0x600000, - smnPCS_XGMI23_PCS_ERROR_STATUS + 0x700000 -}; - static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = { smnPCS_XGMI3X16_PCS_ERROR_STATUS, smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000, @@ -797,9 +785,6 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) xgmi_pcs_err_status_reg_vg20[i]); break; case CHIP_ALDEBARAN: - for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++) - pcs_clear_status(adev, - xgmi23_pcs_err_status_reg_aldebaran[i]); for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) pcs_clear_status(adev, xgmi3x16_pcs_err_status_reg_aldebaran[i]); @@ -900,13 +885,6 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, } break; case CHIP_ALDEBARAN: - /* check xgmi23 pcs error */ - for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++) { - data = RREG32_PCIE(xgmi23_pcs_err_status_reg_aldebaran[i]); - if (data) - amdgpu_xgmi_query_pcs_error_status(adev, - data, &ue_cnt, &ce_cnt, true); - } /* check xgmi3x16 pcs error */ for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) { data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]); -- cgit v1.2.3