diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
27 files changed, 478 insertions, 77 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 4009d2e30727..6125ba905faf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -49,6 +49,8 @@ #include <linux/rbtree.h> #include <linux/hashtable.h> #include <linux/dma-fence.h> +#include <linux/pci.h> +#include <linux/aer.h> #include <drm/ttm/ttm_bo_api.h> #include <drm/ttm/ttm_bo_driver.h> @@ -987,6 +989,9 @@ struct amdgpu_device { atomic_t throttling_logging_enabled; struct ratelimit_state throttling_logging_rs; uint32_t ras_features; + + bool in_pci_err_recovery; + struct pci_saved_state *pci_state; }; static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) @@ -1260,6 +1265,15 @@ static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev) { return void amdgpu_register_gpu_instance(struct amdgpu_device *adev); void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev); +pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, + pci_channel_state_t state); +pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev); +pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev); +void amdgpu_pci_resume(struct pci_dev *pdev); + +bool amdgpu_device_cache_pci_state(struct pci_dev *pdev); +bool amdgpu_device_load_pci_state(struct pci_dev *pdev); + #include "amdgpu_object.h" /* used by df_v3_6.c and amdgpu_pmu.c */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c index 3e35a8f2c5e5..7abe9500c0c6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c @@ -616,7 +616,7 @@ static bool amdgpu_atpx_detect(void) while ((pdev = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, pdev)) != NULL) { vga_count++; - has_atpx |= (amdgpu_atpx_pci_probe_handle(pdev) == true); + has_atpx |= amdgpu_atpx_pci_probe_handle(pdev); parent_pdev = pci_upstream_bridge(pdev); d3_supported |= parent_pdev && parent_pdev->bridge_d3; @@ -626,7 +626,7 @@ static bool amdgpu_atpx_detect(void) while ((pdev = pci_get_class(PCI_CLASS_DISPLAY_OTHER << 8, pdev)) != NULL) { vga_count++; - has_atpx |= (amdgpu_atpx_pci_probe_handle(pdev) == true); + has_atpx |= amdgpu_atpx_pci_probe_handle(pdev); parent_pdev = pci_upstream_bridge(pdev); d3_supported |= parent_pdev && parent_pdev->bridge_d3; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f7307af76452..2ff43a3d52fc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -319,6 +319,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, { uint32_t ret; + if (adev->in_pci_err_recovery) + return 0; + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && down_read_trylock(&adev->reset_sem)) { ret = amdgpu_kiq_rreg(adev, reg); @@ -355,7 +358,11 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, * * Returns the 8 bit value from the offset specified. */ -uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { +uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) +{ + if (adev->in_pci_err_recovery) + return 0; + if (offset < adev->rmmio_size) return (readb(adev->rmmio + offset)); BUG(); @@ -376,7 +383,11 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { * * Writes the value specified to the offset specified. */ -void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { +void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) +{ + if (adev->in_pci_err_recovery) + return; + if (offset < adev->rmmio_size) writeb(value, adev->rmmio + offset); else @@ -387,6 +398,9 @@ static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { + if (adev->in_pci_err_recovery) + return; + trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); if ((reg * 4) < adev->rmmio_size) @@ -414,6 +428,9 @@ static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { + if (adev->in_pci_err_recovery) + return; + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && down_read_trylock(&adev->reset_sem)) { amdgpu_kiq_wreg(adev, reg, v); @@ -432,6 +449,9 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { + if (adev->in_pci_err_recovery) + return; + if (amdgpu_sriov_fullaccess(adev) && adev->gfx.rlc.funcs && adev->gfx.rlc.funcs->is_rlcg_access_range) { @@ -453,6 +473,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t */ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) { + if (adev->in_pci_err_recovery) + return 0; + if ((reg * 4) < adev->rio_mem_size) return ioread32(adev->rio_mem + (reg * 4)); else { @@ -472,6 +495,9 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) */ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) { + if (adev->in_pci_err_recovery) + return; + if ((reg * 4) < adev->rio_mem_size) iowrite32(v, adev->rio_mem + (reg * 4)); else { @@ -491,6 +517,9 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) */ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) { + if (adev->in_pci_err_recovery) + return 0; + if (index < adev->doorbell.num_doorbells) { return readl(adev->doorbell.ptr + index); } else { @@ -511,6 +540,9 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) */ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) { + if (adev->in_pci_err_recovery) + return; + if (index < adev->doorbell.num_doorbells) { writel(v, adev->doorbell.ptr + index); } else { @@ -529,6 +561,9 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) */ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) { + if (adev->in_pci_err_recovery) + return 0; + if (index < adev->doorbell.num_doorbells) { return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); } else { @@ -549,6 +584,9 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) */ void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) { + if (adev->in_pci_err_recovery) + return; + if (index < adev->doorbell.num_doorbells) { atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); } else { @@ -1256,7 +1294,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; pci_set_power_state(dev->pdev, PCI_D0); - pci_restore_state(dev->pdev); + amdgpu_device_load_pci_state(dev->pdev); r = pci_enable_device(dev->pdev); if (r) DRM_WARN("pci_enable_device failed (%d)\n", r); @@ -1269,7 +1307,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, drm_kms_helper_poll_disable(dev); dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; amdgpu_device_suspend(dev, true); - pci_save_state(dev->pdev); + amdgpu_device_cache_pci_state(dev->pdev); /* Shut down the device */ pci_disable_device(dev->pdev); pci_set_power_state(dev->pdev, PCI_D3cold); @@ -2999,6 +3037,7 @@ static const struct attribute *amdgpu_dev_attributes[] = { NULL }; + /** * amdgpu_device_init - initialize the driver * @@ -3170,13 +3209,13 @@ int amdgpu_device_init(struct amdgpu_device *adev, r = amdgpu_device_get_job_timeout_settings(adev); if (r) { dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); - return r; + goto failed_unmap; } /* early init functions */ r = amdgpu_device_ip_early_init(adev); if (r) - return r; + goto failed_unmap; /* doorbell bar mapping and doorbell index init*/ amdgpu_device_doorbell_init(adev); @@ -3217,6 +3256,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, } } + pci_enable_pcie_error_reporting(adev->ddev.pdev); + /* Post card if necessary */ if (amdgpu_device_need_post(adev)) { if (!adev->bios) { @@ -3359,16 +3400,18 @@ fence_driver_init: flush_delayed_work(&adev->delayed_init_work); r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); - if (r) { + if (r) dev_err(adev->dev, "Could not create amdgpu device attr\n"); - return r; - } if (IS_ENABLED(CONFIG_PERF_EVENTS)) r = amdgpu_pmu_init(adev); if (r) dev_err(adev->dev, "amdgpu_pmu_init failed\n"); + /* Have stored pci confspace at hand for restore in sudden PCI error */ + if (amdgpu_device_cache_pci_state(adev->pdev)) + pci_restore_state(pdev); + return 0; failed: @@ -3376,6 +3419,10 @@ failed: if (boco) vga_switcheroo_fini_domain_pm_ops(adev->dev); +failed_unmap: + iounmap(adev->rmmio); + adev->rmmio = NULL; + return r; } @@ -3393,6 +3440,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev) flush_delayed_work(&adev->delayed_init_work); adev->shutdown = true; + kfree(adev->pci_state); + /* make sure IB test finished before entering exclusive mode * to avoid preemption on IB test * */ @@ -4072,7 +4121,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, struct list_head *device_list_handle, - bool *need_full_reset_arg) + bool *need_full_reset_arg, + bool skip_hw_reset) { struct amdgpu_device *tmp_adev = NULL; bool need_full_reset = *need_full_reset_arg, vram_lost = false; @@ -4082,7 +4132,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, * ASIC reset has to be done on all HGMI hive nodes ASAP * to allow proper links negotiation in FW (within 1 sec) */ - if (need_full_reset) { + if (!skip_hw_reset && need_full_reset) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { /* For XGMI run all resets in parallel to speed up the process */ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { @@ -4477,7 +4527,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ if (r) adev->asic_reset_res = r; } else { - r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); + r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); if (r && r == -EAGAIN) goto retry; } @@ -4705,3 +4755,235 @@ int amdgpu_device_baco_exit(struct drm_device *dev) return 0; } + +static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) +{ + int i; + + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + if (!ring || !ring->sched.thread) + continue; + + cancel_delayed_work_sync(&ring->sched.work_tdr); + } +} + +/** + * amdgpu_pci_error_detected - Called when a PCI error is detected. + * @pdev: PCI device struct + * @state: PCI channel state + * + * Description: Called when a PCI error is detected. + * + * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. + */ +pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int i; + + DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); + + if (adev->gmc.xgmi.num_physical_nodes > 1) { + DRM_WARN("No support for XGMI hive yet..."); + return PCI_ERS_RESULT_DISCONNECT; + } + + switch (state) { + case pci_channel_io_normal: + return PCI_ERS_RESULT_CAN_RECOVER; + /* Fatal error, prepare for slot reset */ + case pci_channel_io_frozen: + /* + * Cancel and wait for all TDRs in progress if failing to + * set adev->in_gpu_reset in amdgpu_device_lock_adev + * + * Locking adev->reset_sem will prevent any external access + * to GPU during PCI error recovery + */ + while (!amdgpu_device_lock_adev(adev, NULL)) + amdgpu_cancel_all_tdr(adev); + + /* + * Block any work scheduling as we do for regular GPU reset + * for the duration of the recovery + */ + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + if (!ring || !ring->sched.thread) + continue; + + drm_sched_stop(&ring->sched, NULL); + } + return PCI_ERS_RESULT_NEED_RESET; + case pci_channel_io_perm_failure: + /* Permanent error, prepare for device removal */ + return PCI_ERS_RESULT_DISCONNECT; + } + + return PCI_ERS_RESULT_NEED_RESET; +} + +/** + * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers + * @pdev: pointer to PCI device + */ +pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) +{ + + DRM_INFO("PCI error: mmio enabled callback!!\n"); + + /* TODO - dump whatever for debugging purposes */ + + /* This called only if amdgpu_pci_error_detected returns + * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still + * works, no need to reset slot. + */ + + return PCI_ERS_RESULT_RECOVERED; +} + +/** + * amdgpu_pci_slot_reset - Called when PCI slot has been reset. + * @pdev: PCI device struct + * + * Description: This routine is called by the pci error recovery + * code after the PCI slot has been reset, just before we + * should resume normal operations. + */ +pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int r, i; + bool need_full_reset = true; + u32 memsize; + struct list_head device_list; + + DRM_INFO("PCI error: slot reset callback!!\n"); + + INIT_LIST_HEAD(&device_list); + list_add_tail(&adev->gmc.xgmi.head, &device_list); + + /* wait for asic to come out of reset */ + msleep(500); + + /* Restore PCI confspace */ + amdgpu_device_load_pci_state(pdev); + + /* confirm ASIC came out of reset */ + for (i = 0; i < adev->usec_timeout; i++) { + memsize = amdgpu_asic_get_config_memsize(adev); + + if (memsize != 0xffffffff) + break; + udelay(1); + } + if (memsize == 0xffffffff) { + r = -ETIME; + goto out; + } + + adev->in_pci_err_recovery = true; + r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); + adev->in_pci_err_recovery = false; + if (r) + goto out; + + r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); + +out: + if (!r) { + if (amdgpu_device_cache_pci_state(adev->pdev)) + pci_restore_state(adev->pdev); + + DRM_INFO("PCIe error recovery succeeded\n"); + } else { + DRM_ERROR("PCIe error recovery failed, err:%d", r); + amdgpu_device_unlock_adev(adev); + } + + return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; +} + +/** + * amdgpu_pci_resume() - resume normal ops after PCI reset + * @pdev: pointer to PCI device + * + * Called when the error recovery driver tells us that its + * OK to resume normal operation. Use completion to allow + * halted scsi ops to resume. + */ +void amdgpu_pci_resume(struct pci_dev *pdev) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int i; + + + DRM_INFO("PCI error: resume callback!!\n"); + + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + if (!ring || !ring->sched.thread) + continue; + + + drm_sched_resubmit_jobs(&ring->sched); + drm_sched_start(&ring->sched, true); + } + + amdgpu_device_unlock_adev(adev); +} + +bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int r; + + r = pci_save_state(pdev); + if (!r) { + kfree(adev->pci_state); + + adev->pci_state = pci_store_saved_state(pdev); + + if (!adev->pci_state) { + DRM_ERROR("Failed to store PCI saved state"); + return false; + } + } else { + DRM_WARN("Failed to save PCI state, err:%d\n", r); + return false; + } + + return true; +} + +bool amdgpu_device_load_pci_state(struct pci_dev *pdev) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int r; + + if (!adev->pci_state) + return false; + + r = pci_load_saved_state(pdev, adev->pci_state); + + if (!r) { + pci_restore_state(pdev); + } else { + DRM_WARN("Failed to load PCI state, err:%d\n", r); + return false; + } + + return true; +} + + diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h index 61a26c15c8dd..373cdebe0e2f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h @@ -44,9 +44,9 @@ struct amdgpu_df_funcs { void (*enable_ecc_force_par_wr_rmw)(struct amdgpu_device *adev, bool enable); int (*pmc_start)(struct amdgpu_device *adev, uint64_t config, - int is_enable); + int is_add); int (*pmc_stop)(struct amdgpu_device *adev, uint64_t config, - int is_disable); + int is_remove); void (*pmc_get_count)(struct amdgpu_device *adev, uint64_t config, uint64_t *count); uint64_t (*get_fica)(struct amdgpu_device *adev, uint32_t ficaa_val); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 6edde2b9e402..a4b518211b1f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -32,7 +32,6 @@ #include <drm/drm_pciids.h> #include <linux/console.h> #include <linux/module.h> -#include <linux/pci.h> #include <linux/pm_runtime.h> #include <linux/vga_switcheroo.h> #include <drm/drm_probe_helper.h> @@ -1073,8 +1072,16 @@ static const struct pci_device_id pciidlist[] = { {0x1002, 0x1636, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU}, /* Navi12 */ - {0x1002, 0x7360, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI12|AMD_EXP_HW_SUPPORT}, - {0x1002, 0x7362, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI12|AMD_EXP_HW_SUPPORT}, + {0x1002, 0x7360, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI12}, + {0x1002, 0x7362, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI12}, + + /* Sienna_Cichlid */ + {0x1002, 0x73A0, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID}, + {0x1002, 0x73A2, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID}, + {0x1002, 0x73A3, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID}, + {0x1002, 0x73AB, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID}, + {0x1002, 0x73AE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID}, + {0x1002, 0x73BF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID}, {0, 0, 0} }; @@ -1102,6 +1109,16 @@ static int amdgpu_pci_probe(struct pci_dev *pdev, return -ENODEV; } + /* Due to hardware bugs, S/G Display on raven requires a 1:1 IOMMU mapping, + * however, SME requires an indirect IOMMU mapping because the encryption + * bit is beyond the DMA mask of the chip. + */ + if (mem_encrypt_active() && ((flags & AMD_ASIC_MASK) == CHIP_RAVEN)) { + dev_info(&pdev->dev, + "SME is not compatible with RAVEN\n"); + return -ENOTSUPP; + } + #ifdef CONFIG_DRM_AMDGPU_SI if (!amdgpu_si_support) { switch (flags & AMD_ASIC_MASK) { @@ -1316,7 +1333,7 @@ static int amdgpu_pmops_runtime_suspend(struct device *dev) if (amdgpu_is_atpx_hybrid()) { pci_ignore_hotplug(pdev); } else { - pci_save_state(pdev); + amdgpu_device_cache_pci_state(pdev); pci_disable_device(pdev); pci_ignore_hotplug(pdev); pci_set_power_state(pdev, PCI_D3cold); @@ -1349,7 +1366,7 @@ static int amdgpu_pmops_runtime_resume(struct device *dev) pci_set_master(pdev); } else { pci_set_power_state(pdev, PCI_D0); - pci_restore_state(pdev); + amdgpu_device_load_pci_state(pdev); ret = pci_enable_device(pdev); if (ret) return ret; @@ -1528,6 +1545,13 @@ static struct drm_driver kms_driver = { .patchlevel = KMS_DRIVER_PATCHLEVEL, }; +static struct pci_error_handlers amdgpu_pci_err_handler = { + .error_detected = amdgpu_pci_error_detected, + .mmio_enabled = amdgpu_pci_mmio_enabled, + .slot_reset = amdgpu_pci_slot_reset, + .resume = amdgpu_pci_resume, +}; + static struct pci_driver amdgpu_kms_pci_driver = { .name = DRIVER_NAME, .id_table = pciidlist, @@ -1535,6 +1559,7 @@ static struct pci_driver amdgpu_kms_pci_driver = { .remove = amdgpu_pci_remove, .shutdown = amdgpu_pci_shutdown, .driver.pm = &amdgpu_pm_ops, + .err_handler = &amdgpu_pci_err_handler, }; static int __init amdgpu_init(void) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index d6981425ec51..8c9bacfdbc30 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -693,6 +693,9 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg) struct amdgpu_kiq *kiq = &adev->gfx.kiq; struct amdgpu_ring *ring = &kiq->ring; + if (adev->in_pci_err_recovery) + return 0; + BUG_ON(!ring->funcs->emit_rreg); spin_lock_irqsave(&kiq->ring_lock, flags); @@ -757,6 +760,9 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) BUG_ON(!ring->funcs->emit_wreg); + if (adev->in_pci_err_recovery) + return; + spin_lock_irqsave(&kiq->ring_lock, flags); amdgpu_ring_alloc(ring, 32); amdgpu_ring_emit_wreg(ring, reg, v); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 456a4a93b337..bccaf4f77647 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -282,14 +282,25 @@ static int amdgpu_firmware_info(struct drm_amdgpu_info_firmware *fw_info, fw_info->feature = 0; break; case AMDGPU_INFO_FW_TA: - if (query_fw->index > 1) - return -EINVAL; - if (query_fw->index == 0) { + switch (query_fw->index) { + case 0: fw_info->ver = adev->psp.ta_fw_version; fw_info->feature = adev->psp.ta_xgmi_ucode_version; - } else { + break; + case 1: fw_info->ver = adev->psp.ta_fw_version; fw_info->feature = adev->psp.ta_ras_ucode_version; + break; + case 2: + fw_info->ver = adev->psp.ta_fw_version; + fw_info->feature = adev->psp.ta_hdcp_ucode_version; + break; + case 3: + fw_info->ver = adev->psp.ta_fw_version; + fw_info->feature = adev->psp.ta_dtm_ucode_version; + break; + default: + return -EINVAL; } break; case AMDGPU_INFO_FW_SDMA: @@ -1385,13 +1396,31 @@ static int amdgpu_debugfs_firmware_info(struct seq_file *m, void *data) fw_info.feature, fw_info.ver); query_fw.fw_type = AMDGPU_INFO_FW_TA; - for (i = 0; i < 2; i++) { + for (i = 0; i < 4; i++) { query_fw.index = i; ret = amdgpu_firmware_info(&fw_info, &query_fw, adev); if (ret) continue; - seq_printf(m, "TA %s feature version: %u, firmware version: 0x%08x\n", - i ? "RAS" : "XGMI", fw_info.feature, fw_info.ver); + switch (query_fw.index) { + case 0: + seq_printf(m, "TA %s feature version: 0x%08x, firmware version: 0x%08x\n", + "RAS", fw_info.feature, fw_info.ver); + break; + case 1: + seq_printf(m, "TA %s feature version: 0x%08x, firmware version: 0x%08x\n", + "XGMI", fw_info.feature, fw_info.ver); + break; + case 2: + seq_printf(m, "TA %s feature version: 0x%08x, firmware version: 0x%08x\n", + "HDCP", fw_info.feature, fw_info.ver); + break; + case 3: + seq_printf(m, "TA %s feature version: 0x%08x, firmware version: 0x%08x\n", + "DTM", fw_info.feature, fw_info.ver); + break; + default: + return -EINVAL; + } } /* SMC */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h index 04a430e0e2e1..a04decb934b0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h @@ -46,6 +46,7 @@ #include <drm/drm_dp_mst_helper.h> #include "modules/inc/mod_freesync.h" +#include "amdgpu_dm_irq_params.h" struct amdgpu_bo; struct amdgpu_device; @@ -404,7 +405,8 @@ struct amdgpu_crtc { struct amdgpu_flip_work *pflip_works; enum amdgpu_flip_status pflip_status; int deferred_flip_completion; - u32 last_flip_vblank; + /* parameters access from DM IRQ handler */ + struct dm_irq_params dm_irq_params; /* pll sharing */ struct amdgpu_atom_ss ss; bool ss_enabled; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index d6c38e24f130..2c66e20b2ed9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -178,7 +178,7 @@ static int psp_sw_init(void *handle) return ret; } - if (adev->asic_type == CHIP_NAVI10) { + if (adev->asic_type == CHIP_NAVI10 || adev->asic_type == CHIP_SIENNA_CICHLID) { ret= psp_sysfs_init(adev); if (ret) { return ret; @@ -219,6 +219,9 @@ int psp_wait_for(struct psp_context *psp, uint32_t reg_index, int i; struct amdgpu_device *adev = psp->adev; + if (psp->adev->in_pci_err_recovery) + return 0; + for (i = 0; i < adev->usec_timeout; i++) { val = RREG32(reg_index); if (check_changed) { @@ -245,6 +248,9 @@ psp_cmd_submit_buf(struct psp_context *psp, bool ras_intr = false; bool skip_unsupport = false; + if (psp->adev->in_pci_err_recovery) + return 0; + mutex_lock(&psp->mutex); memset(psp->cmd_buf_mem, 0, PSP_CMD_BUFFER_SIZE); @@ -929,6 +935,7 @@ static int psp_ras_load(struct psp_context *psp) { int ret; struct psp_gfx_cmd_resp *cmd; + struct ta_ras_shared_memory *ras_cmd; /* * TODO: bypass the loading in sriov for now @@ -952,11 +959,20 @@ static int psp_ras_load(struct psp_context *psp) ret = psp_cmd_submit_buf(psp, NULL, cmd, psp->fence_buf_mc_addr); + ras_cmd = (struct ta_ras_shared_memory*)psp->ras.ras_shared_buf; + if (!ret) { - psp->ras.ras_initialized = true; psp->ras.session_id = cmd->resp.session_id; + + if (!ras_cmd->ras_status) + psp->ras.ras_initialized = true; + else + dev_warn(psp->adev->dev, "RAS Init Status: 0x%X\n", ras_cmd->ras_status); } + if (ret || ras_cmd->ras_status) + amdgpu_ras_fini(psp->adev); + kfree(cmd); return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index a18dc878339a..4a85f8cedd77 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1021,6 +1021,7 @@ static int amdgpu_ttm_tt_pin_userptr(struct ttm_tt *ttm) release_sg: kfree(ttm->sg); + ttm->sg = NULL; return r; } @@ -1155,7 +1156,12 @@ static int amdgpu_ttm_backend_bind(struct ttm_tt *ttm, } /** - * amdgpu_ttm_alloc_gart - Allocate GART memory f |