diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-21 08:10:03 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-21 08:10:03 -0800 |
commit | d9de5ce8a5ec8f97c9468244fd85ff1a10363b60 (patch) | |
tree | 6de3a4ef226313aceae0ed920b1a3e28f42ac144 | |
parent | 0246725d7399d7d6acc8fd5a1a0a1ffce9a1eaa3 (diff) | |
parent | 28980db94742f9f2fb0f68ea35f2171b38007bae (diff) | |
download | linux-d9de5ce8a5ec8f97c9468244fd85ff1a10363b60.tar.gz linux-d9de5ce8a5ec8f97c9468244fd85ff1a10363b60.tar.bz2 linux-d9de5ce8a5ec8f97c9468244fd85ff1a10363b60.zip |
Merge tag 'edac_updates_for_v6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras
Pull EDAC updates from Borislav Petkov:
- Add a driver for the RAS functionality on Xilinx's on chip memory
controller
- Add support for decoding errors from the first and second level
memory on SKL-based hardware
- Add support for the memory controllers in Intel Granite Rapids and
Emerald Rapids machines
- First round of amd64_edac driver simplification and removal of
unneeded functionality
- The usual cleanups and fixes
* tag 'edac_updates_for_v6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras:
EDAC/amd64: Shut up an -Werror,-Wsometimes-uninitialized clang false positive
EDAC/amd64: Remove early_channel_count()
EDAC/amd64: Remove PCI Function 0
EDAC/amd64: Remove PCI Function 6
EDAC/amd64: Remove scrub rate control for Family 17h and later
EDAC/amd64: Don't set up EDAC PCI control on Family 17h+
EDAC/i10nm: Add driver decoder for Sapphire Rapids server
EDAC/i10nm: Add Intel Granite Rapids server support
EDAC/i10nm: Make more configurations CPU model specific
EDAC/i10nm: Add Intel Emerald Rapids server support
EDAC/skx_common: Delete duplicated and unreachable code
EDAC/skx_common: Enable EDAC support for the "near" memory
EDAC/qcom: Add platform_device_id table for module autoloading
EDAC/zynqmp: Add EDAC support for Xilinx ZynqMP OCM
dt-bindings: edac: Add bindings for Xilinx ZynqMP OCM
-rw-r--r-- | Documentation/devicetree/bindings/memory-controllers/xlnx,zynqmp-ocmc-1.0.yaml | 45 | ||||
-rw-r--r-- | MAINTAINERS | 7 | ||||
-rw-r--r-- | drivers/edac/Kconfig | 8 | ||||
-rw-r--r-- | drivers/edac/Makefile | 1 | ||||
-rw-r--r-- | drivers/edac/amd64_edac.c | 217 | ||||
-rw-r--r-- | drivers/edac/amd64_edac.h | 24 | ||||
-rw-r--r-- | drivers/edac/i10nm_base.c | 459 | ||||
-rw-r--r-- | drivers/edac/qcom_edac.c | 7 | ||||
-rw-r--r-- | drivers/edac/skx_common.c | 76 | ||||
-rw-r--r-- | drivers/edac/skx_common.h | 61 | ||||
-rw-r--r-- | drivers/edac/zynqmp_edac.c | 467 |
11 files changed, 1013 insertions, 359 deletions
diff --git a/Documentation/devicetree/bindings/memory-controllers/xlnx,zynqmp-ocmc-1.0.yaml b/Documentation/devicetree/bindings/memory-controllers/xlnx,zynqmp-ocmc-1.0.yaml new file mode 100644 index 000000000000..ca9fc747bf4f --- /dev/null +++ b/Documentation/devicetree/bindings/memory-controllers/xlnx,zynqmp-ocmc-1.0.yaml @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/memory-controllers/xlnx,zynqmp-ocmc-1.0.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Xilinx Zynqmp OCM(On-Chip Memory) Controller + +maintainers: + - Shubhrajyoti Datta <shubhrajyoti.datta@amd.com> + - Sai Krishna Potthuri <sai.krishna.potthuri@amd.com> + +description: | + The OCM supports 64-bit wide ECC functionality to detect multi-bit errors + and recover from a single-bit memory fault.On a write, if all bytes are + being written, the ECC is generated and written into the ECC RAM along with + the write-data that is written into the data RAM. If one or more bytes are + not written, then the read operation results in an correctable error or + uncorrectable error. + +properties: + compatible: + const: xlnx,zynqmp-ocmc-1.0 + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + +required: + - compatible + - reg + - interrupts + +additionalProperties: false + +examples: + - | + #include <dt-bindings/interrupt-controller/arm-gic.h> + memory-controller@ff960000 { + compatible = "xlnx,zynqmp-ocmc-1.0"; + reg = <0xff960000 0x1000>; + interrupts = <GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>; + }; diff --git a/MAINTAINERS b/MAINTAINERS index d0b9548d26f7..82f6dfe8c215 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22743,6 +22743,13 @@ F: Documentation/devicetree/bindings/dma/xilinx/xlnx,zynqmp-dpdma.yaml F: drivers/dma/xilinx/xilinx_dpdma.c F: include/dt-bindings/dma/xlnx-zynqmp-dpdma.h +XILINX ZYNQMP OCM EDAC DRIVER +M: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com> +M: Sai Krishna Potthuri <sai.krishna.potthuri@amd.com> +S: Maintained +F: Documentation/devicetree/bindings/memory-controllers/xlnx,zynqmp-ocmc-1.0.yaml +F: drivers/edac/zynqmp_edac.c + XILINX ZYNQMP PSGTR PHY DRIVER M: Anurag Kumar Vulisha <anurag.kumar.vulisha@xilinx.com> M: Laurent Pinchart <laurent.pinchart@ideasonboard.com> diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 4cfdefbd744d..68f576700911 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -542,4 +542,12 @@ config EDAC_DMC520 Support for error detection and correction on the SoCs with ARM DMC-520 DRAM controller. +config EDAC_ZYNQMP + tristate "Xilinx ZynqMP OCM Controller" + depends on ARCH_ZYNQMP || COMPILE_TEST + help + This driver supports error detection and correction for the + Xilinx ZynqMP OCM (On Chip Memory) controller. It can also be + built as a module. In that case it will be called zynqmp_edac. + endif # EDAC diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index 2d1641a27a28..9b025c5b3061 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -84,3 +84,4 @@ obj-$(CONFIG_EDAC_QCOM) += qcom_edac.o obj-$(CONFIG_EDAC_ASPEED) += aspeed_edac.o obj-$(CONFIG_EDAC_BLUEFIELD) += bluefield_edac.o obj-$(CONFIG_EDAC_DMC520) += dmc520_edac.o +obj-$(CONFIG_EDAC_ZYNQMP) += zynqmp_edac.o diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index e3318e5575a3..5b42533f306a 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -182,21 +182,6 @@ static inline int amd64_read_dct_pci_cfg(struct amd64_pvt *pvt, u8 dct, * other archs, we might not have access to the caches directly. */ -static inline void __f17h_set_scrubval(struct amd64_pvt *pvt, u32 scrubval) -{ - /* - * Fam17h supports scrub values between 0x5 and 0x14. Also, the values - * are shifted down by 0x5, so scrubval 0x5 is written to the register - * as 0x0, scrubval 0x6 as 0x1, etc. - */ - if (scrubval >= 0x5 && scrubval <= 0x14) { - scrubval -= 0x5; - pci_write_bits32(pvt->F6, F17H_SCR_LIMIT_ADDR, scrubval, 0xF); - pci_write_bits32(pvt->F6, F17H_SCR_BASE_ADDR, 1, 0x1); - } else { - pci_write_bits32(pvt->F6, F17H_SCR_BASE_ADDR, 0, 0x1); - } -} /* * Scan the scrub rate mapping table for a close or matching bandwidth value to * issue. If requested is too big, then use last maximum value found. @@ -229,9 +214,7 @@ static int __set_scrub_rate(struct amd64_pvt *pvt, u32 new_bw, u32 min_rate) scrubval = scrubrates[i].scrubval; - if (pvt->umc) { - __f17h_set_scrubval(pvt, scrubval); - } else if (pvt->fam == 0x15 && pvt->model == 0x60) { + if (pvt->fam == 0x15 && pvt->model == 0x60) { f15h_select_dct(pvt, 0); pci_write_bits32(pvt->F2, F15H_M60H_SCRCTRL, scrubval, 0x001F); f15h_select_dct(pvt, 1); @@ -271,16 +254,7 @@ static int get_scrub_rate(struct mem_ctl_info *mci) int i, retval = -EINVAL; u32 scrubval = 0; - if (pvt->umc) { - amd64_read_pci_cfg(pvt->F6, F17H_SCR_BASE_ADDR, &scrubval); - if (scrubval & BIT(0)) { - amd64_read_pci_cfg(pvt->F6, F17H_SCR_LIMIT_ADDR, &scrubval); - scrubval &= 0xF; - scrubval += 0x5; - } else { - scrubval = 0; - } - } else if (pvt->fam == 0x15) { + if (pvt->fam == 0x15) { /* Erratum #505 */ if (pvt->model < 0x10) f15h_select_dct(pvt, 0); @@ -1454,9 +1428,6 @@ static void __dump_misc_regs_df(struct amd64_pvt *pvt) debug_display_dimm_sizes_df(pvt, i); } - - edac_dbg(1, "F0x104 (DRAM Hole Address): 0x%08x, base: 0x%08x\n", - pvt->dhar, dhar_base(pvt)); } /* Display and decode various NB registers for debug purposes. */ @@ -1491,6 +1462,8 @@ static void __dump_misc_regs(struct amd64_pvt *pvt) /* Only if NOT ganged does dclr1 have valid info */ if (!dct_ganging_enabled(pvt)) debug_dump_dramcfg_low(pvt, pvt->dclr1, 1); + + edac_dbg(1, " DramHoleValid: %s\n", dhar_valid(pvt) ? "yes" : "no"); } /* Display and decode various NB registers for debug purposes. */ @@ -1501,8 +1474,6 @@ static void dump_misc_regs(struct amd64_pvt *pvt) else __dump_misc_regs(pvt); - edac_dbg(1, " DramHoleValid: %s\n", dhar_valid(pvt) ? "yes" : "no"); - amd64_info("using x%u syndromes.\n", pvt->ecc_sym_sz); } @@ -1732,24 +1703,6 @@ ddr3: pvt->dram_type = (pvt->dclr0 & BIT(16)) ? MEM_DDR3 : MEM_RDDR3; } -/* Get the number of DCT channels the memory controller is using. */ -static int k8_early_channel_count(struct amd64_pvt *pvt) -{ - int flag; - - if (pvt->ext_model >= K8_REV_F) - /* RevF (NPT) and later */ - flag = pvt->dclr0 & WIDTH_128; - else - /* RevE and earlier */ - flag = pvt->dclr0 & REVE_WIDTH_128; - - /* not used */ - pvt->dclr1 = 0; - - return (flag) ? 2 : 1; -} - /* On F10h and later ErrAddr is MC4_ADDR[47:1] */ static u64 get_error_address(struct amd64_pvt *pvt, struct mce *m) { @@ -2001,69 +1954,6 @@ static int k8_dbam_to_chip_select(struct amd64_pvt *pvt, u8 dct, } } -/* - * Get the number of DCT channels in use. - * - * Return: - * number of Memory Channels in operation - * Pass back: - * contents of the DCL0_LOW register - */ -static int f1x_early_channel_count(struct amd64_pvt *pvt) -{ - int i, j, channels = 0; - - /* On F10h, if we are in 128 bit mode, then we are using 2 channels */ - if (pvt->fam == 0x10 && (pvt->dclr0 & WIDTH_128)) - return 2; - - /* - * Need to check if in unganged mode: In such, there are 2 channels, - * but they are not in 128 bit mode and thus the above 'dclr0' status - * bit will be OFF. - * - * Need to check DCT0[0] and DCT1[0] to see if only one of them has - * their CSEnable bit on. If so, then SINGLE DIMM case. - */ - edac_dbg(0, "Data width is not 128 bits - need more decoding\n"); - - /* - * Check DRAM Bank Address Mapping values for each DIMM to see if there - * is more than just one DIMM present in unganged mode. Need to check - * both controllers since DIMMs can be placed in either one. - */ - for (i = 0; i < 2; i++) { - u32 dbam = (i ? pvt->dbam1 : pvt->dbam0); - - for (j = 0; j < 4; j++) { - if (DBAM_DIMM(j, dbam) > 0) { - channels++; - break; - } - } - } - - if (channels > 2) - channels = 2; - - amd64_info("MCT channel count: %d\n", channels); - - return channels; -} - -static int f17_early_channel_count(struct amd64_pvt *pvt) -{ - int i, channels = 0; - - /* SDP Control bit 31 (SdpInit) is clear for unused UMC channels */ - for_each_umc(i) - channels += !!(pvt->umc[i].sdp_ctrl & UMC_SDP_INIT); - - amd64_info("MCT channel count: %d\n", channels); - - return channels; -} - static int ddr3_cs_size(unsigned i, bool dct_width) { unsigned shift = 0; @@ -2858,7 +2748,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, .max_mcs = 2, .ops = { - .early_channel_count = k8_early_channel_count, .map_sysaddr_to_csrow = k8_map_sysaddr_to_csrow, .dbam_to_cs = k8_dbam_to_chip_select, } @@ -2869,7 +2758,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_10H_NB_DRAM, .max_mcs = 2, .ops = { - .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, .dbam_to_cs = f10_dbam_to_chip_select, } @@ -2880,7 +2768,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_15H_NB_F2, .max_mcs = 2, .ops = { - .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, .dbam_to_cs = f15_dbam_to_chip_select, } @@ -2891,7 +2778,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_15H_M30H_NB_F2, .max_mcs = 2, .ops = { - .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, .dbam_to_cs = f16_dbam_to_chip_select, } @@ -2902,7 +2788,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_15H_M60H_NB_F2, .max_mcs = 2, .ops = { - .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, .dbam_to_cs = f15_m60h_dbam_to_chip_select, } @@ -2913,7 +2798,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_16H_NB_F2, .max_mcs = 2, .ops = { - .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, .dbam_to_cs = f16_dbam_to_chip_select, } @@ -2924,89 +2808,64 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_16H_M30H_NB_F2, .max_mcs = 2, .ops = { - .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, .dbam_to_cs = f16_dbam_to_chip_select, } }, [F17_CPUS] = { .ctl_name = "F17h", - .f0_id = PCI_DEVICE_ID_AMD_17H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_17H_DF_F6, .max_mcs = 2, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, [F17_M10H_CPUS] = { .ctl_name = "F17h_M10h", - .f0_id = PCI_DEVICE_ID_AMD_17H_M10H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_17H_M10H_DF_F6, .max_mcs = 2, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, [F17_M30H_CPUS] = { .ctl_name = "F17h_M30h", - .f0_id = PCI_DEVICE_ID_AMD_17H_M30H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_17H_M30H_DF_F6, .max_mcs = 8, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, [F17_M60H_CPUS] = { .ctl_name = "F17h_M60h", - .f0_id = PCI_DEVICE_ID_AMD_17H_M60H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_17H_M60H_DF_F6, .max_mcs = 2, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, [F17_M70H_CPUS] = { .ctl_name = "F17h_M70h", - .f0_id = PCI_DEVICE_ID_AMD_17H_M70H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_17H_M70H_DF_F6, .max_mcs = 2, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, [F19_CPUS] = { .ctl_name = "F19h", - .f0_id = PCI_DEVICE_ID_AMD_19H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_19H_DF_F6, .max_mcs = 8, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, [F19_M10H_CPUS] = { .ctl_name = "F19h_M10h", - .f0_id = PCI_DEVICE_ID_AMD_19H_M10H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_19H_M10H_DF_F6, .max_mcs = 12, .flags.zn_regs_v2 = 1, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, [F19_M50H_CPUS] = { .ctl_name = "F19h_M50h", - .f0_id = PCI_DEVICE_ID_AMD_19H_M50H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_19H_M50H_DF_F6, .max_mcs = 2, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -3316,36 +3175,12 @@ log_error: /* * Use pvt->F3 which contains the F3 CPU PCI device to get the related * F1 (AddrMap) and F2 (Dct) devices. Return negative value on error. - * Reserve F0 and F6 on systems with a UMC. */ static int reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) { - if (pvt->umc) { - pvt->F0 = pci_get_related_function(pvt->F3->vendor, pci_id1, pvt->F3); - if (!pvt->F0) { - edac_dbg(1, "F0 not found, device 0x%x\n", pci_id1); - return -ENODEV; - } - - pvt->F6 = pci_get_related_function(pvt->F3->vendor, pci_id2, pvt->F3); - if (!pvt->F6) { - pci_dev_put(pvt->F0); - pvt->F0 = NULL; - - edac_dbg(1, "F6 not found: device 0x%x\n", pci_id2); - return -ENODEV; - } - - if (!pci_ctl_dev) - pci_ctl_dev = &pvt->F0->dev; - - edac_dbg(1, "F0: %s\n", pci_name(pvt->F0)); - edac_dbg(1, "F3: %s\n", pci_name(pvt->F3)); - edac_dbg(1, "F6: %s\n", pci_name(pvt->F6)); - + if (pvt->umc) return 0; - } /* Reserve the ADDRESS MAP Device */ pvt->F1 = pci_get_related_function(pvt->F3->vendor, pci_id1, pvt->F3); @@ -3377,8 +3212,7 @@ reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) static void free_mc_sibling_devs(struct amd64_pvt *pvt) { if (pvt->umc) { - pci_dev_put(pvt->F0); - pci_dev_put(pvt->F6); + return; } else { pci_dev_put(pvt->F1); pci_dev_put(pvt->F2); @@ -3468,7 +3302,6 @@ static void read_mc_regs(struct amd64_pvt *pvt) if (pvt->umc) { __read_mc_regs_df(pvt); - amd64_read_pci_cfg(pvt->F0, DF_DHAR, &pvt->dhar); goto skip; } @@ -3691,7 +3524,7 @@ static int init_csrows(struct mem_ctl_info *mci) : EDAC_SECDED; } - for (j = 0; j < pvt->channel_count; j++) { + for (j = 0; j < fam_type->max_mcs; j++) { dimm = csrow->channels[j]->dimm; dimm->mtype = pvt->dram_type; dimm->edac_mode = edac_mode; @@ -3967,6 +3800,9 @@ static void setup_mci_misc_attrs(struct mem_ctl_info *mci) mci->dev_name = pci_name(pvt->F3); mci->ctl_page_to_phys = NULL; + if (pvt->fam >= 0x17) + return; + /* memory scrubber interface */ mci->set_sdram_scrub_rate = set_scrub_rate; mci->get_sdram_scrub_rate = get_scrub_rate; @@ -4092,16 +3928,13 @@ static const struct attribute_group *amd64_edac_attr_groups[] = { static int hw_info_get(struct amd64_pvt *pvt) { - u16 pci_id1, pci_id2; + u16 pci_id1 = 0, pci_id2 = 0; int ret; if (pvt->fam >= 0x17) { pvt->umc = kcalloc(fam_type->max_mcs, sizeof(struct amd64_umc), GFP_KERNEL); if (!pvt->umc) return -ENOMEM; - - pci_id1 = fam_type->f0_id; - pci_id2 = fam_type->f6_id; } else { pci_id1 = fam_type->f1_id; pci_id2 = fam_type->f2_id; @@ -4118,7 +3951,7 @@ static int hw_info_get(struct amd64_pvt *pvt) static void hw_info_put(struct amd64_pvt *pvt) { - if (pvt->F0 || pvt->F1) + if (pvt->F1) free_mc_sibling_devs(pvt); kfree(pvt->umc); @@ -4128,28 +3961,12 @@ static int init_one_instance(struct amd64_pvt *pvt) { struct mem_ctl_info *mci = NULL; struct edac_mc_layer layers[2]; - int ret = -EINVAL; - - /* - * We need to determine how many memory channels there are. Then use - * that information for calculating the size of the dynamic instance - * tables in the 'mci' structure. - */ - pvt->channel_count = pvt->ops->early_channel_count(pvt); - if (pvt->channel_count < 0) - return ret; + int ret = -ENOMEM; - ret = -ENOMEM; layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; layers[0].size = pvt->csels[0].b_cnt; layers[0].is_virt_csrow = true; layers[1].type = EDAC_MC_LAYER_CHANNEL; - - /* - * Always allocate two channels since we can have setups with DIMMs on - * only one channel. Also, this simplifies handling later for the price - * of a couple of KBs tops. - */ layers[1].size = fam_type->max_mcs; layers[1].is_virt_csrow = false; @@ -4370,12 +4187,12 @@ static int __init amd64_edac_init(void) } /* register stuff with EDAC MCE */ - if (boot_cpu_data.x86 >= 0x17) + if (boot_cpu_data.x86 >= 0x17) { amd_register_ecc_decoder(decode_umc_error); - else + } else { amd_register_ecc_decoder(decode_bus_error); - - setup_pci_device(); + setup_pci_device(); + } #ifdef CONFIG_X86_32 amd64_err("%s on 32-bit is unsupported. USE AT YOUR OWN RISK!\n", EDAC_MOD_STR); diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 38e5ad95d010..e4329dff8cf2 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -114,22 +114,6 @@ #define PCI_DEVICE_ID_AMD_16H_NB_F2 0x1532 #define PCI_DEVICE_ID_AMD_16H_M30H_NB_F1 0x1581 #define PCI_DEVICE_ID_AMD_16H_M30H_NB_F2 0x1582 -#define PCI_DEVICE_ID_AMD_17H_DF_F0 0x1460 -#define PCI_DEVICE_ID_AMD_17H_DF_F6 0x1466 -#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F0 0x15e8 -#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F6 0x15ee -#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F0 0x1490 -#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F6 0x1496 -#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F0 0x1448 -#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F6 0x144e -#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F0 0x1440 -#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F6 0x1446 -#define PCI_DEVICE_ID_AMD_19H_DF_F0 0x1650 -#define PCI_DEVICE_ID_AMD_19H_DF_F6 0x1656 -#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F0 0x14ad -#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F6 0x14b3 -#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F0 0x166a -#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F6 0x1670 /* * Function 1 - Address Map @@ -215,8 +199,6 @@ #define DCT_SEL_HI 0x114 #define F15H_M60H_SCRCTRL 0x1C8 -#define F17H_SCR_BASE_ADDR 0x48 -#define F17H_SCR_LIMIT_ADDR 0x4C /* * Function 3 - Misc Control @@ -356,7 +338,7 @@ struct amd64_pvt { struct low_ops *ops; /* pci_device handles which we utilize */ - struct pci_dev *F0, *F1, *F2, *F3, *F6; + struct pci_dev *F1, *F2, *F3; u16 mc_node_id; /* MC index of this MC node */ u8 fam; /* CPU family */ @@ -364,7 +346,6 @@ struct amd64_pvt { u8 stepping; /* ... stepping */ int ext_model; /* extended model value of this node */ - int channel_count; /* Raw registers */ u32 dclr0; /* DRAM Configuration Low DCT0 reg */ @@ -484,7 +465,6 @@ struct ecc_settings { * functions and per device encoding/decoding logic. */ struct low_ops { - int (*early_channel_count) (struct amd64_pvt *pvt); void (*map_sysaddr_to_csrow) (struct mem_ctl_info *mci, u64 sys_addr, struct err_info *); int (*dbam_to_cs) (struct amd64_pvt *pvt, u8 dct, @@ -503,7 +483,7 @@ struct amd64_family_flags { struct amd64_family_type { const char *ctl_name; - u16 f0_id, f1_id, f2_id, f6_id; + u16 f1_id, f2_id; /* Maximum number of memory controllers per die/node. */ u8 max_mcs; struct amd64_family_flags flags; diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 65aeea53e2df..0a4691792801 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -13,7 +13,7 @@ #include "edac_module.h" #include "skx_common.h" -#define I10NM_REVISION "v0.0.5" +#define I10NM_REVISION "v0.0.6" #define EDAC_MOD_STR "i10nm_edac" /* Debug macros */ @@ -22,25 +22,34 @@ #define I10NM_GET_SCK_BAR(d, reg) \ pci_read_config_dword((d)->uracu, 0xd0, &(reg)) -#define I10NM_GET_IMC_BAR(d, i, reg) \ - pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg)) +#define I10NM_GET_IMC_BAR(d, i, reg) \ + pci_read_config_dword((d)->uracu, \ + (res_cfg->type == GNR ? 0xd4 : 0xd8) + (i) * 4, &(reg)) #define I10NM_GET_SAD(d, offset, i, reg)\ - pci_read_config_dword((d)->sad_all, (offset) + (i) * 8, &(reg)) + pci_read_config_dword((d)->sad_all, (offset) + (i) * \ + (res_cfg->type == GNR ? 12 : 8), &(reg)) #define I10NM_GET_HBM_IMC_BAR(d, reg) \ pci_read_config_dword((d)->uracu, 0xd4, &(reg)) #define I10NM_GET_CAPID3_CFG(d, reg) \ - pci_read_config_dword((d)->pcu_cr3, 0x90, &(reg)) + pci_read_config_dword((d)->pcu_cr3, \ + res_cfg->type == GNR ? 0x290 : 0x90, &(reg)) +#define I10NM_GET_CAPID5_CFG(d, reg) \ + pci_read_config_dword((d)->pcu_cr3, \ + res_cfg->type == GNR ? 0x298 : 0x98, &(reg)) #define I10NM_GET_DIMMMTR(m, i, j) \ - readl((m)->mbase + ((m)->hbm_mc ? 0x80c : 0x2080c) + \ + readl((m)->mbase + ((m)->hbm_mc ? 0x80c : \ + (res_cfg->type == GNR ? 0xc0c : 0x2080c)) + \ (i) * (m)->chan_mmio_sz + (j) * 4) #define I10NM_GET_MCDDRTCFG(m, i) \ readl((m)->mbase + ((m)->hbm_mc ? 0x970 : 0x20970) + \ (i) * (m)->chan_mmio_sz) #define I10NM_GET_MCMTR(m, i) \ - readl((m)->mbase + ((m)->hbm_mc ? 0xef8 : 0x20ef8) + \ + readl((m)->mbase + ((m)->hbm_mc ? 0xef8 : \ + (res_cfg->type == GNR ? 0xaf8 : 0x20ef8)) + \ (i) * (m)->chan_mmio_sz) #define I10NM_GET_AMAP(m, i) \ - readl((m)->mbase + ((m)->hbm_mc ? 0x814 : 0x20814) + \ + readl((m)->mbase + ((m)->hbm_mc ? 0x814 : \ + (res_cfg->type == GNR ? 0xc14 : 0x20814)) + \ (i) * (m)->chan_mmio_sz) #define I10NM_GET_REG32(m, i, offset) \ readl((m)->mbase + (i) * (m)->chan_mmio_sz + (offset)) @@ -56,7 +65,10 @@ #define I10NM_GET_HBM_IMC_MMIO_OFFSET(reg) \ ((GET_BITFIELD(reg, 0, 10) << 12) + 0x140000) +#define I10NM_GNR_IMC_MMIO_OFFSET 0x24c000 +#define I10NM_GNR_IMC_MMIO_SIZE 0x4000 #define I10NM_HBM_IMC_MMIO_SIZE 0x9000 +#define I10NM_DDR_IMC_CH_CNT(reg) GET_BITFIELD(reg, 21, 24) #define I10NM_IS_HBM_PRESENT(reg) GET_BITFIELD(reg, 27, 30) #define I10NM_IS_HBM_IMC(reg) GET_BITFIELD(reg, 29, 29) @@ -148,35 +160,47 @@ static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable static void enable_retry_rd_err_log(bool enable) { + int i, j, imc_num, chan_num; struct skx_imc *imc; struct skx_dev *d; - int i, j; edac_dbg(2, "\n"); - list_for_each_entry(d, i10nm_edac_list, list) - for (i = 0; i < I10NM_NUM_IMC; i++) { + list_for_each_entry(d, i10nm_edac_list, list) { + imc_num = res_cfg->ddr_imc_num; + chan_num = res_cfg->ddr_chan_num; + + for (i = 0; i < imc_num; i++) { imc = &d->imc[i]; if (!imc->mbase) continue; - for (j = 0; j < I10NM_NUM_CHANNELS; j++) { - if (imc->hbm_mc) { - __enable_retry_rd_err_log(imc, j, enable, - res_cfg->offsets_scrub_hbm0, - res_cfg->offsets_demand_hbm0, - NULL); - __enable_retry_rd_err_log(imc, j, enable, - res_cfg->offsets_scrub_hbm1, - res_cfg->offsets_demand_hbm1, - NULL); - } else { - __enable_retry_rd_err_log(imc, j, enable, - res_cfg->offsets_scrub, - res_cfg->offsets_demand, - res_cfg->offsets_demand2); - } + for (j = 0; j < chan_num; j++) + __enable_retry_rd_err_log(imc, j, enable, + res_cfg->offsets_scrub, + res_cfg->offsets_demand, + res_cfg->offsets_demand2); + } + + imc_num += res_cfg->hbm_imc_num; + chan_num = res_cfg->hbm_chan_num; + + for (; i < imc_num; i++) { + imc = &d->imc[i]; + if (!imc->mbase || !imc->hbm_mc) + continue; + + for (j = 0; j < chan_num; j++) { + __enable_retry_rd_err_log(imc, j, enable, + res_cfg->offsets_scrub_hbm0, + res_cfg->offsets_demand_hbm0, + NULL); + __enable_retry_rd_err_log(imc, j, enable, + res_cfg->offsets_scrub_hbm1, + res_cfg->offsets_demand_hbm1, + NULL); } + } } } @@ -311,6 +335,79 @@ static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus, return pdev; } +/** + * i10nm_get_imc_num() - Get the number of present DDR memory controllers. + * + * @cfg : The pointer to the structure of EDAC resource configurations. + * + * For Granite Rapids CPUs, the number of present DDR memory controllers read + * at runtime overwrites the value statically configured in @cfg->ddr_imc_num. + * For other CPUs, the number of present DDR memory controllers is statically + * configured in @cfg->ddr_imc_num. + * + * RETURNS : 0 on success, < 0 on failure. + */ +static int i10nm_get_imc_num(struct res_config *cfg) +{ + int n, imc_num, chan_num = 0; + struct skx_dev *d; + u32 reg; + + list_for_each_entry(d, i10nm_edac_list, list) { + d->pcu_cr3 = pci_get_dev_wrapper(d->seg, d->bus[res_cfg->pcu_cr3_bdf.bus], + res_cfg->pcu_cr3_bdf.dev, + res_cfg->pcu_cr3_bdf.fun); + if (!d->pcu_cr3) + continue; + + if (I10NM_GET_CAPID5_CFG(d, reg)) + continue; + + n = I10NM_DDR_IMC_CH_CNT(reg); + + if (!chan_num) { + chan_num = n; + edac_dbg(2, "Get DDR CH number: %d\n", chan_num); + } else if (chan_num != n) { + i10nm_printk(KERN_NOTICE, "Get DDR CH numbers: %d, %d\n", chan_num, n); + } + } + + switch (cfg->type) { + case GNR: + /* + * One channel per DDR memory controller for Granite Rapids CPUs. + */ + imc_num = chan_num; + + if (!imc_num) { + i10nm_printk(KERN_ERR, "Invalid DDR MC number\n"); + return -ENODEV; + } + + if (imc_num > I10NM_NUM_DDR_IMC) { + i10nm_printk(KERN_ERR, "Need to make I10NM_NUM_DDR_IMC >= %d\n", imc_num); + return -EINVAL; + } + + if (cfg->ddr_imc_num != imc_num) { + /* + * Store the number of present DDR memory controllers. + */ + cfg->ddr_imc_num = imc_num; + edac_dbg(2, "Set DDR MC number: %d", imc_num); + } + + return 0; + default: + /* + * For other CPUs, the number of present DDR memory controllers + * is statically pre-configured in cfg->ddr_imc_num. + */ + return 0; + } +} + static bool i10nm_check_2lm(struct res_config *cfg) { struct skx_dev *d; @@ -318,9 +415,9 @@ static bool i10nm_check_2lm(struct res_config *cfg) int i; list_for_each_entry(d, i10nm_edac_list, list) { - d->sad_all = pci_get_dev_wrapper(d->seg, d->bus[1], - PCI_SLOT(cfg->sad_all_devfn), - PCI_FUNC(cfg->sad_all_devfn)); + d->sad_all = pci_get_dev_wrapper(d->seg, d->bus[res_cfg->sad_all_bdf.bus], + res_cfg->sad_all_bdf.dev, + res_cfg->sad_all_bdf.fun); if (!d->sad_all) continue; @@ -337,20 +434,39 @@ static bool i10nm_check_2lm(struct res_config *cfg) } /* - * Check whether the error comes from DDRT by ICX/Tremont model specific error code. - * Refer to SDM vol3B 16.11.3 Intel IMC MC error codes for IA32_MCi_STATUS. + * Check whether the error comes from DDRT by ICX/Tremont/SPR model specific error code. + * Refer to SDM vol3B 17.11.3/17.13.2 Intel IMC MC error codes for IA32_MCi_STATUS. */ static bool i10nm_mscod_is_ddrt(u32 mscod) { - switch (mscod) { - case 0x0106: case 0x0107: - case 0x0800: case 0x0804: - case 0x0806 ... 0x0808: - case 0x080a ... 0x080e: - case 0x0810: case 0x0811: - case 0x0816: case 0x081e: - case 0x081f: - return true; + switch (res_cfg->type) { + case I10NM: + switch (mscod) { + case 0x0106: case 0x0107: + case 0x0800: case 0x0804: + case 0x0806 ... 0x0808: + case 0x080a ... 0x080e: + case 0x0810: case 0x0811: + case 0x0816: case 0x081e: + case 0x081f: + return true; + } + + break; + case SPR: + switch (mscod) { + case 0x0800: case 0x0804: + case 0x0806 ... 0x0808: + case 0x080a ... 0x080e: + case 0x0810: case 0x0811: + case 0x0816: case 0x081e: + case 0x081f: + return true; + } + + break; + default: + return false; } return false; @@ -358,6 +474,7 @@ static bool i10nm_mscod_is_ddrt(u32 mscod) static bool i10nm_mc_decode_available(struct mce *mce) { +#define ICX_IMCx_CHy 0x06666000 u8 bank; if (!decoding_via_mca || mem_cfg_2lm) @@ -371,21 +488,26 @@ static bool i10nm_mc_decode_available(struct mce *mce) switch (res_cfg->type) { case I10NM: - if (bank < 13 || bank > 26) - return false; - - /* DDRT errors can't be decoded from MCA bank registers */ - if (MCI_MISC_ECC_MODE(mce->misc) == MCI_MISC_ECC_DDRT) + /* Check whether the bank is one of {13,14,17,18,21,22,25,26} */ + if (!(ICX_IMCx_CHy & (1 << bank))) return false; - - if (i10nm_mscod_is_ddrt(MCI_STATUS_MSCOD(mce->status))) + break; + case SPR: + if (bank < 13 || bank > 20) return false; - - /* Check whether one of {13,14,17,18,21,22,25,26} */ - return ((bank - 13) & BIT(1)) == 0; + break; default: return false; } + + /* DDRT errors can't be decoded from MCA bank registers */ + if (MCI_MISC_ECC_MODE(mce->misc) == MCI_MISC_ECC_DDRT) + return false; + + if (i10nm_mscod_is_ddrt(MCI_STATUS_MSCOD(mce->status))) + return false; + + return true; } static bool i10nm_mc_decode(struct decoded_addr *res) @@ -407,9 +529,29 @@ static bool i10nm_mc_decode(struct decoded_addr *res) switch (res_cfg->type) { case I10NM: - bank = m->bank - 13; - res->imc = bank / 4; - res->channel = bank % 2; + bank = m->bank - 13; + res->imc = bank / 4; + res->channel = bank % 2; + res->column = GET_BITFIELD(m->misc, 9, 18) << 2; + res->row = GET_BITFIELD(m->misc, 19, 39); + res->bank_group = GET_BITFIELD(m->misc, 40, 41); + |