diff options
author | Linus Torvalds | 2020-03-30 13:12:37 -0700 |
---|---|---|
committer | Linus Torvalds | 2020-03-30 13:12:37 -0700 |
commit | aaf985e21a4abb471df2a10ad7163367cbcd4088 (patch) | |
tree | 89938997de46c17a675739f53b1570c5c01d158c /drivers | |
parent | c271bdbf38e03ea0c19ce0041c1eaefb42227110 (diff) | |
parent | 41dac9a2ad4a3d5c96394a23dd53b7e6edcb80ba (diff) |
Merge tag 'edac_updates_for_5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras
Pull EDAC updates from Borislav Petkov:
- A substantial edac_mc cleanup, sanitizing object freeing,
streamlining and simplifying code flow, and getting rid of a lot of
needless complexity in memory controller representation code, by
Robert Richter.
- A new EDAC driver for the ARM DMC-520 memory controller, by Lei Wang,
Shiping Ji and others.
- The usual sprinkling of misc cleanups and fixes all over the
subsystem.
* tag 'edac_updates_for_5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras:
EDAC/armada_xp: Use scnprintf() for avoiding potential buffer overflow
EDAC/synopsys: Do not dump uninitialized pinf->col
EDAC: Add EDAC driver for DMC520
dt-bindings: edac: Dmc-520.yaml
EDAC/mce_amd: Print !SMCA processor warning only once
EDAC/mc: Remove per layer counters
EDAC/mc: Remove detail[] string and cleanup error string generation
EDAC/mc: Pass the error descriptor to error reporting functions
EDAC/mc: Remove enable_per_layer_report function argument
EDAC/mc: Report "unknown memory" on too many DIMM labels found
EDAC/mc: Carve out error increment into a separate function
EDAC/mc: Determine mci pointer from the error descriptor
EDAC: Store error type in struct edac_raw_error_desc
EDAC/mc: Reorder functions edac_mc_alloc*()
EDAC/mc: Split edac_mc_alloc() into smaller functions
EDAC/mc: Change mci device removal to use put_device()
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/edac/Kconfig | 7 | ||||
-rw-r--r-- | drivers/edac/Makefile | 1 | ||||
-rw-r--r-- | drivers/edac/armada_xp_edac.c | 26 | ||||
-rw-r--r-- | drivers/edac/dmc520_edac.c | 656 | ||||
-rw-r--r-- | drivers/edac/edac_mc.c | 511 | ||||
-rw-r--r-- | drivers/edac/edac_mc.h | 6 | ||||
-rw-r--r-- | drivers/edac/edac_mc_sysfs.c | 110 | ||||
-rw-r--r-- | drivers/edac/edac_module.h | 1 | ||||
-rw-r--r-- | drivers/edac/ghes_edac.c | 16 | ||||
-rw-r--r-- | drivers/edac/mce_amd.c | 2 | ||||
-rw-r--r-- | drivers/edac/synopsys_edac.c | 22 |
11 files changed, 984 insertions, 374 deletions
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index b3c99bb5fe77..fe2eb892a1bd 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -523,4 +523,11 @@ config EDAC_BLUEFIELD Support for error detection and correction on the Mellanox BlueField SoCs. +config EDAC_DMC520 + tristate "ARM DMC-520 ECC" + depends on ARM64 + help + Support for error detection and correction on the + SoCs with ARM DMC-520 DRAM controller. + endif # EDAC diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index d77200c9680b..269e15118cea 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -87,3 +87,4 @@ obj-$(CONFIG_EDAC_TI) += ti_edac.o obj-$(CONFIG_EDAC_QCOM) += qcom_edac.o obj-$(CONFIG_EDAC_ASPEED) += aspeed_edac.o obj-$(CONFIG_EDAC_BLUEFIELD) += bluefield_edac.o +obj-$(CONFIG_EDAC_DMC520) += dmc520_edac.o diff --git a/drivers/edac/armada_xp_edac.c b/drivers/edac/armada_xp_edac.c index 7f227bdcbc84..a7502ebe9bdc 100644 --- a/drivers/edac/armada_xp_edac.c +++ b/drivers/edac/armada_xp_edac.c @@ -429,26 +429,26 @@ static void aurora_l2_check(struct edac_device_ctl_info *dci) src = (attr_cap & AURORA_ERR_ATTR_SRC_MSK) >> AURORA_ERR_ATTR_SRC_OFF; if (src <= 3) - len += snprintf(msg+len, size-len, "src=CPU%d ", src); + len += scnprintf(msg+len, size-len, "src=CPU%d ", src); else - len += snprintf(msg+len, size-len, "src=IO "); + len += scnprintf(msg+len, size-len, "src=IO "); txn = (attr_cap & AURORA_ERR_ATTR_TXN_MSK) >> AURORA_ERR_ATTR_TXN_OFF; switch (txn) { case 0: - len += snprintf(msg+len, size-len, "txn=Data-Read "); + len += scnprintf(msg+len, size-len, "txn=Data-Read "); break; case 1: - len += snprintf(msg+len, size-len, "txn=Isn-Read "); + len += scnprintf(msg+len, size-len, "txn=Isn-Read "); break; case 2: - len += snprintf(msg+len, size-len, "txn=Clean-Flush "); + len += scnprintf(msg+len, size-len, "txn=Clean-Flush "); break; case 3: - len += snprintf(msg+len, size-len, "txn=Eviction "); + len += scnprintf(msg+len, size-len, "txn=Eviction "); break; case 4: - len += snprintf(msg+len, size-len, + len += scnprintf(msg+len, size-len, "txn=Read-Modify-Write "); break; } @@ -456,19 +456,19 @@ static void aurora_l2_check(struct edac_device_ctl_info *dci) err = (attr_cap & AURORA_ERR_ATTR_ERR_MSK) >> AURORA_ERR_ATTR_ERR_OFF; switch (err) { case 0: - len += snprintf(msg+len, size-len, "err=CorrECC "); + len += scnprintf(msg+len, size-len, "err=CorrECC "); break; case 1: - len += snprintf(msg+len, size-len, "err=UnCorrECC "); + len += scnprintf(msg+len, size-len, "err=UnCorrECC "); break; case 2: - len += snprintf(msg+len, size-len, "err=TagParity "); + len += scnprintf(msg+len, size-len, "err=TagParity "); break; } - len += snprintf(msg+len, size-len, "addr=0x%x ", addr_cap & AURORA_ERR_ADDR_CAP_ADDR_MASK); - len += snprintf(msg+len, size-len, "index=0x%x ", (way_cap & AURORA_ERR_WAY_IDX_MSK) >> AURORA_ERR_WAY_IDX_OFF); - len += snprintf(msg+len, size-len, "way=0x%x", (way_cap & AURORA_ERR_WAY_CAP_WAY_MASK) >> AURORA_ERR_WAY_CAP_WAY_OFFSET); + len += scnprintf(msg+len, size-len, "addr=0x%x ", addr_cap & AURORA_ERR_ADDR_CAP_ADDR_MASK); + len += scnprintf(msg+len, size-len, "index=0x%x ", (way_cap & AURORA_ERR_WAY_IDX_MSK) >> AURORA_ERR_WAY_IDX_OFF); + len += scnprintf(msg+len, size-len, "way=0x%x", (way_cap & AURORA_ERR_WAY_CAP_WAY_MASK) >> AURORA_ERR_WAY_CAP_WAY_OFFSET); /* clear error capture registers */ writel(AURORA_ERR_ATTR_CAP_VALID, drvdata->base + AURORA_ERR_ATTR_CAP_REG); diff --git a/drivers/edac/dmc520_edac.c b/drivers/edac/dmc520_edac.c new file mode 100644 index 000000000000..fc1153ab1ebb --- /dev/null +++ b/drivers/edac/dmc520_edac.c @@ -0,0 +1,656 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * EDAC driver for DMC-520 memory controller. + * + * The driver supports 10 interrupt lines, + * though only dram_ecc_errc and dram_ecc_errd are currently handled. + * + * Authors: Rui Zhao <ruizhao@microsoft.com> + * Lei Wang <lewan@microsoft.com> + * Shiping Ji <shji@microsoft.com> + */ + +#include <linux/bitfield.h> +#include <linux/edac.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/of.h> +#include <linux/platform_device.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include "edac_mc.h" + +/* DMC-520 registers */ +#define REG_OFFSET_FEATURE_CONFIG 0x130 +#define REG_OFFSET_ECC_ERRC_COUNT_31_00 0x158 +#define REG_OFFSET_ECC_ERRC_COUNT_63_32 0x15C +#define REG_OFFSET_ECC_ERRD_COUNT_31_00 0x160 +#define REG_OFFSET_ECC_ERRD_COUNT_63_32 0x164 +#define REG_OFFSET_INTERRUPT_CONTROL 0x500 +#define REG_OFFSET_INTERRUPT_CLR 0x508 +#define REG_OFFSET_INTERRUPT_STATUS 0x510 +#define REG_OFFSET_DRAM_ECC_ERRC_INT_INFO_31_00 0x528 +#define REG_OFFSET_DRAM_ECC_ERRC_INT_INFO_63_32 0x52C +#define REG_OFFSET_DRAM_ECC_ERRD_INT_INFO_31_00 0x530 +#define REG_OFFSET_DRAM_ECC_ERRD_INT_INFO_63_32 0x534 +#define REG_OFFSET_ADDRESS_CONTROL_NOW 0x1010 +#define REG_OFFSET_MEMORY_TYPE_NOW 0x1128 +#define REG_OFFSET_SCRUB_CONTROL0_NOW 0x1170 +#define REG_OFFSET_FORMAT_CONTROL 0x18 + +/* DMC-520 types, masks and bitfields */ +#define RAM_ECC_INT_CE_BIT BIT(0) +#define RAM_ECC_INT_UE_BIT BIT(1) +#define DRAM_ECC_INT_CE_BIT BIT(2) +#define DRAM_ECC_INT_UE_BIT BIT(3) +#define FAILED_ACCESS_INT_BIT BIT(4) +#define FAILED_PROG_INT_BIT BIT(5) +#define LINK_ERR_INT_BIT BIT(6) +#define TEMPERATURE_EVENT_INT_BIT BIT(7) +#define ARCH_FSM_INT_BIT BIT(8) +#define PHY_REQUEST_INT_BIT BIT(9) +#define MEMORY_WIDTH_MASK GENMASK(1, 0) +#define SCRUB_TRIGGER0_NEXT_MASK GENMASK(1, 0) +#define REG_FIELD_DRAM_ECC_ENABLED GENMASK(1, 0) +#define REG_FIELD_MEMORY_TYPE GENMASK(2, 0) +#define REG_FIELD_DEVICE_WIDTH GENMASK(9, 8) +#define REG_FIELD_ADDRESS_CONTROL_COL GENMASK(2, 0) +#define REG_FIELD_ADDRESS_CONTROL_ROW GENMASK(10, 8) +#define REG_FIELD_ADDRESS_CONTROL_BANK GENMASK(18, 16) +#define REG_FIELD_ADDRESS_CONTROL_RANK GENMASK(25, 24) +#define REG_FIELD_ERR_INFO_LOW_VALID BIT(0) +#define REG_FIELD_ERR_INFO_LOW_COL GENMASK(10, 1) +#define REG_FIELD_ERR_INFO_LOW_ROW GENMASK(28, 11) +#define REG_FIELD_ERR_INFO_LOW_RANK GENMASK(31, 29) +#define REG_FIELD_ERR_INFO_HIGH_BANK GENMASK(3, 0) +#define REG_FIELD_ERR_INFO_HIGH_VALID BIT(31) + +#define DRAM_ADDRESS_CONTROL_MIN_COL_BITS 8 +#define DRAM_ADDRESS_CONTROL_MIN_ROW_BITS 11 + +#define DMC520_SCRUB_TRIGGER_ERR_DETECT 2 +#define DMC520_SCRUB_TRIGGER_IDLE 3 + +/* Driver settings */ +/* + * The max-length message would be: "rank:7 bank:15 row:262143 col:1023". + * Max length is 34. Using a 40-size buffer is enough. + */ +#define DMC520_MSG_BUF_SIZE 40 +#define EDAC_MOD_NAME "dmc520-edac" +#define EDAC_CTL_NAME "dmc520" + +/* the data bus width for the attached memory chips. */ +enum dmc520_mem_width { + MEM_WIDTH_X32 = 2, + MEM_WIDTH_X64 = 3 +}; + +/* memory type */ +enum dmc520_mem_type { + MEM_TYPE_DDR3 = 1, + MEM_TYPE_DDR4 = 2 +}; + +/* memory device width */ +enum dmc520_dev_width { + DEV_WIDTH_X4 = 0, + DEV_WIDTH_X8 = 1, + DEV_WIDTH_X16 = 2 +}; + +struct ecc_error_info { + u32 col; + u32 row; + u32 bank; + u32 rank; +}; + +/* The interrupt config */ +struct dmc520_irq_config { + char *name; + int mask; +}; + +/* The interrupt mappings */ +static struct dmc520_irq_config dmc520_irq_configs[] = { + { + .name = "ram_ecc_errc", + .mask = RAM_ECC_INT_CE_BIT + }, + { + .name = "ram_ecc_errd", + .mask = RAM_ECC_INT_UE_BIT + }, + { + .name = "dram_ecc_errc", + .mask = DRAM_ECC_INT_CE_BIT + }, + { + .name = "dram_ecc_errd", + .mask = DRAM_ECC_INT_UE_BIT + }, + { + .name = "failed_access", + .mask = FAILED_ACCESS_INT_BIT + }, + { + .name = "failed_prog", + .mask = FAILED_PROG_INT_BIT + }, + { + .name = "link_err", + .mask = LINK_ERR_INT_BIT + }, + { + .name = "temperature_event", + .mask = TEMPERATURE_EVENT_INT_BIT + }, + { + .name = "arch_fsm", + .mask = ARCH_FSM_INT_BIT + }, + { + .name = "phy_request", + .mask = PHY_REQUEST_INT_BIT + } +}; + +#define NUMBER_OF_IRQS ARRAY_SIZE(dmc520_irq_configs) + +/* + * The EDAC driver private data. + * error_lock is to protect concurrent writes to the mci->error_desc through + * edac_mc_handle_error(). + */ +struct dmc520_edac { + void __iomem *reg_base; + spinlock_t error_lock; + u32 mem_width_in_bytes; + int irqs[NUMBER_OF_IRQS]; + int masks[NUMBER_OF_IRQS]; +}; + +static int dmc520_mc_idx; + +static u32 dmc520_read_reg(struct dmc520_edac *pvt, u32 offset) +{ + return readl(pvt->reg_base + offset); +} + +static void dmc520_write_reg(struct dmc520_edac *pvt, u32 val, u32 offset) +{ + writel(val, pvt->reg_base + offset); +} + +static u32 dmc520_calc_dram_ecc_error(u32 value) +{ + u32 total = 0; + + /* Each rank's error counter takes one byte. */ + while (value > 0) { + total += (value & 0xFF); + value >>= 8; + } + return total; +} + +static u32 dmc520_get_dram_ecc_error_count(struct dmc520_edac *pvt, + bool is_ce) +{ + u32 reg_offset_low, reg_offset_high; + u32 err_low, err_high; + u32 err_count; + + reg_offset_low = is_ce ? REG_OFFSET_ECC_ERRC_COUNT_31_00 : + REG_OFFSET_ECC_ERRD_COUNT_31_00; + reg_offset_high = is_ce ? REG_OFFSET_ECC_ERRC_COUNT_63_32 : + REG_OFFSET_ECC_ERRD_COUNT_63_32; + + err_low = dmc520_read_reg(pvt, reg_offset_low); + err_high = dmc520_read_reg(pvt, reg_offset_high); + /* Reset error counters */ + dmc520_write_reg(pvt, 0, reg_offset_low); + dmc520_write_reg(pvt, 0, reg_offset_high); + + err_count = dmc520_calc_dram_ecc_error(err_low) + + dmc520_calc_dram_ecc_error(err_high); + + return err_count; +} + +static void dmc520_get_dram_ecc_error_info(struct dmc520_edac *pvt, + bool is_ce, + struct ecc_error_info *info) +{ + u32 reg_offset_low, reg_offset_high; + u32 reg_val_low, reg_val_high; + bool valid; + + reg_offset_low = is_ce ? REG_OFFSET_DRAM_ECC_ERRC_INT_INFO_31_00 : + REG_OFFSET_DRAM_ECC_ERRD_INT_INFO_31_00; + reg_offset_high = is_ce ? REG_OFFSET_DRAM_ECC_ERRC_INT_INFO_63_32 : + REG_OFFSET_DRAM_ECC_ERRD_INT_INFO_63_32; + + reg_val_low = dmc520_read_reg(pvt, reg_offset_low); + reg_val_high = dmc520_read_reg(pvt, reg_offset_high); + + valid = (FIELD_GET(REG_FIELD_ERR_INFO_LOW_VALID, reg_val_low) != 0) && + (FIELD_GET(REG_FIELD_ERR_INFO_HIGH_VALID, reg_val_high) != 0); + + if (valid) { + info->col = FIELD_GET(REG_FIELD_ERR_INFO_LOW_COL, reg_val_low); + info->row = FIELD_GET(REG_FIELD_ERR_INFO_LOW_ROW, reg_val_low); + info->rank = FIELD_GET(REG_FIELD_ERR_INFO_LOW_RANK, reg_val_low); + info->bank = FIELD_GET(REG_FIELD_ERR_INFO_HIGH_BANK, reg_val_high); + } else { + memset(info, 0, sizeof(*info)); + } +} + +static bool dmc520_is_ecc_enabled(void __iomem *reg_base) +{ + u32 reg_val = readl(reg_base + REG_OFFSET_FEATURE_CONFIG); + + return FIELD_GET(REG_FIELD_DRAM_ECC_ENABLED, reg_val); +} + +static enum scrub_type dmc520_get_scrub_type(struct dmc520_edac *pvt) +{ + enum scrub_type type = SCRUB_NONE; + u32 reg_val, scrub_cfg; + + reg_val = dmc520_read_reg(pvt, REG_OFFSET_SCRUB_CONTROL0_NOW); + scrub_cfg = FIELD_GET(SCRUB_TRIGGER0_NEXT_MASK, reg_val); + + if (scrub_cfg == DMC520_SCRUB_TRIGGER_ERR_DETECT || + scrub_cfg == DMC520_SCRUB_TRIGGER_IDLE) + type = SCRUB_HW_PROG; + + return type; +} + +/* Get the memory data bus width, in number of bytes. */ +static u32 dmc520_get_memory_width(struct dmc520_edac *pvt) +{ + enum dmc520_mem_width mem_width_field; + u32 mem_width_in_bytes = 0; + u32 reg_val; + + reg_val = dmc520_read_reg(pvt, REG_OFFSET_FORMAT_CONTROL); + mem_width_field = FIELD_GET(MEMORY_WIDTH_MASK, reg_val); + + if (mem_width_field == MEM_WIDTH_X32) + mem_width_in_bytes = 4; + else if (mem_width_field == MEM_WIDTH_X64) + mem_width_in_bytes = 8; + return mem_width_in_bytes; +} + +static enum mem_type dmc520_get_mtype(struct dmc520_edac *pvt) +{ + enum mem_type mt = MEM_UNKNOWN; + enum dmc520_mem_type type; + u32 reg_val; + + reg_val = dmc520_read_reg(pvt, REG_OFFSET_MEMORY_TYPE_NOW); + type = FIELD_GET(REG_FIELD_MEMORY_TYPE, reg_val); + + switch (type) { + case MEM_TYPE_DDR3: + mt = MEM_DDR3; + break; + + case MEM_TYPE_DDR4: + mt = MEM_DDR4; + break; + } + + return mt; +} + +static enum dev_type dmc520_get_dtype(struct dmc520_edac *pvt) +{ + enum dmc520_dev_width device_width; + enum dev_type dt = DEV_UNKNOWN; + u32 reg_val; + + reg_val = dmc520_read_reg(pvt, REG_OFFSET_MEMORY_TYPE_NOW); + device_width = FIELD_GET(REG_FIELD_DEVICE_WIDTH, reg_val); + + switch (device_width) { + case DEV_WIDTH_X4: + dt = DEV_X4; + break; + + case DEV_WIDTH_X8: + dt = DEV_X8; + break; + + case DEV_WIDTH_X16: + dt = DEV_X16; + break; + } + + return dt; +} + +static u32 dmc520_get_rank_count(void __iomem *reg_base) +{ + u32 reg_val, rank_bits; + + reg_val = readl(reg_base + REG_OFFSET_ADDRESS_CONTROL_NOW); + rank_bits = FIELD_GET(REG_FIELD_ADDRESS_CONTROL_RANK, reg_val); + + return BIT(rank_bits); +} + +static u64 dmc520_get_rank_size(struct dmc520_edac *pvt) +{ + u32 reg_val, col_bits, row_bits, bank_bits; + + reg_val = dmc520_read_reg(pvt, REG_OFFSET_ADDRESS_CONTROL_NOW); + + col_bits = FIELD_GET(REG_FIELD_ADDRESS_CONTROL_COL, reg_val) + + DRAM_ADDRESS_CONTROL_MIN_COL_BITS; + row_bits = FIELD_GET(REG_FIELD_ADDRESS_CONTROL_ROW, reg_val) + + DRAM_ADDRESS_CONTROL_MIN_ROW_BITS; + bank_bits = FIELD_GET(REG_FIELD_ADDRESS_CONTROL_BANK, reg_val); + + return (u64)pvt->mem_width_in_bytes << (col_bits + row_bits + bank_bits); +} + +static void dmc520_handle_dram_ecc_errors(struct mem_ctl_info *mci, + bool is_ce) +{ + struct dmc520_edac *pvt = mci->pvt_info; + char message[DMC520_MSG_BUF_SIZE]; + struct ecc_error_info info; + u32 cnt; + + dmc520_get_dram_ecc_error_info(pvt, is_ce, &info); + + cnt = dmc520_get_dram_ecc_error_count(pvt, is_ce); + if (!cnt) + return; + + snprintf(message, ARRAY_SIZE(message), + "rank:%d bank:%d row:%d col:%d", + info.rank, info.bank, + info.row, info.col); + + spin_lock(&pvt->error_lock); + edac_mc_handle_error((is_ce ? HW_EVENT_ERR_CORRECTED : + HW_EVENT_ERR_UNCORRECTED), + mci, cnt, 0, 0, 0, info.rank, -1, -1, + message, ""); + spin_unlock(&pvt->error_lock); +} + +static irqreturn_t dmc520_edac_dram_ecc_isr(int irq, struct mem_ctl_info *mci, + bool is_ce) +{ + struct dmc520_edac *pvt = mci->pvt_info; + u32 i_mask; + + i_mask = is_ce ? DRAM_ECC_INT_CE_BIT : DRAM_ECC_INT_UE_BIT; + + dmc520_handle_dram_ecc_errors(mci, is_ce); + + dmc520_write_reg(pvt, i_mask, REG_OFFSET_INTERRUPT_CLR); + + return IRQ_HANDLED; +} + +static irqreturn_t dmc520_edac_dram_all_isr(int irq, struct mem_ctl_info *mci, + u32 irq_mask) +{ + struct dmc520_edac *pvt = mci->pvt_info; + irqreturn_t irq_ret = IRQ_NONE; + u32 status; + + status = dmc520_read_reg(pvt, REG_OFFSET_INTERRUPT_STATUS); + + if ((irq_mask & DRAM_ECC_INT_CE_BIT) && + (status & DRAM_ECC_INT_CE_BIT)) + irq_ret = dmc520_edac_dram_ecc_isr(irq, mci, true); + + if ((irq_mask & DRAM_ECC_INT_UE_BIT) && + (status & DRAM_ECC_INT_UE_BIT)) + irq_ret = dmc520_edac_dram_ecc_isr(irq, mci, false); + + return irq_ret; +} + +static irqreturn_t dmc520_isr(int irq, void *data) +{ + struct mem_ctl_info *mci = data; + struct dmc520_edac *pvt = mci->pvt_info; + u32 mask = 0; + int idx; + + for (idx = 0; idx < NUMBER_OF_IRQS; idx++) { + if (pvt->irqs[idx] == irq) { + mask = pvt->masks[idx]; + break; + } + } + return dmc520_edac_dram_all_isr(irq, mci, mask); +} + +static void dmc520_init_csrow(struct mem_ctl_info *mci) +{ + struct dmc520_edac *pvt = mci->pvt_info; + struct csrow_info *csi; + struct dimm_info *dimm; + u32 pages_per_rank; + enum dev_type dt; + enum mem_type mt; + int row, ch; + u64 rs; + + dt = dmc520_get_dtype(pvt); + mt = dmc520_get_mtype(pvt); + rs = dmc520_get_rank_size(pvt); + pages_per_rank = rs >> PAGE_SHIFT; + + for (row = 0; row < mci->nr_csrows; row++) { + csi = mci->csrows[row]; + + for (ch = 0; ch < csi->nr_channels; ch++) { + dimm = csi->channels[ch]->dimm; + dimm->grain = pvt->mem_width_in_bytes; + dimm->dtype = dt; + dimm->mtype = mt; + dimm->edac_mode = EDAC_FLAG_SECDED; + dimm->nr_pages = pages_per_rank / csi->nr_channels; + } + } +} + +static int dmc520_edac_probe(struct platform_device *pdev) +{ + bool registered[NUMBER_OF_IRQS] = { false }; + int irqs[NUMBER_OF_IRQS] = { -ENXIO }; + int masks[NUMBER_OF_IRQS] = { 0 }; + struct edac_mc_layer layers[1]; + struct dmc520_edac *pvt = NULL; + struct mem_ctl_info *mci; + void __iomem *reg_base; + u32 irq_mask_all = 0; + struct resource *res; + struct device *dev; + int ret, idx, irq; + u32 reg_val; + + /* Parse the device node */ + dev = &pdev->dev; + + for (idx = 0; idx < NUMBER_OF_IRQS; idx++) { + irq = platform_get_irq_byname(pdev, dmc520_irq_configs[idx].name); + irqs[idx] = irq; + masks[idx] = dmc520_irq_configs[idx].mask; + if (irq >= 0) { + irq_mask_all |= dmc520_irq_configs[idx].mask; + edac_dbg(0, "Discovered %s, irq: %d.\n", dmc520_irq_configs[idx].name, irq); + } + } + + if (!irq_mask_all) { + edac_printk(KERN_ERR, EDAC_MOD_NAME, + "At least one valid interrupt line is expected.\n"); + return -EINVAL; + } + + /* Initialize dmc520 edac */ + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + reg_base = devm_ioremap_resource(dev, res); + if (IS_ERR(reg_base)) + return PTR_ERR(reg_base); + + if (!dmc520_is_ecc_enabled(reg_base)) + return -ENXIO; + + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = dmc520_get_rank_count(reg_base); + layers[0].is_virt_csrow = true; + + mci = edac_mc_alloc(dmc520_mc_idx++, ARRAY_SIZE(layers), layers, sizeof(*pvt)); + if (!mci) { + edac_printk(KERN_ERR, EDAC_MOD_NAME, + "Failed to allocate memory for mc instance\n"); + ret = -ENOMEM; + goto err; + } + + pvt = mci->pvt_info; + + pvt->reg_base = reg_base; + spin_lock_init(&pvt->error_lock); + memcpy(pvt->irqs, irqs, sizeof(irqs)); + memcpy(pvt->masks, masks, sizeof(masks)); + + platform_set_drvdata(pdev, mci); + + mci->pdev = dev; + mci->mtype_cap = MEM_FLAG_DDR3 | MEM_FLAG_DDR4; + mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED; + mci->edac_cap = EDAC_FLAG_SECDED; + mci->scrub_cap = SCRUB_FLAG_HW_SRC; + mci->scrub_mode = dmc520_get_scrub_type(pvt); + mci->ctl_name = EDAC_CTL_NAME; + mci->dev_name = dev_name(mci->pdev); + mci->mod_name = EDAC_MOD_NAME; + + edac_op_state = EDAC_OPSTATE_INT; + + pvt->mem_width_in_bytes = dmc520_get_memory_width(pvt); + + dmc520_init_csrow(mci); + + /* Clear interrupts, not affecting other unrelated interrupts */ + reg_val = dmc520_read_reg(pvt, REG_OFFSET_INTERRUPT_CONTROL); + dmc520_write_reg(pvt, reg_val & (~irq_mask_all), + REG_OFFSET_INTERRUPT_CONTROL); + dmc520_write_reg(pvt, irq_mask_all, REG_OFFSET_INTERRUPT_CLR); + + for (idx = 0; idx < NUMBER_OF_IRQS; idx++) { + irq = irqs[idx]; + if (irq >= 0) { + ret = devm_request_irq(&pdev->dev, irq, + dmc520_isr, IRQF_SHARED, + dev_name(&pdev->dev), mci); + if (ret < 0) { + edac_printk(KERN_ERR, EDAC_MC, + "Failed to request irq %d\n", irq); + goto err; + } + registered[idx] = true; + } + } + + /* Reset DRAM CE/UE counters */ + if (irq_mask_all & DRAM_ECC_INT_CE_BIT) + dmc520_get_dram_ecc_error_count(pvt, true); + + if (irq_mask_all & DRAM_ECC_INT_UE_BIT) + dmc520_get_dram_ecc_error_count(pvt, false); + + ret = edac_mc_add_mc(mci); + if (ret) { + edac_printk(KERN_ERR, EDAC_MOD_NAME, + "Failed to register with EDAC core\n"); + goto err; + } + + /* Enable interrupts, not affecting other unrelated interrupts */ + dmc520_write_reg(pvt, reg_val | irq_mask_all, + REG_OFFSET_INTERRUPT_CONTROL); + + return 0; + +err: + for (idx = 0; idx < NUMBER_OF_IRQS; idx++) { + if (registered[idx]) + devm_free_irq(&pdev->dev, pvt->irqs[idx], mci); + } + if (mci) + edac_mc_free(mci); + + return ret; +} + +static int dmc520_edac_remove(struct platform_device *pdev) +{ + u32 reg_val, idx, irq_mask_all = 0; + struct mem_ctl_info *mci; + struct dmc520_edac *pvt; + + mci = platform_get_drvdata(pdev); + pvt = mci->pvt_info; + + /* Disable interrupts */ + reg_val = dmc520_read_reg(pvt, REG_OFFSET_INTERRUPT_CONTROL); + dmc520_write_reg(pvt, reg_val & (~irq_mask_all), + REG_OFFSET_INTERRUPT_CONTROL); + + /* free irq's */ + for (idx = 0; idx < NUMBER_OF_IRQS; idx++) { + if (pvt->irqs[idx] >= 0) { + irq_mask_all |= pvt->masks[idx]; + devm_free_irq(&pdev->dev, pvt->irqs[idx], mci); + } + } + + edac_mc_del_mc(&pdev->dev); + edac_mc_free(mci); + + return 0; +} + +static const struct of_device_id dmc520_edac_driver_id[] = { + { .compatible = "arm,dmc-520", }, + { /* end of table */ } +}; + +MODULE_DEVICE_TABLE(of, dmc520_edac_driver_id); + +static struct platform_driver dmc520_edac_driver = { + .driver = { + .name = "dmc520", + .of_match_table = dmc520_edac_driver_id, + }, + + .probe = dmc520_edac_probe, + .remove = dmc520_edac_remove +}; + +module_platform_driver(dmc520_edac_driver); + +MODULE_AUTHOR("Rui Zhao <ruizhao@microsoft.com>"); +MODULE_AUTHOR("Lei Wang <lewan@microsoft.com>"); +MODULE_AUTHOR("Shiping Ji <shji@microsoft.com>"); +MODULE_DESCRIPTION("DMC-520 ECC driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 69e0d90460e6..75ede27bdf6a 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -55,6 +55,11 @@ static LIST_HEAD(mc_devices); */ static const char *edac_mc_owner; +static struct mem_ctl_info *error_desc_to_mci(struct edac_raw_error_desc *e) +{ + return container_of(e, struct mem_ctl_info, error_desc); +} + int edac_get_report_status(void) { return edac_report; @@ -278,6 +283,12 @@ void *edac_align_ptr(void **p, unsigned int size, int n_elems) static void _edac_mc_free(struct mem_ctl_info *mci) { + put_device(&mci->dev); +} + +static void mci_release(struct device *dev) +{ + struct mem_ctl_info *mci = container_of(dev, struct mem_ctl_info, dev); struct csrow_info *csr; int i, chn, row; @@ -305,103 +316,26 @@ static void _edac_mc_free(struct mem_ctl_info *mci) kfree(mci); } -struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, - unsigned int n_layers, - struct edac_mc_layer *layers, - unsigned int sz_pvt) +static int edac_mc_alloc_csrows(struct mem_ctl_info *mci) { - struct mem_ctl_info *mci; - struct edac_mc_layer *layer; - struct csrow_info *csr; - struct rank_info *chan; - struct dimm_info *dimm; - u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; - unsigned int pos[EDAC_MAX_LAYERS]; - unsigned int idx, size, tot_dimms = 1, count = 1; - unsigned int tot_csrows = 1, tot_channels = 1, tot_errcount = 0; - void *pvt, *p, *ptr = NULL; - int i, j, row, chn, n, len; - bool per_rank = false; - - if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0)) - return NULL; - - /* - * Calculate the total amount of dimms and csrows/cschannels while - * in the old API emulation mode - */ - for (idx = 0; idx < n_layers; idx++) { - tot_dimms *= layers[idx].size; - - if (layers[idx].is_virt_csrow) - tot_csrows *= layers[idx].size; - else - tot_channels *= layers[idx].size; - - if (layers[idx].type == EDAC_MC_LAYER_CHIP_SELECT) - per_rank = true; - } - - /* Figure out the offsets of the various items from the start of an mc - * structure. We want the alignment of each item to be at least as - * stringent as what the compiler would provide if we could simply - * hardcode everything into a single struct. - */ - mci = edac_align_ptr(&ptr, sizeof(*mci), 1); - layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers); - for (i = 0; i < n_layers; i++) { - count *= layers[i].size; - edac_dbg(4, "errcount layer %d size %d\n", i, count); - ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count); - ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count); - tot_errcount += 2 * count; - } - - edac_dbg(4, "allocating %d error counters\n", tot_errcount); - pvt = edac_align_ptr(&ptr, sz_pvt, 1); - size = ((unsigned long)pvt) + sz_pvt; - - edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n", - size, - tot_dimms, - per_rank ? "ranks" : "dimms", - tot_csrows * tot_channels); - - mci = kzalloc(size, GFP_KERNEL); - if (mci == NULL) - return NULL; - - /* Adjust pointers so they point within the memory we just allocated - * rather than an imaginary chunk of memory located at address 0. - */ - layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer)); - for (i = 0; i < n_layers; i++) { - mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i])); - mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i])); - } - pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL; - - /* setup index and various internal pointers */ - mci->mc_idx = mc_num; - mci->tot_dimms = tot_dimms; - mci->pvt_info = pvt; - mci->n_layers = n_layers; - mci->layers = layer; - memcpy(mci->layers, layers, sizeof(*layer) * n_layers); - mci->nr_csrows = tot_csrows; - mci->num_cschannel = tot_channels; - mci->csbased = per_rank; + unsigned int tot_channels = mci->num_cschannel; + unsigned int tot_csrows = mci->nr_csrows; + unsigned int row, chn; /* * Alocate and fill the csrow/channels structs */ mci->csrows = kcalloc(tot_csrows, sizeof(*mci->csrows), GFP_KERNEL); if (!mci->csrows) - goto error; + return -ENOMEM; + for (row = 0; row < tot_csrows; row++) { + struct csrow_info *csr; + csr = kzalloc(sizeof(**mci->csrows), GFP_KERNEL); if (!csr) - goto error; + return -ENOMEM; + mci->csrows[row] = csr; csr->csrow_idx = row; csr->mci = mci; @@ -409,34 +343,51 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, csr->channels = kcalloc(tot_channels, sizeof(*csr->channels), GFP_KERNEL); if (!csr->channels) - goto error; + return -ENOMEM; for (chn = 0; chn < tot_channels; chn++) { + struct rank_info *chan; + chan = kzalloc(sizeof(**csr->channels), GFP_KERNEL); if (!chan) - goto error; + return -ENOMEM; + csr->channels[chn] = chan; chan->chan_idx = chn; chan->csrow = csr; } } + return 0; +} + +static int edac_mc_alloc_dimms(struct mem_ctl_info *mci) +{ + unsigned int pos[EDAC_MAX_LAYERS]; + unsigned int row, chn, idx; + int layer; + void *p; + /* * Allocate and fill the dimm structs */ - mci->dimms = kcalloc(tot_dimms, sizeof(*mci->dimms), GFP_KERNEL); + mci->dimms = kcalloc(mci->tot_dimms, sizeof(*mci->dimms), GFP_KERNEL); if (!mci->dimms) - goto error; + return -ENOMEM; memset(&pos, 0, sizeof(pos)); row = 0; chn = 0; - for (idx = 0; idx < tot_dimms; idx++) { + for (idx = 0; idx < mci->tot_dimms; idx++) { + struct dimm_info *dimm; + struct rank_info *chan; + int n, len; + chan = mci->csrows[row]->channels[chn]; dimm = kzalloc(sizeof(**mci->dimms), GFP_KERNEL); if (!dimm) - goto error; + return -ENOMEM; mci->dimms[idx] = dimm; dimm->mci = mci; dimm->idx = idx; @@ -446,16 +397,16 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, */ len = sizeof(dimm->label); p = dimm->label; - n = snprintf(p, len, "mc#%u", mc_num); + n = snprintf(p, len, "mc#%u", mci->mc_idx); p += n; len -= n; - for (j = 0; j < n_layers; j++) { + for (layer = 0; layer < mci->n_layers; layer++) { n = snprintf(p, len, "%s#%u", - edac_layer_name[layers[j].type], - pos[j]); + edac_layer_name[mci->layers[layer].type], + pos[layer]); p += n; len -= n; - dimm->location[j] = pos[j]; + dimm->location[layer] = pos[layer]; if (len <= 0) break; @@ -467,29 +418,109 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, dimm->cschannel = chn; /* Increment csrow location */ - if (layers[0].is_virt_csrow) { + if (mci->layers[0].is_virt_csrow) { chn++; - if (chn == tot_channels) { + if (chn == mci->num_cschannel) { chn = 0; row++; } } else { row++; - if (row == tot_csrows) { + if (row == mci->nr_csrows) { row = 0; chn++; } } /* Increment dimm location */ - for (j = n_layers - 1; j >= 0; j--) { - pos[j]++; - if (pos[j] < layers[j].size) + for (layer = mci->n_layers - 1; layer >= 0; layer--) { + pos[layer]++; + if (pos[layer] < mci->layers[layer].size) break; - pos[j] = 0; + pos[layer] = 0; } } + return 0; +} + +struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, + unsigned int n_layers, + struct edac_mc_layer *layers, + unsigned int sz_pvt) +{ + struct mem_ctl_info *mci; + struct edac_mc_layer *layer; + unsigned int idx, size, tot_dimms = 1; + unsigned int tot_csrows = 1, tot_channels = 1; + void *pvt, *ptr = NULL; + bool per_rank = false; + + if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0)) + return NULL; + + /* + * Calculate the total amount of dimms and csrows/cschannels while + * in the old API emulation mode + */ + for (idx = 0; idx < n_layers; idx++) { + tot_dimms *= layers[idx].size; + + if (layers[idx].is_virt_csrow) + tot_csrows *= layers[idx].size; + else + tot_channels *= layers[idx].size; + + if (layers[idx].type == EDAC_MC_LAYER_CHIP_SELECT) + per_rank = true; + } + + /* Figure out the offsets of the various items from the start of an mc + * structure. We want the alignment of each item to be at least as + * stringent as what the compiler would provide if we could simply + * hardcode everything into a single struct. + */ + mci = edac_align_ptr(&ptr, sizeof(*mci), 1); + layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers); + pvt = edac_align_ptr(&ptr, sz_pvt, 1); + size = ((unsigned long)pvt) + sz_pvt; + + edac_dbg(1, "allocating %u bytes for mci data (%d %s, %d csrows/channels)\n", + size, + tot_dimms, + per_rank ? "ranks" : "dimms", + tot_csrows * tot_channels); + + mci = kzalloc(size, GFP_KERNEL); + if (mci == NULL) + return NULL; + + mci->dev.release = mci_release; + device_initialize(&mci->dev); + + /* Adjust pointers so they point within the memory we just allocated + * rather than an imaginary chunk of memory located at address 0. + */ + layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer)); + pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL; + + /* setup index and various internal pointers */ + mci->mc_idx = mc_num; + mci->tot_dimms = tot_dimms; + mci->pvt_info = pvt; + mci->n_layers = n_layers; + mci->layers = layer; + memcpy(mci->layers, layers, sizeof(*layer) * n_layers); + mci->nr_csrows = tot_csrows; + mci->num_cschannel = tot_channels; + mci->csbased = per_rank; + + if (edac_mc_alloc_csrows(mci)) + goto error; + + if (edac_mc_alloc_dimms(mci)) + goto error; + mci->op_state = OP_ALLOC; return mci; @@ -505,9 +536,6 @@ void edac_mc_free(struct mem_ctl_info *mci) { edac_dbg(1, "\n"); - if (device_is_registered(&mci->dev)) - edac_unregister_sysfs(mci); - _edac_mc_free(mci); } EXPORT_SYMBOL_GPL(edac_mc_free); @@ -902,88 +930,51 @@ const char *edac_layer_name[] = { }; EXPORT_SYMBOL_GPL(edac_layer_name); -static void edac_inc_ce_error(struct mem_ctl_info *mci, - bool enable_per_layer_report, - const int pos[EDAC_MAX_LAYERS], - const u16 count) +static void edac_inc_ce_error(struct edac_raw_error_desc *e) { - int i, index = 0; - - mci->ce_mc += count; - - if (!enable_per_layer_report) { - mci->ce_noinfo_count += count; - return; - } + int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer }; + struct mem_ctl_info *mci = error_desc_to_mci(e); + struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]); - for (i = 0; i < mci->n_layers; i++) { - if (pos[i] < 0) - break; - index += pos[i]; - mci->ce_per_layer[i][index] += count; + mci->ce_mc += e->error_count; - if (i < mci->n_layers - 1) - index *= mci->layers[i + 1].size; - } + if (dimm) + dimm->ce_count += e->error_count; + else + mci->ce_noinfo_count += e->error_count; } -static void edac_inc_ue_error(struct mem_ctl_info *mci, - bool enable_per_layer_report, - const int pos[EDAC_MAX_LAYERS], - const u16 count) +static void edac_inc_ue_error(struct edac_raw_error_desc *e) { - int i, index = 0; - - mci->ue_mc += count; - - if (!enable_per_layer_report) { - mci->ue_noinfo_count += count; - return; - } + int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer }; + struct mem_ctl_info *mci = error_desc_to_mci(e); + struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]); - for (i = 0; i < mci->n_layers; i++) { - if (pos[i] < 0) - break; - index += pos[i]; - mci->ue_per_layer[i][index] += count; + mci->ue_mc += e->error_count; - if (i < mci->n_layers - 1) - index *= mci->layers[i + 1].size; - } + if (dimm) + dimm->ue_count += e->error_count; + else + mci->ue_noinfo_count += e->error_count; } -static void edac_ce_error(struct mem_ctl_info *mci, - const u16 error_count, - const int pos[EDAC_MAX_LAYERS], - const char *msg, - const char *location, - const char *label, - const char *detail, - const char *other_detail, - const bool enable_per_layer_report, - const unsigned long page_frame_number, - const unsigned long offset_in_page, - long grain) +static void edac_ce_error(struct edac_raw_error_desc *e) { + struct mem_ctl_info *mci = error_desc_to_mci(e); unsigned long remapped_page; - char *msg_aux = ""; - - if (*msg) - msg_aux = " "; if (edac_mc_get_log_ce()) { - if (other_detail && *other_detail) - edac_mc_printk(mci, KERN_WARNING, - "%d CE %s%son %s (%s %s - %s)\n", - error_count, msg, msg_aux, label, - location, detail, other_detail); - else - edac_mc_printk(mci, KERN_WARNING, - "%d CE %s%son %s (%s %s)\n", - error_count, msg, msg_aux, label, - location, detail); + edac_mc_printk(mci, KERN_WARNING, + "%d CE %s%son %s (%s page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx%s%s)\n", + e->error_count, e->msg, + *e->msg ? " " : "", + e->label, e->location, e->page_frame_number, e->offset_in_page, + e->grain, e->syndrome, + *e->other_detail ? " - " : "", + e->other_detail); } - edac_inc_ce_error(mci, enable_per_layer_report, pos, error_count); + + edac_inc_ce_error(e); if (mci->scrub_mode == SCRUB_SW_SRC) { /* @@ -998,60 +989,64 @@ static void edac_ce_error(struct mem_ctl_info *mci, * be scrubbed. */ remapped_page = mci->ctl_page_to_phys ? - mci->ctl_page_to_phys(mci, page_frame_number) : - page_frame_number; + mci->ctl_page_to_phys(mci, e->page_frame_number) : + e->page_frame_number; - edac_mc_scrub_block(remapped_page, - offset_in_page, grain); + edac_mc_scrub_block(remapped_page, e->offset_in_page, e->grain); } } -static void edac_ue_error(struct mem_ctl_info *mci, - const u16 error_count, - const int pos[EDAC_MAX_LAYERS], - const char *msg, - const char *location, - const char *label, - const char *detail, - const char *other_detail, - const bool enable_per_layer_report) +static void edac_ue_error(struct edac_raw_error_desc *e) { - char *msg_aux = ""; - - if (*msg) - msg_aux = " "; + struct mem_ctl_info *mci = error_desc_to_mci(e); if (edac_mc_get_log_ue()) { - if (other_detail && *other_detail) - edac_mc_printk(mci, KERN_WARNING, - "%d UE %s%son %s (%s %s - %s)\n", - error_count, msg, msg_aux, label, - location, detail, other_detail); - else - edac_mc_printk(mci, KERN_WARNING, - "%d UE %s%son %s (%s %s)\n", - error_count, msg, msg_aux, label, - location, detail); + edac_mc_printk(mci, KERN_WARNING, + "%d UE %s%son %s (%s page:0x%lx offset:0x%lx grain:%ld%s%s)\n", + e->error_count, e->msg, + *e->msg ? " " : "", + e->label, e->location, e->page_frame_number, e->offset_in_page, + e->grain, + *e->other_detail ? " - " : "", + e->other_detail); } if (edac_mc_get_panic_on_ue()) { - if (other_detail && *other_detail) - panic("UE %s%son %s (%s%s - %s)\n", - msg, msg_aux, label, location, detail, other_detail); - else - panic("UE %s%son %s (%s%s)\n", - msg, msg_aux, label, location, detail); + panic("UE %s%son %s (%s page:0x%lx offset:0x%lx grain:%ld%s%s)\n", + e->msg, + *e->msg ? " " : "", + e->label, e->location, e->page_frame_number, e->offset_in_page, + e->grain, + *e->other_detail ? " - " : "", + e->other_detail); } - edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count); + edac_inc_ue_error(e); } -void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type, - struct mem_ctl_info *mci, - struct edac_raw_error_desc *e) +static void edac_inc_csrow(struct edac_raw_error_desc *e, int row, int chan) { - char detail[80]; - int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer }; + struct mem_ctl_info *mci = error_desc_to_mci(e); + enum hw_event_mc_err_type type = e->type; + u16 count = e->error_count; + + if (row < 0) + return; + + edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan); + + if (type == HW_EVENT_ERR_CORRECTED) { + mci->csrows[row]->ce_count += count; + if (chan >= 0) + mci->csrows[row]->channels[chan]->ce_count += count; + } else { + mci->csrows[row]->ue_count += count; + } +} + +void edac_raw_mc_handle_error(struct edac_raw_error_desc *e) +{ + struct mem_ctl_info *mci = error_desc_to_mci(e); u8 grain_bits; /* Sanity-check driver-supplied grain value. */ @@ -1062,31 +1057,16 @@ void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type, /* Report the error via the trace interface */ if (IS_ENABLED(CONFIG_RAS)) - trace_mc_event(type, e->msg, e->label, e->error_count, + trace_mc_event(e->type, e->msg, e->label, e->error_count, mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer, (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page, grain_bits, e->syndrome, e->other_detail); - /* Memory type dependent details about the error */ - if (type == HW_EVENT_ERR_CORRECTED) { - snprintf(detail, sizeof(detail), - "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx", - e->page_frame_number, e->offset_in_page, - e->grain, e->syndrome); - edac_ce_error(mci, e->error_count, pos, e->msg, e->location, e->label, - detail, e->other_detail, e->enable_per_layer_report, - e->page_frame_number, e->offset_in_page, e->grain); - } else { - snprintf(detail, sizeof(detail), - "page:0x%lx offset:0x%lx grain:%ld", - e->page_frame_number, e->offset_in_page, e->grain); - - edac_ue_error(mci, e->error_count, pos, e->msg, e->location, e->label, - detail, e->other_detail, e->enable_per_layer_report); - } - - + if (e->type == HW_EVENT_ERR_CORRECTED) + edac_ce_error(e); + else + edac_ue_error(e); } EXPORT_SYMBOL_GPL(edac_raw_mc_handle_error); @@ -1108,25 +1088,27 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer }; int i, n_labels = 0; struct edac_raw_error_desc *e = &mci->error_desc; + bool any_memory = true; edac_dbg(3, "MC%d\n", mci->mc_idx); /* Fills the error report buffer */ memset(e, 0, sizeof (*e)); e->error_count = error_count; + e->type = type; e->top_layer = top_layer; e->mid_layer = mid_layer; e->low_layer = low_layer; e->page_frame_number = page_frame_number; e->offset_in_page = offset_in_page; e->syndrome = syndrome; - e->msg = msg; - e->other_detail = other_detail; + /* need valid strings here for both: */ + e->msg = msg ?: ""; + e->other_detail = other_detail ?: ""; /* - * Check if the event report is consistent and if the memory - * location is known. If it is known, enable_per_layer_report will be - * true, the DIMM(s) label info will be filled and the per-layer + * Check if the event report is consistent and if the memory location is + * known. If it is, the DIMM(s) label info will be filled and the DIMM's * error counters will be incremented. */ for (i = 0; i < mci->n_layers; i++) { @@ -1145,7 +1127,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, pos[i] = -1; } if (pos[i] >= 0) - e->enable_per_layer_report = true; + any_memory = false; } /* @@ -1176,24 +1158,25 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, /* * If the error is memory-controller wide, there's no need to - * seek for the affected DIMMs because the whole - * channel/memory controller/... may be affected. - * Also, don't show errors for empty DIMM slots. + * seek for the affected DIMMs because the whole channel/memory + * controller/... may be affected. Also, don't show errors for + * empty DIMM slots. */ - if (!e->enable_per_layer_report || !dimm->nr_pages) + if (!dimm->nr_pages) continue; - if (n_labels >= EDAC_MAX_LABELS) { - e->enable_per_layer_report = false; - break; - } n_labels++; - if (p != e->label) { - strcpy(p, OTHER_LABEL); - p += strlen(OTHER_LABEL); + if (n_labels > EDAC_MAX_LABELS) { + p = e->label; + *p = '\0'; + } else { + if (p != e->label) { + strcpy(p, OTHER_LABEL); + p += strlen(OTHER_LABEL); + } + strcpy(p, dimm->label); + p += strlen(p); } - strcpy(p, dimm->label); - p += strlen(p); /* * get csrow/channel of the DIMM, in order to allow @@ -1213,22 +1196,12 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, chan = -2; } - if (!e->enable_per_layer_report) { + if (any_memory) strcpy(e->label, "any memory"); - } else { - edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan); - if (p == e->label) - strcpy(e->label, "unknown memory"); - if (type == HW_EVENT_ERR_CORRECTED) { - if (row >= 0) { - mci->csrows[row]->ce_count += error_count; - if (chan >= 0) - mci->csrows[row]->channels[chan]->ce_count += error_count; - } - } else - if (row >= 0) - mci->csrows[row]->ue_count += error_count; - } + else if (!*e->label) + strcpy(e->label, "unknown memory"); + + edac_inc_csrow(e, row, chan); /* Fill the RAM location data */ p = e->location; @@ -1244,6 +1217,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, if (p > e->location) *(p - 1) = '\0'; - edac_raw_mc_handle_error(type, mci, e); + edac_raw_mc_handle_error(e); } EXPORT_SYMBOL_GPL(edac_mc_handle_error); diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h index 02aac5c61d00..881b00eadf7a 100644 --- a/drivers/edac/edac_mc.h +++ b/drivers/edac/edac_mc.h @@ -212,17 +212,13 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, * edac_raw_mc_handle_error() - Reports a memory event to userspace without * doing anything to discover the error location. * - * @type: severity of the error (CE/UE/Fatal) - * @mci: a struct mem_ctl_info pointer * @e: error description * * This raw function is used internally by edac_mc_handle_error(). It should * only be called directly when the hardware error come directly from BIOS, * like in the case of APEI GHES driver. */ -void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type, - struct mem_ctl_info *mci, - struct edac_raw_error_desc *e); +void edac_raw_mc_handle_error(struct edac_raw_error_desc *e); /** * edac_mc_handle_error() - Reports a memory event to userspace. diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index c70ec0a306d8..4e6aca595133 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -274,14 +274,8 @@ static const struct attribute_group *csrow_attr_groups[] = { NULL }; -static void csrow_attr_release(struct device *dev) -{ - /* release device with _edac_mc_free() */ -} - static const struct device_type csrow_attr_type = { .groups = csrow_attr_groups, - .release = csrow_attr_release, }; /* @@ -387,6 +381,14 @@ static const struct attribute_group *csrow_dev_groups[] = { NULL }; +static void csrow_release(struct device *dev) +{ + /* + * Nothing to do, just unregister sysfs here. The mci + * device owns the data and will also release it. + */ +} + static inline int nr_pages_per_csrow(struct csrow_info *csrow) { int chan, nr_pages = 0; @@ -405,6 +407,7 @@ static int edac_create_csrow_object(struct mem_ctl_info *mci, csrow->dev.type = &csrow_attr_type; csrow->dev.groups = csrow_dev_groups; + csrow->dev.release = csrow_release; device_initialize(&csrow->dev); csrow->dev.parent = &mci->dev; csrow->mci = mci; @@ -441,10 +444,8 @@ static int edac_create_csrow_objects(struct mem_ctl_info *mci) error: for (--i; i >= 0; i--) { - csrow = mci->csrows[i]; - if (!nr_pages_per_csrow(csrow)) - continue; - device_unregister(&mci->csrows[i]->dev); + if (device_is_registered(&mci->csrows[i]->dev)) + device_unregister(&mci->csrows[i]->dev); } return err; @@ -453,15 +454,13 @@ error: static void edac_delete_csrow_objects(struct mem_ctl_info *mci) { int i; - struct csrow_info *csrow; - for (i = mci->nr_csrows - 1; i >= 0; i--) { - csrow = mci->csrows[i]; - if (!nr_pages_per_csrow(csrow)) - continue; - device_unregister(&mci->csrows[i]->dev); + for (i = 0; i < mci->nr_csrows; i++) { + if (device_is_registered(&mci->csrows[i]->dev)) + device_unregister(&mci->csrows[i]->dev); } } + #endif /* @@ -552,10 +551,8 @@ static ssize_t dimmdev_ce_count_show(struct device *dev, char *data) { struct dimm_info *dimm = to_dimm(dev); - u32 count; - count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx]; - return sprintf(data, "%u\n", count); + return sprintf(data, "%u\n", dimm->ce_count); } static ssize_t dimmdev_ue_count_show(struct device *dev, @@ -563,10 +560,8 @@ static ssize_t dimmdev_ue_count_show(struct device *dev, char *data) { struct dimm_info *dimm = to_dimm(dev); - u32 count; - count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][dimm->idx]; - return sprintf(data, "%u\n", count); + return sprintf(data, "%u\n", dimm->ue_count); } /* dimm/rank attribute files */ @@ -602,16 +597,18 @@ static const struct attribute_group *dimm_attr_groups[] = { NULL }; -static void dimm_attr_release(struct device *dev) -{ - /* release device with _edac_mc_free() */ -} - static const struct device_type dimm_attr_type = { .groups = dimm_attr_groups, - .release = dimm_attr_release, }; +static void dimm_release(struct device *dev) +{ + /* + * Nothing to do, just unregister sysfs here. The mci + * device owns the data and will also release it. + */ +} + /* Create a DIMM object under specifed memory controller device */ static int edac_create_dimm_object(struct mem_ctl_info *mci, struct dimm_info *dimm) @@ -620,6 +617,7 @@ static int edac_create_dimm_object(struct mem_ctl_info *mci, dimm->mci = mci; dimm->dev.type = &dimm_attr_type; + dimm->dev.release = dimm_release; device_initialize(&dimm->dev); dimm->dev.parent = &mci->dev; @@ -659,7 +657,9 @@ static ssize_t mci_reset_counters_store(struct device *dev, const char *data, size_t count) { struct mem_ctl_info *mci = to_mci(dev); - int cnt, row, chan, i; + struct dimm_info *dimm; + int row, chan; + mci->ue_mc = 0; mci->ce_mc = 0; mci->ue_noinfo_count = 0; @@ -675,11 +675,9 @@ static ssize_t mci_reset_counters_store(struct device *dev, ri->channels[chan]->ce_count = 0; } - cnt = 1; - for (i = 0; i < mci->n_layers; i++) { - cnt *= mci->layers[i].size; - memset(mci->ce_per_layer[i], 0, cnt * sizeof(u32)); - memset(mci->ue_per_layer[i], 0, cnt * sizeof(u32)); + mci_for_each_dimm(mci, dimm) { + dimm->ue_count = 0; + dimm->ce_count = 0; } mci->start_time = jiffies; @@ -884,14 +882,8 @@ static const struct attribute_group *mci_attr_groups[] = { NULL }; -static void mci_attr_release(struct device *dev) -{ - /* release device with _edac_mc_free() */ -} - static const struct device_type mci_attr_type = { .groups = mci_attr_groups, - .release = mci_attr_release, }; /* @@ -910,8 +902,6 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, /* get the /sys/devices/system/edac subsys reference */ mci->dev.type = &mci_attr_type; - device_initialize(&mci->dev); - mci->dev.parent = mci_pdev; mci->dev.groups = groups; dev_set_name(&mci->dev, "mc%d", mci->mc_idx); @@ -921,7 +911,7 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, err = device_add(&mci->dev); if (err < 0) { edac_dbg(1, "failure: create device %s\n", dev_name(&mci->dev)); - put_device(&mci->dev); + /* no put_device() here, free mci with _edac_mc_free() */ return err; } @@ -937,24 +927,20 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, err = edac_create_dimm_object(mci, dimm); if (err) - goto fail_unregister_dimm; + goto fail; } #ifdef CONFIG_EDAC_LEGACY_SYSFS err = edac_create_csrow_objects(mci); if (err < 0) - goto fail_unregister_dimm; + goto fail; #endif edac_create_debugfs_nodes(mci); return 0; -fail_unregister_dimm: - mci_for_each_dimm(mci, dimm) { - if (device_is_registered(&dimm->dev)) - device_unregister(&dimm->dev); - } - device_unregister(&mci->dev); +fail: + edac_remove_sysfs_mci_device(mci); return err; } @@ -966,6 +952,9 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci) { struct dimm_info *dimm; + if (!device_is_registered(&mci->dev)) + return; + edac_dbg(0, "\n"); #ifdef CONFIG_EDAC_DEBUG @@ -976,17 +965,14 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci) #endif mci_for_each_dimm(mci, dimm) { - if (dimm->nr_pages == 0) + if (!device_is_registered(&dimm->dev)) continue; edac_dbg(1, "unregistering device %s\n", dev_name(&dimm->dev)); device_unregister(&dimm->dev); } -} -void edac_unregister_sysfs(struct mem_ctl_info *mci) -{ - edac_dbg(1, "unregistering device %s\n", dev_name(&mci->dev)); - device_unregister(&mci->dev); + /* only remove the device, but keep mci */ + device_del(&mci->dev); } static void mc_attr_release(struct device *dev) @@ -1000,9 +986,6 @@ static void mc_attr_release(struct device *dev) kfree(dev); } -static const struct device_type mc_attr_type = { - .release = mc_attr_release, -}; /* * Init/exit code for the module. Basically, creates/removes /sys/class/rc */ @@ -1015,11 +998,10 @@ int __init edac_mc_sysfs_init(void) return -ENOMEM; mci_pdev->bus = edac_get_sysfs_subsys(); - mci_pdev->type = &mc_attr_type; - device_initialize(mci_pdev); - dev_set_name(mci_pdev, "mc"); + mci_pdev->release = mc_attr_release; + mci_pdev->init_name = "mc"; - err = device_add(mci_pdev); + err = device_register(mci_pdev); if (err < 0) { edac_dbg(1, "failure: create device %s\n", dev_name(mci_pdev)); put_device(mci_pdev); diff --git a/drivers/edac/edac_module.h b/drivers/edac/edac_module.h index 388427d378b1..aa1f91688eb8 100644 --- a/drivers/edac/edac_module.h +++ b/drivers/edac/edac_module.h @@ -28,7 +28,6 @@ void edac_mc_sysfs_exit(void); extern int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, const struct attribute_group **groups); extern void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci); -void edac_unregister_sysfs(struct mem_ctl_info *mci); extern int edac_get_log_ue(void); extern int edac_get_log_ce(void); extern int edac_get_panic_on_ue(void); diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index b99080d8a10c..cb3dab56a875 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -201,7 +201,6 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg) void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) { - enum hw_event_mc_err_type type; struct edac_raw_error_desc *e; struct mem_ctl_info *mci; struct ghes_edac_pvt *pvt; @@ -240,17 +239,17 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) switch (sev) { case GHES_SEV_CORRECTED: - type = HW_EVENT_ERR_CORRECTED; + e->type = HW_EVENT_ERR_CORRECTED; break; case GHES_SEV_RECOVERABLE: - type = HW_EVENT_ERR_UNCORRECTED; + e->type = HW_EVENT_ERR_UNCORRECTED; break; case GHES_SEV_PANIC: - type = HW_EVENT_ERR_FATAL; + e->type = HW_EVENT_ERR_FATAL; break; default: case GHES_SEV_NO: - type = HW_EVENT_ERR_INFO; + e->type = HW_EVENT_ERR_INFO; } edac_dbg(1, "error validation_bits: 0x%08llx\n", @@ -356,11 +355,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) mem_err->mem_dev_handle); index = get_dimm_smbios_index(mci, mem_err->mem_dev_handle); - if (index >= 0) { + if (index >= 0) e->top_layer = index; - e->enable_per_layer_report = true; - } - } if (p > e->location) *(p - 1) = '\0'; @@ -442,7 +438,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) if (p > pvt->other_detail) *(p - 1) = '\0'; - edac_raw_mc_handle_error(type, mci, e); + edac_raw_mc_handle_error(e); unlock: spin_unlock_irqrestore(&ghes_lock, flags); diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index ea980c556f2e..8874b7722b2f 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -1239,7 +1239,7 @@ static int __init mce_amd_init(void) case 0x17: case 0x18: - pr_warn("Decoding supported only on Scalable MCA processors.\n"); + pr_warn_once("Decoding supported only on Scalable MCA processors.\n"); return -EINVAL; default: diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c index 880ffd833718..12211dc040e8 100644 --- a/drivers/edac/synopsys_edac.c +++ b/drivers/edac/synopsys_edac.c @@ -477,16 +477,16 @@ static void handle_error(struct mem_ctl_info *mci, struct synps_ecc_status *p) if (p->ce_cnt) { pinf = &p->ceinfo; - if (!priv->p_data->quirks) { + if (priv->p_data->quirks & DDR_ECC_INTR_SUPPORT) { snprintf(priv->message, SYNPS_EDAC_MSG_SIZE, - "DDR ECC error type:%s Row %d Bank %d Col %d Bit Position: %d Data: 0x%08x", - "CE", pinf->row, pinf->bank, pinf->col, + "DDR ECC error type:%s Row %d Bank %d BankGroup Number %d Block Number %d Bit Position: %d Data: 0x%08x", + "CE", pinf->row, pinf->bank, + pinf->bankgrpnr, pinf->blknr, pinf->bitpos, pinf->data); } else { snprintf(priv->message, SYNPS_EDAC_MSG_SIZE, - "DDR ECC error type:%s Row %d Bank %d Col %d BankGroup Number %d Block Number %d Bit Position: %d Data: 0x%08x", + "DDR ECC error type:%s Row %d Bank %d Col %d Bit Position: %d Data: 0x%08x", "CE", pinf->row, pinf->bank, pinf->col, - pinf->bankgrpnr, pinf->blknr, pinf->bitpos, pinf->data); } @@ -497,15 +497,15 @@ static void handle_error(struct mem_ctl_info *mci, struct synps_ecc_status *p) if (p->ue_cnt) { pinf = &p->ueinfo; - if (!priv->p_data->quirks) { + if (priv->p_data->quirks & DDR_ECC_INTR_SUPPORT) { snprintf(priv->message, SYNPS_EDAC_MSG_SIZE, - "DDR ECC error type :%s Row %d Bank %d Col %d ", - "UE", pinf->row, pinf->bank, pinf->col); + "DDR ECC error type :%s Row %d Bank %d BankGroup Number %d Block Number %d", + "UE", pinf->row, pinf->bank, + pinf->bankgrpnr, pinf->blknr); } else { snprintf(priv->message, SYNPS_EDAC_MSG_SIZE, - "DDR ECC error type :%s Row %d Bank %d Col %d BankGroup Number %d Block Number %d", - "UE", pinf->row, pinf->bank, pinf->col, - pinf->bankgrpnr, pinf->blknr); + "DDR ECC error type :%s Row %d Bank %d Col %d ", + "UE", pinf->row, pinf->bank, pinf->col); } edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, |