From bff7b812465a797bc563e9938fa11316fcd2ac0d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 4 Aug 2011 19:25:24 +0200 Subject: EDAC, MCE, AMD: Print CPU number when reporting the error Currently, correctable ECCs go through mcelog and do not print the scary MCE banner. In that case, however, reporting the core where the CECC happened is important information so dump it along with the decoded string albeit at risk of having a minor redundancy. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 795cfbc0bf50..5bfe6997d9ff 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -769,8 +769,8 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (amd_filter_mce(m)) return NOTIFY_STOP; - pr_emerg(HW_ERR "MC%d_STATUS[%s|%s|%s|%s|%s", - m->bank, + pr_emerg(HW_ERR "CPU:%d MC%d_STATUS[%s|%s|%s|%s|%s", + m->extcpu, m->bank, ((m->status & MCI_STATUS_OVER) ? "Over" : "-"), ((m->status & MCI_STATUS_UC) ? "UE" : "CE"), ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), -- cgit v1.2.3 From 086be786ca10af7a9783ab06a9b5594c2c6facbf Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 30 Sep 2011 16:34:44 +0200 Subject: EDAC, MCE, AMD: Print valid addr when reporting an error The MCi_STATUS bank has a AddrV bit which, when set, denotes that the corresponding MCi_ADDR MSR contains a valid address belonging to the MCE currently being reported. Dump it since it is definitely relevant information. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 5bfe6997d9ff..a6d25dac8a88 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -769,7 +769,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (amd_filter_mce(m)) return NOTIFY_STOP; - pr_emerg(HW_ERR "CPU:%d MC%d_STATUS[%s|%s|%s|%s|%s", + pr_emerg(HW_ERR "CPU:%d\tMC%d_STATUS[%s|%s|%s|%s|%s", m->extcpu, m->bank, ((m->status & MCI_STATUS_OVER) ? "Over" : "-"), ((m->status & MCI_STATUS_UC) ? "UE" : "CE"), @@ -789,6 +789,8 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) pr_cont("]: 0x%016llx\n", m->status); + if (m->status & MCI_STATUS_ADDRV) + pr_emerg(HW_ERR "\tMC%d_ADDR: 0x%016llx\n", m->bank, m->addr); switch (m->bank) { case 0: -- cgit v1.2.3 From 295d8cda2689a74ae88bcece7b4cfe0bf8bf9a91 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 24 Aug 2011 17:47:11 +0200 Subject: EDAC, MCE, AMD: Drop local coreid reporting MCE decoding code is reporting the core which encountered the error unconditionally now so drop this piece. Besides, it reported the coreid in the local processor package which is not that valuable as a datapoint. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'drivers') diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index a6d25dac8a88..b81c5da542c0 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -597,26 +597,8 @@ void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) struct cpuinfo_x86 *c = &boot_cpu_data; u16 ec = EC(m->status); u8 xec = XEC(m->status, 0x1f); - u32 nbsh = (u32)(m->status >> 32); - int core = -1; - pr_emerg(HW_ERR "Northbridge Error (node %d", node_id); - - /* F10h, revD can disable ErrCpu[3:0] through ErrCpuVal */ - if (c->x86 == 0x10 && c->x86_model > 7) { - if (nbsh & NBSH_ERR_CPU_VAL) - core = nbsh & nb_err_cpumask; - } else { - u8 assoc_cpus = nbsh & nb_err_cpumask; - - if (assoc_cpus > 0) - core = fls(assoc_cpus) - 1; - } - - if (core >= 0) - pr_cont(", core %d): ", core); - else - pr_cont("): "); + pr_emerg(HW_ERR "Northbridge Error (node %d): ", node_id); switch (xec) { case 0x2: -- cgit v1.2.3 From b0b07a2bd4fbb6198d4e7142337214eeb77c417a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 24 Aug 2011 18:44:22 +0200 Subject: EDAC, MCE, AMD: Simplify NB MCE decoder interface Drop third nbcfg argument which is old remains and not required anymore. No functionality change. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 6 ++---- drivers/edac/mce_amd.c | 20 ++++++++++---------- drivers/edac/mce_amd.h | 6 +++--- 3 files changed, 15 insertions(+), 17 deletions(-) (limited to 'drivers') diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 9a8bebcf6b17..9bf0b6228529 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1953,11 +1953,9 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, amd64_handle_ue(mci, m); } -void amd64_decode_bus_error(int node_id, struct mce *m, u32 nbcfg) +void amd64_decode_bus_error(int node_id, struct mce *m) { - struct mem_ctl_info *mci = mcis[node_id]; - - __amd64_decode_bus_error(mci, m); + __amd64_decode_bus_error(mcis[node_id], m); } /* diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index b81c5da542c0..d0864d9c38ad 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -9,7 +9,7 @@ static u8 xec_mask = 0xf; static u8 nb_err_cpumask = 0xf; static bool report_gart_errors; -static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); +static void (*nb_bus_decoder)(int node_id, struct mce *m); void amd_report_gart_errors(bool v) { @@ -17,13 +17,13 @@ void amd_report_gart_errors(bool v) } EXPORT_SYMBOL_GPL(amd_report_gart_errors); -void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)) +void amd_register_ecc_decoder(void (*f)(int, struct mce *)) { nb_bus_decoder = f; } EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); -void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)) +void amd_unregister_ecc_decoder(void (*f)(int, struct mce *)) { if (nb_bus_decoder) { WARN_ON(nb_bus_decoder != f); @@ -592,11 +592,12 @@ static bool nb_noop_mce(u16 ec, u8 xec) return false; } -void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) +void amd_decode_nb_mce(struct mce *m) { struct cpuinfo_x86 *c = &boot_cpu_data; - u16 ec = EC(m->status); - u8 xec = XEC(m->status, 0x1f); + int node_id = amd_get_nb_id(m->extcpu); + u16 ec = EC(m->status); + u8 xec = XEC(m->status, 0x1f); pr_emerg(HW_ERR "Northbridge Error (node %d): ", node_id); @@ -630,7 +631,7 @@ void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x15) if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder) - nb_bus_decoder(node_id, m, nbcfg); + nb_bus_decoder(node_id, m); return; @@ -746,7 +747,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { struct mce *m = (struct mce *)data; struct cpuinfo_x86 *c = &boot_cpu_data; - int node, ecc; + int ecc; if (amd_filter_mce(m)) return NOTIFY_STOP; @@ -795,8 +796,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) break; case 4: - node = amd_get_nb_id(m->extcpu); - amd_decode_nb_mce(node, m, 0); + amd_decode_nb_mce(m); break; case 5: diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 795a3206acf5..0106747e240c 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -86,9 +86,9 @@ struct amd_decoder_ops { }; void amd_report_gart_errors(bool); -void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)); -void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)); -void amd_decode_nb_mce(int, struct mce *, u32); +void amd_register_ecc_decoder(void (*f)(int, struct mce *)); +void amd_unregister_ecc_decoder(void (*f)(int, struct mce *)); +void amd_decode_nb_mce(struct mce *); int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data); #endif /* _EDAC_MCE_AMD_H */ -- cgit v1.2.3 From 73ba85937b2a07b6401ba0b7ca06a112762de9f7 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 19 Sep 2011 17:34:45 +0200 Subject: amd64_edac: Add a fix for Erratum 505 When accessing the scrub rate control register (F3x58) on F15h, the DRAM controller selector (F1x10C[DctCfgSel]) has to point to DCT0 so that the scrub rate configuration can take effect. See Erratum 505 in the AMD F15h revision guide for more details. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 9bf0b6228529..367756a48ebe 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -114,10 +114,22 @@ static int f10_read_dct_pci_cfg(struct amd64_pvt *pvt, int addr, u32 *val, return __amd64_read_pci_cfg_dword(pvt->F2, addr, val, func); } +/* + * Select DCT to which PCI cfg accesses are routed + */ +static void f15h_select_dct(struct amd64_pvt *pvt, u8 dct) +{ + u32 reg = 0; + + amd64_read_pci_cfg(pvt->F1, DCT_CFG_SEL, ®); + reg &= 0xfffffffe; + reg |= dct; + amd64_write_pci_cfg(pvt->F1, DCT_CFG_SEL, reg); +} + static int f15_read_dct_pci_cfg(struct amd64_pvt *pvt, int addr, u32 *val, const char *func) { - u32 reg = 0; u8 dct = 0; if (addr >= 0x140 && addr <= 0x1a0) { @@ -125,10 +137,7 @@ static int f15_read_dct_pci_cfg(struct amd64_pvt *pvt, int addr, u32 *val, addr -= 0x100; } - amd64_read_pci_cfg(pvt->F1, DCT_CFG_SEL, ®); - reg &= 0xfffffffe; - reg |= dct; - amd64_write_pci_cfg(pvt->F1, DCT_CFG_SEL, reg); + f15h_select_dct(pvt, dct); return __amd64_read_pci_cfg_dword(pvt->F2, addr, val, func); } @@ -198,6 +207,10 @@ static int amd64_set_scrub_rate(struct mem_ctl_info *mci, u32 bw) if (boot_cpu_data.x86 == 0xf) min_scrubrate = 0x0; + /* F15h Erratum #505 */ + if (boot_cpu_data.x86 == 0x15) + f15h_select_dct(pvt, 0); + return __amd64_set_scrub_rate(pvt->F3, bw, min_scrubrate); } @@ -207,6 +220,10 @@ static int amd64_get_scrub_rate(struct mem_ctl_info *mci) u32 scrubval = 0; int i, retval = -EINVAL; + /* F15h Erratum #505 */ + if (boot_cpu_data.x86 == 0x15) + f15h_select_dct(pvt, 0); + amd64_read_pci_cfg(pvt->F3, SCRCTRL, &scrubval); scrubval = scrubval & 0x001F; -- cgit v1.2.3 From 1f6189ed18cbd99d90bffdbc76c3adc54418b2fd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 6 Oct 2011 02:30:25 -0400 Subject: amd64_edac: Cleanup return type of amd64_determine_edac_cap() Sparse complains that edac_cap was declared as dev_type and we are returning edac_type. Historically, edac_type was correct but since then we have changed it to return a bit field. Signed-off-by: Dan Carpenter Link: http://lkml.kernel.org/r/20111006063025.GA2615@mwanda Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 367756a48ebe..c9eee6d33e9a 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -768,10 +768,10 @@ static int get_channel_from_ecc_syndrome(struct mem_ctl_info *, u16); * Determine if the DIMMs have ECC enabled. ECC is enabled ONLY if all the DIMMs * are ECC capable. */ -static enum edac_type amd64_determine_edac_cap(struct amd64_pvt *pvt) +static unsigned long amd64_determine_edac_cap(struct amd64_pvt *pvt) { u8 bit; - enum dev_type edac_cap = EDAC_FLAG_NONE; + unsigned long edac_cap = EDAC_FLAG_NONE; bit = (boot_cpu_data.x86 > 0xf || pvt->ext_model >= K8_REV_F) ? 19 -- cgit v1.2.3