Skip to content

Commit e23a7cd

Browse files
Talel Shenharsuryasaimadhu
authored andcommitted
EDAC/al-mc-edac: Add Amazon's Annapurna Labs Memory Controller driver
The Amazon's Annapurna Labs Memory Controller EDAC supports ECC capability for error detection and correction (Single bit error correction, Double detection). This driver introduces EDAC driver for that capability. [ bp: Remove "EDAC" string from Kconfig tristate as it is redundant. ] Signed-off-by: Talel Shenhar <talel@amazon.com> Signed-off-by: Borislav Petkov <bp@suse.de> Reviewed-by: James Morse <james.morse@arm.com> Link: https://lkml.kernel.org/r/20200816185551.19108-3-talel@amazon.com
1 parent eb3411c commit e23a7cd

4 files changed

Lines changed: 369 additions & 0 deletions

File tree

MAINTAINERS

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -802,6 +802,13 @@ S: Maintained
802802
F: Documentation/devicetree/bindings/interrupt-controller/amazon,al-fic.txt
803803
F: drivers/irqchip/irq-al-fic.c
804804

805+
AMAZON ANNAPURNA LABS MEMORY CONTROLLER EDAC
806+
M: Talel Shenhar <talel@amazon.com>
807+
M: Talel Shenhar <talelshenhar@gmail.com>
808+
S: Maintained
809+
F: Documentation/devicetree/bindings/edac/amazon,al-mc-edac.yaml
810+
F: drivers/edac/al_mc_edac.c
811+
805812
AMAZON ANNAPURNA LABS THERMAL MMIO DRIVER
806813
M: Talel Shenhar <talel@amazon.com>
807814
S: Maintained

drivers/edac/Kconfig

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,13 @@ config EDAC_AMD64_ERROR_INJECTION
100100
In addition, there are two control files, inject_read and inject_write,
101101
which trigger the DRAM ECC Read and Write respectively.
102102

103+
config EDAC_AL_MC
104+
tristate "Amazon's Annapurna Lab Memory Controller"
105+
depends on (ARCH_ALPINE || COMPILE_TEST)
106+
help
107+
Support for error detection and correction for Amazon's Annapurna
108+
Labs Alpine chips which allow 1 bit correction and 2 bits detection.
109+
103110
config EDAC_AMD76X
104111
tristate "AMD 76x (760, 762, 768)"
105112
depends on PCI && X86_32

drivers/edac/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ obj-$(CONFIG_EDAC_GHES) += ghes_edac.o
2222
edac_mce_amd-y := mce_amd.o
2323
obj-$(CONFIG_EDAC_DECODE_MCE) += edac_mce_amd.o
2424

25+
obj-$(CONFIG_EDAC_AL_MC) += al_mc_edac.o
2526
obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o
2627
obj-$(CONFIG_EDAC_CPC925) += cpc925_edac.o
2728
obj-$(CONFIG_EDAC_I5000) += i5000_edac.o

drivers/edac/al_mc_edac.c

Lines changed: 354 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,354 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/*
3+
* Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
*/
5+
#include <linux/bitfield.h>
6+
#include <linux/bitops.h>
7+
#include <linux/edac.h>
8+
#include <linux/of_irq.h>
9+
#include <linux/platform_device.h>
10+
#include <linux/spinlock.h>
11+
#include "edac_module.h"
12+
13+
/* Registers Offset */
14+
#define AL_MC_ECC_CFG 0x70
15+
#define AL_MC_ECC_CLEAR 0x7c
16+
#define AL_MC_ECC_ERR_COUNT 0x80
17+
#define AL_MC_ECC_CE_ADDR0 0x84
18+
#define AL_MC_ECC_CE_ADDR1 0x88
19+
#define AL_MC_ECC_UE_ADDR0 0xa4
20+
#define AL_MC_ECC_UE_ADDR1 0xa8
21+
#define AL_MC_ECC_CE_SYND0 0x8c
22+
#define AL_MC_ECC_CE_SYND1 0x90
23+
#define AL_MC_ECC_CE_SYND2 0x94
24+
#define AL_MC_ECC_UE_SYND0 0xac
25+
#define AL_MC_ECC_UE_SYND1 0xb0
26+
#define AL_MC_ECC_UE_SYND2 0xb4
27+
28+
/* Registers Fields */
29+
#define AL_MC_ECC_CFG_SCRUB_DISABLED BIT(4)
30+
31+
#define AL_MC_ECC_CLEAR_UE_COUNT BIT(3)
32+
#define AL_MC_ECC_CLEAR_CE_COUNT BIT(2)
33+
#define AL_MC_ECC_CLEAR_UE_ERR BIT(1)
34+
#define AL_MC_ECC_CLEAR_CE_ERR BIT(0)
35+
36+
#define AL_MC_ECC_ERR_COUNT_UE GENMASK(31, 16)
37+
#define AL_MC_ECC_ERR_COUNT_CE GENMASK(15, 0)
38+
39+
#define AL_MC_ECC_CE_ADDR0_RANK GENMASK(25, 24)
40+
#define AL_MC_ECC_CE_ADDR0_ROW GENMASK(17, 0)
41+
42+
#define AL_MC_ECC_CE_ADDR1_BG GENMASK(25, 24)
43+
#define AL_MC_ECC_CE_ADDR1_BANK GENMASK(18, 16)
44+
#define AL_MC_ECC_CE_ADDR1_COLUMN GENMASK(11, 0)
45+
46+
#define AL_MC_ECC_UE_ADDR0_RANK GENMASK(25, 24)
47+
#define AL_MC_ECC_UE_ADDR0_ROW GENMASK(17, 0)
48+
49+
#define AL_MC_ECC_UE_ADDR1_BG GENMASK(25, 24)
50+
#define AL_MC_ECC_UE_ADDR1_BANK GENMASK(18, 16)
51+
#define AL_MC_ECC_UE_ADDR1_COLUMN GENMASK(11, 0)
52+
53+
#define DRV_NAME "al_mc_edac"
54+
#define AL_MC_EDAC_MSG_MAX 256
55+
56+
struct al_mc_edac {
57+
void __iomem *mmio_base;
58+
spinlock_t lock;
59+
int irq_ce;
60+
int irq_ue;
61+
};
62+
63+
static void prepare_msg(char *message, size_t buffer_size,
64+
enum hw_event_mc_err_type type,
65+
u8 rank, u32 row, u8 bg, u8 bank, u16 column,
66+
u32 syn0, u32 syn1, u32 syn2)
67+
{
68+
snprintf(message, buffer_size,
69+
"%s rank=0x%x row=0x%x bg=0x%x bank=0x%x col=0x%x syn0: 0x%x syn1: 0x%x syn2: 0x%x",
70+
type == HW_EVENT_ERR_UNCORRECTED ? "UE" : "CE",
71+
rank, row, bg, bank, column, syn0, syn1, syn2);
72+
}
73+
74+
static int handle_ce(struct mem_ctl_info *mci)
75+
{
76+
u32 eccerrcnt, ecccaddr0, ecccaddr1, ecccsyn0, ecccsyn1, ecccsyn2, row;
77+
struct al_mc_edac *al_mc = mci->pvt_info;
78+
char msg[AL_MC_EDAC_MSG_MAX];
79+
u16 ce_count, column;
80+
unsigned long flags;
81+
u8 rank, bg, bank;
82+
83+
eccerrcnt = readl_relaxed(al_mc->mmio_base + AL_MC_ECC_ERR_COUNT);
84+
ce_count = FIELD_GET(AL_MC_ECC_ERR_COUNT_CE, eccerrcnt);
85+
if (!ce_count)
86+
return 0;
87+
88+
ecccaddr0 = readl_relaxed(al_mc->mmio_base + AL_MC_ECC_CE_ADDR0);
89+
ecccaddr1 = readl_relaxed(al_mc->mmio_base + AL_MC_ECC_CE_ADDR1);
90+
ecccsyn0 = readl_relaxed(al_mc->mmio_base + AL_MC_ECC_CE_SYND0);
91+
ecccsyn1 = readl_relaxed(al_mc->mmio_base + AL_MC_ECC_CE_SYND1);
92+
ecccsyn2 = readl_relaxed(al_mc->mmio_base + AL_MC_ECC_CE_SYND2);
93+
94+
writel_relaxed(AL_MC_ECC_CLEAR_CE_COUNT | AL_MC_ECC_CLEAR_CE_ERR,
95+
al_mc->mmio_base + AL_MC_ECC_CLEAR);
96+
97+
dev_dbg(mci->pdev, "eccuaddr0=0x%08x eccuaddr1=0x%08x\n",
98+
ecccaddr0, ecccaddr1);
99+
100+
rank = FIELD_GET(AL_MC_ECC_CE_ADDR0_RANK, ecccaddr0);
101+
row = FIELD_GET(AL_MC_ECC_CE_ADDR0_ROW, ecccaddr0);
102+
103+
bg = FIELD_GET(AL_MC_ECC_CE_ADDR1_BG, ecccaddr1);
104+
bank = FIELD_GET(AL_MC_ECC_CE_ADDR1_BANK, ecccaddr1);
105+
column = FIELD_GET(AL_MC_ECC_CE_ADDR1_COLUMN, ecccaddr1);
106+
107+
prepare_msg(msg, sizeof(msg), HW_EVENT_ERR_CORRECTED,
108+
rank, row, bg, bank, column,
109+
ecccsyn0, ecccsyn1, ecccsyn2);
110+
111+
spin_lock_irqsave(&al_mc->lock, flags);
112+
edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
113+
ce_count, 0, 0, 0, 0, 0, -1, mci->ctl_name, msg);
114+
spin_unlock_irqrestore(&al_mc->lock, flags);
115+
116+
return ce_count;
117+
}
118+
119+
static int handle_ue(struct mem_ctl_info *mci)
120+
{
121+
u32 eccerrcnt, eccuaddr0, eccuaddr1, eccusyn0, eccusyn1, eccusyn2, row;
122+
struct al_mc_edac *al_mc = mci->pvt_info;
123+
char msg[AL_MC_EDAC_MSG_MAX];
124+
u16 ue_count, column;
125+
unsigned long flags;
126+
u8 rank, bg, bank;
127+
128+
eccerrcnt = readl_relaxed(al_mc->mmio_base + AL_MC_ECC_ERR_COUNT);
129+
ue_count = FIELD_GET(AL_MC_ECC_ERR_COUNT_UE, eccerrcnt);
130+
if (!ue_count)
131+
return 0;
132+
133+
eccuaddr0 = readl_relaxed(al_mc->mmio_base + AL_MC_ECC_UE_ADDR0);
134+
eccuaddr1 = readl_relaxed(al_mc->mmio_base + AL_MC_ECC_UE_ADDR1);
135+
eccusyn0 = readl_relaxed(al_mc->mmio_base + AL_MC_ECC_UE_SYND0);
136+
eccusyn1 = readl_relaxed(al_mc->mmio_base + AL_MC_ECC_UE_SYND1);
137+
eccusyn2 = readl_relaxed(al_mc->mmio_base + AL_MC_ECC_UE_SYND2);
138+
139+
writel_relaxed(AL_MC_ECC_CLEAR_UE_COUNT | AL_MC_ECC_CLEAR_UE_ERR,
140+
al_mc->mmio_base + AL_MC_ECC_CLEAR);
141+
142+
dev_dbg(mci->pdev, "eccuaddr0=0x%08x eccuaddr1=0x%08x\n",
143+
eccuaddr0, eccuaddr1);
144+
145+
rank = FIELD_GET(AL_MC_ECC_UE_ADDR0_RANK, eccuaddr0);
146+
row = FIELD_GET(AL_MC_ECC_UE_ADDR0_ROW, eccuaddr0);
147+
148+
bg = FIELD_GET(AL_MC_ECC_UE_ADDR1_BG, eccuaddr1);
149+
bank = FIELD_GET(AL_MC_ECC_UE_ADDR1_BANK, eccuaddr1);
150+
column = FIELD_GET(AL_MC_ECC_UE_ADDR1_COLUMN, eccuaddr1);
151+
152+
prepare_msg(msg, sizeof(msg), HW_EVENT_ERR_UNCORRECTED,
153+
rank, row, bg, bank, column,
154+
eccusyn0, eccusyn1, eccusyn2);
155+
156+
spin_lock_irqsave(&al_mc->lock, flags);
157+
edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
158+
ue_count, 0, 0, 0, 0, 0, -1, mci->ctl_name, msg);
159+
spin_unlock_irqrestore(&al_mc->lock, flags);
160+
161+
return ue_count;
162+
}
163+
164+
static void al_mc_edac_check(struct mem_ctl_info *mci)
165+
{
166+
struct al_mc_edac *al_mc = mci->pvt_info;
167+
168+
if (al_mc->irq_ue <= 0)
169+
handle_ue(mci);
170+
171+
if (al_mc->irq_ce <= 0)
172+
handle_ce(mci);
173+
}
174+
175+
static irqreturn_t al_mc_edac_irq_handler_ue(int irq, void *info)
176+
{
177+
struct platform_device *pdev = info;
178+
struct mem_ctl_info *mci = platform_get_drvdata(pdev);
179+
180+
if (handle_ue(mci))
181+
return IRQ_HANDLED;
182+
return IRQ_NONE;
183+
}
184+
185+
static irqreturn_t al_mc_edac_irq_handler_ce(int irq, void *info)
186+
{
187+
struct platform_device *pdev = info;
188+
struct mem_ctl_info *mci = platform_get_drvdata(pdev);
189+
190+
if (handle_ce(mci))
191+
return IRQ_HANDLED;
192+
return IRQ_NONE;
193+
}
194+
195+
static enum scrub_type get_scrub_mode(void __iomem *mmio_base)
196+
{
197+
u32 ecccfg0;
198+
199+
ecccfg0 = readl(mmio_base + AL_MC_ECC_CFG);
200+
201+
if (FIELD_GET(AL_MC_ECC_CFG_SCRUB_DISABLED, ecccfg0))
202+
return SCRUB_NONE;
203+
else
204+
return SCRUB_HW_SRC;
205+
}
206+
207+
static void devm_al_mc_edac_free(void *data)
208+
{
209+
edac_mc_free(data);
210+
}
211+
212+
static void devm_al_mc_edac_del(void *data)
213+
{
214+
edac_mc_del_mc(data);
215+
}
216+
217+
static int al_mc_edac_probe(struct platform_device *pdev)
218+
{
219+
struct edac_mc_layer layers[1];
220+
struct mem_ctl_info *mci;
221+
struct al_mc_edac *al_mc;
222+
void __iomem *mmio_base;
223+
struct dimm_info *dimm;
224+
int ret;
225+
226+
mmio_base = devm_platform_ioremap_resource(pdev, 0);
227+
if (IS_ERR(mmio_base)) {
228+
dev_err(&pdev->dev, "failed to ioremap memory (%ld)\n",
229+
PTR_ERR(mmio_base));
230+
return PTR_ERR(mmio_base);
231+
}
232+
233+
layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
234+
layers[0].size = 1;
235+
layers[0].is_virt_csrow = false;
236+
mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers,
237+
sizeof(struct al_mc_edac));
238+
if (!mci)
239+
return -ENOMEM;
240+
241+
ret = devm_add_action(&pdev->dev, devm_al_mc_edac_free, mci);
242+
if (ret) {
243+
edac_mc_free(mci);
244+
return ret;
245+
}
246+
247+
platform_set_drvdata(pdev, mci);
248+
al_mc = mci->pvt_info;
249+
250+
al_mc->mmio_base = mmio_base;
251+
252+
al_mc->irq_ue = of_irq_get_byname(pdev->dev.of_node, "ue");
253+
if (al_mc->irq_ue <= 0)
254+
dev_dbg(&pdev->dev,
255+
"no IRQ defined for UE - falling back to polling\n");
256+
257+
al_mc->irq_ce = of_irq_get_byname(pdev->dev.of_node, "ce");
258+
if (al_mc->irq_ce <= 0)
259+
dev_dbg(&pdev->dev,
260+
"no IRQ defined for CE - falling back to polling\n");
261+
262+
/*
263+
* In case both interrupts (ue/ce) are to be found, use interrupt mode.
264+
* In case none of the interrupt are foud, use polling mode.
265+
* In case only one interrupt is found, use interrupt mode for it but
266+
* keep polling mode enable for the other.
267+
*/
268+
if (al_mc->irq_ue <= 0 || al_mc->irq_ce <= 0) {
269+
edac_op_state = EDAC_OPSTATE_POLL;
270+
mci->edac_check = al_mc_edac_check;
271+
} else {
272+
edac_op_state = EDAC_OPSTATE_INT;
273+
}
274+
275+
spin_lock_init(&al_mc->lock);
276+
277+
mci->mtype_cap = MEM_FLAG_DDR3 | MEM_FLAG_DDR4;
278+
mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
279+
mci->edac_cap = EDAC_FLAG_SECDED;
280+
mci->mod_name = DRV_NAME;
281+
mci->ctl_name = "al_mc";
282+
mci->pdev = &pdev->dev;
283+
mci->scrub_mode = get_scrub_mode(mmio_base);
284+
285+
dimm = *mci->dimms;
286+
dimm->grain = 1;
287+
288+
ret = edac_mc_add_mc(mci);
289+
if (ret < 0) {
290+
dev_err(&pdev->dev,
291+
"fail to add memory controller device (%d)\n",
292+
ret);
293+
return ret;
294+
}
295+
296+
ret = devm_add_action(&pdev->dev, devm_al_mc_edac_del, &pdev->dev);
297+
if (ret) {
298+
edac_mc_del_mc(&pdev->dev);
299+
return ret;
300+
}
301+
302+
if (al_mc->irq_ue > 0) {
303+
ret = devm_request_irq(&pdev->dev,
304+
al_mc->irq_ue,
305+
al_mc_edac_irq_handler_ue,
306+
IRQF_SHARED,
307+
pdev->name,
308+
pdev);
309+
if (ret != 0) {
310+
dev_err(&pdev->dev,
311+
"failed to request UE IRQ %d (%d)\n",
312+
al_mc->irq_ue, ret);
313+
return ret;
314+
}
315+
}
316+
317+
if (al_mc->irq_ce > 0) {
318+
ret = devm_request_irq(&pdev->dev,
319+
al_mc->irq_ce,
320+
al_mc_edac_irq_handler_ce,
321+
IRQF_SHARED,
322+
pdev->name,
323+
pdev);
324+
if (ret != 0) {
325+
dev_err(&pdev->dev,
326+
"failed to request CE IRQ %d (%d)\n",
327+
al_mc->irq_ce, ret);
328+
return ret;
329+
}
330+
}
331+
332+
return 0;
333+
}
334+
335+
static const struct of_device_id al_mc_edac_of_match[] = {
336+
{ .compatible = "amazon,al-mc-edac", },
337+
{},
338+
};
339+
340+
MODULE_DEVICE_TABLE(of, al_mc_edac_of_match);
341+
342+
static struct platform_driver al_mc_edac_driver = {
343+
.probe = al_mc_edac_probe,
344+
.driver = {
345+
.name = DRV_NAME,
346+
.of_match_table = al_mc_edac_of_match,
347+
},
348+
};
349+
350+
module_platform_driver(al_mc_edac_driver);
351+
352+
MODULE_LICENSE("GPL v2");
353+
MODULE_AUTHOR("Talel Shenhar");
354+
MODULE_DESCRIPTION("Amazon's Annapurna Lab's Memory Controller EDAC Driver");

0 commit comments

Comments
 (0)