mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/
synced 2025-04-19 20:58:31 +09:00
- Add infrastructure support to EDAC in order to be able to register memory
scrubbing RAS functionality with the kernel and expose sysfs nodes to control such scrubbing functionality. The main use case is CXL devices which provide different scrubbers for their built-in memories so that tools like rasdaemon can configure and control memory scrubbing and other, more advanced RAS functionality. (Shiju Jose and Jonathan Cameron) - Add support to ie31200_edac for client SoCs like Raptor Lake-S which have multiple memory controllers and out-of-band ECC capability. (Qiuxu Zhuo) - The usual round of cleanups, simplifications and fixlets -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmfitgYACgkQEsHwGGHe VUp/Tg/7BJeE9QunMlB2EcCbYM+3eelp+Sg899S/6iNdC66sevFPoVXTpv9qz7Q6 +ZD+V5vKIuKmGlV9dtn8nK5o8VvA2EvZsYSp6kk85qC+GNoqlc9E50I1yB3+otl8 /3qD7PH0Ww5a4csjg+ioTRTphXp5DaK5J1m+Gze4h9n2ADs/aDb6vWr2AobomYOT h8pIb5PBdX9ehjWqUP/d+G+/ZN7244+FtMt1p3/xhBMjRJcwUxeAkw1u59EC5Hpb poP60Sl4pjr6uUI6QXrGEvLqvX3kq+fqveRosX1L+SlgAXesGXSg/tbdY5T78zGS aTebwmej00tvqQIYfsPpFKqk4W2wxUfnG6a2K0U3fYINQqSjI8kPrq9kMLpPejAG Lb0rZmHwLTPMM+G0BZVc4QSClhO9GXnD1wIsH8YGcqEkjDo0wkDL3KVm8aFhcx7b BDHn7b9Zx9zIvPhlcRupsUUNiqrNAV3R+zfUWzH9JF/GeCrT148vs8cZX16QGk18 bnA5SY/mv8EExbeaEltKRagdToqxW6WEq5vv6KuLpco4kfCWJYWUHe5QaAMzZUBZ hXW0vvzUbkBLZo2NsdbXJk0+iT/lmjjOaqHZGcuwtepLIsJoRySkabbsMe6TTTtY O4abNt5yJkgcCYJwMwKmS9RogROO9yZjhK+4QfGex2lYgjlMTds= =OpTE -----END PGP SIGNATURE----- Merge tag 'edac_updates_for_v6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras Pull EDAC updates from Borislav Petkov: - Add infrastructure support to EDAC in order to be able to register memory scrubbing RAS functionality with the kernel and expose sysfs nodes to control such scrubbing functionality. The main use case is CXL devices which provide different scrubbers for their built-in memories so that tools like rasdaemon can configure and control memory scrubbing and other, more advanced RAS functionality (Shiju Jose and Jonathan Cameron) - Add support to ie31200_edac for client SoCs like Raptor Lake-S which have multiple memory controllers and out-of-band ECC capability (Qiuxu Zhuo) - The usual round of cleanups, simplifications and fixlets * tag 'edac_updates_for_v6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras: (25 commits) MAINTAINERS: Add a secondary maintainer for bluefield_edac EDAC/ie31200: Switch Raptor Lake-S to interrupt mode EDAC/ie31200: Add Intel Raptor Lake-S SoCs support EDAC/ie31200: Break up ie31200_probe1() EDAC/ie31200: Fold the two channel loops into one loop EDAC/ie31200: Make struct dimm_data contain decoded information EDAC/ie31200: Make the memory controller resources configurable EDAC/ie31200: Simplify the pci_device_id table EDAC/ie31200: Fix the 3rd parameter name of *populate_dimm_info() EDAC/ie31200: Fix the error path order of ie31200_init() EDAC/ie31200: Fix the DIMM size mask for several SoCs EDAC/ie31200: Fix the size of EDAC_MC_LAYER_CHIP_SELECT layer EDAC/device: Fix dev_set_name() format string EDAC/pnd2: Make read-only const array intlv static EDAC/igen6: Constify struct res_config EDAC/amd64: Simplify return statement in dct_ecc_enabled() EDAC: Update memory repair control interface for memory sparing feature EDAC: Add a memory repair control feature EDAC: Use string choice helper functions EDAC: Add a Error Check Scrub control feature ...
This commit is contained in:
commit
ae8371a46e
74
Documentation/ABI/testing/sysfs-edac-ecs
Normal file
74
Documentation/ABI/testing/sysfs-edac-ecs
Normal file
@ -0,0 +1,74 @@
|
||||
What: /sys/bus/edac/devices/<dev-name>/ecs_fruX
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
The sysfs EDAC bus devices /<dev-name>/ecs_fruX subdirectory
|
||||
pertains to the memory media ECS (Error Check Scrub) control
|
||||
feature, where <dev-name> directory corresponds to a device
|
||||
registered with the EDAC device driver for the ECS feature.
|
||||
/ecs_fruX belongs to the media FRUs (Field Replaceable Unit)
|
||||
under the memory device.
|
||||
|
||||
The sysfs ECS attr nodes are only present if the parent
|
||||
driver has implemented the corresponding attr callback
|
||||
function and provided the necessary operations to the EDAC
|
||||
device driver during registration.
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/ecs_fruX/log_entry_type
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(RW) The log entry type of how the DDR5 ECS log is reported.
|
||||
|
||||
- 0 - per DRAM.
|
||||
|
||||
- 1 - per memory media FRU.
|
||||
|
||||
- All other values are reserved.
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/ecs_fruX/mode
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(RW) The mode of how the DDR5 ECS counts the errors.
|
||||
Error count is tracked based on two different modes
|
||||
selected by DDR5 ECS Control Feature - Codeword mode and
|
||||
Row Count mode. If the ECS is under Codeword mode, then
|
||||
the error count increments each time a codeword with check
|
||||
bit errors is detected. If the ECS is under Row Count mode,
|
||||
then the error counter increments each time a row with
|
||||
check bit errors is detected.
|
||||
|
||||
- 0 - ECS counts rows in the memory media that have ECC errors.
|
||||
|
||||
- 1 - ECS counts codewords with errors, specifically, it counts
|
||||
the number of ECC-detected errors in the memory media.
|
||||
|
||||
- All other values are reserved.
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/ecs_fruX/reset
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(WO) ECS reset ECC counter.
|
||||
|
||||
- 1 - reset ECC counter to the default value.
|
||||
|
||||
- All other values are reserved.
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/ecs_fruX/threshold
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(RW) DDR5 ECS threshold count per gigabits of memory cells.
|
||||
The ECS error count is subject to the ECS Threshold count
|
||||
per Gbit, which masks error counts less than the Threshold.
|
||||
|
||||
Supported values are 256, 1024 and 4096.
|
||||
|
||||
All other values are reserved.
|
206
Documentation/ABI/testing/sysfs-edac-memory-repair
Normal file
206
Documentation/ABI/testing/sysfs-edac-memory-repair
Normal file
@ -0,0 +1,206 @@
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
The sysfs EDAC bus devices /<dev-name>/mem_repairX subdirectory
|
||||
pertains to the memory media repair features control, such as
|
||||
PPR (Post Package Repair), memory sparing etc, where <dev-name>
|
||||
directory corresponds to a device registered with the EDAC
|
||||
device driver for the memory repair features.
|
||||
|
||||
Post Package Repair is a maintenance operation requests the memory
|
||||
device to perform a repair operation on its media. It is a memory
|
||||
self-healing feature that fixes a failing memory location by
|
||||
replacing it with a spare row in a DRAM device. For example, a
|
||||
CXL memory device with DRAM components that support PPR features may
|
||||
implement PPR maintenance operations. DRAM components may support
|
||||
two types of PPR functions: hard PPR, for a permanent row repair, and
|
||||
soft PPR, for a temporary row repair. Soft PPR may be much faster
|
||||
than hard PPR, but the repair is lost with a power cycle.
|
||||
|
||||
The sysfs attributes nodes for a repair feature are only
|
||||
present if the parent driver has implemented the corresponding
|
||||
attr callback function and provided the necessary operations
|
||||
to the EDAC device driver during registration.
|
||||
|
||||
In some states of system configuration (e.g. before address
|
||||
decoders have been configured), memory devices (e.g. CXL)
|
||||
may not have an active mapping in the main host address
|
||||
physical address map. As such, the memory to repair must be
|
||||
identified by a device specific physical addressing scheme
|
||||
using a device physical address(DPA). The DPA and other control
|
||||
attributes to use will be presented in related error records.
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/repair_type
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(RO) Memory repair type. For eg. post package repair,
|
||||
memory sparing etc. Valid values are:
|
||||
|
||||
- ppr - Post package repair.
|
||||
|
||||
- cacheline-sparing
|
||||
|
||||
- row-sparing
|
||||
|
||||
- bank-sparing
|
||||
|
||||
- rank-sparing
|
||||
|
||||
- All other values are reserved.
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/persist_mode
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(RW) Get/Set the current persist repair mode set for a
|
||||
repair function. Persist repair modes supported in the
|
||||
device, based on a memory repair function, either is temporary,
|
||||
which is lost with a power cycle or permanent. Valid values are:
|
||||
|
||||
- 0 - Soft memory repair (temporary repair).
|
||||
|
||||
- 1 - Hard memory repair (permanent repair).
|
||||
|
||||
- All other values are reserved.
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/repair_safe_when_in_use
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(RO) True if memory media is accessible and data is retained
|
||||
during the memory repair operation.
|
||||
The data may not be retained and memory requests may not be
|
||||
correctly processed during a repair operation. In such case
|
||||
repair operation can not be executed at runtime. The memory
|
||||
must be taken offline.
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/hpa
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(RW) Host Physical Address (HPA) of the memory to repair.
|
||||
The HPA to use will be provided in related error records.
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/dpa
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(RW) Device Physical Address (DPA) of the memory to repair.
|
||||
The specific DPA to use will be provided in related error
|
||||
records.
|
||||
|
||||
In some states of system configuration (e.g. before address
|
||||
decoders have been configured), memory devices (e.g. CXL)
|
||||
may not have an active mapping in the main host address
|
||||
physical address map. As such, the memory to repair must be
|
||||
identified by a device specific physical addressing scheme
|
||||
using a DPA. The device physical address(DPA) to use will be
|
||||
presented in related error records.
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/nibble_mask
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(RW) Read/Write Nibble mask of the memory to repair.
|
||||
Nibble mask identifies one or more nibbles in error on the
|
||||
memory bus that produced the error event. Nibble Mask bit 0
|
||||
shall be set if nibble 0 on the memory bus produced the
|
||||
event, etc. For example, CXL PPR and sparing, a nibble mask
|
||||
bit set to 1 indicates the request to perform repair
|
||||
operation in the specific device. All nibble mask bits set
|
||||
to 1 indicates the request to perform the operation in all
|
||||
devices. Eg. for CXL memory repair, the specific value of
|
||||
nibble mask to use will be provided in related error records.
|
||||
For more details, See nibble mask field in CXL spec ver 3.1,
|
||||
section 8.2.9.7.1.2 Table 8-103 soft PPR and section
|
||||
8.2.9.7.1.3 Table 8-104 hard PPR, section 8.2.9.7.1.4
|
||||
Table 8-105 memory sparing.
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/min_hpa
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/max_hpa
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/min_dpa
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/max_dpa
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(RW) The supported range of memory address that is to be
|
||||
repaired. The memory device may give the supported range of
|
||||
attributes to use and it will depend on the memory device
|
||||
and the portion of memory to repair.
|
||||
The userspace may receive the specific value of attributes
|
||||
to use for a repair operation from the memory device via
|
||||
related error records and trace events, for eg. CXL DRAM
|
||||
and CXL general media error records in CXL memory devices.
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/bank_group
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/bank
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/rank
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/row
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/column
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/channel
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/sub_channel
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(RW) The control attributes for the memory to be repaired.
|
||||
The specific value of attributes to use depends on the
|
||||
portion of memory to repair and will be reported to the host
|
||||
in related error records and be available to userspace
|
||||
in trace events, such as CXL DRAM and CXL general media
|
||||
error records of CXL memory devices.
|
||||
|
||||
When readng back these attributes, it returns the current
|
||||
value of memory requested to be repaired.
|
||||
|
||||
bank_group - The bank group of the memory to repair.
|
||||
|
||||
bank - The bank number of the memory to repair.
|
||||
|
||||
rank - The rank of the memory to repair. Rank is defined as a
|
||||
set of memory devices on a channel that together execute a
|
||||
transaction.
|
||||
|
||||
row - The row number of the memory to repair.
|
||||
|
||||
column - The column number of the memory to repair.
|
||||
|
||||
channel - The channel of the memory to repair. Channel is
|
||||
defined as an interface that can be independently accessed
|
||||
for a transaction.
|
||||
|
||||
sub_channel - The subchannel of the memory to repair.
|
||||
|
||||
The requirement to set these attributes varies based on the
|
||||
repair function. The attributes in sysfs are not present
|
||||
unless required for a repair function.
|
||||
|
||||
For example, CXL spec ver 3.1, Section 8.2.9.7.1.2 Table 8-103
|
||||
soft PPR and Section 8.2.9.7.1.3 Table 8-104 hard PPR operations,
|
||||
these attributes are not required to set. CXL spec ver 3.1,
|
||||
Section 8.2.9.7.1.4 Table 8-105 memory sparing, these attributes
|
||||
are required to set based on memory sparing granularity.
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/repair
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(WO) Issue the memory repair operation for the specified
|
||||
memory repair attributes. The operation may fail if resources
|
||||
are insufficient based on the requirements of the memory
|
||||
device and repair function.
|
||||
|
||||
- 1 - Issue the repair operation.
|
||||
|
||||
- All other values are reserved.
|
69
Documentation/ABI/testing/sysfs-edac-scrub
Normal file
69
Documentation/ABI/testing/sysfs-edac-scrub
Normal file
@ -0,0 +1,69 @@
|
||||
What: /sys/bus/edac/devices/<dev-name>/scrubX
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
The sysfs EDAC bus devices /<dev-name>/scrubX subdirectory
|
||||
belongs to an instance of memory scrub control feature,
|
||||
where <dev-name> directory corresponds to a device/memory
|
||||
region registered with the EDAC device driver for the
|
||||
scrub control feature.
|
||||
|
||||
The sysfs scrub attr nodes are only present if the parent
|
||||
driver has implemented the corresponding attr callback
|
||||
function and provided the necessary operations to the EDAC
|
||||
device driver during registration.
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/scrubX/addr
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(RW) The base address of the memory region to be scrubbed
|
||||
for on-demand scrubbing. Setting address starts scrubbing.
|
||||
The size must be set before that.
|
||||
|
||||
The readback addr value is non-zero if the requested
|
||||
on-demand scrubbing is in progress, zero otherwise.
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/scrubX/size
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(RW) The size of the memory region to be scrubbed
|
||||
(on-demand scrubbing).
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/scrubX/enable_background
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(RW) Start/Stop background (patrol) scrubbing if supported.
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/scrubX/min_cycle_duration
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(RO) Supported minimum scrub cycle duration in seconds
|
||||
by the memory scrubber.
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/scrubX/max_cycle_duration
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(RO) Supported maximum scrub cycle duration in seconds
|
||||
by the memory scrubber.
|
||||
|
||||
What: /sys/bus/edac/devices/<dev-name>/scrubX/current_cycle_duration
|
||||
Date: March 2025
|
||||
KernelVersion: 6.15
|
||||
Contact: linux-edac@vger.kernel.org
|
||||
Description:
|
||||
(RW) The current scrub cycle duration in seconds and must be
|
||||
within the supported range by the memory scrubber.
|
||||
|
||||
Scrub has an overhead when running and that may want to be
|
||||
reduced by taking longer to do it.
|
103
Documentation/edac/features.rst
Normal file
103
Documentation/edac/features.rst
Normal file
@ -0,0 +1,103 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later
|
||||
|
||||
=================
|
||||
EDAC/RAS features
|
||||
=================
|
||||
|
||||
Copyright (c) 2024-2025 HiSilicon Limited.
|
||||
|
||||
:Author: Shiju Jose <shiju.jose@huawei.com>
|
||||
:License: The GNU Free Documentation License, Version 1.2 without
|
||||
Invariant Sections, Front-Cover Texts nor Back-Cover Texts.
|
||||
(dual licensed under the GPL v2)
|
||||
|
||||
- Written for: 6.15
|
||||
|
||||
Introduction
|
||||
------------
|
||||
|
||||
EDAC/RAS components plugging and high-level design:
|
||||
|
||||
1. Scrub control
|
||||
|
||||
2. Error Check Scrub (ECS) control
|
||||
|
||||
3. ACPI RAS2 features
|
||||
|
||||
4. Post Package Repair (PPR) control
|
||||
|
||||
5. Memory Sparing Repair control
|
||||
|
||||
High level design is illustrated in the following diagram::
|
||||
|
||||
+-----------------------------------------------+
|
||||
| Userspace - Rasdaemon |
|
||||
| +-------------+ |
|
||||
| | RAS CXL mem | +---------------+ |
|
||||
| |error handler|---->| | |
|
||||
| +-------------+ | RAS dynamic | |
|
||||
| +-------------+ | scrub, memory | |
|
||||
| | RAS memory |---->| repair control| |
|
||||
| |error handler| +----|----------+ |
|
||||
| +-------------+ | |
|
||||
+--------------------------|--------------------+
|
||||
|
|
||||
|
|
||||
+-------------------------------|------------------------------+
|
||||
| Kernel EDAC extension for | controlling RAS Features |
|
||||
|+------------------------------|----------------------------+ |
|
||||
|| EDAC Core Sysfs EDAC| Bus | |
|
||||
|| +--------------------------|---------------------------+| |
|
||||
|| |/sys/bus/edac/devices/<dev>/scrubX/ | | EDAC device || |
|
||||
|| |/sys/bus/edac/devices/<dev>/ecsX/ |<->| EDAC MC || |
|
||||
|| |/sys/bus/edac/devices/<dev>/repairX | | EDAC sysfs || |
|
||||
|| +---------------------------|--------------------------+| |
|
||||
|| EDAC|Bus | |
|
||||
|| | | |
|
||||
|| +----------+ Get feature | Get feature | |
|
||||
|| | | desc +---------|------+ desc +----------+ | |
|
||||
|| |EDAC scrub|<-----| EDAC device | | | | |
|
||||
|| +----------+ | driver- RAS |----->| EDAC mem | | |
|
||||
|| +----------+ | feature control| | repair | | |
|
||||
|| | |<-----| | +----------+ | |
|
||||
|| |EDAC ECS | +---------|------+ | |
|
||||
|| +----------+ Register RAS|features | |
|
||||
|| ______________________|_____________ | |
|
||||
|+---------|---------------|------------------|--------------+ |
|
||||
| +-------|----+ +-------|-------+ +----|----------+ |
|
||||
| | | | CXL mem driver| | Client driver | |
|
||||
| | ACPI RAS2 | | scrub, ECS, | | memory repair | |
|
||||
| | driver | | sparing, PPR | | features | |
|
||||
| +-----|------+ +-------|-------+ +------|--------+ |
|
||||
| | | | |
|
||||
+--------|-----------------|--------------------|--------------+
|
||||
| | |
|
||||
+--------|-----------------|--------------------|--------------+
|
||||
| +---|-----------------|--------------------|-------+ |
|
||||
| | | |
|
||||
| | Platform HW and Firmware | |
|
||||
| +--------------------------------------------------+ |
|
||||
+--------------------------------------------------------------+
|
||||
|
||||
|
||||
1. EDAC Features components - Create feature-specific descriptors. For
|
||||
example: scrub, ECS, memory repair in the above diagram.
|
||||
|
||||
2. EDAC device driver for controlling RAS Features - Get feature's attribute
|
||||
descriptors from EDAC RAS feature component and registers device's RAS
|
||||
features with EDAC bus and expose the features control attributes via
|
||||
sysfs. For example, /sys/bus/edac/devices/<dev-name>/<feature>X/
|
||||
|
||||
3. RAS dynamic feature controller - Userspace sample modules in rasdaemon for
|
||||
dynamic scrub/repair control to issue scrubbing/repair when excess number
|
||||
of corrected memory errors are reported in a short span of time.
|
||||
|
||||
RAS features
|
||||
------------
|
||||
1. Memory Scrub
|
||||
|
||||
Memory scrub features are documented in `Documentation/edac/scrub.rst`.
|
||||
|
||||
2. Memory Repair
|
||||
|
||||
Memory repair features are documented in `Documentation/edac/memory_repair.rst`.
|
12
Documentation/edac/index.rst
Normal file
12
Documentation/edac/index.rst
Normal file
@ -0,0 +1,12 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later
|
||||
|
||||
==============
|
||||
EDAC Subsystem
|
||||
==============
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
features
|
||||
memory_repair
|
||||
scrub
|
121
Documentation/edac/memory_repair.rst
Normal file
121
Documentation/edac/memory_repair.rst
Normal file
@ -0,0 +1,121 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later
|
||||
|
||||
==========================
|
||||
EDAC Memory Repair Control
|
||||
==========================
|
||||
|
||||
Copyright (c) 2024-2025 HiSilicon Limited.
|
||||
|
||||
:Author: Shiju Jose <shiju.jose@huawei.com>
|
||||
:License: The GNU Free Documentation License, Version 1.2 without
|
||||
Invariant Sections, Front-Cover Texts nor Back-Cover Texts.
|
||||
(dual licensed under the GPL v2)
|
||||
:Original Reviewers:
|
||||
|
||||
- Written for: 6.15
|
||||
|
||||
Introduction
|
||||
------------
|
||||
|
||||
Some memory devices support repair operations to address issues in their
|
||||
memory media. Post Package Repair (PPR) and memory sparing are examples of
|
||||
such features.
|
||||
|
||||
Post Package Repair (PPR)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Post Package Repair is a maintenance operation which requests the memory
|
||||
device to perform repair operation on its media. It is a memory self-healing
|
||||
feature that fixes a failing memory location by replacing it with a spare row
|
||||
in a DRAM device.
|
||||
|
||||
For example, a CXL memory device with DRAM components that support PPR
|
||||
features implements maintenance operations. DRAM components support those
|
||||
types of PPR functions:
|
||||
|
||||
- hard PPR, for a permanent row repair, and
|
||||
- soft PPR, for a temporary row repair.
|
||||
|
||||
Soft PPR is much faster than hard PPR, but the repair is lost after a power
|
||||
cycle.
|
||||
|
||||
The data may not be retained and memory requests may not be correctly
|
||||
processed during a repair operation. In such case, the repair operation should
|
||||
not be executed at runtime.
|
||||
|
||||
For example, for CXL memory devices, see CXL spec rev 3.1 [1]_ sections
|
||||
8.2.9.7.1.1 PPR Maintenance Operations, 8.2.9.7.1.2 sPPR Maintenance Operation
|
||||
and 8.2.9.7.1.3 hPPR Maintenance Operation for more details.
|
||||
|
||||
Memory Sparing
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
Memory sparing is a repair function that replaces a portion of memory with
|
||||
a portion of functional memory at a particular granularity. Memory
|
||||
sparing has cacheline/row/bank/rank sparing granularities. For example, in
|
||||
rank memory-sparing mode, one memory rank serves as a spare for other ranks on
|
||||
the same channel in case they fail.
|
||||
|
||||
The spare rank is held in reserve and not used as active memory until
|
||||
a failure is indicated, with reserved capacity subtracted from the total
|
||||
available memory in the system.
|
||||
|
||||
After an error threshold is surpassed in a system protected by memory sparing,
|
||||
the content of a failing rank of DIMMs is copied to the spare rank. The
|
||||
failing rank is then taken offline and the spare rank placed online for use as
|
||||
active memory in place of the failed rank.
|
||||
|
||||
For example, CXL memory devices can support various subclasses for sparing
|
||||
operation vary in terms of the scope of the sparing being performed.
|
||||
|
||||
Cacheline sparing subclass refers to a sparing action that can replace a full
|
||||
cacheline. Row sparing is provided as an alternative to PPR sparing functions
|
||||
and its scope is that of a single DDR row. Bank sparing allows an entire bank
|
||||
to be replaced. Rank sparing is defined as an operation in which an entire DDR
|
||||
rank is replaced.
|
||||
|
||||
See CXL spec 3.1 [1]_ section 8.2.9.7.1.4 Memory Sparing Maintenance
|
||||
Operations for more details.
|
||||
|
||||
.. [1] https://computeexpresslink.org/cxl-specification/
|
||||
|
||||
Use cases of generic memory repair features control
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
1. The soft PPR, hard PPR and memory-sparing features share similar control
|
||||
attributes. Therefore, there is a need for a standardized, generic sysfs
|
||||
repair control that is exposed to userspace and used by administrators,
|
||||
scripts and tools.
|
||||
|
||||
2. When a CXL device detects an error in a memory component, it informs the
|
||||
host of the need for a repair maintenance operation by using an event
|
||||
record where the "maintenance needed" flag is set. The event record
|
||||
specifies the device physical address (DPA) and attributes of the memory
|
||||
that requires repair. The kernel reports the corresponding CXL general
|
||||
media or DRAM trace event to userspace, and userspace tools (e.g.
|
||||
rasdaemon) initiate a repair maintenance operation in response to the
|
||||
device request using the sysfs repair control.
|
||||
|
||||
3. Userspace tools, such as rasdaemon, request a repair operation on a memory
|
||||
region when maintenance need flag set or an uncorrected memory error or
|
||||
excess of corrected memory errors above a threshold value is reported or an
|
||||
exceed corrected errors threshold flag set for that memory.
|
||||
|
||||
4. Multiple PPR/sparing instances may be present per memory device.
|
||||
|
||||
5. Drivers should enforce that live repair is safe. In systems where memory
|
||||
mapping functions can change between boots, one approach to this is to log
|
||||
memory errors seen on this boot against which to check live memory repair
|
||||
requests.
|
||||
|
||||
The File System
|
||||
---------------
|
||||
|
||||
The control attributes of a registered memory repair instance could be
|
||||
accessed in the /sys/bus/edac/devices/<dev-name>/mem_repairX/
|
||||
|
||||
sysfs
|
||||
-----
|
||||
|
||||
Sysfs files are documented in
|
||||
`Documentation/ABI/testing/sysfs-edac-memory-repair`.
|
266
Documentation/edac/scrub.rst
Normal file
266
Documentation/edac/scrub.rst
Normal file
@ -0,0 +1,266 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later
|
||||
|
||||
=============
|
||||
Scrub Control
|
||||
=============
|
||||
|
||||
Copyright (c) 2024-2025 HiSilicon Limited.
|
||||
|
||||
:Author: Shiju Jose <shiju.jose@huawei.com>
|
||||
:License: The GNU Free Documentation License, Version 1.2 without
|
||||
Invariant Sections, Front-Cover Texts nor Back-Cover Texts.
|
||||
(dual licensed under the GPL v2)
|
||||
|
||||
- Written for: 6.15
|
||||
|
||||
Introduction
|
||||
------------
|
||||
|
||||
Increasing DRAM size and cost have made memory subsystem reliability an
|
||||
important concern. These modules are used where potentially corrupted data
|
||||
could cause expensive or fatal issues. Memory errors are among the top
|
||||
hardware failures that cause server and workload crashes.
|
||||
|
||||
Memory scrubbing is a feature where an ECC (Error-Correcting Code) engine
|
||||
reads data from each memory media location, corrects if necessary and writes
|
||||
the corrected data back to the same memory media location.
|
||||
|
||||
DIMMs can be scrubbed at a configurable rate to detect uncorrected memory
|
||||
errors and attempt recovery from detected errors, providing the following
|
||||
benefits:
|
||||
|
||||
1. Proactively scrubbing DIMMs reduces the chance of a correctable error
|
||||
becoming uncorrectable.
|
||||
|
||||
2. When detected, uncorrected errors caught in unallocated memory pages are
|
||||
isolated and prevented from being allocated to an application or the OS.
|
||||
|
||||
3. This reduces the likelihood of software or hardware products encountering
|
||||
memory errors.
|
||||
|
||||
4. The additional data on failures in memory may be used to build up
|
||||
statistics that are later used to decide whether to use memory repair
|
||||
technologies such as Post Package Repair or Sparing.
|
||||
|
||||
There are 2 types of memory scrubbing:
|
||||
|
||||
1. Background (patrol) scrubbing while the DRAM is otherwise idle.
|
||||
|
||||
2. On-demand scrubbing for a specific address range or region of memory.
|
||||
|
||||
Several types of interfaces to hardware memory scrubbers have been
|
||||
identified, such as CXL memory device patrol scrub, CXL DDR5 ECS, ACPI
|
||||
RAS2 memory scrubbing, and ACPI NVDIMM ARS (Address Range Scrub).
|
||||
|
||||
The control mechanisms vary across different memory scrubbers. To enable
|
||||
standardized userspace tooling, there is a need to present these controls
|
||||
through a standardized ABI.
|
||||
|
||||
A generic memory EDAC scrub control allows users to manage underlying
|
||||
scrubbers in the system through a standardized sysfs control interface. It
|
||||
abstracts the management of various scrubbing functionalities into a unified
|
||||
set of functions.
|
||||
|
||||
Use cases of common scrub control feature
|
||||
-----------------------------------------
|
||||
|
||||
1. Several types of interfaces for hardware memory scrubbers have been
|
||||
identified, including the CXL memory device patrol scrub, CXL DDR5 ECS,
|
||||
ACPI RAS2 memory scrubbing features, ACPI NVDIMM ARS (Address Range Scrub),
|
||||
and software-based memory scrubbers.
|
||||
|
||||
Of the identified interfaces to hardware memory scrubbers some support
|
||||
control over patrol (background) scrubbing (e.g., ACPI RAS2, CXL) and/or
|
||||
on-demand scrubbing (e.g., ACPI RAS2, ACPI ARS). However, the scrub control
|
||||
interfaces vary between memory scrubbers, highlighting the need for
|
||||
a standardized, generic sysfs scrub control interface that is accessible to
|
||||
userspace for administration and use by scripts/tools.
|
||||
|
||||
2. User-space scrub controls allow users to disable scrubbing if necessary,
|
||||
for example, to disable background patrol scrubbing or adjust the scrub
|
||||
rate for performance-aware operations where background activities need to
|
||||
be minimized or disabled.
|
||||
|
||||
3. User-space tools enable on-demand scrubbing for specific address ranges,
|
||||
provided that the scrubber supports this functionality.
|
||||
|
||||
4. User-space tools can also control memory DIMM scrubbing at a configurable
|
||||
scrub rate via sysfs scrub controls. This approach offers several benefits:
|
||||
|
||||
4.1. Detects uncorrectable memory errors early, before user access to affected
|
||||
memory, helping facilitate recovery.
|
||||
|
||||
4.2. Reduces the likelihood of correctable errors developing into uncorrectable
|
||||
errors.
|
||||
|
||||
5. Policy control for hotplugged memory is necessary because there may not
|
||||
be a system-wide BIOS or similar control to manage scrub settings for a CXL
|
||||
device added after boot. Determining these settings is a policy decision,
|
||||
balancing reliability against performance, so userspace should control it.
|
||||
Therefore, a unified interface is recommended for handling this function in
|
||||
a way that aligns with other similar interfaces, rather than creating a
|
||||
separate one.
|
||||
|
||||
Scrubbing features
|
||||
------------------
|
||||
|
||||
CXL Memory Scrubbing features
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
CXL spec r3.1 [1]_ section 8.2.9.9.11.1 describes the memory device patrol
|
||||
scrub control feature. The device patrol scrub proactively locates and makes
|
||||
corrections to errors in regular cycle. The patrol scrub control allows the
|
||||
userspace request to change CXL patrol scrubber's configurations.
|
||||
|
||||
The patrol scrub control allows the requester to specify the number of
|
||||
hours in which the patrol scrub cycles must be completed, provided that
|
||||
the requested scrub rate must be within the supported range of the
|
||||
scrub rate that the device is capable of. In the CXL driver, the
|
||||
number of seconds per scrub cycles, which user requests via sysfs, is
|
||||
rescaled to hours per scrub cycles.
|
||||
|
||||
In addition, they allow the host to disable the feature in case it interferes
|
||||
with performance-aware operations which require the background operations to
|
||||
be turned off.
|
||||
|
||||
Error Check Scrub (ECS)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
CXL spec r3.1 [1]_ section 8.2.9.9.11.2 describes Error Check Scrub (ECS)
|
||||
- a feature defined in the JEDEC DDR5 SDRAM Specification (JESD79-5) and
|
||||
allowing DRAM to internally read, correct single-bit errors, and write back
|
||||
corrected data bits to the DRAM array while providing transparency to error
|
||||
counts.
|
||||
|
||||
The DDR5 device contains number of memory media Field Replaceable Units (FRU)
|
||||
per device. The DDR5 ECS feature and thus the ECS control driver supports
|
||||
configuring the ECS parameters per FRU.
|
||||
|
||||
ACPI RAS2 Hardware-based Memory Scrubbing
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
ACPI spec 6.5 [2]_ section 5.2.21 ACPI RAS2 describes an ACPI RAS2 table
|
||||
which provides interfaces for platform RAS features and supports independent
|
||||
RAS controls and capabilities for a given RAS feature for multiple instances
|
||||
of the same component in a given system.
|
||||
|
||||
Memory RAS features apply to RAS capabilities, controls and operations that
|
||||
are specific to memory. RAS2 PCC sub-spaces for memory-specific RAS features
|
||||
have a Feature Type of 0x00 (Memory).
|
||||
|
||||
The platform can use the hardware-based memory scrubbing feature to expose
|
||||
controls and capabilities associated with hardware-based memory scrub
|
||||
engines. The RAS2 memory scrubbing feature supports as per spec,
|
||||
|
||||
1. Independent memory scrubbing controls for each NUMA domain, identified
|
||||
using its proximity domain.
|
||||
|
||||
2. Provision for background (patrol) scrubbing of the entire memory system,
|
||||
as well as on-demand scrubbing for a specific region of memory.
|
||||
|
||||
ACPI Address Range Scrubbing (ARS)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
ACPI spec 6.5 [2]_ section 9.19.7.2 describes Address Range Scrubbing (ARS).
|
||||
ARS allows the platform to communicate memory errors to system software.
|
||||
This capability allows system software to prevent accesses to addresses with
|
||||
uncorrectable errors in memory. ARS functions manage all NVDIMMs present in
|
||||
the system. Only one scrub can be in progress system wide at any given time.
|
||||
|
||||
The following functions are supported as per the specification:
|
||||
|
||||
1. Query ARS Capabilities for a given address range, indicates platform
|
||||
supports the ACPI NVDIMM Root Device Unconsumed Error Notification.
|
||||
|
||||
2. Start ARS triggers an Address Range Scrub for the given memory range.
|
||||
Address scrubbing can be done for volatile or persistent memory, or both.
|
||||
|
||||
3. Query ARS Status command allows software to get the status of ARS,
|
||||
including the progress of ARS and ARS error record.
|
||||
|
||||
4. Clear Uncorrectable Error.
|
||||
|
||||
5. Translate SPA
|
||||
|
||||
6. ARS Error Inject etc.
|
||||
|
||||
The kernel supports an existing control for ARS and ARS is currently not
|
||||
supported in EDAC.
|
||||
|
||||
.. [1] https://computeexpresslink.org/cxl-specification/
|
||||
|
||||
.. [2] https://uefi.org/specs/ACPI/6.5/
|
||||
|
||||
Comparison of various scrubbing features
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
+--------------+-----------+-----------+-----------+-----------+
|
||||
| | ACPI | CXL patrol| CXL ECS | ARS |
|
||||
| Name | RAS2 | scrub | | |
|
||||
+--------------+-----------+-----------+-----------+-----------+
|
||||
| | | | | |
|
||||
| On-demand | Supported | No | No | Supported |
|
||||
| Scrubbing | | | | |
|
||||
| | | | | |
|
||||
+--------------+-----------+-----------+-----------+-----------+
|
||||
| | | | | |
|
||||
| Background | Supported | Supported | Supported | No |
|
||||
| scrubbing | | | | |
|
||||
| | | | | |
|
||||
+--------------+-----------+-----------+-----------+-----------+
|
||||
| | | | | |
|
||||
| Mode of | Scrub ctrl| per device| per memory| Unknown |
|
||||
| scrubbing | per NUMA | | media | |
|
||||
| | domain. | | | |
|
||||
+--------------+-----------+-----------+-----------+-----------+
|
||||
| | | | | |
|
||||
| Query scrub | Supported | Supported | Supported | Supported |
|
||||
| capabilities | | | | |
|
||||
| | | | | |
|
||||
+--------------+-----------+-----------+-----------+-----------+
|
||||
| | | | | |
|
||||
| Setting | Supported | No | No | Supported |
|
||||
| address range| | | | |
|
||||
| | | | | |
|
||||
+--------------+-----------+-----------+-----------+-----------+
|
||||
| | | | | |
|
||||
| Setting | Supported | Supported | No | No |
|
||||
| scrub rate | | | | |
|
||||
| | | | | |
|
||||
+--------------+-----------+-----------+-----------+-----------+
|
||||
| | | | | |
|
||||
| Unit for | Not | in hours | No | No |
|
||||
| scrub rate | Defined | | | |
|
||||
| | | | | |
|
||||
+--------------+-----------+-----------+-----------+-----------+
|
||||
| | Supported | | | |
|
||||
| Scrub | on-demand | No | No | Supported |
|
||||
| status/ | scrubbing | | | |
|
||||
| Completion | only | | | |
|
||||
+--------------+-----------+-----------+-----------+-----------+
|
||||
| UC error | |CXL general|CXL general| ACPI UCE |
|
||||
| reporting | Exception |media/DRAM |media/DRAM | notify and|
|
||||
| | |event/media|event/media| query |
|
||||
| | |scan? |scan? | ARS status|
|
||||
+--------------+-----------+-----------+-----------+-----------+
|
||||
| | | | | |
|
||||
| Support for | Supported | Supported | Supported | No |
|
||||
| EDAC control | | | | |
|
||||
| | | | | |
|
||||
+--------------+-----------+-----------+-----------+-----------+
|
||||
|
||||
The File System
|
||||
---------------
|
||||
|
||||
The control attributes of a registered scrubber instance could be
|
||||
accessed in:
|
||||
|
||||
/sys/bus/edac/devices/<dev-name>/scrubX/
|
||||
|
||||
sysfs
|
||||
-----
|
||||
|
||||
Sysfs files are documented in
|
||||
`Documentation/ABI/testing/sysfs-edac-scrub`
|
||||
|
||||
`Documentation/ABI/testing/sysfs-edac-ecs`
|
@ -8236,6 +8236,7 @@ F: drivers/edac/aspeed_edac.c
|
||||
|
||||
EDAC-BLUEFIELD
|
||||
M: Shravan Kumar Ramani <shravankr@nvidia.com>
|
||||
M: David Thompson <davthompson@nvidia.com>
|
||||
S: Supported
|
||||
F: drivers/edac/bluefield_edac.c
|
||||
|
||||
|
@ -75,6 +75,34 @@ config EDAC_GHES
|
||||
|
||||
In doubt, say 'Y'.
|
||||
|
||||
config EDAC_SCRUB
|
||||
bool "EDAC scrub feature"
|
||||
help
|
||||
The EDAC scrub feature is optional and is designed to control the
|
||||
memory scrubbers in the system. The common sysfs scrub interface
|
||||
abstracts the control of various arbitrary scrubbing functionalities
|
||||
into a unified set of functions.
|
||||
Say 'y/n' to enable/disable EDAC scrub feature.
|
||||
|
||||
config EDAC_ECS
|
||||
bool "EDAC ECS (Error Check Scrub) feature"
|
||||
help
|
||||
The EDAC ECS feature is optional and is designed to control on-die
|
||||
error check scrub (e.g., DDR5 ECS) in the system. The common sysfs
|
||||
ECS interface abstracts the control of various ECS functionalities
|
||||
into a unified set of functions.
|
||||
Say 'y/n' to enable/disable EDAC ECS feature.
|
||||
|
||||
config EDAC_MEM_REPAIR
|
||||
bool "EDAC memory repair feature"
|
||||
help
|
||||
The EDAC memory repair feature is optional and is designed to control
|
||||
the memory devices with repair features, such as Post Package Repair
|
||||
(PPR), memory sparing etc. The common sysfs memory repair interface
|
||||
abstracts the control of various memory repair functionalities into
|
||||
a unified set of functions.
|
||||
Say 'y/n' to enable/disable EDAC memory repair feature.
|
||||
|
||||
config EDAC_AMD64
|
||||
tristate "AMD64 (Opteron, Athlon64)"
|
||||
depends on AMD_NB && EDAC_DECODE_MCE
|
||||
@ -168,7 +196,7 @@ config EDAC_I3200
|
||||
|
||||
config EDAC_IE31200
|
||||
tristate "Intel e312xx"
|
||||
depends on PCI && X86
|
||||
depends on PCI && X86 && X86_MCE_INTEL
|
||||
help
|
||||
Support for error detection and correction on the Intel
|
||||
E3-1200 based DRAM controllers.
|
||||
|
@ -12,6 +12,9 @@ edac_core-y := edac_mc.o edac_device.o edac_mc_sysfs.o
|
||||
edac_core-y += edac_module.o edac_device_sysfs.o wq.o
|
||||
|
||||
edac_core-$(CONFIG_EDAC_DEBUG) += debugfs.o
|
||||
edac_core-$(CONFIG_EDAC_SCRUB) += scrub.o
|
||||
edac_core-$(CONFIG_EDAC_ECS) += ecs.o
|
||||
edac_core-$(CONFIG_EDAC_MEM_REPAIR) += mem_repair.o
|
||||
|
||||
ifdef CONFIG_PCI
|
||||
edac_core-y += edac_pci.o edac_pci_sysfs.o
|
||||
|
@ -1,5 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
#include <linux/ras.h>
|
||||
#include <linux/string_choices.h>
|
||||
#include "amd64_edac.h"
|
||||
#include <asm/amd_nb.h>
|
||||
#include <asm/amd_node.h>
|
||||
@ -1171,22 +1172,21 @@ static void debug_dump_dramcfg_low(struct amd64_pvt *pvt, u32 dclr, int chan)
|
||||
edac_dbg(1, " LRDIMM %dx rank multiply\n", (dcsm & 0x3));
|
||||
}
|
||||
|
||||
edac_dbg(1, "All DIMMs support ECC:%s\n",
|
||||
(dclr & BIT(19)) ? "yes" : "no");
|
||||
edac_dbg(1, "All DIMMs support ECC: %s\n", str_yes_no(dclr & BIT(19)));
|
||||
|
||||
|
||||
edac_dbg(1, " PAR/ERR parity: %s\n",
|
||||
(dclr & BIT(8)) ? "enabled" : "disabled");
|
||||
str_enabled_disabled(dclr & BIT(8)));
|
||||
|
||||
if (pvt->fam == 0x10)
|
||||
edac_dbg(1, " DCT 128bit mode width: %s\n",
|
||||
(dclr & BIT(11)) ? "128b" : "64b");
|
||||
|
||||
edac_dbg(1, " x4 logical DIMMs present: L0: %s L1: %s L2: %s L3: %s\n",
|
||||
(dclr & BIT(12)) ? "yes" : "no",
|
||||
(dclr & BIT(13)) ? "yes" : "no",
|
||||
(dclr & BIT(14)) ? "yes" : "no",
|
||||
(dclr & BIT(15)) ? "yes" : "no");
|
||||
str_yes_no(dclr & BIT(12)),
|
||||
str_yes_no(dclr & BIT(13)),
|
||||
str_yes_no(dclr & BIT(14)),
|
||||
str_yes_no(dclr & BIT(15)));
|
||||
}
|
||||
|
||||
#define CS_EVEN_PRIMARY BIT(0)
|
||||
@ -1353,14 +1353,14 @@ static void umc_dump_misc_regs(struct amd64_pvt *pvt)
|
||||
edac_dbg(1, "UMC%d UMC cap high: 0x%x\n", i, umc->umc_cap_hi);
|
||||
|
||||
edac_dbg(1, "UMC%d ECC capable: %s, ChipKill ECC capable: %s\n",
|
||||
i, (umc->umc_cap_hi & BIT(30)) ? "yes" : "no",
|
||||
(umc->umc_cap_hi & BIT(31)) ? "yes" : "no");
|
||||
i, str_yes_no(umc->umc_cap_hi & BIT(30)),
|
||||
str_yes_no(umc->umc_cap_hi & BIT(31)));
|
||||
edac_dbg(1, "UMC%d All DIMMs support ECC: %s\n",
|
||||
i, (umc->umc_cfg & BIT(12)) ? "yes" : "no");
|
||||
i, str_yes_no(umc->umc_cfg & BIT(12)));
|
||||
edac_dbg(1, "UMC%d x4 DIMMs present: %s\n",
|
||||
i, (umc->dimm_cfg & BIT(6)) ? "yes" : "no");
|
||||
i, str_yes_no(umc->dimm_cfg & BIT(6)));
|
||||
edac_dbg(1, "UMC%d x16 DIMMs present: %s\n",
|
||||
i, (umc->dimm_cfg & BIT(7)) ? "yes" : "no");
|
||||
i, str_yes_no(umc->dimm_cfg & BIT(7)));
|
||||
|
||||
umc_debug_display_dimm_sizes(pvt, i);
|
||||
}
|
||||
@ -1371,11 +1371,11 @@ static void dct_dump_misc_regs(struct amd64_pvt *pvt)
|
||||
edac_dbg(1, "F3xE8 (NB Cap): 0x%08x\n", pvt->nbcap);
|
||||
|
||||
edac_dbg(1, " NB two channel DRAM capable: %s\n",
|
||||
(pvt->nbcap & NBCAP_DCT_DUAL) ? "yes" : "no");
|
||||
str_yes_no(pvt->nbcap & NBCAP_DCT_DUAL));
|
||||
|
||||
edac_dbg(1, " ECC capable: %s, ChipKill ECC capable: %s\n",
|
||||
(pvt->nbcap & NBCAP_SECDED) ? "yes" : "no",
|
||||
(pvt->nbcap & NBCAP_CHIPKILL) ? "yes" : "no");
|
||||
str_yes_no(pvt->nbcap & NBCAP_SECDED),
|
||||
str_yes_no(pvt->nbcap & NBCAP_CHIPKILL));
|
||||
|
||||
debug_dump_dramcfg_low(pvt, pvt->dclr0, 0);
|
||||
|
||||
@ -1398,7 +1398,7 @@ static void dct_dump_misc_regs(struct amd64_pvt *pvt)
|
||||
if (!dct_ganging_enabled(pvt))
|
||||
debug_dump_dramcfg_low(pvt, pvt->dclr1, 1);
|
||||
|
||||
edac_dbg(1, " DramHoleValid: %s\n", dhar_valid(pvt) ? "yes" : "no");
|
||||
edac_dbg(1, " DramHoleValid: %s\n", str_yes_no(dhar_valid(pvt)));
|
||||
|
||||
amd64_info("using x%u syndromes.\n", pvt->ecc_sym_sz);
|
||||
}
|
||||
@ -2027,15 +2027,15 @@ static void read_dram_ctl_register(struct amd64_pvt *pvt)
|
||||
|
||||
if (!dct_ganging_enabled(pvt))
|
||||
edac_dbg(0, " Address range split per DCT: %s\n",
|
||||
(dct_high_range_enabled(pvt) ? "yes" : "no"));
|
||||
str_yes_no(dct_high_range_enabled(pvt)));
|
||||
|
||||
edac_dbg(0, " data interleave for ECC: %s, DRAM cleared since last warm reset: %s\n",
|
||||
(dct_data_intlv_enabled(pvt) ? "enabled" : "disabled"),
|
||||
(dct_memory_cleared(pvt) ? "yes" : "no"));
|
||||
str_enabled_disabled(dct_data_intlv_enabled(pvt)),
|
||||
str_yes_no(dct_memory_cleared(pvt)));
|
||||
|
||||
edac_dbg(0, " channel interleave: %s, "
|
||||
"interleave bits selector: 0x%x\n",
|
||||
(dct_interleave_enabled(pvt) ? "enabled" : "disabled"),
|
||||
str_enabled_disabled(dct_interleave_enabled(pvt)),
|
||||
dct_sel_interleave_addr(pvt));
|
||||
}
|
||||
|
||||
@ -3208,8 +3208,7 @@ static bool nb_mce_bank_enabled_on_node(u16 nid)
|
||||
nbe = reg->l & MSR_MCGCTL_NBE;
|
||||
|
||||
edac_dbg(0, "core: %u, MCG_CTL: 0x%llx, NB MSR is %s\n",
|
||||
cpu, reg->q,
|
||||
(nbe ? "enabled" : "disabled"));
|
||||
cpu, reg->q, str_enabled_disabled(nbe));
|
||||
|
||||
if (!nbe)
|
||||
goto out;
|
||||
@ -3353,12 +3352,9 @@ static bool dct_ecc_enabled(struct amd64_pvt *pvt)
|
||||
edac_dbg(0, "NB MCE bank disabled, set MSR 0x%08x[4] on node %d to enable.\n",
|
||||
MSR_IA32_MCG_CTL, nid);
|
||||
|
||||
edac_dbg(3, "Node %d: DRAM ECC %s.\n", nid, (ecc_en ? "enabled" : "disabled"));
|
||||
edac_dbg(3, "Node %d: DRAM ECC %s.\n", nid, str_enabled_disabled(ecc_en));
|
||||
|
||||
if (!ecc_en || !nb_mce_en)
|
||||
return false;
|
||||
else
|
||||
return true;
|
||||
return ecc_en && nb_mce_en;
|
||||
}
|
||||
|
||||
static bool umc_ecc_enabled(struct amd64_pvt *pvt)
|
||||
@ -3378,7 +3374,7 @@ static bool umc_ecc_enabled(struct amd64_pvt *pvt)
|
||||
}
|
||||
}
|
||||
|
||||
edac_dbg(3, "Node %d: DRAM ECC %s.\n", pvt->mc_node_id, (ecc_en ? "enabled" : "disabled"));
|
||||
edac_dbg(3, "Node %d: DRAM ECC %s.\n", pvt->mc_node_id, str_enabled_disabled(ecc_en));
|
||||
|
||||
return ecc_en;
|
||||
}
|
||||
|
@ -1,4 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
|
||||
#include <linux/string_choices.h>
|
||||
|
||||
#include "edac_module.h"
|
||||
|
||||
static struct dentry *edac_debugfs;
|
||||
@ -22,7 +25,7 @@ static ssize_t edac_fake_inject_write(struct file *file,
|
||||
"Generating %d %s fake error%s to %d.%d.%d to test core handling. NOTE: this won't test the driver-specific decoding logic.\n",
|
||||
errcount,
|
||||
(type == HW_EVENT_ERR_UNCORRECTED) ? "UE" : "CE",
|
||||
errcount > 1 ? "s" : "",
|
||||
str_plural(errcount),
|
||||
mci->fake_inject_layer[0],
|
||||
mci->fake_inject_layer[1],
|
||||
mci->fake_inject_layer[2]
|
||||
|
205
drivers/edac/ecs.c
Executable file
205
drivers/edac/ecs.c
Executable file
@ -0,0 +1,205 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* The generic ECS driver is designed to support control of on-die error
|
||||
* check scrub (e.g., DDR5 ECS). The common sysfs ECS interface abstracts
|
||||
* the control of various ECS functionalities into a unified set of functions.
|
||||
*
|
||||
* Copyright (c) 2024-2025 HiSilicon Limited.
|
||||
*/
|
||||
|
||||
#include <linux/edac.h>
|
||||
|
||||
#define EDAC_ECS_FRU_NAME "ecs_fru"
|
||||
|
||||
enum edac_ecs_attributes {
|
||||
ECS_LOG_ENTRY_TYPE,
|
||||
ECS_MODE,
|
||||
ECS_RESET,
|
||||
ECS_THRESHOLD,
|
||||
ECS_MAX_ATTRS
|
||||
};
|
||||
|
||||
struct edac_ecs_dev_attr {
|
||||
struct device_attribute dev_attr;
|
||||
int fru_id;
|
||||
};
|
||||
|
||||
struct edac_ecs_fru_context {
|
||||
char name[EDAC_FEAT_NAME_LEN];
|
||||
struct edac_ecs_dev_attr dev_attr[ECS_MAX_ATTRS];
|
||||
struct attribute *ecs_attrs[ECS_MAX_ATTRS + 1];
|
||||
struct attribute_group group;
|
||||
};
|
||||
|
||||
struct edac_ecs_context {
|
||||
u16 num_media_frus;
|
||||
struct edac_ecs_fru_context *fru_ctxs;
|
||||
};
|
||||
|
||||
#define TO_ECS_DEV_ATTR(_dev_attr) \
|
||||
container_of(_dev_attr, struct edac_ecs_dev_attr, dev_attr)
|
||||
|
||||
#define EDAC_ECS_ATTR_SHOW(attrib, cb, type, format) \
|
||||
static ssize_t attrib##_show(struct device *ras_feat_dev, \
|
||||
struct device_attribute *attr, char *buf) \
|
||||
{ \
|
||||
struct edac_ecs_dev_attr *dev_attr = TO_ECS_DEV_ATTR(attr); \
|
||||
struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \
|
||||
const struct edac_ecs_ops *ops = ctx->ecs.ecs_ops; \
|
||||
type data; \
|
||||
int ret; \
|
||||
\
|
||||
ret = ops->cb(ras_feat_dev->parent, ctx->ecs.private, \
|
||||
dev_attr->fru_id, &data); \
|
||||
if (ret) \
|
||||
return ret; \
|
||||
\
|
||||
return sysfs_emit(buf, format, data); \
|
||||
}
|
||||
|
||||
EDAC_ECS_ATTR_SHOW(log_entry_type, get_log_entry_type, u32, "%u\n")
|
||||
EDAC_ECS_ATTR_SHOW(mode, get_mode, u32, "%u\n")
|
||||
EDAC_ECS_ATTR_SHOW(threshold, get_threshold, u32, "%u\n")
|
||||
|
||||
#define EDAC_ECS_ATTR_STORE(attrib, cb, type, conv_func) \
|
||||
static ssize_t attrib##_store(struct device *ras_feat_dev, \
|
||||
struct device_attribute *attr, \
|
||||
const char *buf, size_t len) \
|
||||
{ \
|
||||
struct edac_ecs_dev_attr *dev_attr = TO_ECS_DEV_ATTR(attr); \
|
||||
struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \
|
||||
const struct edac_ecs_ops *ops = ctx->ecs.ecs_ops; \
|
||||
type data; \
|
||||
int ret; \
|
||||
\
|
||||
ret = conv_func(buf, 0, &data); \
|
||||
if (ret < 0) \
|
||||
return ret; \
|
||||
\
|
||||
ret = ops->cb(ras_feat_dev->parent, ctx->ecs.private, \
|
||||
dev_attr->fru_id, data); \
|
||||
if (ret) \
|
||||
return ret; \
|
||||
\
|
||||
return len; \
|
||||
}
|
||||
|
||||
EDAC_ECS_ATTR_STORE(log_entry_type, set_log_entry_type, unsigned long, kstrtoul)
|
||||
EDAC_ECS_ATTR_STORE(mode, set_mode, unsigned long, kstrtoul)
|
||||
EDAC_ECS_ATTR_STORE(reset, reset, unsigned long, kstrtoul)
|
||||
EDAC_ECS_ATTR_STORE(threshold, set_threshold, unsigned long, kstrtoul)
|
||||
|
||||
static umode_t ecs_attr_visible(struct kobject *kobj, struct attribute *a, int attr_id)
|
||||
{
|
||||
struct device *ras_feat_dev = kobj_to_dev(kobj);
|
||||
struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev);
|
||||
const struct edac_ecs_ops *ops = ctx->ecs.ecs_ops;
|
||||
|
||||
switch (attr_id) {
|
||||
case ECS_LOG_ENTRY_TYPE:
|
||||
if (ops->get_log_entry_type) {
|
||||
if (ops->set_log_entry_type)
|
||||
return a->mode;
|
||||
else
|
||||
return 0444;
|
||||
}
|
||||
break;
|
||||
case ECS_MODE:
|
||||
if (ops->get_mode) {
|
||||
if (ops->set_mode)
|
||||
return a->mode;
|
||||
else
|
||||
return 0444;
|
||||
}
|
||||
break;
|
||||
case ECS_RESET:
|
||||
if (ops->reset)
|
||||
return a->mode;
|
||||
break;
|
||||
case ECS_THRESHOLD:
|
||||
if (ops->get_threshold) {
|
||||
if (ops->set_threshold)
|
||||
return a->mode;
|
||||
else
|
||||
return 0444;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define EDAC_ECS_ATTR_RO(_name, _fru_id) \
|
||||
((struct edac_ecs_dev_attr) { .dev_attr = __ATTR_RO(_name), \
|
||||
.fru_id = _fru_id })
|
||||
|
||||
#define EDAC_ECS_ATTR_WO(_name, _fru_id) \
|
||||
((struct edac_ecs_dev_attr) { .dev_attr = __ATTR_WO(_name), \
|
||||
.fru_id = _fru_id })
|
||||
|
||||
#define EDAC_ECS_ATTR_RW(_name, _fru_id) \
|
||||
((struct edac_ecs_dev_attr) { .dev_attr = __ATTR_RW(_name), \
|
||||
.fru_id = _fru_id })
|
||||
|
||||
static int ecs_create_desc(struct device *ecs_dev, const struct attribute_group **attr_groups,
|
||||
u16 num_media_frus)
|
||||
{
|
||||
struct edac_ecs_context *ecs_ctx;
|
||||
u32 fru;
|
||||
|
||||
ecs_ctx = devm_kzalloc(ecs_dev, sizeof(*ecs_ctx), GFP_KERNEL);
|
||||
if (!ecs_ctx)
|
||||
return -ENOMEM;
|
||||
|
||||
ecs_ctx->num_media_frus = num_media_frus;
|
||||
ecs_ctx->fru_ctxs = devm_kcalloc(ecs_dev, num_media_frus,
|
||||
sizeof(*ecs_ctx->fru_ctxs),
|
||||
GFP_KERNEL);
|
||||
if (!ecs_ctx->fru_ctxs)
|
||||
return -ENOMEM;
|
||||
|
||||
for (fru = 0; fru < num_media_frus; fru++) {
|
||||
struct edac_ecs_fru_context *fru_ctx = &ecs_ctx->fru_ctxs[fru];
|
||||
struct attribute_group *group = &fru_ctx->group;
|
||||
int i;
|
||||
|
||||
fru_ctx->dev_attr[ECS_LOG_ENTRY_TYPE] = EDAC_ECS_ATTR_RW(log_entry_type, fru);
|
||||
fru_ctx->dev_attr[ECS_MODE] = EDAC_ECS_ATTR_RW(mode, fru);
|
||||
fru_ctx->dev_attr[ECS_RESET] = EDAC_ECS_ATTR_WO(reset, fru);
|
||||
fru_ctx->dev_attr[ECS_THRESHOLD] = EDAC_ECS_ATTR_RW(threshold, fru);
|
||||
|
||||
for (i = 0; i < ECS_MAX_ATTRS; i++)
|
||||
fru_ctx->ecs_attrs[i] = &fru_ctx->dev_attr[i].dev_attr.attr;
|
||||
|
||||
sprintf(fru_ctx->name, "%s%d", EDAC_ECS_FRU_NAME, fru);
|
||||
group->name = fru_ctx->name;
|
||||
group->attrs = fru_ctx->ecs_attrs;
|
||||
group->is_visible = ecs_attr_visible;
|
||||
|
||||
attr_groups[fru] = group;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* edac_ecs_get_desc - get EDAC ECS descriptors
|
||||
* @ecs_dev: client device, supports ECS feature
|
||||
* @attr_groups: pointer to attribute group container
|
||||
* @num_media_frus: number of media FRUs in the device
|
||||
*
|
||||
* Return:
|
||||
* * %0 - Success.
|
||||
* * %-EINVAL - Invalid parameters passed.
|
||||
* * %-ENOMEM - Dynamic memory allocation failed.
|
||||
*/
|
||||
int edac_ecs_get_desc(struct device *ecs_dev,
|
||||
const struct attribute_group **attr_groups, u16 num_media_frus)
|
||||
{
|
||||
if (!ecs_dev || !attr_groups || !num_media_frus)
|
||||
return -EINVAL;
|
||||
|
||||
return ecs_create_desc(ecs_dev, attr_groups, num_media_frus);
|
||||
}
|
@ -570,3 +570,188 @@ void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
|
||||
block ? block->name : "N/A", count, msg);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(edac_device_handle_ue_count);
|
||||
|
||||
static void edac_dev_release(struct device *dev)
|
||||
{
|
||||
struct edac_dev_feat_ctx *ctx = container_of(dev, struct edac_dev_feat_ctx, dev);
|
||||
|
||||
kfree(ctx->mem_repair);
|
||||
kfree(ctx->scrub);
|
||||
kfree(ctx->dev.groups);
|
||||
kfree(ctx);
|
||||
}
|
||||
|
||||
static const struct device_type edac_dev_type = {
|
||||
.name = "edac_dev",
|
||||
.release = edac_dev_release,
|
||||
};
|
||||
|
||||
static void edac_dev_unreg(void *data)
|
||||
{
|
||||
device_unregister(data);
|
||||
}
|
||||
|
||||
/**
|
||||
* edac_dev_register - register device for RAS features with EDAC
|
||||
* @parent: parent device.
|
||||
* @name: name for the folder in the /sys/bus/edac/devices/,
|
||||
* which is derived from the parent device.
|
||||
* For e.g. /sys/bus/edac/devices/cxl_mem0/
|
||||
* @private: parent driver's data to store in the context if any.
|
||||
* @num_features: number of RAS features to register.
|
||||
* @ras_features: list of RAS features to register.
|
||||
*
|
||||
* Return:
|
||||
* * %0 - Success.
|
||||
* * %-EINVAL - Invalid parameters passed.
|
||||
* * %-ENOMEM - Dynamic memory allocation failed.
|
||||
*
|
||||
*/
|
||||
int edac_dev_register(struct device *parent, char *name,
|
||||
void *private, int num_features,
|
||||
const struct edac_dev_feature *ras_features)
|
||||
{
|
||||
const struct attribute_group **ras_attr_groups;
|
||||
struct edac_dev_data *dev_data;
|
||||
struct edac_dev_feat_ctx *ctx;
|
||||
int mem_repair_cnt = 0;
|
||||
int attr_gcnt = 0;
|
||||
int ret = -ENOMEM;
|
||||
int scrub_cnt = 0;
|
||||
int feat;
|
||||
|
||||
if (!parent || !name || !num_features || !ras_features)
|
||||
return -EINVAL;
|
||||
|
||||
/* Double parse to make space for attributes */
|
||||
for (feat = 0; feat < num_features; feat++) {
|
||||
switch (ras_features[feat].ft_type) {
|
||||
case RAS_FEAT_SCRUB:
|
||||
attr_gcnt++;
|
||||
scrub_cnt++;
|
||||
break;
|
||||
case RAS_FEAT_ECS:
|
||||
attr_gcnt += ras_features[feat].ecs_info.num_media_frus;
|
||||
break;
|
||||
case RAS_FEAT_MEM_REPAIR:
|
||||
attr_gcnt++;
|
||||
mem_repair_cnt++;
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
|
||||
if (!ctx)
|
||||
return -ENOMEM;
|
||||
|
||||
ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups), GFP_KERNEL);
|
||||
if (!ras_attr_groups)
|
||||
goto ctx_free;
|
||||
|
||||
if (scrub_cnt) {
|
||||
ctx->scrub = kcalloc(scrub_cnt, sizeof(*ctx->scrub), GFP_KERNEL);
|
||||
if (!ctx->scrub)
|
||||
goto groups_free;
|
||||
}
|
||||
|
||||
if (mem_repair_cnt) {
|
||||
ctx->mem_repair = kcalloc(mem_repair_cnt, sizeof(*ctx->mem_repair), GFP_KERNEL);
|
||||
if (!ctx->mem_repair)
|
||||
goto data_mem_free;
|
||||
}
|
||||
|
||||
attr_gcnt = 0;
|
||||
scrub_cnt = 0;
|
||||
mem_repair_cnt = 0;
|
||||
for (feat = 0; feat < num_features; feat++, ras_features++) {
|
||||
switch (ras_features->ft_type) {
|
||||
case RAS_FEAT_SCRUB:
|
||||
if (!ras_features->scrub_ops || scrub_cnt != ras_features->instance) {
|
||||
ret = -EINVAL;
|
||||
goto data_mem_free;
|
||||
}
|
||||
|
||||
dev_data = &ctx->scrub[scrub_cnt];
|
||||
dev_data->instance = scrub_cnt;
|
||||
dev_data->scrub_ops = ras_features->scrub_ops;
|
||||
dev_data->private = ras_features->ctx;
|
||||
ret = edac_scrub_get_desc(parent, &ras_attr_groups[attr_gcnt],
|
||||
ras_features->instance);
|
||||
if (ret)
|
||||
goto data_mem_free;
|
||||
|
||||
scrub_cnt++;
|
||||
attr_gcnt++;
|
||||
break;
|
||||
case RAS_FEAT_ECS:
|
||||
if (!ras_features->ecs_ops) {
|
||||
ret = -EINVAL;
|
||||
goto data_mem_free;
|
||||
}
|
||||
|
||||
dev_data = &ctx->ecs;
|
||||
dev_data->ecs_ops = ras_features->ecs_ops;
|
||||
dev_data->private = ras_features->ctx;
|
||||
ret = edac_ecs_get_desc(parent, &ras_attr_groups[attr_gcnt],
|
||||
ras_features->ecs_info.num_media_frus);
|
||||
if (ret)
|
||||
goto data_mem_free;
|
||||
|
||||
attr_gcnt += ras_features->ecs_info.num_media_frus;
|
||||
break;
|
||||
case RAS_FEAT_MEM_REPAIR:
|
||||
if (!ras_features->mem_repair_ops ||
|
||||
mem_repair_cnt != ras_features->instance) {
|
||||
ret = -EINVAL;
|
||||
goto data_mem_free;
|
||||
}
|
||||
|
||||
dev_data = &ctx->mem_repair[mem_repair_cnt];
|
||||
dev_data->instance = mem_repair_cnt;
|
||||
dev_data->mem_repair_ops = ras_features->mem_repair_ops;
|
||||
dev_data->private = ras_features->ctx;
|
||||
ret = edac_mem_repair_get_desc(parent, &ras_attr_groups[attr_gcnt],
|
||||
ras_features->instance);
|
||||
if (ret)
|
||||
goto data_mem_free;
|
||||
|
||||
mem_repair_cnt++;
|
||||
attr_gcnt++;
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
goto data_mem_free;
|
||||
}
|
||||
}
|
||||
|
||||
ctx->dev.parent = parent;
|
||||
ctx->dev.bus = edac_get_sysfs_subsys();
|
||||
ctx->dev.type = &edac_dev_type;
|
||||
ctx->dev.groups = ras_attr_groups;
|
||||
ctx->private = private;
|
||||
dev_set_drvdata(&ctx->dev, ctx);
|
||||
|
||||
ret = dev_set_name(&ctx->dev, "%s", name);
|
||||
if (ret)
|
||||
goto data_mem_free;
|
||||
|
||||
ret = device_register(&ctx->dev);
|
||||
if (ret) {
|
||||
put_device(&ctx->dev);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return devm_add_action_or_reset(parent, edac_dev_unreg, &ctx->dev);
|
||||
|
||||
data_mem_free:
|
||||
kfree(ctx->mem_repair);
|
||||
kfree(ctx->scrub);
|
||||
groups_free:
|
||||
kfree(ras_attr_groups);
|
||||
ctx_free:
|
||||
kfree(ctx);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(edac_dev_register);
|
||||
|
@ -751,6 +751,8 @@ static int i10nm_get_ddr_munits(void)
|
||||
continue;
|
||||
} else {
|
||||
d->imc[lmc].mdev = mdev;
|
||||
if (res_cfg->type == SPR)
|
||||
skx_set_mc_mapping(d, i, lmc);
|
||||
lmc++;
|
||||
}
|
||||
}
|
||||
|
@ -31,6 +31,7 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/edac.h>
|
||||
#include <linux/mmzone.h>
|
||||
#include <linux/string_choices.h>
|
||||
|
||||
#include "edac_module.h"
|
||||
|
||||
@ -899,7 +900,7 @@ static void decode_mtr(int slot_row, u16 mtr)
|
||||
edac_dbg(2, "\t\tWIDTH: x%d\n", MTR_DRAM_WIDTH(mtr));
|
||||
|
||||
edac_dbg(2, "\t\tELECTRICAL THROTTLING is %s\n",
|
||||
MTR_DIMMS_ETHROTTLE(mtr) ? "enabled" : "disabled");
|
||||
str_enabled_disabled(MTR_DIMMS_ETHROTTLE(mtr)));
|
||||
|
||||
edac_dbg(2, "\t\tNUMBANK: %d bank(s)\n", MTR_DRAM_BANKS(mtr));
|
||||
edac_dbg(2, "\t\tNUMRANK: %s\n",
|
||||
|
@ -23,6 +23,7 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/edac.h>
|
||||
#include <linux/mmzone.h>
|
||||
#include <linux/string_choices.h>
|
||||
|
||||
#include "edac_module.h"
|
||||
|
||||
@ -620,7 +621,7 @@ static int decode_mtr(struct i7300_pvt *pvt,
|
||||
edac_dbg(2, "\t\tWIDTH: x%d\n", MTR_DRAM_WIDTH(mtr));
|
||||
|
||||
edac_dbg(2, "\t\tELECTRICAL THROTTLING is %s\n",
|
||||
MTR_DIMMS_ETHROTTLE(mtr) ? "enabled" : "disabled");
|
||||
str_enabled_disabled(MTR_DIMMS_ETHROTTLE(mtr)));
|
||||
|
||||
edac_dbg(2, "\t\tNUMBANK: %d bank(s)\n", MTR_DRAM_BANKS(mtr));
|
||||
edac_dbg(2, "\t\tNUMRANK: %s\n",
|
||||
@ -871,9 +872,9 @@ static int i7300_get_mc_regs(struct mem_ctl_info *mci)
|
||||
IS_MIRRORED(pvt->mc_settings) ? "" : "non-");
|
||||
|
||||
edac_dbg(0, "Error detection is %s\n",
|
||||
IS_ECC_ENABLED(pvt->mc_settings) ? "enabled" : "disabled");
|
||||
str_enabled_disabled(IS_ECC_ENABLED(pvt->mc_settings)));
|
||||
edac_dbg(0, "Retry is %s\n",
|
||||
IS_RETRY_ENABLED(pvt->mc_settings) ? "enabled" : "disabled");
|
||||
str_enabled_disabled(IS_RETRY_ENABLED(pvt->mc_settings)));
|
||||
|
||||
/* Get Memory Interleave Range registers */
|
||||
pci_read_config_word(pvt->pci_dev_16_1_fsb_addr_map, MIR0,
|
||||
|
@ -51,6 +51,7 @@
|
||||
#include <linux/edac.h>
|
||||
|
||||
#include <linux/io-64-nonatomic-lo-hi.h>
|
||||
#include <asm/mce.h>
|
||||
#include "edac_module.h"
|
||||
|
||||
#define EDAC_MOD_STR "ie31200_edac"
|
||||
@ -84,44 +85,23 @@
|
||||
#define PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_9 0x3ec6
|
||||
#define PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_10 0x3eca
|
||||
|
||||
/* Test if HB is for Skylake or later. */
|
||||
#define DEVICE_ID_SKYLAKE_OR_LATER(did) \
|
||||
(((did) == PCI_DEVICE_ID_INTEL_IE31200_HB_8) || \
|
||||
((did) == PCI_DEVICE_ID_INTEL_IE31200_HB_9) || \
|
||||
((did) == PCI_DEVICE_ID_INTEL_IE31200_HB_10) || \
|
||||
((did) == PCI_DEVICE_ID_INTEL_IE31200_HB_11) || \
|
||||
((did) == PCI_DEVICE_ID_INTEL_IE31200_HB_12) || \
|
||||
(((did) & PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_MASK) == \
|
||||
PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_MASK))
|
||||
/* Raptor Lake-S */
|
||||
#define PCI_DEVICE_ID_INTEL_IE31200_RPL_S_1 0xa703
|
||||
#define PCI_DEVICE_ID_INTEL_IE31200_RPL_S_2 0x4640
|
||||
#define PCI_DEVICE_ID_INTEL_IE31200_RPL_S_3 0x4630
|
||||
|
||||
#define IE31200_DIMMS 4
|
||||
#define IE31200_RANKS 8
|
||||
#define IE31200_RANKS_PER_CHANNEL 4
|
||||
#define IE31200_RANKS_PER_CHANNEL 8
|
||||
#define IE31200_DIMMS_PER_CHANNEL 2
|
||||
#define IE31200_CHANNELS 2
|
||||
#define IE31200_IMC_NUM 2
|
||||
|
||||
/* Intel IE31200 register addresses - device 0 function 0 - DRAM Controller */
|
||||
#define IE31200_MCHBAR_LOW 0x48
|
||||
#define IE31200_MCHBAR_HIGH 0x4c
|
||||
#define IE31200_MCHBAR_MASK GENMASK_ULL(38, 15)
|
||||
#define IE31200_MMR_WINDOW_SIZE BIT(15)
|
||||
|
||||
/*
|
||||
* Error Status Register (16b)
|
||||
*
|
||||
* 15 reserved
|
||||
* 14 Isochronous TBWRR Run Behind FIFO Full
|
||||
* (ITCV)
|
||||
* 13 Isochronous TBWRR Run Behind FIFO Put
|
||||
* (ITSTV)
|
||||
* 12 reserved
|
||||
* 11 MCH Thermal Sensor Event
|
||||
* for SMI/SCI/SERR (GTSE)
|
||||
* 10 reserved
|
||||
* 9 LOCK to non-DRAM Memory Flag (LCKF)
|
||||
* 8 reserved
|
||||
* 7 DRAM Throttle Flag (DTF)
|
||||
* 6:2 reserved
|
||||
* 1 Multi-bit DRAM ECC Error Flag (DMERR)
|
||||
* 0 Single-bit DRAM ECC Error Flag (DSERR)
|
||||
*/
|
||||
@ -130,68 +110,60 @@
|
||||
#define IE31200_ERRSTS_CE BIT(0)
|
||||
#define IE31200_ERRSTS_BITS (IE31200_ERRSTS_UE | IE31200_ERRSTS_CE)
|
||||
|
||||
/*
|
||||
* Channel 0 ECC Error Log (64b)
|
||||
*
|
||||
* 63:48 Error Column Address (ERRCOL)
|
||||
* 47:32 Error Row Address (ERRROW)
|
||||
* 31:29 Error Bank Address (ERRBANK)
|
||||
* 28:27 Error Rank Address (ERRRANK)
|
||||
* 26:24 reserved
|
||||
* 23:16 Error Syndrome (ERRSYND)
|
||||
* 15: 2 reserved
|
||||
* 1 Multiple Bit Error Status (MERRSTS)
|
||||
* 0 Correctable Error Status (CERRSTS)
|
||||
*/
|
||||
|
||||
#define IE31200_C0ECCERRLOG 0x40c8
|
||||
#define IE31200_C1ECCERRLOG 0x44c8
|
||||
#define IE31200_C0ECCERRLOG_SKL 0x4048
|
||||
#define IE31200_C1ECCERRLOG_SKL 0x4448
|
||||
#define IE31200_ECCERRLOG_CE BIT(0)
|
||||
#define IE31200_ECCERRLOG_UE BIT(1)
|
||||
#define IE31200_ECCERRLOG_RANK_BITS GENMASK_ULL(28, 27)
|
||||
#define IE31200_ECCERRLOG_RANK_SHIFT 27
|
||||
#define IE31200_ECCERRLOG_SYNDROME_BITS GENMASK_ULL(23, 16)
|
||||
#define IE31200_ECCERRLOG_SYNDROME_SHIFT 16
|
||||
|
||||
#define IE31200_ECCERRLOG_SYNDROME(log) \
|
||||
((log & IE31200_ECCERRLOG_SYNDROME_BITS) >> \
|
||||
IE31200_ECCERRLOG_SYNDROME_SHIFT)
|
||||
|
||||
#define IE31200_CAPID0 0xe4
|
||||
#define IE31200_CAPID0_PDCD BIT(4)
|
||||
#define IE31200_CAPID0_DDPCD BIT(6)
|
||||
#define IE31200_CAPID0_ECC BIT(1)
|
||||
|
||||
#define IE31200_MAD_DIMM_0_OFFSET 0x5004
|
||||
#define IE31200_MAD_DIMM_0_OFFSET_SKL 0x500C
|
||||
#define IE31200_MAD_DIMM_SIZE GENMASK_ULL(7, 0)
|
||||
#define IE31200_MAD_DIMM_A_RANK BIT(17)
|
||||
#define IE31200_MAD_DIMM_A_RANK_SHIFT 17
|
||||
#define IE31200_MAD_DIMM_A_RANK_SKL BIT(10)
|
||||
#define IE31200_MAD_DIMM_A_RANK_SKL_SHIFT 10
|
||||
#define IE31200_MAD_DIMM_A_WIDTH BIT(19)
|
||||
#define IE31200_MAD_DIMM_A_WIDTH_SHIFT 19
|
||||
#define IE31200_MAD_DIMM_A_WIDTH_SKL GENMASK_ULL(9, 8)
|
||||
#define IE31200_MAD_DIMM_A_WIDTH_SKL_SHIFT 8
|
||||
|
||||
/* Skylake reports 1GB increments, everything else is 256MB */
|
||||
#define IE31200_PAGES(n, skl) \
|
||||
(n << (28 + (2 * skl) - PAGE_SHIFT))
|
||||
/* Non-constant mask variant of FIELD_GET() */
|
||||
#define field_get(_mask, _reg) (((_reg) & (_mask)) >> (ffs(_mask) - 1))
|
||||
|
||||
static int nr_channels;
|
||||
static struct pci_dev *mci_pdev;
|
||||
static int ie31200_registered = 1;
|
||||
|
||||
struct res_config {
|
||||
enum mem_type mtype;
|
||||
bool cmci;
|
||||
int imc_num;
|
||||
/* Host MMIO configuration register */
|
||||
u64 reg_mchbar_mask;
|
||||
u64 reg_mchbar_window_size;
|
||||
/* ECC error log register */
|
||||
u64 reg_eccerrlog_offset[IE31200_CHANNELS];
|
||||
u64 reg_eccerrlog_ce_mask;
|
||||
u64 reg_eccerrlog_ce_ovfl_mask;
|
||||
u64 reg_eccerrlog_ue_mask;
|
||||
u64 reg_eccerrlog_ue_ovfl_mask;
|
||||
u64 reg_eccerrlog_rank_mask;
|
||||
u64 reg_eccerrlog_syndrome_mask;
|
||||
/* MSR to clear ECC error log register */
|
||||
u32 msr_clear_eccerrlog_offset;
|
||||
/* DIMM characteristics register */
|
||||
u64 reg_mad_dimm_size_granularity;
|
||||
u64 reg_mad_dimm_offset[IE31200_CHANNELS];
|
||||
u32 reg_mad_dimm_size_mask[IE31200_DIMMS_PER_CHANNEL];
|
||||
u32 reg_mad_dimm_rank_mask[IE31200_DIMMS_PER_CHANNEL];
|
||||
u32 reg_mad_dimm_width_mask[IE31200_DIMMS_PER_CHANNEL];
|
||||
};
|
||||
|
||||
struct ie31200_priv {
|
||||
void __iomem *window;
|
||||
void __iomem *c0errlog;
|
||||
void __iomem *c1errlog;
|
||||
struct res_config *cfg;
|
||||
struct mem_ctl_info *mci;
|
||||
struct pci_dev *pdev;
|
||||
struct device dev;
|
||||
};
|
||||
|
||||
static struct ie31200_pvt {
|
||||
struct ie31200_priv *priv[IE31200_IMC_NUM];
|
||||
} ie31200_pvt;
|
||||
|
||||
enum ie31200_chips {
|
||||
IE31200 = 0,
|
||||
IE31200_1 = 1,
|
||||
};
|
||||
|
||||
struct ie31200_dev_info {
|
||||
@ -202,18 +174,22 @@ struct ie31200_error_info {
|
||||
u16 errsts;
|
||||
u16 errsts2;
|
||||
u64 eccerrlog[IE31200_CHANNELS];
|
||||
u64 erraddr;
|
||||
};
|
||||
|
||||
static const struct ie31200_dev_info ie31200_devs[] = {
|
||||
[IE31200] = {
|
||||
.ctl_name = "IE31200"
|
||||
},
|
||||
[IE31200_1] = {
|
||||
.ctl_name = "IE31200_1"
|
||||
},
|
||||
};
|
||||
|
||||
struct dimm_data {
|
||||
u8 size; /* in multiples of 256MB, except Skylake is 1GB */
|
||||
u8 dual_rank : 1,
|
||||
x16_width : 2; /* 0 means x8 width */
|
||||
u64 size; /* in bytes */
|
||||
u8 ranks;
|
||||
enum dev_type dtype;
|
||||
};
|
||||
|
||||
static int how_many_channels(struct pci_dev *pdev)
|
||||
@ -251,29 +227,54 @@ static bool ecc_capable(struct pci_dev *pdev)
|
||||
return true;
|
||||
}
|
||||
|
||||
static int eccerrlog_row(u64 log)
|
||||
{
|
||||
return ((log & IE31200_ECCERRLOG_RANK_BITS) >>
|
||||
IE31200_ECCERRLOG_RANK_SHIFT);
|
||||
}
|
||||
#define mci_to_pci_dev(mci) (((struct ie31200_priv *)(mci)->pvt_info)->pdev)
|
||||
|
||||
static void ie31200_clear_error_info(struct mem_ctl_info *mci)
|
||||
{
|
||||
struct ie31200_priv *priv = mci->pvt_info;
|
||||
struct res_config *cfg = priv->cfg;
|
||||
|
||||
/*
|
||||
* The PCI ERRSTS register is deprecated. Write the MSR to clear
|
||||
* the ECC error log registers in all memory controllers.
|
||||
*/
|
||||
if (cfg->msr_clear_eccerrlog_offset) {
|
||||
if (wrmsr_safe(cfg->msr_clear_eccerrlog_offset,
|
||||
cfg->reg_eccerrlog_ce_mask |
|
||||
cfg->reg_eccerrlog_ce_ovfl_mask |
|
||||
cfg->reg_eccerrlog_ue_mask |
|
||||
cfg->reg_eccerrlog_ue_ovfl_mask, 0) < 0)
|
||||
ie31200_printk(KERN_ERR, "Failed to wrmsr.\n");
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Clear any error bits.
|
||||
* (Yes, we really clear bits by writing 1 to them.)
|
||||
*/
|
||||
pci_write_bits16(to_pci_dev(mci->pdev), IE31200_ERRSTS,
|
||||
pci_write_bits16(mci_to_pci_dev(mci), IE31200_ERRSTS,
|
||||
IE31200_ERRSTS_BITS, IE31200_ERRSTS_BITS);
|
||||
}
|
||||
|
||||
static void ie31200_get_and_clear_error_info(struct mem_ctl_info *mci,
|
||||
struct ie31200_error_info *info)
|
||||
{
|
||||
struct pci_dev *pdev;
|
||||
struct pci_dev *pdev = mci_to_pci_dev(mci);
|
||||
struct ie31200_priv *priv = mci->pvt_info;
|
||||
|
||||
pdev = to_pci_dev(mci->pdev);
|
||||
/*
|
||||
* The PCI ERRSTS register is deprecated, directly read the
|
||||
* MMIO-mapped ECC error log registers.
|
||||
*/
|
||||
if (priv->cfg->msr_clear_eccerrlog_offset) {
|
||||
info->eccerrlog[0] = lo_hi_readq(priv->c0errlog);
|
||||
if (nr_channels == 2)
|
||||
info->eccerrlog[1] = lo_hi_readq(priv->c1errlog);
|
||||
|
||||
ie31200_clear_error_info(mci);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is a mess because there is no atomic way to read all the
|
||||
@ -309,46 +310,56 @@ static void ie31200_get_and_clear_error_info(struct mem_ctl_info *mci,
|
||||
static void ie31200_process_error_info(struct mem_ctl_info *mci,
|
||||
struct ie31200_error_info *info)
|
||||
{
|
||||
struct ie31200_priv *priv = mci->pvt_info;
|
||||
struct res_config *cfg = priv->cfg;
|
||||
int channel;
|
||||
u64 log;
|
||||
|
||||
if (!(info->errsts & IE31200_ERRSTS_BITS))
|
||||
return;
|
||||
if (!cfg->msr_clear_eccerrlog_offset) {
|
||||
if (!(info->errsts & IE31200_ERRSTS_BITS))
|
||||
return;
|
||||
|
||||
if ((info->errsts ^ info->errsts2) & IE31200_ERRSTS_BITS) {
|
||||
edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1, 0, 0, 0,
|
||||
-1, -1, -1, "UE overwrote CE", "");
|
||||
info->errsts = info->errsts2;
|
||||
if ((info->errsts ^ info->errsts2) & IE31200_ERRSTS_BITS) {
|
||||
edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1, 0, 0, 0,
|
||||
-1, -1, -1, "UE overwrote CE", "");
|
||||
info->errsts = info->errsts2;
|
||||
}
|
||||
}
|
||||
|
||||
for (channel = 0; channel < nr_channels; channel++) {
|
||||
log = info->eccerrlog[channel];
|
||||
if (log & IE31200_ECCERRLOG_UE) {
|
||||
if (log & cfg->reg_eccerrlog_ue_mask) {
|
||||
edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
|
||||
0, 0, 0,
|
||||
eccerrlog_row(log),
|
||||
info->erraddr >> PAGE_SHIFT, 0, 0,
|
||||
field_get(cfg->reg_eccerrlog_rank_mask, log),
|
||||
channel, -1,
|
||||
"ie31200 UE", "");
|
||||
} else if (log & IE31200_ECCERRLOG_CE) {
|
||||
} else if (log & cfg->reg_eccerrlog_ce_mask) {
|
||||
edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
|
||||
0, 0,
|
||||
IE31200_ECCERRLOG_SYNDROME(log),
|
||||
eccerrlog_row(log),
|
||||
info->erraddr >> PAGE_SHIFT, 0,
|
||||
field_get(cfg->reg_eccerrlog_syndrome_mask, log),
|
||||
field_get(cfg->reg_eccerrlog_rank_mask, log),
|
||||
channel, -1,
|
||||
"ie31200 CE", "");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ie31200_check(struct mem_ctl_info *mci)
|
||||
static void __ie31200_check(struct mem_ctl_info *mci, struct mce *mce)
|
||||
{
|
||||
struct ie31200_error_info info;
|
||||
|
||||
info.erraddr = mce ? mce->addr : 0;
|
||||
ie31200_get_and_clear_error_info(mci, &info);
|
||||
ie31200_process_error_info(mci, &info);
|
||||
}
|
||||
|
||||
static void __iomem *ie31200_map_mchbar(struct pci_dev *pdev)
|
||||
static void ie31200_check(struct mem_ctl_info *mci)
|
||||
{
|
||||
__ie31200_check(mci, NULL);
|
||||
}
|
||||
|
||||
static void __iomem *ie31200_map_mchbar(struct pci_dev *pdev, struct res_config *cfg, int mc)
|
||||
{
|
||||
union {
|
||||
u64 mchbar;
|
||||
@ -361,7 +372,8 @@ static void __iomem *ie31200_map_mchbar(struct pci_dev *pdev)
|
||||
|
||||
pci_read_config_dword(pdev, IE31200_MCHBAR_LOW, &u.mchbar_low);
|
||||
pci_read_config_dword(pdev, IE31200_MCHBAR_HIGH, &u.mchbar_high);
|
||||
u.mchbar &= IE31200_MCHBAR_MASK;
|
||||
u.mchbar &= cfg->reg_mchbar_mask;
|
||||
u.mchbar += cfg->reg_mchbar_window_size * mc;
|
||||
|
||||
if (u.mchbar != (resource_size_t)u.mchbar) {
|
||||
ie31200_printk(KERN_ERR, "mmio space beyond accessible range (0x%llx)\n",
|
||||
@ -369,7 +381,7 @@ static void __iomem *ie31200_map_mchbar(struct pci_dev *pdev)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
window = ioremap(u.mchbar, IE31200_MMR_WINDOW_SIZE);
|
||||
window = ioremap(u.mchbar, cfg->reg_mchbar_window_size);
|
||||
if (!window)
|
||||
ie31200_printk(KERN_ERR, "Cannot map mmio space at 0x%llx\n",
|
||||
(unsigned long long)u.mchbar);
|
||||
@ -377,155 +389,108 @@ static void __iomem *ie31200_map_mchbar(struct pci_dev *pdev)
|
||||
return window;
|
||||
}
|
||||
|
||||
static void __skl_populate_dimm_info(struct dimm_data *dd, u32 addr_decode,
|
||||
int chan)
|
||||
static void populate_dimm_info(struct dimm_data *dd, u32 addr_decode, int dimm,
|
||||
struct res_config *cfg)
|
||||
{
|
||||
dd->size = (addr_decode >> (chan << 4)) & IE31200_MAD_DIMM_SIZE;
|
||||
dd->dual_rank = (addr_decode & (IE31200_MAD_DIMM_A_RANK_SKL << (chan << 4))) ? 1 : 0;
|
||||
dd->x16_width = ((addr_decode & (IE31200_MAD_DIMM_A_WIDTH_SKL << (chan << 4))) >>
|
||||
(IE31200_MAD_DIMM_A_WIDTH_SKL_SHIFT + (chan << 4)));
|
||||
dd->size = field_get(cfg->reg_mad_dimm_size_mask[dimm], addr_decode) * cfg->reg_mad_dimm_size_granularity;
|
||||
dd->ranks = field_get(cfg->reg_mad_dimm_rank_mask[dimm], addr_decode) + 1;
|
||||
dd->dtype = field_get(cfg->reg_mad_dimm_width_mask[dimm], addr_decode) + DEV_X8;
|
||||
}
|
||||
|
||||
static void __populate_dimm_info(struct dimm_data *dd, u32 addr_decode,
|
||||
int chan)
|
||||
static void ie31200_get_dimm_config(struct mem_ctl_info *mci, void __iomem *window,
|
||||
struct res_config *cfg, int mc)
|
||||
{
|
||||
dd->size = (addr_decode >> (chan << 3)) & IE31200_MAD_DIMM_SIZE;
|
||||
dd->dual_rank = (addr_decode & (IE31200_MAD_DIMM_A_RANK << chan)) ? 1 : 0;
|
||||
dd->x16_width = (addr_decode & (IE31200_MAD_DIMM_A_WIDTH << chan)) ? 1 : 0;
|
||||
}
|
||||
struct dimm_data dimm_info;
|
||||
struct dimm_info *dimm;
|
||||
unsigned long nr_pages;
|
||||
u32 addr_decode;
|
||||
int i, j, k;
|
||||
|
||||
static void populate_dimm_info(struct dimm_data *dd, u32 addr_decode, int chan,
|
||||
bool skl)
|
||||
{
|
||||
if (skl)
|
||||
__skl_populate_dimm_info(dd, addr_decode, chan);
|
||||
else
|
||||
__populate_dimm_info(dd, addr_decode, chan);
|
||||
}
|
||||
for (i = 0; i < IE31200_CHANNELS; i++) {
|
||||
addr_decode = readl(window + cfg->reg_mad_dimm_offset[i]);
|
||||
edac_dbg(0, "addr_decode: 0x%x\n", addr_decode);
|
||||
|
||||
for (j = 0; j < IE31200_DIMMS_PER_CHANNEL; j++) {
|
||||
populate_dimm_info(&dimm_info, addr_decode, j, cfg);
|
||||
edac_dbg(0, "mc: %d, channel: %d, dimm: %d, size: %lld MiB, ranks: %d, DRAM chip type: %d\n",
|
||||
mc, i, j, dimm_info.size >> 20,
|
||||
dimm_info.ranks,
|
||||
dimm_info.dtype);
|
||||
|
||||
static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
|
||||
{
|
||||
int i, j, ret;
|
||||
struct mem_ctl_info *mci = NULL;
|
||||
struct edac_mc_layer layers[2];
|
||||
struct dimm_data dimm_info[IE31200_CHANNELS][IE31200_DIMMS_PER_CHANNEL];
|
||||
void __iomem *window;
|
||||
struct ie31200_priv *priv;
|
||||
u32 addr_decode, mad_offset;
|
||||
nr_pages = MiB_TO_PAGES(dimm_info.size >> 20);
|
||||
if (nr_pages == 0)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Kaby Lake, Coffee Lake seem to work like Skylake. Please re-visit
|
||||
* this logic when adding new CPU support.
|
||||
*/
|
||||
bool skl = DEVICE_ID_SKYLAKE_OR_LATER(pdev->device);
|
||||
|
||||
edac_dbg(0, "MC:\n");
|
||||
|
||||
if (!ecc_capable(pdev)) {
|
||||
ie31200_printk(KERN_INFO, "No ECC support\n");
|
||||
return -ENODEV;
|
||||
nr_pages = nr_pages / dimm_info.ranks;
|
||||
for (k = 0; k < dimm_info.ranks; k++) {
|
||||
dimm = edac_get_dimm(mci, (j * dimm_info.ranks) + k, i, 0);
|
||||
dimm->nr_pages = nr_pages;
|
||||
edac_dbg(0, "set nr pages: 0x%lx\n", nr_pages);
|
||||
dimm->grain = 8; /* just a guess */
|
||||
dimm->mtype = cfg->mtype;
|
||||
dimm->dtype = dimm_info.dtype;
|
||||
dimm->edac_mode = EDAC_UNKNOWN;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int ie31200_register_mci(struct pci_dev *pdev, struct res_config *cfg, int mc)
|
||||
{
|
||||
struct edac_mc_layer layers[2];
|
||||
struct ie31200_priv *priv;
|
||||
struct mem_ctl_info *mci;
|
||||
void __iomem *window;
|
||||
int ret;
|
||||
|
||||
nr_channels = how_many_channels(pdev);
|
||||
layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
|
||||
layers[0].size = IE31200_DIMMS;
|
||||
layers[0].size = IE31200_RANKS_PER_CHANNEL;
|
||||
layers[0].is_virt_csrow = true;
|
||||
layers[1].type = EDAC_MC_LAYER_CHANNEL;
|
||||
layers[1].size = nr_channels;
|
||||
layers[1].is_virt_csrow = false;
|
||||
mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers,
|
||||
mci = edac_mc_alloc(mc, ARRAY_SIZE(layers), layers,
|
||||
sizeof(struct ie31200_priv));
|
||||
if (!mci)
|
||||
return -ENOMEM;
|
||||
|
||||
window = ie31200_map_mchbar(pdev);
|
||||
window = ie31200_map_mchbar(pdev, cfg, mc);
|
||||
if (!window) {
|
||||
ret = -ENODEV;
|
||||
goto fail_free;
|
||||
}
|
||||
|
||||
edac_dbg(3, "MC: init mci\n");
|
||||
mci->pdev = &pdev->dev;
|
||||
if (skl)
|
||||
mci->mtype_cap = MEM_FLAG_DDR4;
|
||||
else
|
||||
mci->mtype_cap = MEM_FLAG_DDR3;
|
||||
mci->mtype_cap = BIT(cfg->mtype);
|
||||
mci->edac_ctl_cap = EDAC_FLAG_SECDED;
|
||||
mci->edac_cap = EDAC_FLAG_SECDED;
|
||||
mci->mod_name = EDAC_MOD_STR;
|
||||
mci->ctl_name = ie31200_devs[dev_idx].ctl_name;
|
||||
mci->ctl_name = ie31200_devs[mc].ctl_name;
|
||||
mci->dev_name = pci_name(pdev);
|
||||
mci->edac_check = ie31200_check;
|
||||
mci->edac_check = cfg->cmci ? NULL : ie31200_check;
|
||||
mci->ctl_page_to_phys = NULL;
|
||||
priv = mci->pvt_info;
|
||||
priv->window = window;
|
||||
if (skl) {
|
||||
priv->c0errlog = window + IE31200_C0ECCERRLOG_SKL;
|
||||
priv->c1errlog = window + IE31200_C1ECCERRLOG_SKL;
|
||||
mad_offset = IE31200_MAD_DIMM_0_OFFSET_SKL;
|
||||
} else {
|
||||
priv->c0errlog = window + IE31200_C0ECCERRLOG;
|
||||
priv->c1errlog = window + IE31200_C1ECCERRLOG;
|
||||
mad_offset = IE31200_MAD_DIMM_0_OFFSET;
|
||||
}
|
||||
|
||||
/* populate DIMM info */
|
||||
for (i = 0; i < IE31200_CHANNELS; i++) {
|
||||
addr_decode = readl(window + mad_offset +
|
||||
(i * 4));
|
||||
edac_dbg(0, "addr_decode: 0x%x\n", addr_decode);
|
||||
for (j = 0; j < IE31200_DIMMS_PER_CHANNEL; j++) {
|
||||
populate_dimm_info(&dimm_info[i][j], addr_decode, j,
|
||||
skl);
|
||||
edac_dbg(0, "size: 0x%x, rank: %d, width: %d\n",
|
||||
dimm_info[i][j].size,
|
||||
dimm_info[i][j].dual_rank,
|
||||
dimm_info[i][j].x16_width);
|
||||
}
|
||||
}
|
||||
|
||||
priv->c0errlog = window + cfg->reg_eccerrlog_offset[0];
|
||||
priv->c1errlog = window + cfg->reg_eccerrlog_offset[1];
|
||||
priv->cfg = cfg;
|
||||
priv->mci = mci;
|
||||
priv->pdev = pdev;
|
||||
device_initialize(&priv->dev);
|
||||
/*
|
||||
* The dram rank boundary (DRB) reg values are boundary addresses
|
||||
* for each DRAM rank with a granularity of 64MB. DRB regs are
|
||||
* cumulative; the last one will contain the total memory
|
||||
* contained in all ranks.
|
||||
* The EDAC core uses mci->pdev (pointer to the structure device)
|
||||
* as the memory controller ID. The SoCs attach one or more memory
|
||||
* controllers to a single pci_dev (a single pci_dev->dev can
|
||||
* correspond to multiple memory controllers).
|
||||
*
|
||||
* To make mci->pdev unique, assign pci_dev->dev to mci->pdev
|
||||
* for the first memory controller and assign a unique priv->dev
|
||||
* to mci->pdev for each additional memory controller.
|
||||
*/
|
||||
for (i = 0; i < IE31200_DIMMS_PER_CHANNEL; i++) {
|
||||
for (j = 0; j < IE31200_CHANNELS; j++) {
|
||||
struct dimm_info *dimm;
|
||||
unsigned long nr_pages;
|
||||
|
||||
nr_pages = IE31200_PAGES(dimm_info[j][i].size, skl);
|
||||
if (nr_pages == 0)
|
||||
continue;
|
||||
|
||||
if (dimm_info[j][i].dual_rank) {
|
||||
nr_pages = nr_pages / 2;
|
||||
dimm = edac_get_dimm(mci, (i * 2) + 1, j, 0);
|
||||
dimm->nr_pages = nr_pages;
|
||||
edac_dbg(0, "set nr pages: 0x%lx\n", nr_pages);
|
||||
dimm->grain = 8; /* just a guess */
|
||||
if (skl)
|
||||
dimm->mtype = MEM_DDR4;
|
||||
else
|
||||
dimm->mtype = MEM_DDR3;
|
||||
dimm->dtype = DEV_UNKNOWN;
|
||||
dimm->edac_mode = EDAC_UNKNOWN;
|
||||
}
|
||||
dimm = edac_get_dimm(mci, i * 2, j, 0);
|
||||
dimm->nr_pages = nr_pages;
|
||||
edac_dbg(0, "set nr pages: 0x%lx\n", nr_pages);
|
||||
dimm->grain = 8; /* same guess */
|
||||
if (skl)
|
||||
dimm->mtype = MEM_DDR4;
|
||||
else
|
||||
dimm->mtype = MEM_DDR3;
|
||||
dimm->dtype = DEV_UNKNOWN;
|
||||
dimm->edac_mode = EDAC_UNKNOWN;
|
||||
}
|
||||
}
|
||||
mci->pdev = mc ? &priv->dev : &pdev->dev;
|
||||
|
||||
ie31200_get_dimm_config(mci, window, cfg, mc);
|
||||
ie31200_clear_error_info(mci);
|
||||
|
||||
if (edac_mc_add_mc(mci)) {
|
||||
@ -534,16 +499,115 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
|
||||
goto fail_unmap;
|
||||
}
|
||||
|
||||
/* get this far and it's successful */
|
||||
ie31200_pvt.priv[mc] = priv;
|
||||
return 0;
|
||||
fail_unmap:
|
||||
iounmap(window);
|
||||
fail_free:
|
||||
edac_mc_free(mci);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void mce_check(struct mce *mce)
|
||||
{
|
||||
struct ie31200_priv *priv;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < IE31200_IMC_NUM; i++) {
|
||||
priv = ie31200_pvt.priv[i];
|
||||
if (!priv)
|
||||
continue;
|
||||
|
||||
__ie31200_check(priv->mci, mce);
|
||||
}
|
||||
}
|
||||
|
||||
static int mce_handler(struct notifier_block *nb, unsigned long val, void *data)
|
||||
{
|
||||
struct mce *mce = (struct mce *)data;
|
||||
char *type;
|
||||
|
||||
if (mce->kflags & MCE_HANDLED_CEC)
|
||||
return NOTIFY_DONE;
|
||||
|
||||
/*
|
||||
* Ignore unless this is a memory related error.
|
||||
* Don't check MCI_STATUS_ADDRV since it's not set on some CPUs.
|
||||
*/
|
||||
if ((mce->status & 0xefff) >> 7 != 1)
|
||||
return NOTIFY_DONE;
|
||||
|
||||
type = mce->mcgstatus & MCG_STATUS_MCIP ? "Exception" : "Event";
|
||||
|
||||
edac_dbg(0, "CPU %d: Machine Check %s: 0x%llx Bank %d: 0x%llx\n",
|
||||
mce->extcpu, type, mce->mcgstatus,
|
||||
mce->bank, mce->status);
|
||||
edac_dbg(0, "TSC 0x%llx\n", mce->tsc);
|
||||
edac_dbg(0, "ADDR 0x%llx\n", mce->addr);
|
||||
edac_dbg(0, "MISC 0x%llx\n", mce->misc);
|
||||
edac_dbg(0, "PROCESSOR %u:0x%x TIME %llu SOCKET %u APIC 0x%x\n",
|
||||
mce->cpuvendor, mce->cpuid, mce->time,
|
||||
mce->socketid, mce->apicid);
|
||||
|
||||
mce_check(mce);
|
||||
mce->kflags |= MCE_HANDLED_EDAC;
|
||||
|
||||
return NOTIFY_DONE;
|
||||
}
|
||||
|
||||
static struct notifier_block ie31200_mce_dec = {
|
||||
.notifier_call = mce_handler,
|
||||
.priority = MCE_PRIO_EDAC,
|
||||
};
|
||||
|
||||
static void ie31200_unregister_mcis(void)
|
||||
{
|
||||
struct ie31200_priv *priv;
|
||||
struct mem_ctl_info *mci;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < IE31200_IMC_NUM; i++) {
|
||||
priv = ie31200_pvt.priv[i];
|
||||
if (!priv)
|
||||
continue;
|
||||
|
||||
mci = priv->mci;
|
||||
edac_mc_del_mc(mci->pdev);
|
||||
iounmap(priv->window);
|
||||
edac_mc_free(mci);
|
||||
}
|
||||
}
|
||||
|
||||
static int ie31200_probe1(struct pci_dev *pdev, struct res_config *cfg)
|
||||
{
|
||||
int i, ret;
|
||||
|
||||
edac_dbg(0, "MC:\n");
|
||||
|
||||
if (!ecc_capable(pdev)) {
|
||||
ie31200_printk(KERN_INFO, "No ECC support\n");
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
for (i = 0; i < cfg->imc_num; i++) {
|
||||
ret = ie31200_register_mci(pdev, cfg, i);
|
||||
if (ret)
|
||||
goto fail_register;
|
||||
}
|
||||
|
||||
if (cfg->cmci) {
|
||||
mce_register_decode_chain(&ie31200_mce_dec);
|
||||
edac_op_state = EDAC_OPSTATE_INT;
|
||||
} else {
|
||||
edac_op_state = EDAC_OPSTATE_POLL;
|
||||
}
|
||||
|
||||
/* get this far and it's successful. */
|
||||
edac_dbg(3, "MC: success\n");
|
||||
return 0;
|
||||
|
||||
fail_unmap:
|
||||
iounmap(window);
|
||||
|
||||
fail_free:
|
||||
edac_mc_free(mci);
|
||||
|
||||
fail_register:
|
||||
ie31200_unregister_mcis();
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -555,7 +619,7 @@ static int ie31200_init_one(struct pci_dev *pdev,
|
||||
edac_dbg(0, "MC:\n");
|
||||
if (pci_enable_device(pdev) < 0)
|
||||
return -EIO;
|
||||
rc = ie31200_probe1(pdev, ent->driver_data);
|
||||
rc = ie31200_probe1(pdev, (struct res_config *)ent->driver_data);
|
||||
if (rc == 0 && !mci_pdev)
|
||||
mci_pdev = pci_dev_get(pdev);
|
||||
|
||||
@ -564,43 +628,112 @@ static int ie31200_init_one(struct pci_dev *pdev,
|
||||
|
||||
static void ie31200_remove_one(struct pci_dev *pdev)
|
||||
{
|
||||
struct mem_ctl_info *mci;
|
||||
struct ie31200_priv *priv;
|
||||
struct ie31200_priv *priv = ie31200_pvt.priv[0];
|
||||
|
||||
edac_dbg(0, "\n");
|
||||
pci_dev_put(mci_pdev);
|
||||
mci_pdev = NULL;
|
||||
mci = edac_mc_del_mc(&pdev->dev);
|
||||
if (!mci)
|
||||
return;
|
||||
priv = mci->pvt_info;
|
||||
iounmap(priv->window);
|
||||
edac_mc_free(mci);
|
||||
if (priv->cfg->cmci)
|
||||
mce_unregister_decode_chain(&ie31200_mce_dec);
|
||||
ie31200_unregister_mcis();
|
||||
}
|
||||
|
||||
static struct res_config snb_cfg = {
|
||||
.mtype = MEM_DDR3,
|
||||
.imc_num = 1,
|
||||
.reg_mchbar_mask = GENMASK_ULL(38, 15),
|
||||
.reg_mchbar_window_size = BIT_ULL(15),
|
||||
.reg_eccerrlog_offset[0] = 0x40c8,
|
||||
.reg_eccerrlog_offset[1] = 0x44c8,
|
||||
.reg_eccerrlog_ce_mask = BIT_ULL(0),
|
||||
.reg_eccerrlog_ue_mask = BIT_ULL(1),
|
||||
.reg_eccerrlog_rank_mask = GENMASK_ULL(28, 27),
|
||||
.reg_eccerrlog_syndrome_mask = GENMASK_ULL(23, 16),
|
||||
.reg_mad_dimm_size_granularity = BIT_ULL(28),
|
||||
.reg_mad_dimm_offset[0] = 0x5004,
|
||||
.reg_mad_dimm_offset[1] = 0x5008,
|
||||
.reg_mad_dimm_size_mask[0] = GENMASK(7, 0),
|
||||
.reg_mad_dimm_size_mask[1] = GENMASK(15, 8),
|
||||
.reg_mad_dimm_rank_mask[0] = BIT(17),
|
||||
.reg_mad_dimm_rank_mask[1] = BIT(18),
|
||||
.reg_mad_dimm_width_mask[0] = BIT(19),
|
||||
.reg_mad_dimm_width_mask[1] = BIT(20),
|
||||
};
|
||||
|
||||
static struct res_config skl_cfg = {
|
||||
.mtype = MEM_DDR4,
|
||||
.imc_num = 1,
|
||||
.reg_mchbar_mask = GENMASK_ULL(38, 15),
|
||||
.reg_mchbar_window_size = BIT_ULL(15),
|
||||
.reg_eccerrlog_offset[0] = 0x4048,
|
||||
.reg_eccerrlog_offset[1] = 0x4448,
|
||||
.reg_eccerrlog_ce_mask = BIT_ULL(0),
|
||||
.reg_eccerrlog_ue_mask = BIT_ULL(1),
|
||||
.reg_eccerrlog_rank_mask = GENMASK_ULL(28, 27),
|
||||
.reg_eccerrlog_syndrome_mask = GENMASK_ULL(23, 16),
|
||||
.reg_mad_dimm_size_granularity = BIT_ULL(30),
|
||||
.reg_mad_dimm_offset[0] = 0x500c,
|
||||
.reg_mad_dimm_offset[1] = 0x5010,
|
||||
.reg_mad_dimm_size_mask[0] = GENMASK(5, 0),
|
||||
.reg_mad_dimm_size_mask[1] = GENMASK(21, 16),
|
||||
.reg_mad_dimm_rank_mask[0] = BIT(10),
|
||||
.reg_mad_dimm_rank_mask[1] = BIT(26),
|
||||
.reg_mad_dimm_width_mask[0] = GENMASK(9, 8),
|
||||
.reg_mad_dimm_width_mask[1] = GENMASK(25, 24),
|
||||
};
|
||||
|
||||
struct res_config rpl_s_cfg = {
|
||||
.mtype = MEM_DDR5,
|
||||
.cmci = true,
|
||||
.imc_num = 2,
|
||||
.reg_mchbar_mask = GENMASK_ULL(41, 17),
|
||||
.reg_mchbar_window_size = BIT_ULL(16),
|
||||
.reg_eccerrlog_offset[0] = 0xe048,
|
||||
.reg_eccerrlog_offset[1] = 0xe848,
|
||||
.reg_eccerrlog_ce_mask = BIT_ULL(0),
|
||||
.reg_eccerrlog_ce_ovfl_mask = BIT_ULL(1),
|
||||
.reg_eccerrlog_ue_mask = BIT_ULL(2),
|
||||
.reg_eccerrlog_ue_ovfl_mask = BIT_ULL(3),
|
||||
.reg_eccerrlog_rank_mask = GENMASK_ULL(28, 27),
|
||||
.reg_eccerrlog_syndrome_mask = GENMASK_ULL(23, 16),
|
||||
.msr_clear_eccerrlog_offset = 0x791,
|
||||
.reg_mad_dimm_offset[0] = 0xd80c,
|
||||
.reg_mad_dimm_offset[1] = 0xd810,
|
||||
.reg_mad_dimm_size_granularity = BIT_ULL(29),
|
||||
.reg_mad_dimm_size_mask[0] = GENMASK(6, 0),
|
||||
.reg_mad_dimm_size_mask[1] = GENMASK(22, 16),
|
||||
.reg_mad_dimm_rank_mask[0] = GENMASK(10, 9),
|
||||
.reg_mad_dimm_rank_mask[1] = GENMASK(27, 26),
|
||||
.reg_mad_dimm_width_mask[0] = GENMASK(8, 7),
|
||||
.reg_mad_dimm_width_mask[1] = GENMASK(25, 24),
|
||||
};
|
||||
|
||||
static const struct pci_device_id ie31200_pci_tbl[] = {
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_1), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_2), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_3), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_4), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_5), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_6), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_7), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_8), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_9), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_10), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_11), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_12), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_CFL_1), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_CFL_2), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_CFL_3), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_CFL_4), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_CFL_5), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_CFL_6), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_CFL_7), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_CFL_8), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_CFL_9), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VEND_DEV(INTEL, IE31200_HB_CFL_10), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_1), (kernel_ulong_t)&snb_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_2), (kernel_ulong_t)&snb_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_3), (kernel_ulong_t)&snb_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_4), (kernel_ulong_t)&snb_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_5), (kernel_ulong_t)&snb_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_6), (kernel_ulong_t)&snb_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_7), (kernel_ulong_t)&snb_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_8), (kernel_ulong_t)&skl_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_9), (kernel_ulong_t)&skl_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_10), (kernel_ulong_t)&skl_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_11), (kernel_ulong_t)&skl_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_12), (kernel_ulong_t)&skl_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_1), (kernel_ulong_t)&skl_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_2), (kernel_ulong_t)&skl_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_3), (kernel_ulong_t)&skl_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_4), (kernel_ulong_t)&skl_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_5), (kernel_ulong_t)&skl_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_6), (kernel_ulong_t)&skl_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_7), (kernel_ulong_t)&skl_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_8), (kernel_ulong_t)&skl_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_9), (kernel_ulong_t)&skl_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_10), (kernel_ulong_t)&skl_cfg },
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_S_1), (kernel_ulong_t)&rpl_s_cfg},
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_S_2), (kernel_ulong_t)&rpl_s_cfg},
|
||||
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_S_3), (kernel_ulong_t)&rpl_s_cfg},
|
||||
{ 0, } /* 0 terminated list. */
|
||||
};
|
||||
MODULE_DEVICE_TABLE(pci, ie31200_pci_tbl);
|
||||
@ -617,12 +750,10 @@ static int __init ie31200_init(void)
|
||||
int pci_rc, i;
|
||||
|
||||
edac_dbg(3, "MC:\n");
|
||||
/* Ensure that the OPSTATE is set correctly for POLL or NMI */
|
||||
opstate_init();
|
||||
|
||||
pci_rc = pci_register_driver(&ie31200_driver);
|
||||
if (pci_rc < 0)
|
||||
goto fail0;
|
||||
return pci_rc;
|
||||
|
||||
if (!mci_pdev) {
|
||||
ie31200_registered = 0;
|
||||
@ -633,11 +764,13 @@ static int __init ie31200_init(void)
|
||||
if (mci_pdev)
|
||||
break;
|
||||
}
|
||||
|
||||
if (!mci_pdev) {
|
||||
edac_dbg(0, "ie31200 pci_get_device fail\n");
|
||||
pci_rc = -ENODEV;
|
||||
goto fail1;
|
||||
goto fail0;
|
||||
}
|
||||
|
||||
pci_rc = ie31200_init_one(mci_pdev, &ie31200_pci_tbl[i]);
|
||||
if (pci_rc < 0) {
|
||||
edac_dbg(0, "ie31200 init fail\n");
|
||||
@ -645,12 +778,12 @@ static int __init ie31200_init(void)
|
||||
goto fail1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
||||
return 0;
|
||||
fail1:
|
||||
pci_unregister_driver(&ie31200_driver);
|
||||
fail0:
|
||||
pci_dev_put(mci_pdev);
|
||||
fail0:
|
||||
pci_unregister_driver(&ie31200_driver);
|
||||
|
||||
return pci_rc;
|
||||
}
|
||||
|
@ -125,7 +125,7 @@
|
||||
#define MEM_SLICE_HASH_MASK(v) (GET_BITFIELD(v, 6, 19) << 6)
|
||||
#define MEM_SLICE_HASH_LSB_MASK_BIT(v) GET_BITFIELD(v, 24, 26)
|
||||
|
||||
static struct res_config {
|
||||
static const struct res_config {
|
||||
bool machine_check;
|
||||
int num_imc;
|
||||
u32 imc_base;
|
||||
@ -472,7 +472,7 @@ static u64 rpl_p_err_addr(u64 ecclog)
|
||||
return ECC_ERROR_LOG_ADDR45(ecclog);
|
||||
}
|
||||
|
||||
static struct res_config ehl_cfg = {
|
||||
static const struct res_config ehl_cfg = {
|
||||
.num_imc = 1,
|
||||
.imc_base = 0x5000,
|
||||
.ibecc_base = 0xdc00,
|
||||
@ -482,7 +482,7 @@ static struct res_config ehl_cfg = {
|
||||
.err_addr_to_imc_addr = ehl_err_addr_to_imc_addr,
|
||||
};
|
||||
|
||||
static struct res_config icl_cfg = {
|
||||
static const struct res_config icl_cfg = {
|
||||
.num_imc = 1,
|
||||
.imc_base = 0x5000,
|
||||
.ibecc_base = 0xd800,
|
||||
@ -492,7 +492,7 @@ static struct res_config icl_cfg = {
|
||||
.err_addr_to_imc_addr = ehl_err_addr_to_imc_addr,
|
||||
};
|
||||
|
||||
static struct res_config tgl_cfg = {
|
||||
static const struct res_config tgl_cfg = {
|
||||
.machine_check = true,
|
||||
.num_imc = 2,
|
||||
.imc_base = 0x5000,
|
||||
@ -506,7 +506,7 @@ static struct res_config tgl_cfg = {
|
||||
.err_addr_to_imc_addr = tgl_err_addr_to_imc_addr,
|
||||
};
|
||||
|
||||
static struct res_config adl_cfg = {
|
||||
static const struct res_config adl_cfg = {
|
||||
.machine_check = true,
|
||||
.num_imc = 2,
|
||||
.imc_base = 0xd800,
|
||||
@ -517,7 +517,7 @@ static struct res_config adl_cfg = {
|
||||
.err_addr_to_imc_addr = adl_err_addr_to_imc_addr,
|
||||
};
|
||||
|
||||
static struct res_config adl_n_cfg = {
|
||||
static const struct res_config adl_n_cfg = {
|
||||
.machine_check = true,
|
||||
.num_imc = 1,
|
||||
.imc_base = 0xd800,
|
||||
@ -528,7 +528,7 @@ static struct res_config adl_n_cfg = {
|
||||
.err_addr_to_imc_addr = adl_err_addr_to_imc_addr,
|
||||
};
|
||||
|
||||
static struct res_config rpl_p_cfg = {
|
||||
static const struct res_config rpl_p_cfg = {
|
||||
.machine_check = true,
|
||||
.num_imc = 2,
|
||||
.imc_base = 0xd800,
|
||||
@ -540,7 +540,7 @@ static struct res_config rpl_p_cfg = {
|
||||
.err_addr_to_imc_addr = adl_err_addr_to_imc_addr,
|
||||
};
|
||||
|
||||
static struct res_config mtl_ps_cfg = {
|
||||
static const struct res_config mtl_ps_cfg = {
|
||||
.machine_check = true,
|
||||
.num_imc = 2,
|
||||
.imc_base = 0xd800,
|
||||
@ -551,7 +551,7 @@ static struct res_config mtl_ps_cfg = {
|
||||
.err_addr_to_imc_addr = adl_err_addr_to_imc_addr,
|
||||
};
|
||||
|
||||
static struct res_config mtl_p_cfg = {
|
||||
static const struct res_config mtl_p_cfg = {
|
||||
.machine_check = true,
|
||||
.num_imc = 2,
|
||||
.imc_base = 0xd800,
|
||||
@ -785,13 +785,22 @@ static u64 ecclog_read_and_clear(struct igen6_imc *imc)
|
||||
{
|
||||
u64 ecclog = readq(imc->window + ECC_ERROR_LOG_OFFSET);
|
||||
|
||||
if (ecclog & (ECC_ERROR_LOG_CE | ECC_ERROR_LOG_UE)) {
|
||||
/* Clear CE/UE bits by writing 1s */
|
||||
writeq(ecclog, imc->window + ECC_ERROR_LOG_OFFSET);
|
||||
return ecclog;
|
||||
}
|
||||
/*
|
||||
* Quirk: The ECC_ERROR_LOG register of certain SoCs may contain
|
||||
* the invalid value ~0. This will result in a flood of invalid
|
||||
* error reports in polling mode. Skip it.
|
||||
*/
|
||||
if (ecclog == ~0)
|
||||
return 0;
|
||||
|
||||
return 0;
|
||||
/* Neither a CE nor a UE. Skip it.*/
|
||||
if (!(ecclog & (ECC_ERROR_LOG_CE | ECC_ERROR_LOG_UE)))
|
||||
return 0;
|
||||
|
||||
/* Clear CE/UE bits by writing 1s */
|
||||
writeq(ecclog, imc->window + ECC_ERROR_LOG_OFFSET);
|
||||
|
||||
return ecclog;
|
||||
}
|
||||
|
||||
static void errsts_clear(struct igen6_imc *imc)
|
||||
@ -1374,7 +1383,7 @@ static void unregister_err_handler(void)
|
||||
unregister_nmi_handler(NMI_SERR, IGEN6_NMI_NAME);
|
||||
}
|
||||
|
||||
static void opstate_set(struct res_config *cfg, const struct pci_device_id *ent)
|
||||
static void opstate_set(const struct res_config *cfg, const struct pci_device_id *ent)
|
||||
{
|
||||
/*
|
||||
* Quirk: Certain SoCs' error reporting interrupts don't work.
|
||||
|
359
drivers/edac/mem_repair.c
Executable file
359
drivers/edac/mem_repair.c
Executable file
@ -0,0 +1,359 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* The generic EDAC memory repair driver is designed to control the memory
|
||||
* devices with memory repair features, such as Post Package Repair (PPR),
|
||||
* memory sparing etc. The common sysfs memory repair interface abstracts
|
||||
* the control of various arbitrary memory repair functionalities into a
|
||||
* unified set of functions.
|
||||
*
|
||||
* Copyright (c) 2024-2025 HiSilicon Limited.
|
||||
*/
|
||||
|
||||
#include <linux/edac.h>
|
||||
|
||||
enum edac_mem_repair_attributes {
|
||||
MR_TYPE,
|
||||
MR_PERSIST_MODE,
|
||||
MR_SAFE_IN_USE,
|
||||
MR_HPA,
|
||||
MR_MIN_HPA,
|
||||
MR_MAX_HPA,
|
||||
MR_DPA,
|
||||
MR_MIN_DPA,
|
||||
MR_MAX_DPA,
|
||||
MR_NIBBLE_MASK,
|
||||
MR_BANK_GROUP,
|
||||
MR_BANK,
|
||||
MR_RANK,
|
||||
MR_ROW,
|
||||
MR_COLUMN,
|
||||
MR_CHANNEL,
|
||||
MR_SUB_CHANNEL,
|
||||
MEM_DO_REPAIR,
|
||||
MR_MAX_ATTRS
|
||||
};
|
||||
|
||||
struct edac_mem_repair_dev_attr {
|
||||
struct device_attribute dev_attr;
|
||||
u8 instance;
|
||||
};
|
||||
|
||||
struct edac_mem_repair_context {
|
||||
char name[EDAC_FEAT_NAME_LEN];
|
||||
struct edac_mem_repair_dev_attr mem_repair_dev_attr[MR_MAX_ATTRS];
|
||||
struct attribute *mem_repair_attrs[MR_MAX_ATTRS + 1];
|
||||
struct attribute_group group;
|
||||
};
|
||||
|
||||
#define TO_MR_DEV_ATTR(_dev_attr) \
|
||||
container_of(_dev_attr, struct edac_mem_repair_dev_attr, dev_attr)
|
||||
|
||||
#define MR_ATTR_SHOW(attrib, cb, type, format) \
|
||||
static ssize_t attrib##_show(struct device *ras_feat_dev, \
|
||||
struct device_attribute *attr, char *buf) \
|
||||
{ \
|
||||
u8 inst = TO_MR_DEV_ATTR(attr)->instance; \
|
||||
struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \
|
||||
const struct edac_mem_repair_ops *ops = \
|
||||
ctx->mem_repair[inst].mem_repair_ops; \
|
||||
type data; \
|
||||
int ret; \
|
||||
\
|
||||
ret = ops->cb(ras_feat_dev->parent, ctx->mem_repair[inst].private, \
|
||||
&data); \
|
||||
if (ret) \
|
||||
return ret; \
|
||||
\
|
||||
return sysfs_emit(buf, format, data); \
|
||||
}
|
||||
|
||||
MR_ATTR_SHOW(repair_type, get_repair_type, const char *, "%s\n")
|
||||
MR_ATTR_SHOW(persist_mode, get_persist_mode, bool, "%u\n")
|
||||
MR_ATTR_SHOW(repair_safe_when_in_use, get_repair_safe_when_in_use, bool, "%u\n")
|
||||
MR_ATTR_SHOW(hpa, get_hpa, u64, "0x%llx\n")
|
||||
MR_ATTR_SHOW(min_hpa, get_min_hpa, u64, "0x%llx\n")
|
||||
MR_ATTR_SHOW(max_hpa, get_max_hpa, u64, "0x%llx\n")
|
||||
MR_ATTR_SHOW(dpa, get_dpa, u64, "0x%llx\n")
|
||||
MR_ATTR_SHOW(min_dpa, get_min_dpa, u64, "0x%llx\n")
|
||||
MR_ATTR_SHOW(max_dpa, get_max_dpa, u64, "0x%llx\n")
|
||||
MR_ATTR_SHOW(nibble_mask, get_nibble_mask, u32, "0x%x\n")
|
||||
MR_ATTR_SHOW(bank_group, get_bank_group, u32, "%u\n")
|
||||
MR_ATTR_SHOW(bank, get_bank, u32, "%u\n")
|
||||
MR_ATTR_SHOW(rank, get_rank, u32, "%u\n")
|
||||
MR_ATTR_SHOW(row, get_row, u32, "0x%x\n")
|
||||
MR_ATTR_SHOW(column, get_column, u32, "%u\n")
|
||||
MR_ATTR_SHOW(channel, get_channel, u32, "%u\n")
|
||||
MR_ATTR_SHOW(sub_channel, get_sub_channel, u32, "%u\n")
|
||||
|
||||
#define MR_ATTR_STORE(attrib, cb, type, conv_func) \
|
||||
static ssize_t attrib##_store(struct device *ras_feat_dev, \
|
||||
struct device_attribute *attr, \
|
||||
const char *buf, size_t len) \
|
||||
{ \
|
||||
u8 inst = TO_MR_DEV_ATTR(attr)->instance; \
|
||||
struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \
|
||||
const struct edac_mem_repair_ops *ops = \
|
||||
ctx->mem_repair[inst].mem_repair_ops; \
|
||||
type data; \
|
||||
int ret; \
|
||||
\
|
||||
ret = conv_func(buf, 0, &data); \
|
||||
if (ret < 0) \
|
||||
return ret; \
|
||||
\
|
||||
ret = ops->cb(ras_feat_dev->parent, ctx->mem_repair[inst].private, \
|
||||
data); \
|
||||
if (ret) \
|
||||
return ret; \
|
||||
\
|
||||
return len; \
|
||||
}
|
||||
|
||||
MR_ATTR_STORE(persist_mode, set_persist_mode, unsigned long, kstrtoul)
|
||||
MR_ATTR_STORE(hpa, set_hpa, u64, kstrtou64)
|
||||
MR_ATTR_STORE(dpa, set_dpa, u64, kstrtou64)
|
||||
MR_ATTR_STORE(nibble_mask, set_nibble_mask, unsigned long, kstrtoul)
|
||||
MR_ATTR_STORE(bank_group, set_bank_group, unsigned long, kstrtoul)
|
||||
MR_ATTR_STORE(bank, set_bank, unsigned long, kstrtoul)
|
||||
MR_ATTR_STORE(rank, set_rank, unsigned long, kstrtoul)
|
||||
MR_ATTR_STORE(row, set_row, unsigned long, kstrtoul)
|
||||
MR_ATTR_STORE(column, set_column, unsigned long, kstrtoul)
|
||||
MR_ATTR_STORE(channel, set_channel, unsigned long, kstrtoul)
|
||||
MR_ATTR_STORE(sub_channel, set_sub_channel, unsigned long, kstrtoul)
|
||||
|
||||
#define MR_DO_OP(attrib, cb) \
|
||||
static ssize_t attrib##_store(struct device *ras_feat_dev, \
|
||||
struct device_attribute *attr, \
|
||||
const char *buf, size_t len) \
|
||||
{ \
|
||||
u8 inst = TO_MR_DEV_ATTR(attr)->instance; \
|
||||
struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \
|
||||
const struct edac_mem_repair_ops *ops = ctx->mem_repair[inst].mem_repair_ops; \
|
||||
unsigned long data; \
|
||||
int ret; \
|
||||
\
|
||||
ret = kstrtoul(buf, 0, &data); \
|
||||
if (ret < 0) \
|
||||
return ret; \
|
||||
\
|
||||
ret = ops->cb(ras_feat_dev->parent, ctx->mem_repair[inst].private, data); \
|
||||
if (ret) \
|
||||
return ret; \
|
||||
\
|
||||
return len; \
|
||||
}
|
||||
|
||||
MR_DO_OP(repair, do_repair)
|
||||
|
||||
static umode_t mem_repair_attr_visible(struct kobject *kobj, struct attribute *a, int attr_id)
|
||||
{
|
||||
struct device *ras_feat_dev = kobj_to_dev(kobj);
|
||||
struct device_attribute *dev_attr = container_of(a, struct device_attribute, attr);
|
||||
struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev);
|
||||
u8 inst = TO_MR_DEV_ATTR(dev_attr)->instance;
|
||||
const struct edac_mem_repair_ops *ops = ctx->mem_repair[inst].mem_repair_ops;
|
||||
|
||||
switch (attr_id) {
|
||||
case MR_TYPE:
|
||||
if (ops->get_repair_type)
|
||||
return a->mode;
|
||||
break;
|
||||
case MR_PERSIST_MODE:
|
||||
if (ops->get_persist_mode) {
|
||||
if (ops->set_persist_mode)
|
||||
return a->mode;
|
||||
else
|
||||
return 0444;
|
||||
}
|
||||
break;
|
||||
case MR_SAFE_IN_USE:
|
||||
if (ops->get_repair_safe_when_in_use)
|
||||
return a->mode;
|
||||
break;
|
||||
case MR_HPA:
|
||||
if (ops->get_hpa) {
|
||||
if (ops->set_hpa)
|
||||
return a->mode;
|
||||
else
|
||||
return 0444;
|
||||
}
|
||||
break;
|
||||
case MR_MIN_HPA:
|
||||
if (ops->get_min_hpa)
|
||||
return a->mode;
|
||||
break;
|
||||
case MR_MAX_HPA:
|
||||
if (ops->get_max_hpa)
|
||||
return a->mode;
|
||||
break;
|
||||
case MR_DPA:
|
||||
if (ops->get_dpa) {
|
||||
if (ops->set_dpa)
|
||||
return a->mode;
|
||||
else
|
||||
return 0444;
|
||||
}
|
||||
break;
|
||||
case MR_MIN_DPA:
|
||||
if (ops->get_min_dpa)
|
||||
return a->mode;
|
||||
break;
|
||||
case MR_MAX_DPA:
|
||||
if (ops->get_max_dpa)
|
||||
return a->mode;
|
||||
break;
|
||||
case MR_NIBBLE_MASK:
|
||||
if (ops->get_nibble_mask) {
|
||||
if (ops->set_nibble_mask)
|
||||
return a->mode;
|
||||
else
|
||||
return 0444;
|
||||
}
|
||||
break;
|
||||
case MR_BANK_GROUP:
|
||||
if (ops->get_bank_group) {
|
||||
if (ops->set_bank_group)
|
||||
return a->mode;
|
||||
else
|
||||
return 0444;
|
||||
}
|
||||
break;
|
||||
case MR_BANK:
|
||||
if (ops->get_bank) {
|
||||
if (ops->set_bank)
|
||||
return a->mode;
|
||||
else
|
||||
return 0444;
|
||||
}
|
||||
break;
|
||||
case MR_RANK:
|
||||
if (ops->get_rank) {
|
||||
if (ops->set_rank)
|
||||
return a->mode;
|
||||
else
|
||||
return 0444;
|
||||
}
|
||||
break;
|
||||
case MR_ROW:
|
||||
if (ops->get_row) {
|
||||
if (ops->set_row)
|
||||
return a->mode;
|
||||
else
|
||||
return 0444;
|
||||
}
|
||||
break;
|
||||
case MR_COLUMN:
|
||||
if (ops->get_column) {
|
||||
if (ops->set_column)
|
||||
return a->mode;
|
||||
else
|
||||
return 0444;
|
||||
}
|
||||
break;
|
||||
case MR_CHANNEL:
|
||||
if (ops->get_channel) {
|
||||
if (ops->set_channel)
|
||||
return a->mode;
|
||||
else
|
||||
return 0444;
|
||||
}
|
||||
break;
|
||||
case MR_SUB_CHANNEL:
|
||||
if (ops->get_sub_channel) {
|
||||
if (ops->set_sub_channel)
|
||||
return a->mode;
|
||||
else
|
||||
return 0444;
|
||||
}
|
||||
break;
|
||||
case MEM_DO_REPAIR:
|
||||
if (ops->do_repair)
|
||||
return a->mode;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define MR_ATTR_RO(_name, _instance) \
|
||||
((struct edac_mem_repair_dev_attr) { .dev_attr = __ATTR_RO(_name), \
|
||||
.instance = _instance })
|
||||
|
||||
#define MR_ATTR_WO(_name, _instance) \
|
||||
((struct edac_mem_repair_dev_attr) { .dev_attr = __ATTR_WO(_name), \
|
||||
.instance = _instance })
|
||||
|
||||
#define MR_ATTR_RW(_name, _instance) \
|
||||
((struct edac_mem_repair_dev_attr) { .dev_attr = __ATTR_RW(_name), \
|
||||
.instance = _instance })
|
||||
|
||||
static int mem_repair_create_desc(struct device *dev,
|
||||
const struct attribute_group **attr_groups,
|
||||
u8 instance)
|
||||
{
|
||||
struct edac_mem_repair_context *ctx;
|
||||
struct attribute_group *group;
|
||||
int i;
|
||||
struct edac_mem_repair_dev_attr dev_attr[] = {
|
||||
[MR_TYPE] = MR_ATTR_RO(repair_type, instance),
|
||||
[MR_PERSIST_MODE] = MR_ATTR_RW(persist_mode, instance),
|
||||
[MR_SAFE_IN_USE] = MR_ATTR_RO(repair_safe_when_in_use, instance),
|
||||
[MR_HPA] = MR_ATTR_RW(hpa, instance),
|
||||
[MR_MIN_HPA] = MR_ATTR_RO(min_hpa, instance),
|
||||
[MR_MAX_HPA] = MR_ATTR_RO(max_hpa, instance),
|
||||
[MR_DPA] = MR_ATTR_RW(dpa, instance),
|
||||
[MR_MIN_DPA] = MR_ATTR_RO(min_dpa, instance),
|
||||
[MR_MAX_DPA] = MR_ATTR_RO(max_dpa, instance),
|
||||
[MR_NIBBLE_MASK] = MR_ATTR_RW(nibble_mask, instance),
|
||||
[MR_BANK_GROUP] = MR_ATTR_RW(bank_group, instance),
|
||||
[MR_BANK] = MR_ATTR_RW(bank, instance),
|
||||
[MR_RANK] = MR_ATTR_RW(rank, instance),
|
||||
[MR_ROW] = MR_ATTR_RW(row, instance),
|
||||
[MR_COLUMN] = MR_ATTR_RW(column, instance),
|
||||
[MR_CHANNEL] = MR_ATTR_RW(channel, instance),
|
||||
[MR_SUB_CHANNEL] = MR_ATTR_RW(sub_channel, instance),
|
||||
[MEM_DO_REPAIR] = MR_ATTR_WO(repair, instance)
|
||||
};
|
||||
|
||||
ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL);
|
||||
if (!ctx)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = 0; i < MR_MAX_ATTRS; i++) {
|
||||
memcpy(&ctx->mem_repair_dev_attr[i],
|
||||
&dev_attr[i], sizeof(dev_attr[i]));
|
||||
ctx->mem_repair_attrs[i] =
|
||||
&ctx->mem_repair_dev_attr[i].dev_attr.attr;
|
||||
}
|
||||
|
||||
sprintf(ctx->name, "%s%d", "mem_repair", instance);
|
||||
group = &ctx->group;
|
||||
group->name = ctx->name;
|
||||
group->attrs = ctx->mem_repair_attrs;
|
||||
group->is_visible = mem_repair_attr_visible;
|
||||
attr_groups[0] = group;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* edac_mem_repair_get_desc - get EDAC memory repair descriptors
|
||||
* @dev: client device with memory repair feature
|
||||
* @attr_groups: pointer to attribute group container
|
||||
* @instance: device's memory repair instance number.
|
||||
*
|
||||
* Return:
|
||||
* * %0 - Success.
|
||||
* * %-EINVAL - Invalid parameters passed.
|
||||
* * %-ENOMEM - Dynamic memory allocation failed.
|
||||
*/
|
||||
int edac_mem_repair_get_desc(struct device *dev,
|
||||
const struct attribute_group **attr_groups, u8 instance)
|
||||
{
|
||||
if (!dev || !attr_groups)
|
||||
return -EINVAL;
|
||||
|
||||
return mem_repair_create_desc(dev, attr_groups, instance);
|
||||
}
|
@ -372,7 +372,7 @@ static int gen_asym_mask(struct b_cr_slice_channel_hash *p,
|
||||
struct b_cr_asym_mem_region1_mchbar *as1,
|
||||
struct b_cr_asym_2way_mem_region_mchbar *as2way)
|
||||
{
|
||||
const int intlv[] = { 0x5, 0xA, 0x3, 0xC };
|
||||
static const int intlv[] = { 0x5, 0xA, 0x3, 0xC };
|
||||
int mask = 0;
|
||||
|
||||
if (as2way->asym_2way_interleave_enable)
|
||||
@ -489,7 +489,7 @@ static int dnv_get_registers(void)
|
||||
*/
|
||||
static int get_registers(void)
|
||||
{
|
||||
const int intlv[] = { 10, 11, 12, 12 };
|
||||
static const int intlv[] = { 10, 11, 12, 12 };
|
||||
|
||||
if (RD_REG(&tolud, b_cr_tolud_pci) ||
|
||||
RD_REG(&touud_lo, b_cr_touud_lo_pci) ||
|
||||
|
209
drivers/edac/scrub.c
Executable file
209
drivers/edac/scrub.c
Executable file
@ -0,0 +1,209 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* The generic EDAC scrub driver controls the memory scrubbers in the
|
||||
* system. The common sysfs scrub interface abstracts the control of
|
||||
* various arbitrary scrubbing functionalities into a unified set of
|
||||
* functions.
|
||||
*
|
||||
* Copyright (c) 2024-2025 HiSilicon Limited.
|
||||
*/
|
||||
|
||||
#include <linux/edac.h>
|
||||
|
||||
enum edac_scrub_attributes {
|
||||
SCRUB_ADDRESS,
|
||||
SCRUB_SIZE,
|
||||
SCRUB_ENABLE_BACKGROUND,
|
||||
SCRUB_MIN_CYCLE_DURATION,
|
||||
SCRUB_MAX_CYCLE_DURATION,
|
||||
SCRUB_CUR_CYCLE_DURATION,
|
||||
SCRUB_MAX_ATTRS
|
||||
};
|
||||
|
||||
struct edac_scrub_dev_attr {
|
||||
struct device_attribute dev_attr;
|
||||
u8 instance;
|
||||
};
|
||||
|
||||
struct edac_scrub_context {
|
||||
char name[EDAC_FEAT_NAME_LEN];
|
||||
struct edac_scrub_dev_attr scrub_dev_attr[SCRUB_MAX_ATTRS];
|
||||
struct attribute *scrub_attrs[SCRUB_MAX_ATTRS + 1];
|
||||
struct attribute_group group;
|
||||
};
|
||||
|
||||
#define TO_SCRUB_DEV_ATTR(_dev_attr) \
|
||||
container_of(_dev_attr, struct edac_scrub_dev_attr, dev_attr)
|
||||
|
||||
#define EDAC_SCRUB_ATTR_SHOW(attrib, cb, type, format) \
|
||||
static ssize_t attrib##_show(struct device *ras_feat_dev, \
|
||||
struct device_attribute *attr, char *buf) \
|
||||
{ \
|
||||
u8 inst = TO_SCRUB_DEV_ATTR(attr)->instance; \
|
||||
struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \
|
||||
const struct edac_scrub_ops *ops = ctx->scrub[inst].scrub_ops; \
|
||||
type data; \
|
||||
int ret; \
|
||||
\
|
||||
ret = ops->cb(ras_feat_dev->parent, ctx->scrub[inst].private, &data); \
|
||||
if (ret) \
|
||||
return ret; \
|
||||
\
|
||||
return sysfs_emit(buf, format, data); \
|
||||
}
|
||||
|
||||
EDAC_SCRUB_ATTR_SHOW(addr, read_addr, u64, "0x%llx\n")
|
||||
EDAC_SCRUB_ATTR_SHOW(size, read_size, u64, "0x%llx\n")
|
||||
EDAC_SCRUB_ATTR_SHOW(enable_background, get_enabled_bg, bool, "%u\n")
|
||||
EDAC_SCRUB_ATTR_SHOW(min_cycle_duration, get_min_cycle, u32, "%u\n")
|
||||
EDAC_SCRUB_ATTR_SHOW(max_cycle_duration, get_max_cycle, u32, "%u\n")
|
||||
EDAC_SCRUB_ATTR_SHOW(current_cycle_duration, get_cycle_duration, u32, "%u\n")
|
||||
|
||||
#define EDAC_SCRUB_ATTR_STORE(attrib, cb, type, conv_func) \
|
||||
static ssize_t attrib##_store(struct device *ras_feat_dev, \
|
||||
struct device_attribute *attr, \
|
||||
const char *buf, size_t len) \
|
||||
{ \
|
||||
u8 inst = TO_SCRUB_DEV_ATTR(attr)->instance; \
|
||||
struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \
|
||||
const struct edac_scrub_ops *ops = ctx->scrub[inst].scrub_ops; \
|
||||
type data; \
|
||||
int ret; \
|
||||
\
|
||||
ret = conv_func(buf, 0, &data); \
|
||||
if (ret < 0) \
|
||||
return ret; \
|
||||
\
|
||||
ret = ops->cb(ras_feat_dev->parent, ctx->scrub[inst].private, data); \
|
||||
if (ret) \
|
||||
return ret; \
|
||||
\
|
||||
return len; \
|
||||
}
|
||||
|
||||
EDAC_SCRUB_ATTR_STORE(addr, write_addr, u64, kstrtou64)
|
||||
EDAC_SCRUB_ATTR_STORE(size, write_size, u64, kstrtou64)
|
||||
EDAC_SCRUB_ATTR_STORE(enable_background, set_enabled_bg, unsigned long, kstrtoul)
|
||||
EDAC_SCRUB_ATTR_STORE(current_cycle_duration, set_cycle_duration, unsigned long, kstrtoul)
|
||||
|
||||
static umode_t scrub_attr_visible(struct kobject *kobj, struct attribute *a, int attr_id)
|
||||
{
|
||||
struct device *ras_feat_dev = kobj_to_dev(kobj);
|
||||
struct device_attribute *dev_attr = container_of(a, struct device_attribute, attr);
|
||||
u8 inst = TO_SCRUB_DEV_ATTR(dev_attr)->instance;
|
||||
struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev);
|
||||
const struct edac_scrub_ops *ops = ctx->scrub[inst].scrub_ops;
|
||||
|
||||
switch (attr_id) {
|
||||
case SCRUB_ADDRESS:
|
||||
if (ops->read_addr) {
|
||||
if (ops->write_addr)
|
||||
return a->mode;
|
||||
else
|
||||
return 0444;
|
||||
}
|
||||
break;
|
||||
case SCRUB_SIZE:
|
||||
if (ops->read_size) {
|
||||
if (ops->write_size)
|
||||
return a->mode;
|
||||
else
|
||||
return 0444;
|
||||
}
|
||||
break;
|
||||
case SCRUB_ENABLE_BACKGROUND:
|
||||
if (ops->get_enabled_bg) {
|
||||
if (ops->set_enabled_bg)
|
||||
return a->mode;
|
||||
else
|
||||
return 0444;
|
||||
}
|
||||
break;
|
||||
case SCRUB_MIN_CYCLE_DURATION:
|
||||
if (ops->get_min_cycle)
|
||||
return a->mode;
|
||||
break;
|
||||
case SCRUB_MAX_CYCLE_DURATION:
|
||||
if (ops->get_max_cycle)
|
||||
return a->mode;
|
||||
break;
|
||||
case SCRUB_CUR_CYCLE_DURATION:
|
||||
if (ops->get_cycle_duration) {
|
||||
if (ops->set_cycle_duration)
|
||||
return a->mode;
|
||||
else
|
||||
return 0444;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define EDAC_SCRUB_ATTR_RO(_name, _instance) \
|
||||
((struct edac_scrub_dev_attr) { .dev_attr = __ATTR_RO(_name), \
|
||||
.instance = _instance })
|
||||
|
||||
#define EDAC_SCRUB_ATTR_WO(_name, _instance) \
|
||||
((struct edac_scrub_dev_attr) { .dev_attr = __ATTR_WO(_name), \
|
||||
.instance = _instance })
|
||||
|
||||
#define EDAC_SCRUB_ATTR_RW(_name, _instance) \
|
||||
((struct edac_scrub_dev_attr) { .dev_attr = __ATTR_RW(_name), \
|
||||
.instance = _instance })
|
||||
|
||||
static int scrub_create_desc(struct device *scrub_dev,
|
||||
const struct attribute_group **attr_groups, u8 instance)
|
||||
{
|
||||
struct edac_scrub_context *scrub_ctx;
|
||||
struct attribute_group *group;
|
||||
int i;
|
||||
struct edac_scrub_dev_attr dev_attr[] = {
|
||||
[SCRUB_ADDRESS] = EDAC_SCRUB_ATTR_RW(addr, instance),
|
||||
[SCRUB_SIZE] = EDAC_SCRUB_ATTR_RW(size, instance),
|
||||
[SCRUB_ENABLE_BACKGROUND] = EDAC_SCRUB_ATTR_RW(enable_background, instance),
|
||||
[SCRUB_MIN_CYCLE_DURATION] = EDAC_SCRUB_ATTR_RO(min_cycle_duration, instance),
|
||||
[SCRUB_MAX_CYCLE_DURATION] = EDAC_SCRUB_ATTR_RO(max_cycle_duration, instance),
|
||||
[SCRUB_CUR_CYCLE_DURATION] = EDAC_SCRUB_ATTR_RW(current_cycle_duration, instance)
|
||||
};
|
||||
|
||||
scrub_ctx = devm_kzalloc(scrub_dev, sizeof(*scrub_ctx), GFP_KERNEL);
|
||||
if (!scrub_ctx)
|
||||
return -ENOMEM;
|
||||
|
||||
group = &scrub_ctx->group;
|
||||
for (i = 0; i < SCRUB_MAX_ATTRS; i++) {
|
||||
memcpy(&scrub_ctx->scrub_dev_attr[i], &dev_attr[i], sizeof(dev_attr[i]));
|
||||
scrub_ctx->scrub_attrs[i] = &scrub_ctx->scrub_dev_attr[i].dev_attr.attr;
|
||||
}
|
||||
sprintf(scrub_ctx->name, "%s%d", "scrub", instance);
|
||||
group->name = scrub_ctx->name;
|
||||
group->attrs = scrub_ctx->scrub_attrs;
|
||||
group->is_visible = scrub_attr_visible;
|
||||
|
||||
attr_groups[0] = group;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* edac_scrub_get_desc - get EDAC scrub descriptors
|
||||
* @scrub_dev: client device, with scrub support
|
||||
* @attr_groups: pointer to attribute group container
|
||||
* @instance: device's scrub instance number.
|
||||
*
|
||||
* Return:
|
||||
* * %0 - Success.
|
||||
* * %-EINVAL - Invalid parameters passed.
|
||||
* * %-ENOMEM - Dynamic memory allocation failed.
|
||||
*/
|
||||
int edac_scrub_get_desc(struct device *scrub_dev,
|
||||
const struct attribute_group **attr_groups, u8 instance)
|
||||
{
|
||||
if (!scrub_dev || !attr_groups)
|
||||
return -EINVAL;
|
||||
|
||||
return scrub_create_desc(scrub_dev, attr_groups, instance);
|
||||
}
|
@ -121,6 +121,35 @@ void skx_adxl_put(void)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(skx_adxl_put);
|
||||
|
||||
static void skx_init_mc_mapping(struct skx_dev *d)
|
||||
{
|
||||
/*
|
||||
* By default, the BIOS presents all memory controllers within each
|
||||
* socket to the EDAC driver. The physical indices are the same as
|
||||
* the logical indices of the memory controllers enumerated by the
|
||||
* EDAC driver.
|
||||
*/
|
||||
for (int i = 0; i < NUM_IMC; i++)
|
||||
d->mc_mapping[i] = i;
|
||||
}
|
||||
|
||||
void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc)
|
||||
{
|
||||
edac_dbg(0, "Set the mapping of mc phy idx to logical idx: %02d -> %02d\n",
|
||||
pmc, lmc);
|
||||
|
||||
d->mc_mapping[pmc] = lmc;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(skx_set_mc_mapping);
|
||||
|
||||
static u8 skx_get_mc_mapping(struct skx_dev *d, u8 pmc)
|
||||
{
|
||||
edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n",
|
||||
pmc, d->mc_mapping[pmc]);
|
||||
|
||||
return d->mc_mapping[pmc];
|
||||
}
|
||||
|
||||
static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src)
|
||||
{
|
||||
struct skx_dev *d;
|
||||
@ -188,6 +217,8 @@ static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src)
|
||||
return false;
|
||||
}
|
||||
|
||||
res->imc = skx_get_mc_mapping(d, res->imc);
|
||||
|
||||
for (i = 0; i < adxl_component_count; i++) {
|
||||
if (adxl_values[i] == ~0x0ull)
|
||||
continue;
|
||||
@ -326,6 +357,8 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list)
|
||||
d->bus[0], d->bus[1], d->bus[2], d->bus[3]);
|
||||
list_add_tail(&d->list, &dev_edac_list);
|
||||
prev = pdev;
|
||||
|
||||
skx_init_mc_mapping(d);
|
||||
}
|
||||
|
||||
if (list)
|
||||
|
@ -93,6 +93,16 @@ struct skx_dev {
|
||||
struct pci_dev *uracu; /* for i10nm CPU */
|
||||
struct pci_dev *pcu_cr3; /* for HBM memory detection */
|
||||
u32 mcroute;
|
||||
/*
|
||||
* Some server BIOS may hide certain memory controllers, and the
|
||||
* EDAC driver skips those hidden memory controllers. However, the
|
||||
* ADXL still decodes memory error address using physical memory
|
||||
* controller indices. The mapping table is used to convert the
|
||||
* physical indices (reported by ADXL) to the logical indices
|
||||
* (used the EDAC driver) of present memory controllers during the
|
||||
* error handling process.
|
||||
*/
|
||||
u8 mc_mapping[NUM_IMC];
|
||||
struct skx_imc {
|
||||
struct mem_ctl_info *mci;
|
||||
struct pci_dev *mdev; /* for i10nm CPU */
|
||||
@ -242,6 +252,7 @@ void skx_adxl_put(void);
|
||||
void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log);
|
||||
void skx_set_mem_cfg(bool mem_cfg_2lm);
|
||||
void skx_set_res_cfg(struct res_config *cfg);
|
||||
void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc);
|
||||
|
||||
int skx_get_src_id(struct skx_dev *d, int off, u8 *id);
|
||||
|
||||
|
@ -15,6 +15,7 @@
|
||||
#include <linux/of.h>
|
||||
#include <linux/of_address.h>
|
||||
#include <linux/regmap.h>
|
||||
#include <linux/string_choices.h>
|
||||
|
||||
#include "edac_module.h"
|
||||
|
||||
@ -1407,7 +1408,7 @@ static void xgene_edac_iob_gic_report(struct edac_device_ctl_info *edac_dev)
|
||||
dev_err(edac_dev->dev, "Multiple XGIC write size error\n");
|
||||
info = readl(ctx->dev_csr + XGICTRANSERRREQINFO);
|
||||
dev_err(edac_dev->dev, "XGIC %s access @ 0x%08X (0x%08X)\n",
|
||||
info & REQTYPE_MASK ? "read" : "write", ERRADDR_RD(info),
|
||||
str_read_write(info & REQTYPE_MASK), ERRADDR_RD(info),
|
||||
info);
|
||||
writel(reg, ctx->dev_csr + XGICTRANSERRINTSTS);
|
||||
|
||||
@ -1489,19 +1490,19 @@ static void xgene_edac_rb_report(struct edac_device_ctl_info *edac_dev)
|
||||
if (reg & AGENT_OFFLINE_ERR_MASK)
|
||||
dev_err(edac_dev->dev,
|
||||
"IOB bus %s access to offline agent error\n",
|
||||
write ? "write" : "read");
|
||||
str_write_read(write));
|
||||
if (reg & UNIMPL_RBPAGE_ERR_MASK)
|
||||
dev_err(edac_dev->dev,
|
||||
"IOB bus %s access to unimplemented page error\n",
|
||||
write ? "write" : "read");
|
||||
str_write_read(write));
|
||||
if (reg & WORD_ALIGNED_ERR_MASK)
|
||||
dev_err(edac_dev->dev,
|
||||
"IOB bus %s word aligned access error\n",
|
||||
write ? "write" : "read");
|
||||
str_write_read(write));
|
||||
if (reg & PAGE_ACCESS_ERR_MASK)
|
||||
dev_err(edac_dev->dev,
|
||||
"IOB bus %s to page out of range access error\n",
|
||||
write ? "write" : "read");
|
||||
str_write_read(write));
|
||||
if (regmap_write(ctx->edac->rb_map, RBEIR, 0))
|
||||
return;
|
||||
if (regmap_write(ctx->edac->rb_map, RBCSR, 0))
|
||||
@ -1560,7 +1561,7 @@ rb_skip:
|
||||
err_addr_lo = readl(ctx->dev_csr + IOBBATRANSERRREQINFOL);
|
||||
err_addr_hi = readl(ctx->dev_csr + IOBBATRANSERRREQINFOH);
|
||||
dev_err(edac_dev->dev, "IOB BA %s access at 0x%02X.%08X (0x%08X)\n",
|
||||
REQTYPE_F2_RD(err_addr_hi) ? "read" : "write",
|
||||
str_read_write(REQTYPE_F2_RD(err_addr_hi)),
|
||||
ERRADDRH_F2_RD(err_addr_hi), err_addr_lo, err_addr_hi);
|
||||
if (reg & WRERR_RESP_MASK)
|
||||
dev_err(edac_dev->dev, "IOB BA requestor ID 0x%08X\n",
|
||||
@ -1611,7 +1612,7 @@ chk_iob_axi0:
|
||||
dev_err(edac_dev->dev,
|
||||
"%sAXI slave 0 illegal %s access @ 0x%02X.%08X (0x%08X)\n",
|
||||
reg & IOBAXIS0_M_ILLEGAL_ACCESS_MASK ? "Multiple " : "",
|
||||
REQTYPE_RD(err_addr_hi) ? "read" : "write",
|
||||
str_read_write(REQTYPE_RD(err_addr_hi)),
|
||||
ERRADDRH_RD(err_addr_hi), err_addr_lo, err_addr_hi);
|
||||
writel(reg, ctx->dev_csr + IOBAXIS0TRANSERRINTSTS);
|
||||
|
||||
@ -1625,7 +1626,7 @@ chk_iob_axi1:
|
||||
dev_err(edac_dev->dev,
|
||||
"%sAXI slave 1 illegal %s access @ 0x%02X.%08X (0x%08X)\n",
|
||||
reg & IOBAXIS0_M_ILLEGAL_ACCESS_MASK ? "Multiple " : "",
|
||||
REQTYPE_RD(err_addr_hi) ? "read" : "write",
|
||||
str_read_write(REQTYPE_RD(err_addr_hi)),
|
||||
ERRADDRH_RD(err_addr_hi), err_addr_lo, err_addr_hi);
|
||||
writel(reg, ctx->dev_csr + IOBAXIS1TRANSERRINTSTS);
|
||||
}
|
||||
|
@ -661,4 +661,219 @@ static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci,
|
||||
|
||||
return mci->dimms[index];
|
||||
}
|
||||
|
||||
#define EDAC_FEAT_NAME_LEN 128
|
||||
|
||||
/* RAS feature type */
|
||||
enum edac_dev_feat {
|
||||
RAS_FEAT_SCRUB,
|
||||
RAS_FEAT_ECS,
|
||||
RAS_FEAT_MEM_REPAIR,
|
||||
RAS_FEAT_MAX
|
||||
};
|
||||
|
||||
/**
|
||||
* struct edac_scrub_ops - scrub device operations (all elements optional)
|
||||
* @read_addr: read base address of scrubbing range.
|
||||
* @read_size: read offset of scrubbing range.
|
||||
* @write_addr: set base address of the scrubbing range.
|
||||
* @write_size: set offset of the scrubbing range.
|
||||
* @get_enabled_bg: check if currently performing background scrub.
|
||||
* @set_enabled_bg: start or stop a bg-scrub.
|
||||
* @get_min_cycle: get minimum supported scrub cycle duration in seconds.
|
||||
* @get_max_cycle: get maximum supported scrub cycle duration in seconds.
|
||||
* @get_cycle_duration: get current scrub cycle duration in seconds.
|
||||
* @set_cycle_duration: set current scrub cycle duration in seconds.
|
||||
*/
|
||||
struct edac_scrub_ops {
|
||||
int (*read_addr)(struct device *dev, void *drv_data, u64 *base);
|
||||
int (*read_size)(struct device *dev, void *drv_data, u64 *size);
|
||||
int (*write_addr)(struct device *dev, void *drv_data, u64 base);
|
||||
int (*write_size)(struct device *dev, void *drv_data, u64 size);
|
||||
int (*get_enabled_bg)(struct device *dev, void *drv_data, bool *enable);
|
||||
int (*set_enabled_bg)(struct device *dev, void *drv_data, bool enable);
|
||||
int (*get_min_cycle)(struct device *dev, void *drv_data, u32 *min);
|
||||
int (*get_max_cycle)(struct device *dev, void *drv_data, u32 *max);
|
||||
int (*get_cycle_duration)(struct device *dev, void *drv_data, u32 *cycle);
|
||||
int (*set_cycle_duration)(struct device *dev, void *drv_data, u32 cycle);
|
||||
};
|
||||
|
||||
#if IS_ENABLED(CONFIG_EDAC_SCRUB)
|
||||
int edac_scrub_get_desc(struct device *scrub_dev,
|
||||
const struct attribute_group **attr_groups,
|
||||
u8 instance);
|
||||
#else
|
||||
static inline int edac_scrub_get_desc(struct device *scrub_dev,
|
||||
const struct attribute_group **attr_groups,
|
||||
u8 instance)
|
||||
{ return -EOPNOTSUPP; }
|
||||
#endif /* CONFIG_EDAC_SCRUB */
|
||||
|
||||
/**
|
||||
* struct edac_ecs_ops - ECS device operations (all elements optional)
|
||||
* @get_log_entry_type: read the log entry type value.
|
||||
* @set_log_entry_type: set the log entry type value.
|
||||
* @get_mode: read the mode value.
|
||||
* @set_mode: set the mode value.
|
||||
* @reset: reset the ECS counter.
|
||||
* @get_threshold: read the threshold count per gigabits of memory cells.
|
||||
* @set_threshold: set the threshold count per gigabits of memory cells.
|
||||
*/
|
||||
struct edac_ecs_ops {
|
||||
int (*get_log_entry_type)(struct device *dev, void *drv_data, int fru_id, u32 *val);
|
||||
int (*set_log_entry_type)(struct device *dev, void *drv_data, int fru_id, u32 val);
|
||||
int (*get_mode)(struct device *dev, void *drv_data, int fru_id, u32 *val);
|
||||
int (*set_mode)(struct device *dev, void *drv_data, int fru_id, u32 val);
|
||||
int (*reset)(struct device *dev, void *drv_data, int fru_id, u32 val);
|
||||
int (*get_threshold)(struct device *dev, void *drv_data, int fru_id, u32 *threshold);
|
||||
int (*set_threshold)(struct device *dev, void *drv_data, int fru_id, u32 threshold);
|
||||
};
|
||||
|
||||
struct edac_ecs_ex_info {
|
||||
u16 num_media_frus;
|
||||
};
|
||||
|
||||
#if IS_ENABLED(CONFIG_EDAC_ECS)
|
||||
int edac_ecs_get_desc(struct device *ecs_dev,
|
||||
const struct attribute_group **attr_groups,
|
||||
u16 num_media_frus);
|
||||
#else
|
||||
static inline int edac_ecs_get_desc(struct device *ecs_dev,
|
||||
const struct attribute_group **attr_groups,
|
||||
u16 num_media_frus)
|
||||
{ return -EOPNOTSUPP; }
|
||||
#endif /* CONFIG_EDAC_ECS */
|
||||
|
||||
enum edac_mem_repair_type {
|
||||
EDAC_REPAIR_MAX
|
||||
};
|
||||
|
||||
enum edac_mem_repair_cmd {
|
||||
EDAC_DO_MEM_REPAIR = 1,
|
||||
};
|
||||
|
||||
/**
|
||||
* struct edac_mem_repair_ops - memory repair operations
|
||||
* (all elements are optional except do_repair, set_hpa/set_dpa)
|
||||
* @get_repair_type: get the memory repair type, listed in
|
||||
* enum edac_mem_repair_function.
|
||||
* @get_persist_mode: get the current persist mode.
|
||||
* false - Soft repair type (temporary repair).
|
||||
* true - Hard memory repair type (permanent repair).
|
||||
* @set_persist_mode: set the persist mode of the memory repair instance.
|
||||
* @get_repair_safe_when_in_use: get whether memory media is accessible and
|
||||
* data is retained during repair operation.
|
||||
* @get_hpa: get current host physical address (HPA) of memory to repair.
|
||||
* @set_hpa: set host physical address (HPA) of memory to repair.
|
||||
* @get_min_hpa: get the minimum supported host physical address (HPA).
|
||||
* @get_max_hpa: get the maximum supported host physical address (HPA).
|
||||
* @get_dpa: get current device physical address (DPA) of memory to repair.
|
||||
* @set_dpa: set device physical address (DPA) of memory to repair.
|
||||
* In some states of system configuration (e.g. before address decoders
|
||||
* have been configured), memory devices (e.g. CXL) may not have an active
|
||||
* mapping in the host physical address map. As such, the memory
|
||||
* to repair must be identified by a device specific physical addressing
|
||||
* scheme using a device physical address(DPA). The DPA and other control
|
||||
* attributes to use for the repair operations will be presented in related
|
||||
* error records.
|
||||
* @get_min_dpa: get the minimum supported device physical address (DPA).
|
||||
* @get_max_dpa: get the maximum supported device physical address (DPA).
|
||||
* @get_nibble_mask: get current nibble mask of memory to repair.
|
||||
* @set_nibble_mask: set nibble mask of memory to repair.
|
||||
* @get_bank_group: get current bank group of memory to repair.
|
||||
* @set_bank_group: set bank group of memory to repair.
|
||||
* @get_bank: get current bank of memory to repair.
|
||||
* @set_bank: set bank of memory to repair.
|
||||
* @get_rank: get current rank of memory to repair.
|
||||
* @set_rank: set rank of memory to repair.
|
||||
* @get_row: get current row of memory to repair.
|
||||
* @set_row: set row of memory to repair.
|
||||
* @get_column: get current column of memory to repair.
|
||||
* @set_column: set column of memory to repair.
|
||||
* @get_channel: get current channel of memory to repair.
|
||||
* @set_channel: set channel of memory to repair.
|
||||
* @get_sub_channel: get current subchannel of memory to repair.
|
||||
* @set_sub_channel: set subchannel of memory to repair.
|
||||
* @do_repair: Issue memory repair operation for the HPA/DPA and
|
||||
* other control attributes set for the memory to repair.
|
||||
*
|
||||
* All elements are optional except do_repair and at least one of set_hpa/set_dpa.
|
||||
*/
|
||||
struct edac_mem_repair_ops {
|
||||
int (*get_repair_type)(struct device *dev, void *drv_data, const char **type);
|
||||
int (*get_persist_mode)(struct device *dev, void *drv_data, bool *persist);
|
||||
int (*set_persist_mode)(struct device *dev, void *drv_data, bool persist);
|
||||
int (*get_repair_safe_when_in_use)(struct device *dev, void *drv_data, bool *safe);
|
||||
int (*get_hpa)(struct device *dev, void *drv_data, u64 *hpa);
|
||||
int (*set_hpa)(struct device *dev, void *drv_data, u64 hpa);
|
||||
int (*get_min_hpa)(struct device *dev, void *drv_data, u64 *hpa);
|
||||
int (*get_max_hpa)(struct device *dev, void *drv_data, u64 *hpa);
|
||||
int (*get_dpa)(struct device *dev, void *drv_data, u64 *dpa);
|
||||
int (*set_dpa)(struct device *dev, void *drv_data, u64 dpa);
|
||||
int (*get_min_dpa)(struct device *dev, void *drv_data, u64 *dpa);
|
||||
int (*get_max_dpa)(struct device *dev, void *drv_data, u64 *dpa);
|
||||
int (*get_nibble_mask)(struct device *dev, void *drv_data, u32 *val);
|
||||
int (*set_nibble_mask)(struct device *dev, void *drv_data, u32 val);
|
||||
int (*get_bank_group)(struct device *dev, void *drv_data, u32 *val);
|
||||
int (*set_bank_group)(struct device *dev, void *drv_data, u32 val);
|
||||
int (*get_bank)(struct device *dev, void *drv_data, u32 *val);
|
||||
int (*set_bank)(struct device *dev, void *drv_data, u32 val);
|
||||
int (*get_rank)(struct device *dev, void *drv_data, u32 *val);
|
||||
int (*set_rank)(struct device *dev, void *drv_data, u32 val);
|
||||
int (*get_row)(struct device *dev, void *drv_data, u32 *val);
|
||||
int (*set_row)(struct device *dev, void *drv_data, u32 val);
|
||||
int (*get_column)(struct device *dev, void *drv_data, u32 *val);
|
||||
int (*set_column)(struct device *dev, void *drv_data, u32 val);
|
||||
int (*get_channel)(struct device *dev, void *drv_data, u32 *val);
|
||||
int (*set_channel)(struct device *dev, void *drv_data, u32 val);
|
||||
int (*get_sub_channel)(struct device *dev, void *drv_data, u32 *val);
|
||||
int (*set_sub_channel)(struct device *dev, void *drv_data, u32 val);
|
||||
int (*do_repair)(struct device *dev, void *drv_data, u32 val);
|
||||
};
|
||||
|
||||
#if IS_ENABLED(CONFIG_EDAC_MEM_REPAIR)
|
||||
int edac_mem_repair_get_desc(struct device *dev,
|
||||
const struct attribute_group **attr_groups,
|
||||
u8 instance);
|
||||
#else
|
||||
static inline int edac_mem_repair_get_desc(struct device *dev,
|
||||
const struct attribute_group **attr_groups,
|
||||
u8 instance)
|
||||
{ return -EOPNOTSUPP; }
|
||||
#endif /* CONFIG_EDAC_MEM_REPAIR */
|
||||
|
||||
/* EDAC device feature information structure */
|
||||
struct edac_dev_data {
|
||||
union {
|
||||
const struct edac_scrub_ops *scrub_ops;
|
||||
const struct edac_ecs_ops *ecs_ops;
|
||||
const struct edac_mem_repair_ops *mem_repair_ops;
|
||||
};
|
||||
u8 instance;
|
||||
void *private;
|
||||
};
|
||||
|
||||
struct edac_dev_feat_ctx {
|
||||
struct device dev;
|
||||
void *private;
|
||||
struct edac_dev_data *scrub;
|
||||
struct edac_dev_data ecs;
|
||||
struct edac_dev_data *mem_repair;
|
||||
};
|
||||
|
||||
struct edac_dev_feature {
|
||||
enum edac_dev_feat ft_type;
|
||||
u8 instance;
|
||||
union {
|
||||
const struct edac_scrub_ops *scrub_ops;
|
||||
const struct edac_ecs_ops *ecs_ops;
|
||||
const struct edac_mem_repair_ops *mem_repair_ops;
|
||||
};
|
||||
void *ctx;
|
||||
struct edac_ecs_ex_info ecs_info;
|
||||
};
|
||||
|
||||
int edac_dev_register(struct device *parent, char *dev_name,
|
||||
void *parent_pvt_data, int num_features,
|
||||
const struct edac_dev_feature *ras_features);
|
||||
#endif /* _LINUX_EDAC_H_ */
|
||||
|
Loading…
x
Reference in New Issue
Block a user