/*
* Copyright 2018 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*
*/
#include <linux/debugfs.h>
#include <linux/list.h>
#include <linux/module.h>
#include "amdgpu.h"
#include "amdgpu_ras.h"
#include "amdgpu_atomfirmware.h"
struct ras_ih_data {
/* interrupt bottom half */
struct work_struct ih_work;
int inuse;
/* IP callback */
ras_ih_cb cb;
/* full of entries */
unsigned char *ring;
unsigned int ring_size;
unsigned int element_size;
unsigned int aligned_element_size;
unsigned int rptr;
unsigned int wptr;
};
struct ras_fs_data {
char sysfs_name[32];
char debugfs_name[32];
};
struct ras_err_data {
unsigned long ue_count;
unsigned long ce_count;
};
struct ras_err_handler_data {
/* point to bad pages array */
struct {
unsigned long bp;
struct amdgpu_bo *bo;
} *bps;
/* the count of entries */
int count;
/* the space can place new entries */
int space_left;
/* last reserved entry's index + 1 */
int last_reserved;
};
struct ras_manager {
struct ras_common_if head;
/* reference count */
int use;
/* ras block link */
struct list_head node;
/* the device */
struct amdgpu_device *adev;
/* debugfs */
struct dentry *ent;
/* sysfs */
struct device_attribute sysfs_attr;
int attr_inuse;
/* fs node name */
struct ras_fs_data fs_data;
/* IH data */
struct ras_ih_data ih_data;
struct ras_err_data err_data;
};
const char *ras_error_string[] = {
"none",
"parity",
"single_correctable",
"multi_uncorrectable",
"poison",
};
const char *ras_block_string[] = {
"umc",
"sdma",
"gfx",
"mmhub",
"athub",
"pcie_bif",
"hdp",
"xgmi_wafl",
"df",
"smn",
"sem",
"mp0",
"mp1",
"fuse",
};
#define ras_err_str(i) (ras_error_string[ffs(i)])
#define ras_block_str(i) (ras_block_string[i])
#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1
#define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2
#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
static void amdgpu_ras_self_test(struct amdgpu_device *adev)
{
/* TODO */
}
static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
size_t size, loff_t *pos)
{
struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
struct ras_query_if info = {
.head = obj->head,
};
ssize_t s;
char val[128];
if (amdgpu_ras_error_query(obj->adev, &info))
return -EINVAL;
s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
"ue", info.ue_count,
"ce", info.ce_count);
if (*pos >= s)
return 0;
s -= *pos;
s = min_t(u64, s, size);
if (copy_to_user(buf, &val[*pos], s))
return -EINVAL;
*pos += s;
return s;
}
static const struct file_operations amdgpu_ras_debugfs_ops = {
.owner = THIS_MODULE,
.read = amdgpu_ras_debugfs_read,
.write = NULL,
.llseek = default_llseek
};
static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
{
int i;
for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
*block_id = i;
if (strcmp(name, ras_block_str(i)) == 0)
return 0;
}
return -EINVAL;
}
static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
const char __user *buf, size_t size,
loff_t *pos, struct ras_debug_if *data)
{
ssize_t s = min_t(u64, 64, size);
char str[65];
char block_name[33];
char err[9] = "ue";
int op = -1;
int block_id;
u64 address, value;
if (*pos)
return -EINVAL;
*pos = size;
memset(str, 0, sizeof(str));
memset(data, 0, sizeof(*data));
if (copy_from_user(str, buf, s))
return -EINVAL;
if (sscanf(str, "disable %32s", block_name) == 1)
op = 0;
else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
op = 1;
else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
op = 2;
else if (str[0] && str[1] && str[2] && str[3])
/* ascii string, but commands are not matched. */
return -EINVAL;
if (op != -1) {
if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
return -EINVAL;
data->head.block = block_id;
data->head.type = memcmp("ue", err, 2) == 0 ?
AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE :
AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
data->op = op;
if (op == 2) {
if (sscanf(str, "%*s %*s %*s %llu %llu",
&address, &value) != 2)
if (sscanf(str, "%*s %*s %*s 0x%llx 0x%llx",
&address, &value) != 2)
return -EINVAL;
data->inject.address = address;
data->inject.value = value;
}
} else {
if (size < sizeof(*data))
return -EINVAL;
if (copy_from_user(data, buf, sizeof(*data)))
return -EINVAL;
}
return 0;
}
/*
* DOC: ras debugfs control interface
*
* It accepts struct ras_debug_if who has two members.
*
* First member: ras_debug_if::head or ras_debug_if::inject.
*
* head is used to indicate which IP block will be under control.
*
* head has four members, they are block, type, sub_block_index, name.
* block: which IP will be under control.
* type: what kind of error will be enabled/disabled/injected.
* sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
* name: the name of IP.
*
* inject has two more members than head, they are address, value.
* As their names indicate, inject operation will write the
* value to the address.
*
* Second member: struct ras_debug_if::op.
* It has three kinds of operations.
* 0: disable RAS on the block. Take ::head as its data.
* 1: enable RAS on the block. Take ::head as its data.
* 2: inject errors on the block. Take ::inject as its data.
*
* How to use the interface?
* programs:
* copy the struct ras_debug_if in your codes and initialize it.
* write the struct to the control node.
*
* bash:
* echo op block [error [address value]] > .../ras/ras_ctrl
* op: disable, enable, inject
* disable: only block is needed
* enable: block and error are needed
* inject: error, address, value are needed
* block: umc, smda, gfx, .........
* see ras_block_string[] for details
* error: ue, ce
* ue: multi_uncorrectable
* ce: single_correctable
*
* here are some examples for bash commands,
|