From 52d5e5469526216bcc418f26a2796d5af6226023 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Mon, 19 Sep 2022 18:51:59 +0300 Subject: habanalabs: refactor razwi event notification This event notification was compatible only with gaudi, where razwi and page fault happens together. To make it compatible with all ASICs, this refactor contains: 1. Razwi notification will only notify about razwi info. New notification will be added in future patch, to retrieve data about page fault error. 2. Changed razwi info structure to support all ASICs. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 45 +++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 16 deletions(-) (limited to 'include/uapi/misc') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index e00ebe05097d..d6f84cb35e3d 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -1071,31 +1071,44 @@ struct hl_info_cs_timeout_event { __u64 seq; }; -#define HL_RAZWI_PAGE_FAULT 0 -#define HL_RAZWI_MMU_ACCESS_ERROR 1 +#define HL_RAZWI_NA_ENG_ID U16_MAX +#define HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR 128 +#define HL_RAZWI_READ BIT(0) +#define HL_RAZWI_WRITE BIT(1) +#define HL_RAZWI_LBW BIT(2) +#define HL_RAZWI_HBW BIT(3) +#define HL_RAZWI_RR BIT(4) +#define HL_RAZWI_ADDR_DEC BIT(5) /** * struct hl_info_razwi_event - razwi information. * @timestamp: timestamp of razwi. * @addr: address which accessing it caused razwi. - * @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does not - * have engine id it will be set to U16_MAX. - * @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible - * engines which one them caused the razwi. In that case, it will contain the - * second possible engine id, otherwise it will be set to U16_MAX. - * @no_engine_id: if razwi initiator does not have engine id, this field will be set to 1, - * otherwise 0. - * @error_type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX. - * @pad: padding to 64 bit. + * @engine_id: engine id of the razwi initiator, if it was initiated by engine that does not + * have engine id it will be set to HL_RAZWI_NA_ENG_ID. If there are several possible + * engines which caused the razwi, it will hold all of them. + * @num_of_possible_engines: contains number of possible engine ids. In some asics, razwi indication + * might be common for several engines and there is no way to get the + * exact engine. In this way, engine_id array will be filled with all + * possible engines caused this razwi. Also, there might be possibility + * in gaudi, where we don't indication on specific engine, in that case + * the value of this parameter will be zero. + * @flags: bitmask for additional data: HL_RAZWI_READ - razwi caused by read operation + * HL_RAZWI_WRITE - razwi caused by write operation + * HL_RAZWI_LBW - razwi caused by lbw fabric transaction + * HL_RAZWI_HBW - razwi caused by hbw fabric transaction + * HL_RAZWI_RR - razwi caused by range register + * HL_RAZWI_ADDR_DEC - razwi caused by address decode error + * Note: this data is not supported by all asics, in that case the relevant bits will not + * be set. */ struct hl_info_razwi_event { __s64 timestamp; __u64 addr; - __u16 engine_id_1; - __u16 engine_id_2; - __u8 no_engine_id; - __u8 error_type; - __u8 pad[2]; + __u16 engine_id[HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR]; + __u16 num_of_possible_engines; + __u8 flags; + __u8 pad[5]; }; #define MAX_QMAN_STREAMS_INFO 4 -- cgit v1.2.3 From dd600db47ba60c3c69d4d24c73c43133c6040118 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Sun, 18 Sep 2022 21:37:31 +0300 Subject: habanalabs: add page fault info uapi Only the first page fault will be saved. Besides the address which caused the page fault, the driver captures all of the mmu user mappings. User can retrieve this data via the new uapi (new opcode in INFO ioctl). Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/uapi/misc') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index d6f84cb35e3d..2b794f54e2ed 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -778,6 +778,9 @@ enum hl_server_type { * HL_INFO_UNREGISTER_EVENTFD - Unregister eventfd * HL_INFO_GET_EVENTS - Retrieve the last occurred events * HL_INFO_UNDEFINED_OPCODE_EVENT - Retrieve last undefined opcode error information. + * HL_INFO_ENGINE_STATUS - Retrieve the status of all the h/w engines in the asic. + * HL_INFO_PAGE_FAULT_EVENT - Retrieve parameters of captured page fault. + * HL_INFO_USER_MAPPINGS - Retrieve user mappings, captured after page fault event. */ #define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_EVENTS 1 @@ -809,6 +812,8 @@ enum hl_server_type { #define HL_INFO_GET_EVENTS 30 #define HL_INFO_UNDEFINED_OPCODE_EVENT 31 #define HL_INFO_ENGINE_STATUS 32 +#define HL_INFO_PAGE_FAULT_EVENT 33 +#define HL_INFO_USER_MAPPINGS 34 #define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_CARD_NAME_MAX_LEN 16 @@ -1187,6 +1192,29 @@ struct hl_info_sec_attest { __u8 pad0[2]; }; +/** + * struct hl_page_fault_info - page fault information. + * @timestamp: timestamp of page fault. + * @addr: address which accessing it caused page fault. + * @engine_id: engine id which caused the page fault, supported only in gaudi3. + */ +struct hl_page_fault_info { + __s64 timestamp; + __u64 addr; + __u16 engine_id; + __u8 pad[6]; +}; + +/** + * struct hl_user_mapping - user mapping information. + * @dev_va: device virtual address. + * @size: virtual address mapping size. + */ +struct hl_user_mapping { + __u64 dev_va; + __u64 size; +}; + enum gaudi_dcores { HL_GAUDI_WS_DCORE, HL_GAUDI_WN_DCORE, @@ -1213,6 +1241,8 @@ enum gaudi_dcores { * needed, hence updating this variable so user will know the exact amount * of bytes copied by the kernel to the buffer. * @sec_attest_nonce: Nonce number used for attestation report. + * @array_size: Number of array members copied to user buffer. + * Relevant for HL_INFO_USER_MAPPINGS info ioctl. * @pad: Padding to 64 bit. */ struct hl_info_args { @@ -1228,6 +1258,7 @@ struct hl_info_args { __u32 eventfd; __u32 user_buffer_actual_size; __u32 sec_attest_nonce; + __u32 array_size; }; __u32 pad; -- cgit v1.2.3 From 15ac503cdc0d9a1275d82a926c673359cf69ebef Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Wed, 28 Sep 2022 22:14:55 +0300 Subject: habanalabs/gaudi2: capture RAZWI information Added function to calculate possible engines which caused RAZWI (read-only zero, write ignored), from a given router id or module index. When getting RAZWI via PSOC IP, first the router id is calculated and then the possible engines that caused the RAZWI are calculated. There is a possibility that the RAZWI initiator is not an engine. In that case, it will not be included in possible engines as it doesn't have an engine id. RAZWI information is captured when receiving event from engine or via PSOC IP. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi/misc') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 2b794f54e2ed..a4ceee681898 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -597,6 +597,10 @@ enum gaudi2_engine_id { GAUDI2_ENGINE_ID_NIC10_1, GAUDI2_ENGINE_ID_NIC11_0, GAUDI2_ENGINE_ID_NIC11_1, + GAUDI2_ENGINE_ID_PCIE, + GAUDI2_ENGINE_ID_PSOC, + GAUDI2_ENGINE_ID_ARC_FARM, + GAUDI2_ENGINE_ID_KDMA, GAUDI2_ENGINE_ID_SIZE }; -- cgit v1.2.3 From 841cd2d7658d92e09354640c1887797f0da3d444 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Wed, 26 Oct 2022 16:20:45 +0300 Subject: habanalabs/gaudi2: add PCI revision 2 support Add support for Gaudi2 Device with PCI revision 2. Functionality is exactly the same as revision 1, the only difference is device name exposed to user. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi/misc') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index a4ceee681898..58343998bd63 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -868,6 +868,7 @@ enum hl_server_type { * @number_of_user_interrupts: The number of interrupts that are available to the userspace * application to use. Relevant for Gaudi2 and later. * @device_mem_alloc_default_page_size: default page size used in device memory allocation. + * @revision_id: PCI revision ID of the ASIC. */ struct hl_info_hw_ip_info { __u64 sram_base_address; @@ -898,6 +899,12 @@ struct hl_info_hw_ip_info { __u16 pad2; __u64 reserved4; __u64 device_mem_alloc_default_page_size; + __u64 reserved5; + __u64 reserved6; + __u32 reserved7; + __u8 reserved8; + __u8 revision_id; + __u8 pad[2]; }; struct hl_info_dram_usage { -- cgit v1.2.3 From cb5fb665f30388cf8cb9becae86dcb84ace0ca88 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Sun, 30 Oct 2022 13:08:37 +0200 Subject: habanalabs/gaudi: add razwi notify event Each time razwi (read-only zero, write ignore) happens, besides capturing its data, also notify the user about it. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/misc') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 58343998bd63..7747e19e81fe 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -721,6 +721,7 @@ enum hl_server_type { * HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE - Indicates device is unavailable * HL_NOTIFIER_EVENT_USER_ENGINE_ERR - Indicates device engine in error state * HL_NOTIFIER_EVENT_GENERAL_HW_ERR - Indicates device HW error + * HL_NOTIFIER_EVENT_RAZWI - Indicates razwi happened */ #define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0) #define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1) @@ -729,6 +730,7 @@ enum hl_server_type { #define HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE (1ULL << 4) #define HL_NOTIFIER_EVENT_USER_ENGINE_ERR (1ULL << 5) #define HL_NOTIFIER_EVENT_GENERAL_HW_ERR (1ULL << 6) +#define HL_NOTIFIER_EVENT_RAZWI (1ULL << 7) /* Opcode for management ioctl * -- cgit v1.2.3 From aff6354afd1f9eae1e10658c157c26e316806f56 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Mon, 31 Oct 2022 11:44:45 +0200 Subject: habanalabs/gaudi: add page fault notify event Each time page fault happens, besides capturing its data, also notify the user about it. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/misc') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 7747e19e81fe..e50cb71df081 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -722,6 +722,7 @@ enum hl_server_type { * HL_NOTIFIER_EVENT_USER_ENGINE_ERR - Indicates device engine in error state * HL_NOTIFIER_EVENT_GENERAL_HW_ERR - Indicates device HW error * HL_NOTIFIER_EVENT_RAZWI - Indicates razwi happened + * HL_NOTIFIER_EVENT_PAGE_FAULT - Indicates page fault happened */ #define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0) #define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1) @@ -731,6 +732,7 @@ enum hl_server_type { #define HL_NOTIFIER_EVENT_USER_ENGINE_ERR (1ULL << 5) #define HL_NOTIFIER_EVENT_GENERAL_HW_ERR (1ULL << 6) #define HL_NOTIFIER_EVENT_RAZWI (1ULL << 7) +#define HL_NOTIFIER_EVENT_PAGE_FAULT (1ULL << 8) /* Opcode for management ioctl * -- cgit v1.2.3 From 01907ba5252164ca6bf0de670660cd94d77c378c Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Sun, 23 Oct 2022 12:55:21 +0300 Subject: habanalabs: increase the size of busy engines mask Increase the size of the busy engines mask in 'struct hl_info_hw_idle', for future ASICs with more than 128 engines. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/misc') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index e50cb71df081..3b995e841eb8 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -916,7 +916,7 @@ struct hl_info_dram_usage { __u64 ctx_dram_mem; }; -#define HL_BUSY_ENGINES_MASK_EXT_SIZE 2 +#define HL_BUSY_ENGINES_MASK_EXT_SIZE 4 struct hl_info_hw_idle { __u32 is_idle; -- cgit v1.2.3