summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDave Airlie <airlied@redhat.com>2023-06-09 11:30:37 +1000
committerDave Airlie <airlied@redhat.com>2023-06-09 11:30:44 +1000
commit7f4f4adb9ba1d9b292e4b3ade0235be2e5ad5da7 (patch)
tree9f272d74e1adfa3bca4e466f8e0fa6890be1fb8d
parentc9b685df2d2138aa31399b0d146ba095a91c7846 (diff)
parente6f49e96bc57d34fc0f617f37bfdf62a9b58d2c2 (diff)
downloadlinux-7f4f4adb9ba1d9b292e4b3ade0235be2e5ad5da7.tar.gz
linux-7f4f4adb9ba1d9b292e4b3ade0235be2e5ad5da7.tar.bz2
linux-7f4f4adb9ba1d9b292e4b3ade0235be2e5ad5da7.zip
Merge tag 'drm-habanalabs-next-2023-06-08' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into drm-next
This tag contains additional habanalabs driver changes for v6.5: - uAPI changes: - Return 0 when user queries if there was a h/w or f/w error and no such error happened. Previously we returned an error in such case. - New features and improvements: - Add pci health check when we lose connection with the firmware. This can be used to distinguish between pci link down and firmware getting stuck. - Add more info to the error print when TPC interrupt occur. - Reduce amount of code under mutex in the command submission of signal event. - Firmware related fixes: - Fixes to the handshake protocol during f/w initialization. - Display information that the f/w sends us when encountering a DMA error. - Do soft-reset using a message sent to firmware instead of writing to MMIO. - Prepare generic code to extract f/w version numbers. - Bug fixes and code cleanups. Notable fixes are: - Unsecure certain TPC registers that the user should access. - Fix handling of QMAN errors - Fix memory leak when recording errors (to later pass them to the user) - Multiple fixes to razwi interrupt handling code Signed-off-by: Dave Airlie <airlied@redhat.com> From: Oded Gabbay <ogabbay@kernel.org> Link: https://patchwork.freedesktop.org/patch/msgid/20230608103043.GA2699019@ogabbay-vm-u20.habana-labs.com
-rw-r--r--drivers/accel/habanalabs/common/command_buffer.c6
-rw-r--r--drivers/accel/habanalabs/common/command_submission.c61
-rw-r--r--drivers/accel/habanalabs/common/debugfs.c60
-rw-r--r--drivers/accel/habanalabs/common/device.c112
-rw-r--r--drivers/accel/habanalabs/common/firmware_if.c212
-rw-r--r--drivers/accel/habanalabs/common/habanalabs.h77
-rw-r--r--drivers/accel/habanalabs/common/habanalabs_drv.c9
-rw-r--r--drivers/accel/habanalabs/common/habanalabs_ioctl.c35
-rw-r--r--drivers/accel/habanalabs/common/irq.c2
-rw-r--r--drivers/accel/habanalabs/common/memory.c104
-rw-r--r--drivers/accel/habanalabs/common/mmu/mmu.c56
-rw-r--r--drivers/accel/habanalabs/common/security.c57
-rw-r--r--drivers/accel/habanalabs/gaudi/gaudi.c13
-rw-r--r--drivers/accel/habanalabs/gaudi2/gaudi2.c334
-rw-r--r--drivers/accel/habanalabs/gaudi2/gaudi2P.h2
-rw-r--r--drivers/accel/habanalabs/gaudi2/gaudi2_security.c15
-rw-r--r--drivers/accel/habanalabs/goya/goya.c3
-rw-r--r--drivers/accel/habanalabs/goya/goya_coresight.c9
-rw-r--r--drivers/accel/habanalabs/include/common/cpucp_if.h22
-rw-r--r--drivers/accel/habanalabs/include/common/hl_boot_if.h41
-rw-r--r--drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h11
-rw-r--r--drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h2
-rw-r--r--include/uapi/drm/habanalabs_accel.h10
23 files changed, 557 insertions, 696 deletions
diff --git a/drivers/accel/habanalabs/common/command_buffer.c b/drivers/accel/habanalabs/common/command_buffer.c
index 6e09f48750a0..08f7aee42624 100644
--- a/drivers/accel/habanalabs/common/command_buffer.c
+++ b/drivers/accel/habanalabs/common/command_buffer.c
@@ -27,12 +27,6 @@ static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb)
return -EINVAL;
}
- if (!hdev->mmu_enable) {
- dev_err_ratelimited(hdev->dev,
- "Cannot map CB because MMU is disabled\n");
- return -EINVAL;
- }
-
if (cb->is_mmu_mapped)
return 0;
diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c
index af9d2e22c6e7..c23829dab97a 100644
--- a/drivers/accel/habanalabs/common/command_submission.c
+++ b/drivers/accel/habanalabs/common/command_submission.c
@@ -280,14 +280,8 @@ bool cs_needs_timeout(struct hl_cs *cs)
static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job)
{
- /*
- * Patched CB is created for external queues jobs, and for H/W queues
- * jobs if the user CB was allocated by driver and MMU is disabled.
- */
- return (job->queue_type == QUEUE_TYPE_EXT ||
- (job->queue_type == QUEUE_TYPE_HW &&
- job->is_kernel_allocated_cb &&
- !hdev->mmu_enable));
+ /* Patched CB is created for external queues jobs */
+ return (job->queue_type == QUEUE_TYPE_EXT);
}
/*
@@ -363,14 +357,13 @@ static void hl_complete_job(struct hl_device *hdev, struct hl_cs_job *job)
}
}
- /* For H/W queue jobs, if a user CB was allocated by driver and MMU is
- * enabled, the user CB isn't released in cs_parser() and thus should be
+ /* For H/W queue jobs, if a user CB was allocated by driver,
+ * the user CB isn't released in cs_parser() and thus should be
* released here. This is also true for INT queues jobs which were
* allocated by driver.
*/
- if ((job->is_kernel_allocated_cb &&
- ((job->queue_type == QUEUE_TYPE_HW && hdev->mmu_enable) ||
- job->queue_type == QUEUE_TYPE_INT))) {
+ if (job->is_kernel_allocated_cb &&
+ (job->queue_type == QUEUE_TYPE_HW || job->queue_type == QUEUE_TYPE_INT)) {
atomic_dec(&job->user_cb->cs_cnt);
hl_cb_put(job->user_cb);
}
@@ -804,12 +797,14 @@ out:
static void cs_timedout(struct work_struct *work)
{
+ struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work);
+ bool skip_reset_on_timeout, device_reset = false;
struct hl_device *hdev;
u64 event_mask = 0x0;
+ uint timeout_sec;
int rc;
- struct hl_cs *cs = container_of(work, struct hl_cs,
- work_tdr.work);
- bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false;
+
+ skip_reset_on_timeout = cs->skip_reset_on_timeout;
rc = cs_get_unless_zero(cs);
if (!rc)
@@ -840,29 +835,31 @@ static void cs_timedout(struct work_struct *work)
event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT;
}
+ timeout_sec = jiffies_to_msecs(hdev->timeout_jiffies) / 1000;
+
switch (cs->type) {
case CS_TYPE_SIGNAL:
dev_err(hdev->dev,
- "Signal command submission %llu has not finished in time!\n",
- cs->sequence);
+ "Signal command submission %llu has not finished in %u seconds!\n",
+ cs->sequence, timeout_sec);
break;
case CS_TYPE_WAIT:
dev_err(hdev->dev,
- "Wait command submission %llu has not finished in time!\n",
- cs->sequence);
+ "Wait command submission %llu has not finished in %u seconds!\n",
+ cs->sequence, timeout_sec);
break;
case CS_TYPE_COLLECTIVE_WAIT:
dev_err(hdev->dev,
- "Collective Wait command submission %llu has not finished in time!\n",
- cs->sequence);
+ "Collective Wait command submission %llu has not finished in %u seconds!\n",
+ cs->sequence, timeout_sec);
break;
default:
dev_err(hdev->dev,
- "Command submission %llu has not finished in time!\n",
- cs->sequence);
+ "Command submission %llu has not finished in %u seconds!\n",
+ cs->sequence, timeout_sec);
break;
}
@@ -1139,11 +1136,10 @@ static void force_complete_cs(struct hl_device *hdev)
spin_unlock(&hdev->cs_mirror_lock);
}
-void hl_abort_waitings_for_completion(struct hl_device *hdev)
+void hl_abort_waiting_for_cs_completions(struct hl_device *hdev)
{
force_complete_cs(hdev);
force_complete_multi_cs(hdev);
- hl_release_pending_user_interrupts(hdev);
}
static void job_wq_completion(struct work_struct *work)
@@ -1948,8 +1944,7 @@ static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
else
cb_size = hdev->asic_funcs->get_signal_cb_size(hdev);
- cb = hl_cb_kernel_create(hdev, cb_size,
- q_type == QUEUE_TYPE_HW && hdev->mmu_enable);
+ cb = hl_cb_kernel_create(hdev, cb_size, q_type == QUEUE_TYPE_HW);
if (!cb) {
atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
atomic64_inc(&cntr->out_of_mem_drop_cnt);
@@ -2152,7 +2147,7 @@ static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id)
hdev->asic_funcs->hw_queues_unlock(hdev);
rc = -EINVAL;
- goto out;
+ goto out_unlock;
}
/*
@@ -2167,15 +2162,21 @@ static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id)
/* Release the id and free allocated memory of the handle */
idr_remove(&mgr->handles, handle_id);
+
+ /* unlock before calling ctx_put, where we might sleep */
+ spin_unlock(&mgr->lock);
hl_ctx_put(encaps_sig_hdl->ctx);
kfree(encaps_sig_hdl);
+ goto out;
} else {
rc = -EINVAL;
dev_err(hdev->dev, "failed to unreserve signals, cannot find handler\n");
}
-out:
+
+out_unlock:
spin_unlock(&mgr->lock);
+out:
return rc;
}
diff --git a/drivers/accel/habanalabs/common/debugfs.c b/drivers/accel/habanalabs/common/debugfs.c
index 22dd17c077c0..9e84a47a21dc 100644
--- a/drivers/accel/habanalabs/common/debugfs.c
+++ b/drivers/accel/habanalabs/common/debugfs.c
@@ -255,9 +255,6 @@ static int vm_show(struct seq_file *s, void *data)
u64 j;
int i;
- if (!dev_entry->hdev->mmu_enable)
- return 0;
-
mutex_lock(&dev_entry->ctx_mem_hash_mutex);
list_for_each_entry(ctx, &dev_entry->ctx_mem_hash_list, debugfs_list) {
@@ -436,9 +433,6 @@ static int mmu_show(struct seq_file *s, void *data)
u64 virt_addr = dev_entry->mmu_addr, phys_addr;
int i;
- if (!hdev->mmu_enable)
- return 0;
-
if (dev_entry->mmu_asid == HL_KERNEL_ASID_ID)
ctx = hdev->kernel_ctx;
else
@@ -496,9 +490,6 @@ static ssize_t mmu_asid_va_write(struct file *file, const char __user *buf,
char *c;
ssize_t rc;
- if (!hdev->mmu_enable)
- return count;
-
if (count > sizeof(kbuf) - 1)
goto err;
if (copy_from_user(kbuf, buf, count))
@@ -535,9 +526,6 @@ static int mmu_ack_error(struct seq_file *s, void *data)
struct hl_device *hdev = dev_entry->hdev;
int rc;
- if (!hdev->mmu_enable)
- return 0;
-
if (!dev_entry->mmu_cap_mask) {
dev_err(hdev->dev, "mmu_cap_mask is not set\n");
goto err;
@@ -563,9 +551,6 @@ static ssize_t mmu_ack_error_value_write(struct file *file,
char kbuf[MMU_KBUF_SIZE];
ssize_t rc;
- if (!hdev->mmu_enable)
- return count;
-
if (count > sizeof(kbuf) - 1)
goto err;
@@ -661,9 +646,6 @@ static bool hl_is_device_va(struct hl_device *hdev, u64 addr)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
- if (!hdev->mmu_enable)
- goto out;
-
if (prop->dram_supports_virtual_memory &&
(addr >= prop->dmmu.start_addr && addr < prop->dmmu.end_addr))
return true;
@@ -675,7 +657,7 @@ static bool hl_is_device_va(struct hl_device *hdev, u64 addr)
if (addr >= prop->pmmu_huge.start_addr &&
addr < prop->pmmu_huge.end_addr)
return true;
-out:
+
return false;
}
@@ -685,9 +667,6 @@ static bool hl_is_device_internal_memory_va(struct hl_device *hdev, u64 addr,
struct asic_fixed_properties *prop = &hdev->asic_prop;
u64 dram_start_addr, dram_end_addr;
- if (!hdev->mmu_enable)
- return false;
-
if (prop->dram_supports_virtual_memory) {
dram_start_addr = prop->dmmu.start_addr;
dram_end_addr = prop->dmmu.end_addr;
@@ -1756,17 +1735,15 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
}
}
-void hl_debugfs_add_device(struct hl_device *hdev)
+int hl_debugfs_device_init(struct hl_device *hdev)
{
struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs;
int count = ARRAY_SIZE(hl_debugfs_list);
dev_entry->hdev = hdev;
- dev_entry->entry_arr = kmalloc_array(count,
- sizeof(struct hl_debugfs_entry),
- GFP_KERNEL);
+ dev_entry->entry_arr = kmalloc_array(count, sizeof(struct hl_debugfs_entry), GFP_KERNEL);
if (!dev_entry->entry_arr)
- return;
+ return -ENOMEM;
dev_entry->data_dma_blob_desc.size = 0;
dev_entry->data_dma_blob_desc.data = NULL;
@@ -1787,21 +1764,14 @@ void hl_debugfs_add_device(struct hl_device *hdev)
spin_lock_init(&dev_entry->userptr_spinlock);
mutex_init(&dev_entry->ctx_mem_hash_mutex);
- dev_entry->root = debugfs_create_dir(dev_name(hdev->dev),
- hl_debug_root);
-
- add_files_to_device(hdev, dev_entry, dev_entry->root);
- if (!hdev->asic_prop.fw_security_enabled)
- add_secured_nodes(dev_entry, dev_entry->root);
+ return 0;
}
-void hl_debugfs_remove_device(struct hl_device *hdev)
+void hl_debugfs_device_fini(struct hl_device *hdev)
{
struct hl_dbg_device_entry *entry = &hdev->hl_debugfs;
int i;
- debugfs_remove_recursive(entry->root);
-
mutex_destroy(&entry->ctx_mem_hash_mutex);
mutex_destroy(&entry->file_mutex);
@@ -1814,6 +1784,24 @@ void hl_debugfs_remove_device(struct hl_device *hdev)
kfree(entry->entry_arr);
}
+void hl_debugfs_add_device(struct hl_device *hdev)
+{
+ struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs;
+
+ dev_entry->root = debugfs_create_dir(dev_name(hdev->dev), hl_debug_root);
+
+ add_files_to_device(hdev, dev_entry, dev_entry->root);
+ if (!hdev->asic_prop.fw_security_enabled)
+ add_secured_nodes(dev_entry, dev_entry->root);
+}
+
+void hl_debugfs_remove_device(struct hl_device *hdev)
+{
+ struct hl_dbg_device_entry *entry = &hdev->hl_debugfs;
+
+ debugfs_remove_recursive(entry->root);
+}
+
void hl_debugfs_add_file(struct hl_fpriv *hpriv)
{
struct hl_dbg_device_entry *dev_entry = &hpriv->hdev->hl_debugfs;
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index fabfc501ef54..b97339d1f7c6 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -674,7 +674,7 @@ static int device_init_cdev(struct hl_device *hdev, struct class *class,
return 0;
}
-static int device_cdev_sysfs_add(struct hl_device *hdev)
+static int cdev_sysfs_debugfs_add(struct hl_device *hdev)
{
int rc;
@@ -699,7 +699,9 @@ static int device_cdev_sysfs_add(struct hl_device *hdev)
goto delete_ctrl_cdev_device;
}
- hdev->cdev_sysfs_created = true;
+ hl_debugfs_add_device(hdev);
+
+ hdev->cdev_sysfs_debugfs_created = true;
return 0;
@@ -710,11 +712,12 @@ delete_cdev_device:
return rc;
}
-static void device_cdev_sysfs_del(struct hl_device *hdev)
+static void cdev_sysfs_debugfs_remove(struct hl_device *hdev)
{
- if (!hdev->cdev_sysfs_created)
+ if (!hdev->cdev_sysfs_debugfs_created)
goto put_devices;
+ hl_debugfs_remove_device(hdev);
hl_sysfs_fini(hdev);
cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
cdev_device_del(&hdev->cdev, hdev->dev);
@@ -981,6 +984,18 @@ static void device_early_fini(struct hl_device *hdev)
hdev->asic_funcs->early_fini(hdev);
}
+static bool is_pci_link_healthy(struct hl_device *hdev)
+{
+ u16 vendor_id;
+
+ if (!hdev->pdev)
+ return false;
+
+ pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id);
+
+ return (vendor_id == PCI_VENDOR_ID_HABANALABS);
+}
+
static void hl_device_heartbeat(struct work_struct *work)
{
struct hl_device *hdev = container_of(work, struct hl_device,
@@ -995,7 +1010,8 @@ static void hl_device_heartbeat(struct work_struct *work)
goto reschedule;
if (hl_device_operational(hdev, NULL))
- dev_err(hdev->dev, "Device heartbeat failed!\n");
+ dev_err(hdev->dev, "Device heartbeat failed! PCI link is %s\n",
+ is_pci_link_healthy(hdev) ? "healthy" : "broken");
info.err_type = HL_INFO_FW_HEARTBEAT_ERR;
info.event_mask = &event_mask;
@@ -1157,6 +1173,16 @@ static void take_release_locks(struct hl_device *hdev)
mutex_unlock(&hdev->fpriv_ctrl_list_lock);
}
+static void hl_abort_waiting_for_completions(struct hl_device *hdev)
+{
+ hl_abort_waiting_for_cs_completions(hdev);
+
+ /* Release all pending user interrupts, each pending user interrupt
+ * holds a reference to a user context.
+ */
+ hl_release_pending_user_interrupts(hdev);
+}
+
static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset,
bool skip_wq_flush)
{
@@ -1176,10 +1202,7 @@ static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_r
/* flush the MMU prefetch workqueue */
flush_workqueue(hdev->prefetch_wq);
- /* Release all pending user interrupts, each pending user interrupt
- * holds a reference to user context
- */
- hl_release_pending_user_interrupts(hdev);
+ hl_abort_waiting_for_completions(hdev);
}
/*
@@ -1921,7 +1944,7 @@ out:
hl_ctx_put(ctx);
- hl_abort_waitings_for_completion(hdev);
+ hl_abort_waiting_for_completions(hdev);
return 0;
@@ -2034,7 +2057,7 @@ out_err:
int hl_device_init(struct hl_device *hdev)
{
int i, rc, cq_cnt, user_interrupt_cnt, cq_ready_cnt;
- bool add_cdev_sysfs_on_err = false;
+ bool expose_interfaces_on_err = false;
rc = create_cdev(hdev);
if (rc)
@@ -2150,16 +2173,22 @@ int hl_device_init(struct hl_device *hdev)
hdev->device_release_watchdog_timeout_sec = HL_DEVICE_RELEASE_WATCHDOG_TIMEOUT_SEC;
hdev->memory_scrub_val = MEM_SCRUB_DEFAULT_VAL;
- hl_debugfs_add_device(hdev);
- /* debugfs nodes are created in hl_ctx_init so it must be called after
- * hl_debugfs_add_device.
+ rc = hl_debugfs_device_init(hdev);
+ if (rc) {
+ dev_err(hdev->dev, "failed to initialize debugfs entry structure\n");
+ kfree(hdev->kernel_ctx);
+ goto mmu_fini;
+ }
+
+ /* The debugfs entry structure is accessed in hl_ctx_init(), so it must be called after
+ * hl_debugfs_device_init().
*/
rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
if (rc) {
dev_err(hdev->dev, "failed to initialize kernel context\n");
kfree(hdev->kernel_ctx);
- goto remove_device_from_debugfs;
+ goto debugfs_device_fini;
}
rc = hl_cb_pool_init(hdev);
@@ -2175,11 +2204,10 @@ int hl_device_init(struct hl_device *hdev)
}
/*
- * From this point, override rc (=0) in case of an error to allow
- * debugging (by adding char devices and create sysfs nodes as part of
- * the error flow).
+ * From this point, override rc (=0) in case of an error to allow debugging
+ * (by adding char devices and creating sysfs/debugfs files as part of the error flow).
*/
- add_cdev_sysfs_on_err = true;
+ expose_interfaces_on_err = true;
/* Device is now enabled as part of the initialization requires
* communication with the device firmware to get information that
@@ -2221,15 +2249,13 @@ int hl_device_init(struct hl_device *hdev)
}
/*
- * Expose devices and sysfs nodes to user.
- * From here there is no need to add char devices and create sysfs nodes
- * in case of an error.
+ * Expose devices and sysfs/debugfs files to user.
+ * From here there is no need to expose them in case of an error.
*/
- add_cdev_sysfs_on_err = false;
- rc = device_cdev_sysfs_add(hdev);
+ expose_interfaces_on_err = false;
+ rc = cdev_sysfs_debugfs_add(hdev);
if (rc) {
- dev_err(hdev->dev,
- "Failed to add char devices and sysfs nodes\n");
+ dev_err(hdev->dev, "Failed to add char devices and sysfs/debugfs files\n");
rc = 0;
goto out_disabled;
}
@@ -2275,8 +2301,8 @@ release_ctx:
if (hl_ctx_put(hdev->kernel_ctx) != 1)
dev_err(hdev->dev,
"kernel ctx is still alive on initialization failure\n");
-remove_device_from_debugfs:
- hl_debugfs_remove_device(hdev);
+debugfs_device_fini:
+ hl_debugfs_device_fini(hdev);
mmu_fini:
hl_mmu_fini(hdev);
eq_fini:
@@ -2300,15 +2326,11 @@ free_dev:
put_device(hdev->dev);
out_disabled:
hdev->disabled = true;
- if (add_cdev_sysfs_on_err)
- device_cdev_sysfs_add(hdev);
- if (hdev->pdev)
- dev_err(&hdev->pdev->dev,
- "Failed to initialize hl%d. Device %s is NOT usable !\n",
- hdev->cdev_idx, dev_name(&(hdev)->pdev->dev));
- else
- pr_err("Failed to initialize hl%d. Device %s is NOT usable !\n",
- hdev->cdev_idx, dev_name(&(hdev)->pdev->dev));
+ if (expose_interfaces_on_err)
+ cdev_sysfs_debugfs_add(hdev);
+ dev_err(&hdev->pdev->dev,
+ "Failed to initialize hl%d. Device %s is NOT usable !\n",
+ hdev->cdev_idx, dev_name(&hdev->pdev->dev));
return rc;
}
@@ -2427,8 +2449,6 @@ void hl_device_fini(struct hl_device *hdev)
if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))
dev_err(hdev->dev, "kernel ctx is still alive\n");
- hl_debugfs_remove_device(hdev);
-
hl_dec_fini(hdev);
hl_vm_fini(hdev);
@@ -2453,8 +2473,10 @@ void hl_device_fini(struct hl_device *hdev)
device_early_fini(hdev);
- /* Hide devices and sysfs nodes from user */
- device_cdev_sysfs_del(hdev);
+ /* Hide devices and sysfs/debugfs files from user */
+ cdev_sysfs_debugfs_remove(hdev);
+
+ hl_debugfs_device_fini(hdev);
pr_info("removed device successfully\n");
}
@@ -2667,3 +2689,11 @@ void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info)
if (info->event_mask)
*info->event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR;
}
+
+void hl_enable_err_info_capture(struct hl_error_info *captured_err_info)
+{
+ vfree(captured_err_info->page_fault_info.user_mappings);
+ memset(captured_err_info, 0, sizeof(struct hl_error_info));
+ atomic_set(&captured_err_info->cs_timeout.write_enable, 1);
+ captured_err_info->undef_opcode.write_enable = true;
+}
diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 59f61ec66445..acbc1a6b5cb1 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -71,38 +71,124 @@ free_fw_ver:
return NULL;
}
-static int hl_get_preboot_major_minor(struct hl_device *hdev, char *preboot_ver)
+/**
+ * extract_u32_until_given_char() - given a string of the format "<u32><char>*", extract the u32.
+ * @str: the given string
+ * @ver_num: the pointer to the extracted u32 to be returned to the caller.
+ * @given_char: the given char at the end of the u32 in the string
+ *
+ * Return: Upon success, return a pointer to the given_char in the string. Upon failure, return NULL
+ */
+static char *extract_u32_until_given_char(char *str, u32 *ver_num, char given_char)
{
- char major[8], minor[8], *first_dot, *second_dot;
- int rc;
+ char num_str[8] = {}, *ch;
- first_dot = strnstr(preboot_ver, ".", 10);
- if (first_dot) {
- strscpy(major, preboot_ver, first_dot - preboot_ver + 1);
- rc = kstrtou32(major, 10, &hdev->fw_major_version);
- } else {
- rc = -EINVAL;
+ ch = strchrnul(str, given_char);
+ if (*ch == '\0' || ch == str || ch - str >= sizeof(num_str))
+ return NULL;
+
+ memcpy(num_str, str, ch - str);
+ if (kstrtou32(num_str, 10, ver_num))
+ return NULL;
+ return ch;
+}
+
+/**
+ * hl_get_sw_major_minor_subminor() - extract the FW's SW version major, minor, sub-minor
+ * from the version string
+ * @hdev: pointer to the hl_device
+ * @fw_str: the FW's version string
+ *
+ * The extracted version is set in the hdev fields: fw_sw_{major/minor/sub_minor}_ver.
+ *
+ * fw_str is expected to have one of two possible formats, examples:
+ * 1) 'Preboot version hl-gaudi2-1.9.0-fw-42.0.1-sec-3'
+ * 2) 'Preboot version hl-gaudi2-1.9.0-rc-fw-42.0.1-sec-3'
+ * In those examples, the SW major,minor,subminor are correspondingly: 1,9,0.
+ *
+ * Return: 0 for success or a negative error code for failure.
+ */
+static int hl_get_sw_major_minor_subminor(struct hl_device *hdev, const char *fw_str)
+{
+ char *end, *start;
+
+ end = strnstr(fw_str, "-rc-", VERSION_MAX_LEN);
+ if (end == fw_str)
+ return -EINVAL;
+
+ if (!end)
+ end = strnstr(fw_str, "-fw-", VERSION_MAX_LEN);
+
+ if (end == fw_str)
+ return -EINVAL;
+
+ if (!end)
+ return -EINVAL;
+
+ for (start = end - 1; start != fw_str; start--) {
+ if (*start == '-')
+ break;
}
- if (rc) {
- dev_err(hdev->dev, "Error %d parsing preboot major version\n", rc);
- return rc;
+ if (start == fw_str)
+ return -EINVAL;
+
+ /* start/end point each to the starting and ending hyphen of the sw version e.g. -1.9.0- */
+ start++;
+ start = extract_u32_until_given_char(start, &hdev->fw_sw_major_ver, '.');
+ if (!start)
+ goto err_zero_ver;
+
+ start++;
+ start = extract_u32_until_given_char(start, &hdev->fw_sw_minor_ver, '.');
+ if (!start)
+ goto err_zero_ver;
+
+ start++;
+ start = extract_u32_until_given_char(start, &hdev->fw_sw_sub_minor_ver, '-');
+ if (!start)
+ goto err_zero_ver;
+
+ return 0;
+
+err_zero_ver:
+ hdev->fw_sw_major_ver = 0;
+ hdev->fw_sw_minor_ver = 0;
+ hdev->fw_sw_sub_minor_ver = 0;
+ return -EINVAL;
+}
+
+/**
+ * hl_get_preboot_major_minor() - extract the FW's version major, minor from the version string.
+ * @hdev: pointer to the hl_device
+ * @preboot_ver: the FW's version string
+ *
+ * preboot_ver is expected to be the format of <major>.<minor>.<sub minor>*, e.g: 42.0.1-sec-3
+ * The extracted version is set in the hdev fields: fw_inner_{major/minor}_ver.
+ *
+ * Return: 0 on success, negative error code for failure.
+ */
+static int hl_get_preboot_major_minor(struct hl_device *hdev, char *preboot_ver)
+{
+ preboot_ver = extract_u32_until_given_char(preboot_ver, &hdev->fw_inner_major_ver, '.');
+ if (!preboot_ver) {
+ dev_err(hdev->dev, "Error parsing preboot major version\n");
+ goto err_zero_ver;
}
- /* skip the first dot */
- first_dot++;
+ preboot_ver++;
- second_dot = strnstr(first_dot, ".", 10);
- if (second_dot) {
- strscpy(minor, first_dot, second_dot - first_dot + 1);
- rc = kstrtou32(minor, 10, &hdev->fw_minor_version);
- } else {
- rc = -EINVAL;
+ preboot_ver = extract_u32_until_given_char(preboot_ver, &hdev->fw_inner_minor_ver, '.');
+ if (!preboot_ver) {
+ dev_err(hdev->dev, "Error parsing preboot minor version\n");
+ goto err_zero_ver;
}
+ return 0;
- if (rc)
- dev_err(hdev->dev, "Error %d parsing preboot minor version\n", rc);
- return rc;
+err_zero_ver:
+ hdev->fw_inner_major_ver = 0;
+ hdev->fw_inner_minor_ver = 0;
+ return -EINVAL;
}
static int hl_request_fw(struct hl_device *hdev,
@@ -505,6 +591,20 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
size);
}
+int hl_fw_send_soft_reset(struct hl_device *hdev)
+{
+ struct cpucp_packet pkt;
+ int rc;
+
+ memset(&pkt, 0, sizeof(pkt));
+ pkt.ctl = cpu_to_le32(CPUCP_PACKET_SOFT_RESET << CPUCP_PKT_CTL_OPCODE_SHIFT);
+ rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
+ if (rc)
+ dev_err(hdev->dev, "failed to send soft-reset msg (err = %d)\n", rc);
+
+ return rc;
+}
+
int hl_fw_send_device_activity(struct hl_device *hdev, bool open)
{
struct cpucp_packet pkt;
@@ -1268,8 +1368,10 @@ void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev)
void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev)
{
- struct static_fw_load_mgr *static_loader =
- &hdev->fw_loader.static_loader;
+ struct fw_load_mgr *fw_loader = &hdev->fw_loader;
+ u32 status, cpu_boot_status_reg, cpu_timeout;
+ struct static_fw_load_mgr *static_loader;
+ struct pre_fw_load_props *pre_fw_load;
int rc;
if (hdev->device_cpu_is_halted)
@@ -1277,12 +1379,28 @@ void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev)
/* Stop device CPU to make sure nothing bad happens */
if (hdev->asic_prop.dynamic_fw_load) {
+ pre_fw_load = &fw_loader->pre_fw_load;
+ cpu_timeout = fw_loader->cpu_timeout;
+ cpu_boot_status_reg = pre_fw_load->cpu_boot_status_reg;
+
rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader,
- COMMS_GOTO_WFE, 0, false,
- hdev->fw_loader.cpu_timeout);
- if (rc)
+ COMMS_GOTO_WFE, 0, false, cpu_timeout);
+ if (rc) {
dev_err(hdev->dev, "Failed sending COMMS_GOTO_WFE\n");
+ } else {
+ rc = hl_poll_timeout(
+ hdev,
+ cpu_boot_status_reg,
+ status,
+ status == CPU_BOOT_STATUS_IN_WFE,
+ hdev->fw_poll_interval_usec,
+ cpu_timeout);
+ if (rc)
+ dev_err(hdev->dev, "Current status=%u. Timed-out updating to WFE\n",
+ status);
+ }
} else {
+ static_loader = &hdev->fw_loader.static_loader;
WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_GOTO_WFE);
msleep(static_loader->cpu_reset_wait_msec);
@@ -2151,6 +2269,7 @@ static int hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev,
struct asic_fixed_properties *prop = &hdev->asic_prop;
char *preboot_ver, *boot_ver;
char btl_ver[32];
+ int rc;
switch (fwc) {
case FW_COMP_BOOT_FIT:
@@ -2164,20 +2283,20 @@ static int hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev,
break;
case FW_COMP_PREBOOT:
strscpy(prop->preboot_ver, fw_version, VERSION_MAX_LEN);
- preboot_ver = strnstr(prop->preboot_ver, "Preboot",
- VERSION_MAX_LEN);
+ preboot_ver = strnstr(prop->preboot_ver, "Preboot", VERSION_MAX_LEN);
+ dev_info(hdev->dev, "preboot full version: '%s'\n", preboot_ver);
+
if (preboot_ver && preboot_ver != prop->preboot_ver) {
strscpy(btl_ver, prop->preboot_ver,
min((int) (preboot_ver - prop->preboot_ver), 31));
dev_info(hdev->dev, "%s\n", btl_ver);
}
+ rc = hl_get_sw_major_minor_subminor(hdev, preboot_ver);
+ if (rc)
+ return rc;
preboot_ver = extract_fw_ver_from_str(prop->preboot_ver);
if (preboot_ver) {
- int rc;
-
- dev_info(hdev->dev, "preboot version %s\n", preboot_ver);
-
rc = hl_get_preboot_major_minor(hdev, preboot_ver);
kfree(preboot_ver);
if (rc)
@@ -2367,16 +2486,6 @@ static int hl_fw_dynamic_load_image(struct hl_device *hdev,
if (rc)
goto release_fw;
- /* update state according to boot stage */
- if (cur_fwc == FW_COMP_BOOT_FIT) {
- struct cpu_dyn_regs *dyn_regs;
-
- dyn_regs = &fw_loader->dynamic_loader.comm_desc.cpu_dyn_regs;
- hl_fw_boot_fit_update_state(hdev,
- le32_to_cpu(dyn_regs->cpu_boot_dev_sts0),
- le32_to_cpu(dyn_regs->cpu_boot_dev_sts1));
- }
-
/* copy boot fit to space allocated by FW */