diff options
author | Dave Airlie <airlied@redhat.com> | 2023-06-09 11:30:37 +1000 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2023-06-09 11:30:44 +1000 |
commit | 7f4f4adb9ba1d9b292e4b3ade0235be2e5ad5da7 (patch) | |
tree | 9f272d74e1adfa3bca4e466f8e0fa6890be1fb8d | |
parent | c9b685df2d2138aa31399b0d146ba095a91c7846 (diff) | |
parent | e6f49e96bc57d34fc0f617f37bfdf62a9b58d2c2 (diff) | |
download | linux-7f4f4adb9ba1d9b292e4b3ade0235be2e5ad5da7.tar.gz linux-7f4f4adb9ba1d9b292e4b3ade0235be2e5ad5da7.tar.bz2 linux-7f4f4adb9ba1d9b292e4b3ade0235be2e5ad5da7.zip |
Merge tag 'drm-habanalabs-next-2023-06-08' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into drm-next
This tag contains additional habanalabs driver changes for v6.5:
- uAPI changes:
- Return 0 when user queries if there was a h/w or f/w error and no such error happened.
Previously we returned an error in such case.
- New features and improvements:
- Add pci health check when we lose connection with the firmware. This can be used to
distinguish between pci link down and firmware getting stuck.
- Add more info to the error print when TPC interrupt occur.
- Reduce amount of code under mutex in the command submission of signal event.
- Firmware related fixes:
- Fixes to the handshake protocol during f/w initialization.
- Display information that the f/w sends us when encountering a DMA error.
- Do soft-reset using a message sent to firmware instead of writing to MMIO.
- Prepare generic code to extract f/w version numbers.
- Bug fixes and code cleanups. Notable fixes are:
- Unsecure certain TPC registers that the user should access.
- Fix handling of QMAN errors
- Fix memory leak when recording errors (to later pass them to the user)
- Multiple fixes to razwi interrupt handling code
Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Oded Gabbay <ogabbay@kernel.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20230608103043.GA2699019@ogabbay-vm-u20.habana-labs.com
23 files changed, 557 insertions, 696 deletions
diff --git a/drivers/accel/habanalabs/common/command_buffer.c b/drivers/accel/habanalabs/common/command_buffer.c index 6e09f48750a0..08f7aee42624 100644 --- a/drivers/accel/habanalabs/common/command_buffer.c +++ b/drivers/accel/habanalabs/common/command_buffer.c @@ -27,12 +27,6 @@ static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb) return -EINVAL; } - if (!hdev->mmu_enable) { - dev_err_ratelimited(hdev->dev, - "Cannot map CB because MMU is disabled\n"); - return -EINVAL; - } - if (cb->is_mmu_mapped) return 0; diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c index af9d2e22c6e7..c23829dab97a 100644 --- a/drivers/accel/habanalabs/common/command_submission.c +++ b/drivers/accel/habanalabs/common/command_submission.c @@ -280,14 +280,8 @@ bool cs_needs_timeout(struct hl_cs *cs) static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job) { - /* - * Patched CB is created for external queues jobs, and for H/W queues - * jobs if the user CB was allocated by driver and MMU is disabled. - */ - return (job->queue_type == QUEUE_TYPE_EXT || - (job->queue_type == QUEUE_TYPE_HW && - job->is_kernel_allocated_cb && - !hdev->mmu_enable)); + /* Patched CB is created for external queues jobs */ + return (job->queue_type == QUEUE_TYPE_EXT); } /* @@ -363,14 +357,13 @@ static void hl_complete_job(struct hl_device *hdev, struct hl_cs_job *job) } } - /* For H/W queue jobs, if a user CB was allocated by driver and MMU is - * enabled, the user CB isn't released in cs_parser() and thus should be + /* For H/W queue jobs, if a user CB was allocated by driver, + * the user CB isn't released in cs_parser() and thus should be * released here. This is also true for INT queues jobs which were * allocated by driver. */ - if ((job->is_kernel_allocated_cb && - ((job->queue_type == QUEUE_TYPE_HW && hdev->mmu_enable) || - job->queue_type == QUEUE_TYPE_INT))) { + if (job->is_kernel_allocated_cb && + (job->queue_type == QUEUE_TYPE_HW || job->queue_type == QUEUE_TYPE_INT)) { atomic_dec(&job->user_cb->cs_cnt); hl_cb_put(job->user_cb); } @@ -804,12 +797,14 @@ out: static void cs_timedout(struct work_struct *work) { + struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work); + bool skip_reset_on_timeout, device_reset = false; struct hl_device *hdev; u64 event_mask = 0x0; + uint timeout_sec; int rc; - struct hl_cs *cs = container_of(work, struct hl_cs, - work_tdr.work); - bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false; + + skip_reset_on_timeout = cs->skip_reset_on_timeout; rc = cs_get_unless_zero(cs); if (!rc) @@ -840,29 +835,31 @@ static void cs_timedout(struct work_struct *work) event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT; } + timeout_sec = jiffies_to_msecs(hdev->timeout_jiffies) / 1000; + switch (cs->type) { case CS_TYPE_SIGNAL: dev_err(hdev->dev, - "Signal command submission %llu has not finished in time!\n", - cs->sequence); + "Signal command submission %llu has not finished in %u seconds!\n", + cs->sequence, timeout_sec); break; case CS_TYPE_WAIT: dev_err(hdev->dev, - "Wait command submission %llu has not finished in time!\n", - cs->sequence); + "Wait command submission %llu has not finished in %u seconds!\n", + cs->sequence, timeout_sec); break; case CS_TYPE_COLLECTIVE_WAIT: dev_err(hdev->dev, - "Collective Wait command submission %llu has not finished in time!\n", - cs->sequence); + "Collective Wait command submission %llu has not finished in %u seconds!\n", + cs->sequence, timeout_sec); break; default: dev_err(hdev->dev, - "Command submission %llu has not finished in time!\n", - cs->sequence); + "Command submission %llu has not finished in %u seconds!\n", + cs->sequence, timeout_sec); break; } @@ -1139,11 +1136,10 @@ static void force_complete_cs(struct hl_device *hdev) spin_unlock(&hdev->cs_mirror_lock); } -void hl_abort_waitings_for_completion(struct hl_device *hdev) +void hl_abort_waiting_for_cs_completions(struct hl_device *hdev) { force_complete_cs(hdev); force_complete_multi_cs(hdev); - hl_release_pending_user_interrupts(hdev); } static void job_wq_completion(struct work_struct *work) @@ -1948,8 +1944,7 @@ static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev, else cb_size = hdev->asic_funcs->get_signal_cb_size(hdev); - cb = hl_cb_kernel_create(hdev, cb_size, - q_type == QUEUE_TYPE_HW && hdev->mmu_enable); + cb = hl_cb_kernel_create(hdev, cb_size, q_type == QUEUE_TYPE_HW); if (!cb) { atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); atomic64_inc(&cntr->out_of_mem_drop_cnt); @@ -2152,7 +2147,7 @@ static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id) hdev->asic_funcs->hw_queues_unlock(hdev); rc = -EINVAL; - goto out; + goto out_unlock; } /* @@ -2167,15 +2162,21 @@ static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id) /* Release the id and free allocated memory of the handle */ idr_remove(&mgr->handles, handle_id); + + /* unlock before calling ctx_put, where we might sleep */ + spin_unlock(&mgr->lock); hl_ctx_put(encaps_sig_hdl->ctx); kfree(encaps_sig_hdl); + goto out; } else { rc = -EINVAL; dev_err(hdev->dev, "failed to unreserve signals, cannot find handler\n"); } -out: + +out_unlock: spin_unlock(&mgr->lock); +out: return rc; } diff --git a/drivers/accel/habanalabs/common/debugfs.c b/drivers/accel/habanalabs/common/debugfs.c index 22dd17c077c0..9e84a47a21dc 100644 --- a/drivers/accel/habanalabs/common/debugfs.c +++ b/drivers/accel/habanalabs/common/debugfs.c @@ -255,9 +255,6 @@ static int vm_show(struct seq_file *s, void *data) u64 j; int i; - if (!dev_entry->hdev->mmu_enable) - return 0; - mutex_lock(&dev_entry->ctx_mem_hash_mutex); list_for_each_entry(ctx, &dev_entry->ctx_mem_hash_list, debugfs_list) { @@ -436,9 +433,6 @@ static int mmu_show(struct seq_file *s, void *data) u64 virt_addr = dev_entry->mmu_addr, phys_addr; int i; - if (!hdev->mmu_enable) - return 0; - if (dev_entry->mmu_asid == HL_KERNEL_ASID_ID) ctx = hdev->kernel_ctx; else @@ -496,9 +490,6 @@ static ssize_t mmu_asid_va_write(struct file *file, const char __user *buf, char *c; ssize_t rc; - if (!hdev->mmu_enable) - return count; - if (count > sizeof(kbuf) - 1) goto err; if (copy_from_user(kbuf, buf, count)) @@ -535,9 +526,6 @@ static int mmu_ack_error(struct seq_file *s, void *data) struct hl_device *hdev = dev_entry->hdev; int rc; - if (!hdev->mmu_enable) - return 0; - if (!dev_entry->mmu_cap_mask) { dev_err(hdev->dev, "mmu_cap_mask is not set\n"); goto err; @@ -563,9 +551,6 @@ static ssize_t mmu_ack_error_value_write(struct file *file, char kbuf[MMU_KBUF_SIZE]; ssize_t rc; - if (!hdev->mmu_enable) - return count; - if (count > sizeof(kbuf) - 1) goto err; @@ -661,9 +646,6 @@ static bool hl_is_device_va(struct hl_device *hdev, u64 addr) { struct asic_fixed_properties *prop = &hdev->asic_prop; - if (!hdev->mmu_enable) - goto out; - if (prop->dram_supports_virtual_memory && (addr >= prop->dmmu.start_addr && addr < prop->dmmu.end_addr)) return true; @@ -675,7 +657,7 @@ static bool hl_is_device_va(struct hl_device *hdev, u64 addr) if (addr >= prop->pmmu_huge.start_addr && addr < prop->pmmu_huge.end_addr) return true; -out: + return false; } @@ -685,9 +667,6 @@ static bool hl_is_device_internal_memory_va(struct hl_device *hdev, u64 addr, struct asic_fixed_properties *prop = &hdev->asic_prop; u64 dram_start_addr, dram_end_addr; - if (!hdev->mmu_enable) - return false; - if (prop->dram_supports_virtual_memory) { dram_start_addr = prop->dmmu.start_addr; dram_end_addr = prop->dmmu.end_addr; @@ -1756,17 +1735,15 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent } } -void hl_debugfs_add_device(struct hl_device *hdev) +int hl_debugfs_device_init(struct hl_device *hdev) { struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs; int count = ARRAY_SIZE(hl_debugfs_list); dev_entry->hdev = hdev; - dev_entry->entry_arr = kmalloc_array(count, - sizeof(struct hl_debugfs_entry), - GFP_KERNEL); + dev_entry->entry_arr = kmalloc_array(count, sizeof(struct hl_debugfs_entry), GFP_KERNEL); if (!dev_entry->entry_arr) - return; + return -ENOMEM; dev_entry->data_dma_blob_desc.size = 0; dev_entry->data_dma_blob_desc.data = NULL; @@ -1787,21 +1764,14 @@ void hl_debugfs_add_device(struct hl_device *hdev) spin_lock_init(&dev_entry->userptr_spinlock); mutex_init(&dev_entry->ctx_mem_hash_mutex); - dev_entry->root = debugfs_create_dir(dev_name(hdev->dev), - hl_debug_root); - - add_files_to_device(hdev, dev_entry, dev_entry->root); - if (!hdev->asic_prop.fw_security_enabled) - add_secured_nodes(dev_entry, dev_entry->root); + return 0; } -void hl_debugfs_remove_device(struct hl_device *hdev) +void hl_debugfs_device_fini(struct hl_device *hdev) { struct hl_dbg_device_entry *entry = &hdev->hl_debugfs; int i; - debugfs_remove_recursive(entry->root); - mutex_destroy(&entry->ctx_mem_hash_mutex); mutex_destroy(&entry->file_mutex); @@ -1814,6 +1784,24 @@ void hl_debugfs_remove_device(struct hl_device *hdev) kfree(entry->entry_arr); } +void hl_debugfs_add_device(struct hl_device *hdev) +{ + struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs; + + dev_entry->root = debugfs_create_dir(dev_name(hdev->dev), hl_debug_root); + + add_files_to_device(hdev, dev_entry, dev_entry->root); + if (!hdev->asic_prop.fw_security_enabled) + add_secured_nodes(dev_entry, dev_entry->root); +} + +void hl_debugfs_remove_device(struct hl_device *hdev) +{ + struct hl_dbg_device_entry *entry = &hdev->hl_debugfs; + + debugfs_remove_recursive(entry->root); +} + void hl_debugfs_add_file(struct hl_fpriv *hpriv) { struct hl_dbg_device_entry *dev_entry = &hpriv->hdev->hl_debugfs; diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index fabfc501ef54..b97339d1f7c6 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -674,7 +674,7 @@ static int device_init_cdev(struct hl_device *hdev, struct class *class, return 0; } -static int device_cdev_sysfs_add(struct hl_device *hdev) +static int cdev_sysfs_debugfs_add(struct hl_device *hdev) { int rc; @@ -699,7 +699,9 @@ static int device_cdev_sysfs_add(struct hl_device *hdev) goto delete_ctrl_cdev_device; } - hdev->cdev_sysfs_created = true; + hl_debugfs_add_device(hdev); + + hdev->cdev_sysfs_debugfs_created = true; return 0; @@ -710,11 +712,12 @@ delete_cdev_device: return rc; } -static void device_cdev_sysfs_del(struct hl_device *hdev) +static void cdev_sysfs_debugfs_remove(struct hl_device *hdev) { - if (!hdev->cdev_sysfs_created) + if (!hdev->cdev_sysfs_debugfs_created) goto put_devices; + hl_debugfs_remove_device(hdev); hl_sysfs_fini(hdev); cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl); cdev_device_del(&hdev->cdev, hdev->dev); @@ -981,6 +984,18 @@ static void device_early_fini(struct hl_device *hdev) hdev->asic_funcs->early_fini(hdev); } +static bool is_pci_link_healthy(struct hl_device *hdev) +{ + u16 vendor_id; + + if (!hdev->pdev) + return false; + + pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id); + + return (vendor_id == PCI_VENDOR_ID_HABANALABS); +} + static void hl_device_heartbeat(struct work_struct *work) { struct hl_device *hdev = container_of(work, struct hl_device, @@ -995,7 +1010,8 @@ static void hl_device_heartbeat(struct work_struct *work) goto reschedule; if (hl_device_operational(hdev, NULL)) - dev_err(hdev->dev, "Device heartbeat failed!\n"); + dev_err(hdev->dev, "Device heartbeat failed! PCI link is %s\n", + is_pci_link_healthy(hdev) ? "healthy" : "broken"); info.err_type = HL_INFO_FW_HEARTBEAT_ERR; info.event_mask = &event_mask; @@ -1157,6 +1173,16 @@ static void take_release_locks(struct hl_device *hdev) mutex_unlock(&hdev->fpriv_ctrl_list_lock); } +static void hl_abort_waiting_for_completions(struct hl_device *hdev) +{ + hl_abort_waiting_for_cs_completions(hdev); + + /* Release all pending user interrupts, each pending user interrupt + * holds a reference to a user context. + */ + hl_release_pending_user_interrupts(hdev); +} + static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset, bool skip_wq_flush) { @@ -1176,10 +1202,7 @@ static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_r /* flush the MMU prefetch workqueue */ flush_workqueue(hdev->prefetch_wq); - /* Release all pending user interrupts, each pending user interrupt - * holds a reference to user context - */ - hl_release_pending_user_interrupts(hdev); + hl_abort_waiting_for_completions(hdev); } /* @@ -1921,7 +1944,7 @@ out: hl_ctx_put(ctx); - hl_abort_waitings_for_completion(hdev); + hl_abort_waiting_for_completions(hdev); return 0; @@ -2034,7 +2057,7 @@ out_err: int hl_device_init(struct hl_device *hdev) { int i, rc, cq_cnt, user_interrupt_cnt, cq_ready_cnt; - bool add_cdev_sysfs_on_err = false; + bool expose_interfaces_on_err = false; rc = create_cdev(hdev); if (rc) @@ -2150,16 +2173,22 @@ int hl_device_init(struct hl_device *hdev) hdev->device_release_watchdog_timeout_sec = HL_DEVICE_RELEASE_WATCHDOG_TIMEOUT_SEC; hdev->memory_scrub_val = MEM_SCRUB_DEFAULT_VAL; - hl_debugfs_add_device(hdev); - /* debugfs nodes are created in hl_ctx_init so it must be called after - * hl_debugfs_add_device. + rc = hl_debugfs_device_init(hdev); + if (rc) { + dev_err(hdev->dev, "failed to initialize debugfs entry structure\n"); + kfree(hdev->kernel_ctx); + goto mmu_fini; + } + + /* The debugfs entry structure is accessed in hl_ctx_init(), so it must be called after + * hl_debugfs_device_init(). */ rc = hl_ctx_init(hdev, hdev->kernel_ctx, true); if (rc) { dev_err(hdev->dev, "failed to initialize kernel context\n"); kfree(hdev->kernel_ctx); - goto remove_device_from_debugfs; + goto debugfs_device_fini; } rc = hl_cb_pool_init(hdev); @@ -2175,11 +2204,10 @@ int hl_device_init(struct hl_device *hdev) } /* - * From this point, override rc (=0) in case of an error to allow - * debugging (by adding char devices and create sysfs nodes as part of - * the error flow). + * From this point, override rc (=0) in case of an error to allow debugging + * (by adding char devices and creating sysfs/debugfs files as part of the error flow). */ - add_cdev_sysfs_on_err = true; + expose_interfaces_on_err = true; /* Device is now enabled as part of the initialization requires * communication with the device firmware to get information that @@ -2221,15 +2249,13 @@ int hl_device_init(struct hl_device *hdev) } /* - * Expose devices and sysfs nodes to user. - * From here there is no need to add char devices and create sysfs nodes - * in case of an error. + * Expose devices and sysfs/debugfs files to user. + * From here there is no need to expose them in case of an error. */ - add_cdev_sysfs_on_err = false; - rc = device_cdev_sysfs_add(hdev); + expose_interfaces_on_err = false; + rc = cdev_sysfs_debugfs_add(hdev); if (rc) { - dev_err(hdev->dev, - "Failed to add char devices and sysfs nodes\n"); + dev_err(hdev->dev, "Failed to add char devices and sysfs/debugfs files\n"); rc = 0; goto out_disabled; } @@ -2275,8 +2301,8 @@ release_ctx: if (hl_ctx_put(hdev->kernel_ctx) != 1) dev_err(hdev->dev, "kernel ctx is still alive on initialization failure\n"); -remove_device_from_debugfs: - hl_debugfs_remove_device(hdev); +debugfs_device_fini: + hl_debugfs_device_fini(hdev); mmu_fini: hl_mmu_fini(hdev); eq_fini: @@ -2300,15 +2326,11 @@ free_dev: put_device(hdev->dev); out_disabled: hdev->disabled = true; - if (add_cdev_sysfs_on_err) - device_cdev_sysfs_add(hdev); - if (hdev->pdev) - dev_err(&hdev->pdev->dev, - "Failed to initialize hl%d. Device %s is NOT usable !\n", - hdev->cdev_idx, dev_name(&(hdev)->pdev->dev)); - else - pr_err("Failed to initialize hl%d. Device %s is NOT usable !\n", - hdev->cdev_idx, dev_name(&(hdev)->pdev->dev)); + if (expose_interfaces_on_err) + cdev_sysfs_debugfs_add(hdev); + dev_err(&hdev->pdev->dev, + "Failed to initialize hl%d. Device %s is NOT usable !\n", + hdev->cdev_idx, dev_name(&hdev->pdev->dev)); return rc; } @@ -2427,8 +2449,6 @@ void hl_device_fini(struct hl_device *hdev) if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1)) dev_err(hdev->dev, "kernel ctx is still alive\n"); - hl_debugfs_remove_device(hdev); - hl_dec_fini(hdev); hl_vm_fini(hdev); @@ -2453,8 +2473,10 @@ void hl_device_fini(struct hl_device *hdev) device_early_fini(hdev); - /* Hide devices and sysfs nodes from user */ - device_cdev_sysfs_del(hdev); + /* Hide devices and sysfs/debugfs files from user */ + cdev_sysfs_debugfs_remove(hdev); + + hl_debugfs_device_fini(hdev); pr_info("removed device successfully\n"); } @@ -2667,3 +2689,11 @@ void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info) if (info->event_mask) *info->event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR; } + +void hl_enable_err_info_capture(struct hl_error_info *captured_err_info) +{ + vfree(captured_err_info->page_fault_info.user_mappings); + memset(captured_err_info, 0, sizeof(struct hl_error_info)); + atomic_set(&captured_err_info->cs_timeout.write_enable, 1); + captured_err_info->undef_opcode.write_enable = true; +} diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c index 59f61ec66445..acbc1a6b5cb1 100644 --- a/drivers/accel/habanalabs/common/firmware_if.c +++ b/drivers/accel/habanalabs/common/firmware_if.c @@ -71,38 +71,124 @@ free_fw_ver: return NULL; } -static int hl_get_preboot_major_minor(struct hl_device *hdev, char *preboot_ver) +/** + * extract_u32_until_given_char() - given a string of the format "<u32><char>*", extract the u32. + * @str: the given string + * @ver_num: the pointer to the extracted u32 to be returned to the caller. + * @given_char: the given char at the end of the u32 in the string + * + * Return: Upon success, return a pointer to the given_char in the string. Upon failure, return NULL + */ +static char *extract_u32_until_given_char(char *str, u32 *ver_num, char given_char) { - char major[8], minor[8], *first_dot, *second_dot; - int rc; + char num_str[8] = {}, *ch; - first_dot = strnstr(preboot_ver, ".", 10); - if (first_dot) { - strscpy(major, preboot_ver, first_dot - preboot_ver + 1); - rc = kstrtou32(major, 10, &hdev->fw_major_version); - } else { - rc = -EINVAL; + ch = strchrnul(str, given_char); + if (*ch == '\0' || ch == str || ch - str >= sizeof(num_str)) + return NULL; + + memcpy(num_str, str, ch - str); + if (kstrtou32(num_str, 10, ver_num)) + return NULL; + return ch; +} + +/** + * hl_get_sw_major_minor_subminor() - extract the FW's SW version major, minor, sub-minor + * from the version string + * @hdev: pointer to the hl_device + * @fw_str: the FW's version string + * + * The extracted version is set in the hdev fields: fw_sw_{major/minor/sub_minor}_ver. + * + * fw_str is expected to have one of two possible formats, examples: + * 1) 'Preboot version hl-gaudi2-1.9.0-fw-42.0.1-sec-3' + * 2) 'Preboot version hl-gaudi2-1.9.0-rc-fw-42.0.1-sec-3' + * In those examples, the SW major,minor,subminor are correspondingly: 1,9,0. + * + * Return: 0 for success or a negative error code for failure. + */ +static int hl_get_sw_major_minor_subminor(struct hl_device *hdev, const char *fw_str) +{ + char *end, *start; + + end = strnstr(fw_str, "-rc-", VERSION_MAX_LEN); + if (end == fw_str) + return -EINVAL; + + if (!end) + end = strnstr(fw_str, "-fw-", VERSION_MAX_LEN); + + if (end == fw_str) + return -EINVAL; + + if (!end) + return -EINVAL; + + for (start = end - 1; start != fw_str; start--) { + if (*start == '-') + break; } - if (rc) { - dev_err(hdev->dev, "Error %d parsing preboot major version\n", rc); - return rc; + if (start == fw_str) + return -EINVAL; + + /* start/end point each to the starting and ending hyphen of the sw version e.g. -1.9.0- */ + start++; + start = extract_u32_until_given_char(start, &hdev->fw_sw_major_ver, '.'); + if (!start) + goto err_zero_ver; + + start++; + start = extract_u32_until_given_char(start, &hdev->fw_sw_minor_ver, '.'); + if (!start) + goto err_zero_ver; + + start++; + start = extract_u32_until_given_char(start, &hdev->fw_sw_sub_minor_ver, '-'); + if (!start) + goto err_zero_ver; + + return 0; + +err_zero_ver: + hdev->fw_sw_major_ver = 0; + hdev->fw_sw_minor_ver = 0; + hdev->fw_sw_sub_minor_ver = 0; + return -EINVAL; +} + +/** + * hl_get_preboot_major_minor() - extract the FW's version major, minor from the version string. + * @hdev: pointer to the hl_device + * @preboot_ver: the FW's version string + * + * preboot_ver is expected to be the format of <major>.<minor>.<sub minor>*, e.g: 42.0.1-sec-3 + * The extracted version is set in the hdev fields: fw_inner_{major/minor}_ver. + * + * Return: 0 on success, negative error code for failure. + */ +static int hl_get_preboot_major_minor(struct hl_device *hdev, char *preboot_ver) +{ + preboot_ver = extract_u32_until_given_char(preboot_ver, &hdev->fw_inner_major_ver, '.'); + if (!preboot_ver) { + dev_err(hdev->dev, "Error parsing preboot major version\n"); + goto err_zero_ver; } - /* skip the first dot */ - first_dot++; + preboot_ver++; - second_dot = strnstr(first_dot, ".", 10); - if (second_dot) { - strscpy(minor, first_dot, second_dot - first_dot + 1); - rc = kstrtou32(minor, 10, &hdev->fw_minor_version); - } else { - rc = -EINVAL; + preboot_ver = extract_u32_until_given_char(preboot_ver, &hdev->fw_inner_minor_ver, '.'); + if (!preboot_ver) { + dev_err(hdev->dev, "Error parsing preboot minor version\n"); + goto err_zero_ver; } + return 0; - if (rc) - dev_err(hdev->dev, "Error %d parsing preboot minor version\n", rc); - return rc; +err_zero_ver: + hdev->fw_inner_major_ver = 0; + hdev->fw_inner_minor_ver = 0; + return -EINVAL; } static int hl_request_fw(struct hl_device *hdev, @@ -505,6 +591,20 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, size); } +int hl_fw_send_soft_reset(struct hl_device *hdev) +{ + struct cpucp_packet pkt; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + pkt.ctl = cpu_to_le32(CPUCP_PACKET_SOFT_RESET << CPUCP_PKT_CTL_OPCODE_SHIFT); + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL); + if (rc) + dev_err(hdev->dev, "failed to send soft-reset msg (err = %d)\n", rc); + + return rc; +} + int hl_fw_send_device_activity(struct hl_device *hdev, bool open) { struct cpucp_packet pkt; @@ -1268,8 +1368,10 @@ void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev) void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev) { - struct static_fw_load_mgr *static_loader = - &hdev->fw_loader.static_loader; + struct fw_load_mgr *fw_loader = &hdev->fw_loader; + u32 status, cpu_boot_status_reg, cpu_timeout; + struct static_fw_load_mgr *static_loader; + struct pre_fw_load_props *pre_fw_load; int rc; if (hdev->device_cpu_is_halted) @@ -1277,12 +1379,28 @@ void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev) /* Stop device CPU to make sure nothing bad happens */ if (hdev->asic_prop.dynamic_fw_load) { + pre_fw_load = &fw_loader->pre_fw_load; + cpu_timeout = fw_loader->cpu_timeout; + cpu_boot_status_reg = pre_fw_load->cpu_boot_status_reg; + rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader, - COMMS_GOTO_WFE, 0, false, - hdev->fw_loader.cpu_timeout); - if (rc) + COMMS_GOTO_WFE, 0, false, cpu_timeout); + if (rc) { dev_err(hdev->dev, "Failed sending COMMS_GOTO_WFE\n"); + } else { + rc = hl_poll_timeout( + hdev, + cpu_boot_status_reg, + status, + status == CPU_BOOT_STATUS_IN_WFE, + hdev->fw_poll_interval_usec, + cpu_timeout); + if (rc) + dev_err(hdev->dev, "Current status=%u. Timed-out updating to WFE\n", + status); + } } else { + static_loader = &hdev->fw_loader.static_loader; WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_GOTO_WFE); msleep(static_loader->cpu_reset_wait_msec); @@ -2151,6 +2269,7 @@ static int hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev, struct asic_fixed_properties *prop = &hdev->asic_prop; char *preboot_ver, *boot_ver; char btl_ver[32]; + int rc; switch (fwc) { case FW_COMP_BOOT_FIT: @@ -2164,20 +2283,20 @@ static int hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev, break; case FW_COMP_PREBOOT: strscpy(prop->preboot_ver, fw_version, VERSION_MAX_LEN); - preboot_ver = strnstr(prop->preboot_ver, "Preboot", - VERSION_MAX_LEN); + preboot_ver = strnstr(prop->preboot_ver, "Preboot", VERSION_MAX_LEN); + dev_info(hdev->dev, "preboot full version: '%s'\n", preboot_ver); + if (preboot_ver && preboot_ver != prop->preboot_ver) { strscpy(btl_ver, prop->preboot_ver, min((int) (preboot_ver - prop->preboot_ver), 31)); dev_info(hdev->dev, "%s\n", btl_ver); } + rc = hl_get_sw_major_minor_subminor(hdev, preboot_ver); + if (rc) + return rc; preboot_ver = extract_fw_ver_from_str(prop->preboot_ver); if (preboot_ver) { - int rc; - - dev_info(hdev->dev, "preboot version %s\n", preboot_ver); - rc = hl_get_preboot_major_minor(hdev, preboot_ver); kfree(preboot_ver); if (rc) @@ -2367,16 +2486,6 @@ static int hl_fw_dynamic_load_image(struct hl_device *hdev, if (rc) goto release_fw; - /* update state according to boot stage */ - if (cur_fwc == FW_COMP_BOOT_FIT) { - struct cpu_dyn_regs *dyn_regs; - - dyn_regs = &fw_loader->dynamic_loader.comm_desc.cpu_dyn_regs; - hl_fw_boot_fit_update_state(hdev, - le32_to_cpu(dyn_regs->cpu_boot_dev_sts0), - le32_to_cpu(dyn_regs->cpu_boot_dev_sts1)); - } - /* copy boot fit to space allocated by FW */ |