Merge tag 'misc-habanalabs-next-2019-07-04' of git://people.freedesktop.org/~gabbayo/linux into char-misc-next

Oded writes: This tag contains the following changes for kernel 5.3: - Change the way the device's CPU access the host memory. This allows the driver to use the kernel API of setting DMA mask in a standard way (call it once). - Add a new debugfs entry to show the status of the internal DMA and compute engines. This is very helpful for debugging in case a command submission get stuck. - Return to the user a mask of the internal engines indicating their busy state. - Make sure to restore registers that can be modified by the user to their default values. Only applies to registers that are initialized by the driver. - Elimination of redundant and dead-code. - Support memset of the device's memory with size larger then 4GB - Force the user to set the device to debug mode before configuring the device's coresight infrastructure - Improve error printing in case of interrupts from the device * tag 'misc-habanalabs-next-2019-07-04' of git://people.freedesktop.org/~gabbayo/linux: (31 commits) habanalabs: Add busy engines bitmask to HW idle IOCTL habanalabs: Add debugfs node for engines status habanalabs: Update the device idle check habanalabs: Allow accessing host mapped addresses via debugfs habanalabs: add WARN in case of bad MMU mapping habanalabs: remove DMA mask hack for Goya habanalabs: set Goya CPU to use ASIC MMU habanalabs: add MMU mappings for Goya CPU habanalabs: initialize MMU context for driver habanalabs: de-couple MMU and VM module initialization habanalabs: initialize device CPU queues after MMU init docs/habanalabs: update text for some entries in sysfs habanalabs: add rate-limit to an error message habanalabs: remove simulator dedicated code habanalabs: restore unsecured registers default values habanalabs: clear sobs and monitors in context switch habanalabs: make tpc registers secured habanalabs: don't limit packet size for device CPU habanalabs: support device memory memset > 4GB habanalabs: print event name for fatal and non-RAZWI events ...
author: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 2019-07-04 10:20:48 +0200
committer: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 2019-07-04 10:20:48 +0200
commit: a94de2e7a380064a6e8a4c3e5e10d97e8aa711f4 (patch)
tree: 1026bbbd6ccc8241fe58bd3742353c647b459c5c /drivers
parent: 60e8523e2ea18dc0c0cea69d6c1d69a065019062 (diff)
parent: e8960ca06bb22d0d84edf246b0bf395e8322e127 (diff)
download: linux-a94de2e7a380064a6e8a4c3e5e10d97e8aa711f4.tar.gz
linux-a94de2e7a380064a6e8a4c3e5e10d97e8aa711f4.tar.bz2
linux-a94de2e7a380064a6e8a4c3e5e10d97e8aa711f4.zip
19 files changed, 1181 insertions, 441 deletions
diff --git a/drivers/misc/habanalabs/asid.c b/drivers/misc/habanalabs/asid.c
index f54e7971a762..2c01461701a3 100644
--- a/drivers/misc/habanalabs/asid.c
+++ b/drivers/misc/habanalabs/asid.c
@@ -18,7 +18,7 @@ int hl_asid_init(struct hl_device *hdev)
 
 	mutex_init(&hdev->asid_mutex);
 
-	/* ASID 0 is reserved for KMD */
+	/* ASID 0 is reserved for KMD and device CPU */
 	set_bit(0, hdev->asid_bitmap);
 
 	return 0;
diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index 6fe785e26859..6ad83d5ef4b0 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -682,14 +682,12 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 		u32 tmp;
 
 		rc = hl_poll_timeout_memory(hdev,
-			(u64) (uintptr_t) &ctx->thread_ctx_switch_wait_token,
-			jiffies_to_usecs(hdev->timeout_jiffies),
-			&tmp);
+			&ctx->thread_ctx_switch_wait_token, tmp, (tmp == 1),
+			100, jiffies_to_usecs(hdev->timeout_jiffies));
 
-		if (rc || !tmp) {
+		if (rc == -ETIMEDOUT) {
 			dev_err(hdev->dev,
-				"context switch phase didn't finish in time\n");
-			rc = -ETIMEDOUT;
+				"context switch phase timeout (%d)\n", tmp);
 			goto out;
 		}
 	}
diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/context.c
index f4c92f110a72..8682590e3f6e 100644
--- a/drivers/misc/habanalabs/context.c
+++ b/drivers/misc/habanalabs/context.c
@@ -31,9 +31,13 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 		 * Coresight might be still working by accessing addresses
 		 * related to the stopped engines. Hence stop it explicitly.
 		 */
-		hdev->asic_funcs->halt_coresight(hdev);
+		if (hdev->in_debug)
+			hl_device_set_debug_mode(hdev, false);
+
 		hl_vm_ctx_fini(ctx);
 		hl_asid_free(hdev, ctx->asid);
+	} else {
+		hl_mmu_ctx_fini(ctx);
 	}
 }
 
@@ -117,6 +121,11 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 
 	if (is_kernel_ctx) {
 		ctx->asid = HL_KERNEL_ASID_ID; /* KMD gets ASID 0 */
+		rc = hl_mmu_ctx_init(ctx);
+		if (rc) {
+			dev_err(hdev->dev, "Failed to init mmu ctx module\n");
+			goto mem_ctx_err;
+		}
 	} else {
 		ctx->asid = hl_asid_alloc(hdev);
 		if (!ctx->asid) {
diff --git a/drivers/misc/habanalabs/debugfs.c b/drivers/misc/habanalabs/debugfs.c
index ba418aaa404c..18e499c900c7 100644
--- a/drivers/misc/habanalabs/debugfs.c
+++ b/drivers/misc/habanalabs/debugfs.c
@@ -355,7 +355,7 @@ static int mmu_show(struct seq_file *s, void *data)
 	struct hl_debugfs_entry *entry = s->private;
 	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
 	struct hl_device *hdev = dev_entry->hdev;
-	struct hl_ctx *ctx = hdev->user_ctx;
+	struct hl_ctx *ctx;
 
 	u64 hop0_addr = 0, hop0_pte_addr = 0, hop0_pte = 0,
 		hop1_addr = 0, hop1_pte_addr = 0, hop1_pte = 0,
@@ -367,6 +367,11 @@ static int mmu_show(struct seq_file *s, void *data)
 	if (!hdev->mmu_enable)
 		return 0;
 
+	if (dev_entry->mmu_asid == HL_KERNEL_ASID_ID)
+		ctx = hdev->kernel_ctx;
+	else
+		ctx = hdev->user_ctx;
+
 	if (!ctx) {
 		dev_err(hdev->dev, "no ctx available\n");
 		return 0;
@@ -495,6 +500,36 @@ err:
 	return -EINVAL;
 }
 
+static int engines_show(struct seq_file *s, void *data)
+{
+	struct hl_debugfs_entry *entry = s->private;
+	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
+	struct hl_device *hdev = dev_entry->hdev;
+
+	hdev->asic_funcs->is_device_idle(hdev, NULL, s);
+
+	return 0;
+}
+
+static bool hl_is_device_va(struct hl_device *hdev, u64 addr)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+
+	if (!hdev->mmu_enable)
+		goto out;
+
+	if (hdev->dram_supports_virtual_memory &&
+			addr >= prop->va_space_dram_start_address &&
+			addr < prop->va_space_dram_end_address)
+		return true;
+
+	if (addr >= prop->va_space_host_start_address &&
+			addr < prop->va_space_host_end_address)
+		return true;
+out:
+	return false;
+}
+
 static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr,
 				u64 *phys_addr)
 {
@@ -568,7 +603,6 @@ static ssize_t hl_data_read32(struct file *f, char __user *buf,
 {
 	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
 	struct hl_device *hdev = entry->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	char tmp_buf[32];
 	u64 addr = entry->addr;
 	u32 val;
@@ -577,11 +611,8 @@ static ssize_t hl_data_read32(struct file *f, char __user *buf,
 	if (*ppos)
 		return 0;
 
-	if (addr >= prop->va_space_dram_start_address &&
-			addr < prop->va_space_dram_end_address &&
-			hdev->mmu_enable &&
-			hdev->dram_supports_virtual_memory) {
-		rc = device_va_to_pa(hdev, entry->addr, &addr);
+	if (hl_is_device_va(hdev, addr)) {
+		rc = device_va_to_pa(hdev, addr, &addr);
 		if (rc)
 			return rc;
 	}
@@ -602,7 +633,6 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf,
 {
 	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
 	struct hl_device *hdev = entry->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	u64 addr = entry->addr;
 	u32 value;
 	ssize_t rc;
@@ -611,11 +641,8 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf,
 	if (rc)
 		return rc;
 
-	if (addr >= prop->va_space_dram_start_address &&
-			addr < prop->va_space_dram_end_address &&
-			hdev->mmu_enable &&
-			hdev->dram_supports_virtual_memory) {
-		rc = device_va_to_pa(hdev, entry->addr, &addr);
+	if (hl_is_device_va(hdev, addr)) {
+		rc = device_va_to_pa(hdev, addr, &addr);
 		if (rc)
 			return rc;
 	}
@@ -877,6 +904,7 @@ static const struct hl_info_list hl_debugfs_list[] = {
 	{"userptr", userptr_show, NULL},
 	{"vm", vm_show, NULL},
 	{"mmu", mmu_show, mmu_write},
+	{"engines", engines_show, NULL}
 };
 
 static int hl_debugfs_open(struct inode *inode, struct file *file)
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 0b19d3eefb98..0c4894dd9c02 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -231,6 +231,7 @@ static int device_early_init(struct hl_device *hdev)
 
 	mutex_init(&hdev->fd_open_cnt_lock);
 	mutex_init(&hdev->send_cpu_message_lock);
+	mutex_init(&hdev->debug_lock);
 	mutex_init(&hdev->mmu_cache_lock);
 	INIT_LIST_HEAD(&hdev->hw_queues_mirror_list);
 	spin_lock_init(&hdev->hw_queues_mirror_lock);
@@ -262,6 +263,7 @@ early_fini:
 static void device_early_fini(struct hl_device *hdev)
 {
 	mutex_destroy(&hdev->mmu_cache_lock);
+	mutex_destroy(&hdev->debug_lock);
 	mutex_destroy(&hdev->send_cpu_message_lock);
 
 	hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
@@ -324,7 +326,15 @@ static int device_late_init(struct hl_device *hdev)
 {
 	int rc;
 
-	INIT_DELAYED_WORK(&hdev->work_freq, set_freq_to_low_job);
+	if (hdev->asic_funcs->late_init) {
+		rc = hdev->asic_funcs->late_init(hdev);
+		if (rc) {
+			dev_err(hdev->dev,
+				"failed late initialization for the H/W\n");
+			return rc;
+		}
+	}
+
 	hdev->high_pll = hdev->asic_prop.high_pll;
 
 	/* force setting to low frequency */
@@ -335,17 +345,9 @@ static int device_late_init(struct hl_device *hdev)
 	else
 		hdev->asic_funcs->set_pll_profile(hdev, PLL_LAST);
 
-	if (hdev->asic_funcs->late_init) {
-		rc = hdev->asic_funcs->late_init(hdev);
-		if (rc) {
-			dev_err(hdev->dev,
-				"failed late initialization for the H/W\n");
-			return rc;
-		}
-	}
-
+	INIT_DELAYED_WORK(&hdev->work_freq, set_freq_to_low_job);
 	schedule_delayed_work(&hdev->work_freq,
-			usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
+	usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
 
 	if (hdev->heartbeat) {
 		INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat);
@@ -420,6 +422,52 @@ int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq)
 	return 1;
 }
 
+int hl_device_set_debug_mode(struct hl_device *hdev, bool enable)
+{
+	int rc = 0;
+
+	mutex_lock(&hdev->debug_lock);
+
+	if (!enable) {
+		if (!hdev->in_debug) {
+			dev_err(hdev->dev,
+				"Failed to disable debug mode because device was not in debug mode\n");
+			rc = -EFAULT;
+			goto out;
+		}
+
+		hdev->asic_funcs->halt_coresight(hdev);
+		hdev->in_debug = 0;
+
+		goto out;
+	}
+
+	if (hdev->in_debug) {
+		dev_err(hdev->dev,
+			"Failed to enable debug mode because device is already in debug mode\n");
+		rc = -EFAULT;
+		goto out;
+	}
+
+	mutex_lock(&hdev->fd_open_cnt_lock);
+
+	if (atomic_read(&hdev->fd_open_cnt) > 1) {
+		dev_err(hdev->dev,
+			"Failed to enable debug mode. More then a single user is using the device\n");
+		rc = -EPERM;
+		goto unlock_fd_open_lock;
+	}
+
+	hdev->in_debug = 1;
+
+unlock_fd_open_lock:
+	mutex_unlock(&hdev->fd_open_cnt_lock);
+out:
+	mutex_unlock(&hdev->debug_lock);
+
+	return rc;
+}
+
 /*
  * hl_device_suspend - initiate device suspend
  *
@@ -647,13 +695,6 @@ again:
 
 		hdev->hard_reset_pending = true;
 
-		if (!hdev->pdev) {
-			dev_err(hdev->dev,
-				"Reset action is NOT supported in simulator\n");
-			rc = -EINVAL;
-			goto out_err;
-		}
-
 		device_reset_work = kzalloc(sizeof(*device_reset_work),
 						GFP_ATOMIC);
 		if (!device_reset_work) {
@@ -704,6 +745,7 @@ again:
 
 	if (hard_reset) {
 		hl_vm_fini(hdev);
+		hl_mmu_fini(hdev);
 		hl_eq_reset(hdev, &hdev->event_queue);
 	}
 
@@ -731,6 +773,13 @@ again:
 			goto out_err;
 		}
 
+		rc = hl_mmu_init(hdev);
+		if (rc) {
+			dev_err(hdev->dev,
+				"Failed to initialize MMU S/W after hard reset\n");
+			goto out_err;
+		}
+
 		/* Allocate the kernel context */
 		hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
 						GFP_KERNEL);
@@ -902,11 +951,18 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 		goto cq_fini;
 	}
 
+	/* MMU S/W must be initialized before kernel context is created */
+	rc = hl_mmu_init(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to initialize MMU S/W structures\n");
+		goto eq_fini;
+	}
+
 	/* Allocate the kernel context */
 	hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
 	if (!hdev->kernel_ctx) {
 		rc = -ENOMEM;
-		goto eq_fini;
+		goto mmu_fini;
 	}
 
 	hdev->user_ctx = NULL;
@@ -954,8 +1010,6 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 		goto out_disabled;
 	}
 
-	/* After test_queues, KMD can start sending messages to device CPU */
-
 	rc = device_late_init(hdev);
 	if (rc) {
 		dev_err(hdev->dev, "Failed late initialization\n");
@@ -1001,6 +1055,8 @@ release_ctx:
 			"kernel ctx is still alive on initialization failure\n");
 free_ctx:
 	kfree(hdev->kernel_ctx);
+mmu_fini:
+	hl_mmu_fini(hdev);
 eq_fini:
 	hl_eq_fini(hdev, &hdev->event_queue);
 cq_fini:
@@ -1105,6 +1161,8 @@ void hl_device_fini(struct hl_device *hdev)
 
 	hl_vm_fini(hdev);
 
+	hl_mmu_fini(hdev);
+
 	hl_eq_fini(hdev, &hdev->event_queue);
 
 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
@@ -1126,95 +1184,6 @@ void hl_device_fini(struct hl_device *hdev)
 }
 
 /*
- * hl_poll_timeout_memory - Periodically poll a host memory address
- *                              until it is not zero or a timeout occurs
- * @hdev: pointer to habanalabs device structure
- * @addr: Address to poll
- * @timeout_us: timeout in us
- * @val: Variable to read the value into
- *
- * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
- * case, the last read value at @addr is stored in @val. Must not
- * be called from atomic context if sleep_us or timeout_us are used.
- *
- * The function sleeps for 100us with timeout value of
- * timeout_us
- */
-int hl_poll_timeout_memory(struct hl_device *hdev, u64 addr,
-				u32 timeout_us, u32 *val)
-{
-	/*
-	 * address in this function points always to a memory location in the
-	 * host's (server's) memory. That location is updated asynchronously
-	 * either by the direct access of the device or by another core
-	 */
-	u32 *paddr = (u32 *) (uintptr_t) addr;
-	ktime_t timeout;
-
-	/* timeout should be longer when working with simulator */
-	if (!hdev->pdev)
-		timeout_us *= 10;
-
-	timeout = ktime_add_us(ktime_get(), timeout_us);
-
-	might_sleep();
-
-	for (;;) {
-		/*
-		 * Flush CPU read/write buffers to make sure we read updates
-		 * done by other cores or by the device
-		 */
-		mb();
-		*val = *paddr;
-		if (*val)
-			break;
-		if (ktime_compare(ktime_get(), timeout) > 0) {
-			*val = *paddr;
-			break;
-		}
-		usleep_range((100 >> 2) + 1, 100);
-	}
-
-	return *val ? 0 : -ETIMEDOUT;
-}
-
-/*
- * hl_poll_timeout_devicememory - Periodically poll a device memory address
- *                                until it is not zero or a timeout occurs
- * @hdev: pointer to habanalabs device structure
- * @addr: Device address to poll
- * @timeout_us: timeout in us
- * @val: Variable to read the value into
- *
- * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
- * case, the last read value at @addr is stored in @val. Must not
- * be called from atomic context if sleep_us or timeout_us are used.
- *
- * The function sleeps for 100us with timeout value of
- * timeout_us
- */
-int hl_poll_timeout_device_memory(struct hl_device *hdev, void __iomem *addr,
-				u32 timeout_us, u32 *val)
-{
-	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us);
-
-	might_sleep();
-
-	for (;;) {
-		*val = readl(addr);
-		if (*val)
-			break;
-		if (ktime_compare(ktime_get(), timeout) > 0) {
-			*val = readl(addr);
-			break;
-		}
-		usleep_range((100 >> 2) + 1, 100);
-	}
-
-	return *val ? 0 : -ETIMEDOUT;
-}
-
-/*
  * MMIO register access helper functions.
  */
 
diff --git a/drivers/misc/habanalabs/firmware_if.c b/drivers/misc/habanalabs/firmware_if.c
index eda5d7fcb79f..cc8168bacb24 100644
--- a/drivers/misc/habanalabs/firmware_if.c
+++ b/drivers/misc/habanalabs/firmware_if.c
@@ -29,13 +29,13 @@ int hl_fw_push_fw_to_device(struct hl_device *hdev, const char *fw_name,
 
 	rc = request_firmware(&fw, fw_name, hdev->dev);
 	if (rc) {
-		dev_err(hdev->dev, "Failed to request %s\n", fw_name);
+		dev_err(hdev->dev, "Firmware file %s is not found!\n", fw_name);
 		goto out;
 	}
 
 	fw_size = fw->size;
 	if ((fw_size % 4) != 0) {
-		dev_err(hdev->dev, "illegal %s firmware size %zu\n",
+		dev_err(hdev->dev, "Illegal %s firmware size %zu\n",
 			fw_name, fw_size);
 		rc = -EINVAL;
 		goto out;
@@ -85,12 +85,6 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 	u32 tmp;
 	int rc = 0;
 
-	if (len > HL_CPU_CB_SIZE) {
-		dev_err(hdev->dev, "Invalid CPU message size of %d bytes\n",
-			len);
-		return -ENOMEM;
-	}
-
 	pkt = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, len,
 								&pkt_dma_addr);
 	if (!pkt) {
@@ -117,33 +111,28 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 		goto out;
 	}
 
-	rc = hl_poll_timeout_memory(hdev, (u64) (uintptr_t) &pkt->fence,
-					timeout, &tmp);
+	rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp,
+				(tmp == ARMCP_PACKET_FENCE_VAL), 1000, timeout);
 
 	hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);
 
 	if (rc == -ETIMEDOUT) {
-		dev_err(hdev->dev, "Timeout while waiting for device CPU\n");
+		dev_err(hdev->dev, "Device CPU packet timeout (0x%x)\n", tmp);
 		hdev->device_cpu_disabled = true;
 		goto out;
 	}
 
-	if (tmp == ARMCP_PACKET_FENCE_VAL) {
-		u32 ctl = le32_to_cpu(pkt->ctl);
+	tmp = le32_to_cpu(pkt->ctl);
 
-		rc = (ctl & ARMCP_PKT_CTL_RC_MASK) >> ARMCP_PKT_CTL_RC_SHIFT;
-		if (rc) {
-			dev_err(hdev->dev,
-				"F/W ERROR %d for CPU packet %d\n",
-				rc, (ctl & ARMCP_PKT_CTL_OPCODE_MASK)
+	rc = (tmp & ARMCP_PKT_CTL_RC_MASK) >> ARMCP_PKT_CTL_RC_SHIFT;
+	if (rc) {
+		dev_err(hdev->dev, "F/W ERROR %d for CPU packet %d\n",
+			rc,
+			(tmp & ARMCP_PKT_CTL_OPCODE_MASK)
 						>> ARMCP_PKT_CTL_OPCODE_SHIFT);
-			rc = -EINVAL;
-		} else if (result) {
-			*result = (long) le64_to_cpu(pkt->result);
-		}
-	} else {
-		dev_err(hdev->dev, "CPU packet wrong fence value\n");
-		rc = -EINVAL;
+		rc = -EIO;
+	} else if (result) {
+		*result = (long) le64_to_cpu(pkt->result);
 	}
 
 out:
@@ -186,9 +175,6 @@ void *hl_fw_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size,
 {
 	u64 kernel_addr;
 
-	/* roundup to HL_CPU_PKT_SIZE */
-	size = (size + (HL_CPU_PKT_SIZE - 1)) & HL_CPU_PKT_MASK;
-
 	kernel_addr = gen_pool_alloc(hdev->cpu_accessible_dma_pool, size);
 
 	*dma_handle = hdev->cpu_accessible_dma_address +
@@ -200,9 +186,6 @@ void *hl_fw_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size,
 void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
 					void *vaddr)
 {
-	/* roundup to HL_CPU_PKT_SIZE */
-	size = (size + (HL_CPU_PKT_SIZE - 1)) & HL_CPU_PKT_MASK;
-
 	gen_pool_free(hdev->cpu_accessible_dma_pool, (u64) (uintptr_t) vaddr,
 			size);
 }
@@ -256,7 +239,7 @@ int hl_fw_armcp_info_get(struct hl_device *hdev)
 					HL_ARMCP_INFO_TIMEOUT_USEC, &result);
 	if (rc) {
 		dev_err(hdev->dev,
-			"Failed to send armcp info pkt, error %d\n", rc);
+			"Failed to send ArmCP info pkt, error %d\n", rc);
 		goto out;
 	}
 
@@ -291,7 +274,7 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
 					max_size, &eeprom_info_dma_addr);
 	if (!eeprom_info_cpu_addr) {
 		dev_err(hdev->dev,
-			"Failed to allocate DMA memory for EEPROM info packet\n");
+			"Failed to allocate DMA memory for ArmCP EEPROM packet\n");
 		return -ENOMEM;
 	}
 
@@ -307,7 +290,7 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
 
 	if (rc) {
 		dev_err(hdev->dev,
-			"Failed to send armcp EEPROM pkt, error %d\n", rc);
+			"Failed to send ArmCP EEPROM packet, error %d\n", rc);
 		goto out;
 	}
 
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 02d116b01a1a..75294ec65257 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -14,6 +14,8 @@
 #include <linux/genalloc.h>
 #include <linux/hwmon.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/iommu.h>
+#include <linux/seq_file.h>
 
 /*
  * GOYA security scheme:
@@ -89,6 +91,30 @@
 #define GOYA_CB_POOL_CB_CNT		512
 #define GOYA_CB_POOL_CB_SIZE		0x20000		/* 128KB */
 
+#define IS_QM_IDLE(engine, qm_glbl_sts0) \
+	(((qm_glbl_sts0) & engine##_QM_IDLE_MASK) == engine##_QM_IDLE_MASK)
+#define IS_DMA_QM_IDLE(qm_glbl_sts0)	IS_QM_IDLE(DMA, qm_glbl_sts0)
+#define IS_TPC_QM_IDLE(qm_glbl_sts0)	IS_QM_IDLE(TPC, qm_glbl_sts0)
+#define IS_MME_QM_IDLE(qm_glbl_sts0)	IS_QM_IDLE(MME, qm_glbl_sts0)
+
+#define IS_CMDQ_IDLE(engine, cmdq_glbl_sts0) \
+	(((cmdq_glbl_sts0) & engine##_CMDQ_IDLE_MASK) == \
+			engine##_CMDQ_IDLE_MASK)
+#define IS_TPC_CMDQ_IDLE(cmdq_glbl_sts0) \
+	IS_CMDQ_IDLE(TPC, cmdq_glbl_sts0)
+#define IS_MME_CMDQ_IDLE(cmdq_glbl_sts0) \
+	IS_CMDQ_IDLE(MME, cmdq_glbl_sts0)
+
+#define IS_DMA_IDLE(dma_core_sts0) \
+	!((dma_core_sts0) & DMA_CH_0_STS0_DMA_BUSY_MASK)
+
+#define IS_TPC_IDLE(tpc_cfg_sts) \
+	(((tpc_cfg_sts) & TPC_CFG_IDLE_MASK) == TPC_CFG_IDLE_MASK)
+
+#define IS_MME_IDLE(mme_arch_sts) \
+	(((mme_arch_sts) & MME_ARCH_IDLE_MASK) == MME_ARCH_IDLE_MASK)
+
+
 static const char goya_irq_name[GOYA_MSIX_ENTRIES][GOYA_MAX_STRING_LEN] = {
 		"goya cq 0", "goya cq 1", "goya cq 2", "goya cq 3",
 		"goya cq 4", "goya cpu eq"
@@ -297,6 +323,11 @@ static u32 goya_all_events[] = {
 	GOYA_ASYNC_EVENT_ID_DMA_BM_CH4
 };
 
+static int goya_mmu_clear_pgt_range(struct hl_device *hdev);
+static int goya_mmu_set_dram_default_page(struct hl_device *hdev);
+static int goya_mmu_add_mappings_for_device_cpu(struct hl_device *hdev);
+static void goya_mmu_prepare(struct hl_device *hdev, u32 asid);
+
 void goya_get_fixed_properties(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
@@ -467,7 +498,7 @@ static int goya_early_init(struct hl_device *hdev)
 
 	prop->dram_pci_bar_size = pci_resource_len(pdev, DDR_BAR_ID);
 
-	rc = hl_pci_init(hdev, 39);
+	rc = hl_pci_init(hdev, 48);
 	if (rc)
 		return rc;
 
@@ -539,9 +570,36 @@ int goya_late_init(struct hl_device *hdev)
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	int rc;
 
+	goya_fetch_psoc_frequency(hdev);
+
+	rc = goya_mmu_clear_pgt_range(hdev);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to clear MMU page tables range %d\n", rc);
+		return rc;
+	}
+
+	rc = goya_mmu_set_dram_default_page(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to set DRAM default page %d\n", rc);
+		return rc;
+	}
+
+	rc = goya_mmu_add_mappings_for_device_cpu(hdev);
+	if (rc)
+		return rc;
+
+	rc = goya_init_cpu_queues(hdev);
+	if (rc)
+		return rc;
+
+	rc = goya_test_cpu_queue(hdev);
+	if (rc)
+		return rc;
+
 	rc = goya_armcp_info_get(hdev);
 	if (rc) {
-		dev_err(hdev->dev, "Failed to get armcp info\n");
+		dev_err(hdev->dev, "Failed to get armcp info %d\n", rc);
 		return rc;
 	}
 
@@ -553,33 +611,15 @@ int goya_late_init(struct hl_device *hdev)
 
 	rc = hl_fw_send_pci_access_msg(hdev, ARMCP_PACKET_ENABLE_PCI_ACCESS);
 	if (rc) {
-		dev_err(hdev->dev, "Failed to enable PCI access from CPU\n");
+		dev_err(hdev->dev,
+			"Failed to enable PCI access from CPU %d\n", rc);
 		return rc;
 	}
 
 	WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
 			GOYA_ASYNC_EVENT_ID_INTS_REGISTER);
 
-	goya_fetch_psoc_frequency(hdev);
-
-	rc = goya_mmu_clear_pgt_range(hdev);
-	if (rc) {
-		dev_err(hdev->dev, "Failed to clear MMU page tables range\n");
-		goto disable_pci_access;
-	}
-
-	rc = goya_mmu_set_dram_default_page(hdev);
-	if (rc) {
-		dev_err(hdev->dev, "Failed to set DRAM default page\n");
-		goto disable_pci_access;
-	}
-
 	return 0;
-
-disable_pci_access:
-	hl_fw_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS);
-
-	return rc;
 }
 
 /*
@@ -655,7 +695,10 @@ static int goya_sw_init(struct hl_device *hdev)
 		goto free_dma_pool;
 	}
 
-	hdev->cpu_accessible_dma_pool = gen_pool_create(HL_CPU_PKT_SHIFT, -1);
+	dev_dbg(hdev->dev, "cpu accessible memory at bus address 0x%llx\n",
+		hdev->cpu_accessible_dma_address);
+
+	hdev->cpu_accessible_dma_pool = gen_pool_create(ilog2(32), -1);
 	if (!hdev->cpu_accessible_dma_pool) {
 		dev_err(hdev->dev,
 			"Failed to create CPU accessible DMA pool\n");
@@ -786,7 +829,6 @@ static void goya_init_dma_ch(struct hl_device *hdev, int dma_id)
 	else
 		sob_addr = CFG_BASE + mmSYNC_MNGR_SOB_OBJ_1007;
 
-	WREG32(mmDMA_CH_0_WR_COMP_ADDR_LO + reg_off, lower_32_bits(sob_addr));
 	WREG32(mmDMA_CH_0_WR_COMP_ADDR_HI + reg_off, upper_32_bits(sob_addr));
 	WREG32(mmDMA_CH_0_WR_COMP_WDATA + reg_off, 0x80000001);
 }
@@ -973,9 +1015,9 @@ int goya_init_cpu_queues(struct hl_device *hdev)
 	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_3, upper_32_bits(eq->bus_address));
 
 	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_8,
-			lower_32_bits(hdev->cpu_accessible_dma_address));
+			lower_32_bits(VA_CPU_ACCESSIBLE_MEM_ADDR));
 	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_9,
-			upper_32_bits(hdev->cpu_accessible_dma_address));
+			upper_32_bits(VA_CPU_ACCESSIBLE_MEM_ADDR));
 
 	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_5, HL_QUEUE_SIZE_IN_BYTES);
 	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_4, HL_EQ_SIZE_IN_BYTES);
@@ -1001,7 +1043,7 @@ int goya_init_cpu_queues(struct hl_device *hdev)
 
 	if (err) {
 		dev_err(hdev->dev,
-			"Failed to communicate with ARM CPU (ArmCP timeout)\n");
+			"Failed to setup communication with device CPU\n");
 		return -EIO;
 	}
 
@@ -2061,10 +2103,12 @@ static void goya_halt_engines(struct hl_device *hdev, bool hard_reset)
 	goya_disable_external_queues(hdev);
 	goya_disable_internal_queues(hdev);
 
-	if (hard_reset)
+	if (hard_reset) {
 		goya_disable_msix(hdev);
-	else
+		goya_mmu_remove_device_cpu_mappings(hdev);
+	} else {
 		goya_sync_irqs(hdev);
+	}
 }
 
 /*
@@ -2277,14 +2321,14 @@ static int goya_init_cpu(struct hl_device *hdev, u32 cpu_timeout)
 	goya_read_device_fw_version(hdev, FW_COMP_UBOOT);
 	goya_read_device_fw_version(hdev, FW_COMP_PREBOOT);
 
-	if (status == CPU_BOOT_STATUS_SRAM_AVAIL)
-		goto out;
-
 	if (!hdev->fw_loading) {
 		dev_info(hdev->dev, "Skip loading FW\n");
 		goto out;
 	}
 
+	if (status == CPU_BOOT_STATUS_SRAM_AVAIL)
+		goto out;
+
 	rc = goya_push_linux_to_device(hdev);
 	if (rc)
 		return rc;
@@ -2466,34 +2510,11 @@ static int goya_hw_init(struct hl_device *hdev)
 	if (rc)
 		goto disable_queues;
 
-	rc = goya_init_cpu_queues(hdev);
-	if (rc) {
-		dev_err(hdev->dev, "failed to initialize CPU H/W queues %d\n",
-			rc);
-		goto disable_msix;
-	}
-
-	/*
-	 * Check if we managed to set the DMA mask to more then 32 bits. If so,
-	 * let's try to increase it again because in Goya we set the initial
-	 * dma mask to less then 39 bits so that the allocation of the memory
-	 * area for the device's cpu will be under 39 bits
-	 */
-	if (hdev->dma_mask > 32) {
-		rc = hl_pci_set_dma_mask(hdev, 48);
-		if (rc)
-			goto disable_pci_access;
-	}
-
 	/* Perform read from the device to flush all MSI-X configuration */
 	val = RREG32(mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG);
 
 	return 0;
 
-disable_pci_access:
-	hl_fw_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS);
-disable_msix:
-	goya_disable_msix(hdev);
 disable_queues:
 	goya_disable_internal_queues(hdev);
 	goya_disable_external_queues(hdev);
@@ -2629,7 +2650,6 @@ static int goya_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
 void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
 {
 	u32 db_reg_offset, db_value;
-	bool invalid_queue = false;
 
 	switch (hw_queue_id) {
 	case GOYA_QUEUE_ID_DMA_0:
@@ -2653,10 +2673,7 @@ void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
 		break;
 
 	case GOYA_QUEUE_ID_CPU_PQ:
-		if (hdev->cpu_queues_enable)
-			db_reg_offset = mmCPU_IF_PF_PQ_PI;
-		else
-			invalid_queue = true;
+		db_reg_offset = mmCPU_IF_PF_PQ_PI;
 		break;
 
 	case GOYA_QUEUE_ID_MME:
@@ -2696,12 +2713,8 @@ void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
 		break;
 
 	default:
-		invalid_queue = true;
-	}
-
-	if (invalid_queue) {
 		/* Should never get here */
-		dev_err(hdev->dev, "h/w queue %d is invalid. Can
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2019-07-04 10:20:48 +0200
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2019-07-04 10:20:48 +0200
commit	a94de2e7a380064a6e8a4c3e5e10d97e8aa711f4 (patch)
tree	1026bbbd6ccc8241fe58bd3742353c647b459c5c /drivers
parent	60e8523e2ea18dc0c0cea69d6c1d69a065019062 (diff)
parent	e8960ca06bb22d0d84edf246b0bf395e8322e127 (diff)
download	linux-a94de2e7a380064a6e8a4c3e5e10d97e8aa711f4.tar.gz linux-a94de2e7a380064a6e8a4c3e5e10d97e8aa711f4.tar.bz2 linux-a94de2e7a380064a6e8a4c3e5e10d97e8aa711f4.zip