drm/i915: Start chopping up the GPU error capture

In the near future, we will want to start a GPU error capture from a new context, from inside the softirq region of a forced preemption. To do so requires us to break up the monolithic error capture to provide new entry points with finer control; in particular focusing on one engine/gt, and being able to compose an error state from little pieces of HW capture. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Andi Shyti <andi.shyti@intel.com> Acked-by: Andi Shyti <andi.shyti@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk
author: Chris Wilson <chris@chris-wilson.co.uk> 2020-01-10 12:30:56 +0000
committer: Chris Wilson <chris@chris-wilson.co.uk> 2020-01-10 15:34:33 +0000
commit: 742379c0c4001fd2a6e02005c1ffa1ff611b28fa (patch)
tree: 6b553daa422f74ff42de00abc3b08710a886f19b /drivers
parent: 8ccfc20a7d56d7e16510e6e068ffb7b43c3ac100 (diff)
download: linux-742379c0c4001fd2a6e02005c1ffa1ff611b28fa.tar.gz
linux-742379c0c4001fd2a6e02005c1ffa1ff611b28fa.tar.bz2
linux-742379c0c4001fd2a6e02005c1ffa1ff611b28fa.zip
11 files changed, 877 insertions, 673 deletions
diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
index 71f1bcdfc92f..5df003061e44 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine.h
@@ -202,7 +202,7 @@ void intel_engine_set_hwsp_writemask(struct intel_engine_cs *engine, u32 mask);
 u64 intel_engine_get_active_head(const struct intel_engine_cs *engine);
 u64 intel_engine_get_last_batch_head(const struct intel_engine_cs *engine);
 
-void intel_engine_get_instdone(struct intel_engine_cs *engine,
+void intel_engine_get_instdone(const struct intel_engine_cs *engine,
 			       struct intel_instdone *instdone);
 
 void intel_engine_init_execlists(struct intel_engine_cs *engine);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 825c94e7ca0b..f451ef376548 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -914,8 +914,8 @@ const char *i915_cache_level_str(struct drm_i915_private *i915, int type)
 }
 
 static u32
-read_subslice_reg(struct intel_engine_cs *engine, int slice, int subslice,
-		  i915_reg_t reg)
+read_subslice_reg(const struct intel_engine_cs *engine,
+		  int slice, int subslice, i915_reg_t reg)
 {
 	struct drm_i915_private *i915 = engine->i915;
 	struct intel_uncore *uncore = engine->uncore;
@@ -959,7 +959,7 @@ read_subslice_reg(struct intel_engine_cs *engine, int slice, int subslice,
 }
 
 /* NB: please notice the memset */
-void intel_engine_get_instdone(struct intel_engine_cs *engine,
+void intel_engine_get_instdone(const struct intel_engine_cs *engine,
 			       struct intel_instdone *instdone)
 {
 	struct drm_i915_private *i915 = engine->i915;
diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index eb9365741ff8..c8ba38e7cda7 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -495,6 +495,7 @@ static void cleanup_init_ggtt(struct i915_ggtt *ggtt)
 	ggtt_release_guc_top(ggtt);
 	if (drm_mm_node_allocated(&ggtt->error_capture))
 		drm_mm_remove_node(&ggtt->error_capture);
+	mutex_destroy(&ggtt->error_mutex);
 }
 
 static int init_ggtt(struct i915_ggtt *ggtt)
@@ -526,6 +527,7 @@ static int init_ggtt(struct i915_ggtt *ggtt)
 	if (ret)
 		return ret;
 
+	mutex_init(&ggtt->error_mutex);
 	if (ggtt->mappable_end) {
 		/* Reserve a mappable slot for our lockless error capture */
 		ret = drm_mm_insert_node_in_range(&ggtt->vm.mm,
@@ -716,6 +718,7 @@ static void ggtt_cleanup_hw(struct i915_ggtt *ggtt)
 
 	if (drm_mm_node_allocated(&ggtt->error_capture))
 		drm_mm_remove_node(&ggtt->error_capture);
+	mutex_destroy(&ggtt->error_mutex);
 
 	ggtt_release_guc_top(ggtt);
 	intel_vgt_deballoon(ggtt);
diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h
index 029363cbdf49..7da7681c20b1 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.h
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.h
@@ -345,6 +345,7 @@ struct i915_ggtt {
 	/* Manual runtime pm autosuspend delay for user GGTT mmaps */
 	struct intel_wakeref_auto userfault_wakeref;
 
+	struct mutex error_mutex;
 	struct drm_mm_node error_capture;
 	struct drm_mm_node uc_fw;
 };
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index 76de33ae9efe..beee0cf89bce 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -1230,7 +1230,7 @@ void intel_gt_handle_error(struct intel_gt *gt,
 	engine_mask &= INTEL_INFO(gt->i915)->engine_mask;
 
 	if (flags & I915_ERROR_CAPTURE) {
-		i915_capture_error_state(gt->i915, engine_mask, msg);
+		i915_capture_error_state(gt->i915);
 		intel_gt_clear_error_registers(gt, engine_mask);
 	}
 
diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
index 7c824c26b705..3e5e6c86e843 100644
--- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
+++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
@@ -1498,7 +1498,7 @@ static int igt_handle_error(void *arg)
 	struct intel_engine_cs *engine = gt->engine[RCS0];
 	struct hang h;
 	struct i915_request *rq;
-	struct i915_gpu_state *error;
+	struct i915_gpu_coredump *error;
 	int err;
 
 	/* Check that we can issue a global GPU and engine reset */
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 8f01c2bc7355..61e547308d2e 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -685,7 +685,7 @@ static int i915_gem_fence_regs_info(struct seq_file *m, void *data)
 static ssize_t gpu_state_read(struct file *file, char __user *ubuf,
 			      size_t count, loff_t *pos)
 {
-	struct i915_gpu_state *error;
+	struct i915_gpu_coredump *error;
 	ssize_t ret;
 	void *buf;
 
@@ -698,7 +698,7 @@ static ssize_t gpu_state_read(struct file *file, char __user *ubuf,
 	if (!buf)
 		return -ENOMEM;
 
-	ret = i915_gpu_state_copy_to_buffer(error, buf, *pos, count);
+	ret = i915_gpu_coredump_copy_to_buffer(error, buf, *pos, count);
 	if (ret <= 0)
 		goto out;
 
@@ -714,19 +714,19 @@ out:
 
 static int gpu_state_release(struct inode *inode, struct file *file)
 {
-	i915_gpu_state_put(file->private_data);
+	i915_gpu_coredump_put(file->private_data);
 	return 0;
 }
 
 static int i915_gpu_info_open(struct inode *inode, struct file *file)
 {
 	struct drm_i915_private *i915 = inode->i_private;
-	struct i915_gpu_state *gpu;
+	struct i915_gpu_coredump *gpu;
 	intel_wakeref_t wakeref;
 
 	gpu = NULL;
 	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
-		gpu = i915_capture_gpu_state(i915);
+		gpu = i915_gpu_coredump(i915);
 	if (IS_ERR(gpu))
 		return PTR_ERR(gpu);
 
@@ -748,7 +748,7 @@ i915_error_state_write(struct file *filp,
 		       size_t cnt,
 		       loff_t *ppos)
 {
-	struct i915_gpu_state *error = filp->private_data;
+	struct i915_gpu_coredump *error = filp->private_data;
 
 	if (!error)
 		return 0;
@@ -761,7 +761,7 @@ i915_error_state_write(struct file *filp,
 
 static int i915_error_state_open(struct inode *inode, struct file *file)
 {
-	struct i915_gpu_state *error;
+	struct i915_gpu_coredump *error;
 
 	error = i915_first_error_state(inode->i_private);
 	if (IS_ERR(error))
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 1025d783f494..e7be4c3e43c6 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1874,7 +1874,7 @@ static inline u32 i915_reset_count(struct i915_gpu_error *error)
 }
 
 static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
-					  struct intel_engine_cs *engine)
+					  const struct intel_engine_cs *engine)
 {
 	return atomic_read(&error->reset_engine_count[engine->uabi_class]);
 }
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index fda0977d2059..730129ca4c17 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -41,6 +41,7 @@
 
 #include "gem/i915_gem_context.h"
 #include "gem/i915_gem_lmem.h"
+#include "gt/intel_gt_pm.h"
 
 #include "i915_drv.h"
 #include "i915_gpu_error.h"
@@ -232,14 +233,13 @@ static void pool_free(struct pagevec *pv, void *addr)
 
 #ifdef CONFIG_DRM_I915_COMPRESS_ERROR
 
-struct compress {
+struct i915_vma_compress {
 	struct pagevec pool;
 	struct z_stream_s zstream;
 	void *tmp;
-	bool wc;
 };
 
-static bool compress_init(struct compress *c)
+static bool compress_init(struct i915_vma_compress *c)
 {
 	struct z_stream_s *zstream = &c->zstream;
 
@@ -261,7 +261,7 @@ static bool compress_init(struct compress *c)
 	return true;
 }
 
-static bool compress_start(struct compress *c)
+static bool compress_start(struct i915_vma_compress *c)
 {
 	struct z_stream_s *zstream = &c->zstream;
 	void *workspace = zstream->workspace;
@@ -272,8 +272,8 @@ static bool compress_start(struct compress *c)
 	return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK;
 }
 
-static void *compress_next_page(struct compress *c,
-				struct drm_i915_error_object *dst)
+static void *compress_next_page(struct i915_vma_compress *c,
+				struct i915_vma_coredump *dst)
 {
 	void *page;
 
@@ -287,14 +287,15 @@ static void *compress_next_page(struct compress *c,
 	return dst->pages[dst->page_count++] = page;
 }
 
-static int compress_page(struct compress *c,
+static int compress_page(struct i915_vma_compress *c,
 			 void *src,
-			 struct drm_i915_error_object *dst)
+			 struct i915_vma_coredump *dst,
+			 bool wc)
 {
 	struct z_stream_s *zstream = &c->zstream;
 
 	zstream->next_in = src;
-	if (c->wc && c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
+	if (wc && c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
 		zstream->next_in = c->tmp;
 	zstream->avail_in = PAGE_SIZE;
 
@@ -318,8 +319,8 @@ static int compress_page(struct compress *c,
 	return 0;
 }
 
-static int compress_flush(struct compress *c,
-			  struct drm_i915_error_object *dst)
+static int compress_flush(struct i915_vma_compress *c,
+			  struct i915_vma_coredump *dst)
 {
 	struct z_stream_s *zstream = &c->zstream;
 
@@ -347,12 +348,12 @@ end:
 	return 0;
 }
 
-static void compress_finish(struct compress *c)
+static void compress_finish(struct i915_vma_compress *c)
 {
 	zlib_deflateEnd(&c->zstream);
 }
 
-static void compress_fini(struct compress *c)
+static void compress_fini(struct i915_vma_compress *c)
 {
 	kfree(c->zstream.workspace);
 	if (c->tmp)
@@ -367,24 +368,24 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m)
 
 #else
 
-struct compress {
+struct i915_vma_compress {
 	struct pagevec pool;
-	bool wc;
 };
 
-static bool compress_init(struct compress *c)
+static bool compress_init(struct i915_vma_compress *c)
 {
 	return pool_init(&c->pool, ALLOW_FAIL) == 0;
 }
 
-static bool compress_start(struct compress *c)
+static bool compress_start(struct i915_vma_compress *c)
 {
 	return true;
 }
 
-static int compress_page(struct compress *c,
+static int compress_page(struct i915_vma_compress *c,
 			 void *src,
-			 struct drm_i915_error_object *dst)
+			 struct i915_vma_coredump *dst,
+			 bool wc)
 {
 	void *ptr;
 
@@ -392,24 +393,24 @@ static int compress_page(struct compress *c,
 	if (!ptr)
 		return -ENOMEM;
 
-	if (!(c->wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE)))
+	if (!(wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE)))
 		memcpy(ptr, src, PAGE_SIZE);
 	dst->pages[dst->page_count++] = ptr;
 
 	return 0;
 }
 
-static int compress_flush(struct compress *c,
-			  struct drm_i915_error_object *dst)
+static int compress_flush(struct i915_vma_compress *c,
+			  struct i915_vma_coredump *dst)
 {
 	return 0;
 }
 
-static void compress_finish(struct compress *c)
+static void compress_finish(struct i915_vma_compress *c)
 {
 }
 
-static void compress_fini(struct compress *c)
+static void compress_fini(struct i915_vma_compress *c)
 {
 	pool_fini(&c->pool);
 }
@@ -422,7 +423,7 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m)
 #endif
 
 static void error_print_instdone(struct drm_i915_error_state_buf *m,
-				 const struct drm_i915_error_engine *ee)
+				 const struct intel_engine_coredump *ee)
 {
 	const struct sseu_dev_info *sseu = &RUNTIME_INFO(m->i915)->sseu;
 	int slice;
@@ -453,40 +454,56 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m,
 
 static void error_print_request(struct drm_i915_error_state_buf *m,
 				const char *prefix,
-				const struct drm_i915_error_request *erq,
-				const unsigned long epoch)
+				const struct i915_request_coredump *erq)
 {
 	if (!erq->seqno)
 		return;
 
-	err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
+	err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, start %08x, head %08x, tail %08x\n",
 		   prefix, erq->pid, erq->context, erq->seqno,
 		   test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
 			    &erq->flags) ? "!" : "",
 		   test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
 			    &erq->flags) ? "+" : "",
 		   erq->sched_attr.priority,
-		   jiffies_to_msecs(erq->jiffies - epoch),
 		   erq->start, erq->head, erq->tail);
 }
 
 static void error_print_context(struct drm_i915_error_state_buf *m,
 				const char *header,
-				const struct drm_i915_error_context *ctx)
+				const struct i915_gem_context_coredump *ctx)
 {
 	err_printf(m, "%s%s[%d] prio %d, guilty %d active %d\n",
 		   header, ctx->comm, ctx->pid, ctx->sched_attr.priority,
 		   ctx->guilty, ctx->active);
 }
 
+static struct i915_vma_coredump *
+__find_vma(struct i915_vma_coredump *vma, const char *name)
+{
+	while (vma) {
+		if (strcmp(vma->name, name) == 0)
+			return vma;
+		vma = vma->next;
+	}
+
+	return NULL;
+}
+
+static struct i915_vma_coredump *
+find_batch(const struct intel_engine_coredump *ee)
+{
+	return __find_vma(ee->vma, "batch");
+}
+
 static void error_print_engine(struct drm_i915_error_state_buf *m,
-			       const struct drm_i915_error_engine *ee,
-			       const unsigned long epoch)
+			       const struct intel_engine_coredump *ee)
 {
+	struct i915_vma_coredump *batch;
 	int n;
 
 	err_printf(m, "%s command stream:\n", ee->engine->name);
-	err_printf(m, "  IDLE?: %s\n", yesno(ee->idle));
+	err_printf(m, "  CCID:  0x%08x\n", ee->ccid);
 	err_printf(m, "  START: 0x%08x\n", ee->start);
 	err_printf(m, "  HEAD:  0x%08x [0x%08x]\n", ee->head, ee->rq_head);
 	err_printf(m, "  TAIL:  0x%08x [0x%08x, 0x%08x]\n",
@@ -501,9 +518,10 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 
 	error_print_instdone(m, ee);
 
-	if (ee->batchbuffer) {
-		u64 start = ee->batchbuffer->gtt_offset;
-		u64 end = start + ee->batchbuffer->gtt_size;
+	batch = find_batch(ee);
+	if (batch) {
+		u64 start = batch->gtt_offset;
+		u64 end = start + batch->gtt_size;
 
 		err_printf(m, "  batch: [0x%08x_%08x, 0x%08x_%08x]\n",
 			   upper_32_bits(start), lower_32_bits(start),
@@ -541,7 +559,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 
 	for (n = 0; n < ee->num_ports; n++) {
 		err_printf(m, "  ELSP[%d]:", n);
-		error_print_request(m, " ", &ee->execlist[n], epoch);
+		error_print_request(m, " ", &ee->execlist[n]);
 	}
 
 	error_print_context(m, "  Active context: ", &ee->context);
@@ -556,38 +574,35 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
 	va_end(args);
 }
 
-static void print_error_obj(struct drm_i915_error_state_buf *m,
+static void print_error_vma(struct drm_i915_error_state_buf *m,
 			    const struct intel_engine_cs *engine,
-			    const char *name,
-			    const struct drm_i915_error_object *obj)
+			    const struct i915_vma_coredump *vma)
 {
 	char out[ASCII85_BUFSZ];
 	int page;
 
-	if (!obj)
+	if (!vma)
 		return;
 
-	if (name) {
-		err_printf(m, "%s --- %s = 0x%08x %08x\n",
-			   engine ? engine->name : "global", name,
-			   upper_32_bits(obj->gtt_offset),
-			   lower_32_bits(obj->gtt_offset));
-	}
+	err_printf(m, "%s --- %s = 0x%08x %08x\n",
+		   engine ? engine->name : "global", vma->name,
+		   upper_32_bits(vma->gtt_offset),
+		   lower_32_bits(vma->gtt_offset));
 
-	if (obj->gtt_page_sizes > I915_GTT_PAGE_SIZE_4K)
-		err_printf(m, "gtt_page_sizes = 0x%08x\n", obj->gtt_page_sizes);
+	if (vma->gtt_page_sizes > I915_GTT_PAGE_SIZE_4K)
+		err_printf(m, "gtt_page_sizes = 0x%08x\n", vma->gtt_page_sizes);
 
 	err_compression_marker(m);
-	for (page = 0; page < obj->page_count; page++) {
+	for (page = 0; page < vma->page_count; page++) {
 		int i, len;
 
 		len = PAGE_SIZE;
-		if (page == obj->page_count - 1)
-			len -= obj->unused;
+		if (page == vma->page_count - 1)
+			len -= vma->unused;
 		len = ascii85_encode_len(len);
 
 		for (i = 0; i < len; i++)
-			err_puts(m, ascii85_encode(obj->pages[page][i], out));
+			err_puts(m, ascii85_encode(vma->pages[page][i], out));
 	}
 	err_puts(m, "\n");
 }
@@ -626,18 +641,13 @@ static void err_print_pciid(struct drm_i915_error_state_buf *m,
 }
 
 static void err_print_uc(struct drm_i915_error_state_buf *m,
-			 const struct i915_error_uc *error_uc)
+			 const struct intel_uc_coredump *error_uc)
 {
 	struct drm_printer p = i915_error_printer(m);
-	const struct i915_gpu_state *error =
-		container_of(error_uc, typeof(*error), uc);
-
-	if (!error->device_info.has_gt_uc)
-		return;
 
 	intel_uc_fw_dump(&error_uc->guc_fw, &p);
 	intel_uc_fw_dump(&error_uc->huc_fw, &p);
-	print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log);
+	print_error_vma(m, NULL, error_uc->guc_log);
 }
 
 static void err_free_sgl(struct scatterlist *sgl)
@@ -657,12 +667,78 @@ static void err_free_sgl(struct scatterlist *sgl)
 	}
 }
 
+static void err_print_gt(struct drm_i915_error_state_buf *m,
+			 struct intel_gt_coredump *gt)
+{
+	const struct intel_engine_coredump *ee;
+	int i, j;
+
+	err_printf(m, "GT awake: %s\n", yesno(gt->awake));
+	err_printf(m, "EIR: 0x%08x\n", gt->eir);
+	err_printf(m, "IER: 0x%08x\n", gt->ier);
+	for (i = 0; i < gt->ngtier; i++)
+		err_printf(m, "GTIER[%d]: 0x%08x\n", i, gt->gtier[i]);
+	err_printf(m, "PGTBL_ER: 0x%08x\n", gt->pgtbl_er);
+	err_printf(m, "FORCEWAKE: 0x%08x\n", gt->forcewake);
+	err_printf(m, "DERRMR: 0x%08x\n", gt->derrmr);
+
+	for (i = 0; i < gt->nfence; i++)
+		err_printf(m, "  fence[%d] = %08llx\n", i, gt->fence[i]);
+
+	if (IS_GEN_RANGE(m->i915, 6, 11)) {
+		err_printf(m, "ERROR: 0x%08x\n", gt->error);
+		err_printf(m, "DONE_REG: 0x%08x\n", gt->done_reg);
+	}
+
+	if (INTEL_GEN(m->i915) >= 8)
+		err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
+			   gt->fault_data1, gt->fault_data0);
+
+	if (IS_GEN(m->i915, 7))
+		err_printf(m, "ERR_INT: 0x%08x\n", gt->err_int);
+
+	if (IS_GEN_RANGE(m->i915, 8, 11))
+		err_printf(m, "GTT_CACHE_EN: 0x%08x\n", gt->gtt_cache);
+
+	if (IS_GEN(m->i915, 12))
+		err_printf(m, "AUX_ERR_DBG: 0x%08x\n", gt->aux_err);
+
+	if (INTEL_GEN(m->i915) >= 12) {
+		int i;
+
+		for (i = 0; i < GEN12_SFC_DONE_MAX; i++)
+			err_printf(m, "  SFC_DONE[%d]: 0x%08x\n", i,
+				   gt->sfc_done[i]);
+
+		err_printf(m, "  GAM_DONE: 0x%08x\n", gt->gam_done);
+	}
+
+	for (ee = gt->engine; ee; ee = ee->next) {
+		const struct i915_vma_coredump *vma;
+
+		error_print_engine(m, ee);
+
+		for (vma = ee->vma; vma; vma = vma->next)
+			print_error_vma(m, ee->engine, vma);
+
+		if (ee->num_requests) {
+			err_printf(m, "%s --- %d requests\n",
+				   ee->engine->name,
+				   ee->num_requests);
+			for (j = 0; j < ee->num_requests; j++)
+				error_print_request(m, " ", &ee->requests[j]);
+		}
+	}
+
+	if (gt->uc)
+		err_print_uc(m, gt->uc);
+}
+
 static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
-			       struct i915_gpu_state *error)
+			       struct i915_gpu_coredump *error)
 {
-	const struct drm_i915_error_engine *ee;
+	const struct intel_engine_coredump *ee;
 	struct timespec64 ts;
-	int i, j;
 
 	if (*error->error_msg)
 		err_printf(m, "%s\n", error->error_msg);
@@ -682,7 +758,7 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
 	err_printf(m, "Capture: %lu jiffies; %d ms ago\n",
 		   error->capture, jiffies_to_msecs(jiffies - error->capture));
 
-	for (ee = error->engine; ee; ee = ee->next)
+	for (ee = error->gt ? error->gt->engine : NULL; ee; ee = ee->next)
 		err_printf(m, "Active process (on ring %s): %s [%d]\n",
 			   ee->engine->name,
 			   ee->context.comm,
@@ -708,90 +784,11 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
 			   CSR_VERSION_MINOR(csr->version));
 	}
 
-	err_printf(m, "GT awake: %s\n", yesno(error->awake));
 	err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock));
 	err_printf(m, "PM suspended: %s\n", yesno(error->suspended));
-	err_printf(m, "EIR: 0x%08x\n", error->eir);
-	err_printf(m, "IER: 0x%08x\n", error->ier);
-	for (i = 0; i < error->ngtier; i++)
-		err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]);
-	err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er);
-	err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
-	err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
-	err_printf(m, "CCID: 0x%08x\n", error->ccid);
-
-	for (i = 0; i < error->nfence; i++)
-		err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
-
-	if (IS_GEN_RANGE(m->i915, 6, 11)) {
-		err_printf(m, "ERROR: 0x%08x\n", error->error);
-		err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg);
-	}
-
-	if (INTEL_GEN(m->i915) >= 8)
-		err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
-			   error->fault_data1, error->fault_data0);
-
-	if (IS_GEN(m->i915, 7))
-		err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
-
-	if (IS_GEN_RANGE(m->i915, 8, 11))
-		err_printf(m, "GTT_CACHE_EN: 0x%08x\n", error->gtt_cache);
-
-	if (IS_GEN(m->i915, 12))
-		err_printf(m, "AUX_ERR_DBG: 0x%08x\n", error->aux_err);
-
-	if (INTEL_GEN(m->i915) >= 12) {
-		int i;
-
-		for (i = 0; i < GEN12_SFC_DONE_MAX; i++)
-			err_printf(m, "  SFC_DONE[%d]: 0x%08x\n", i,
-				   error->sfc_done[i]);
-
-		err_printf(m, "  GAM_DONE: 0x%08x\n", error->gam_done);
-	}
-
-	for (ee = error->engine; ee; ee = ee->next)
-		error_print_engine(m, ee, error->capture);
-
-	for (ee = error->engine; ee; ee = ee->next) {
-		const struct drm_i915_error_object *obj;
-
-		obj = ee->batchbuffer;
-		if (obj) {
-			err_puts(m, ee->engine->name);
-			if (ee->context.pid)
-				err_printf(m, " (submitted by %s [%d])",
-					   ee->context.comm,
-					   ee->context.pid);
-			err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
-				   upper_32_bits(obj->gtt_offset),
-				   lower_32_bits(obj->gtt_offset));
-			print_error_obj(m, ee->engine, NULL, obj);
-		}
-
-		for (j = 0; j < ee->user_bo_count; j++)
-			print_error_obj(m, ee->engine, "user", ee->user_bo[j]);
-
-		if (ee->num_requests) {
-			err_printf(m, "%s --- %d requests\n",
-				   ee->engine->name,
-				   ee->num_requests);
-			for (j = 0; j < ee->num_requests; j++)
-				error_print_request(m, " ",
-						    &ee->requests[j],
-						    error->capture);
-		}
 
-		print_error_obj(m, ee->engine, "ringbuffer", ee->ringbuffer);
-		print_error_obj(m, ee->engine, "HW Status", ee->hws_page);
-		print_error_obj(m, ee->engine, "HW context", ee->ctx);
-		print_error_obj(m, ee->engine, "WA context", ee->wa_ctx);
-		print_error_obj(m, ee->engine,
-				"WA batchbuffer", ee->wa_batchbuffer);
-		print_error_obj(m, ee->engine,
-				"NULL context", ee->default_state);
-	}
+	if (error->gt)
+		err_print_gt(m, error->gt);
 
 	if (error->overlay)
 		intel_overlay_print_error_state(m, error->overlay);
@@ -802,10 +799,9 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
 	err_print_capabilities(m, &error->device_info, &error->runtime_info,
 			       &error->driver_caps);
 	err_print_params(m, &error->params);
-	err_print_uc(m, &error->uc);
 }
 
-static int err_print_to_sgl(struct i915_gpu_state *error)
+static int err_print_to_sgl(struct i915_gpu_coredump *error)
 {
 	struct drm_i915_error_state_buf m;
 
@@ -842,8 +838,8 @@ static int err_print_to_sgl(struct i915_gpu_state *error)
 	return 0;
 }
 
-ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error,
-				      char *buf, loff_t off, size_t rem)
+ssize_t i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error,
+					 char *buf, loff_t off, size_t rem)
 {
 	struct scatterlist *sg;
 	size_t count;
@@ -906,85 +902,89 @@ ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error,
 	return count;
 }
 
-static void i915_error_object_free(struct drm_i915_error_object *obj)
+static void i915_vma_coredump_free(struct i915_vma_coredump *vma)
 {
-	int page;
-
-	if (obj == NULL)
-		return;
+	while (vma) {
+		struct i915_vma_coredump *next = vma->next;
+		int page;
 
-	for (page = 0; page < obj->page_count; page++)
-		free_page((unsigned long)obj->pages[page]);
+		for (page = 0; page < vma->page_count; page++)
+			free_page((unsigned long)vma->pages[page]);
 
-	kfree(obj);
+		kfree(vma);
+		vma = next;
+	}
 }
 
-
-static void cleanup_params(struct i915_gpu_state *error)
+static void cleanup_params(struct i915_gpu_coredump *error)
 {
 	i915_params_free(&error->params);
 }
 
-static void cleanup_uc_state(struct i915_gpu_state *error)
+static void cleanup_uc(struct intel_uc_coredump *uc)
 {
-	struct i915_error_uc *error_uc = &error->uc;
+	kfree(uc->guc_fw.path);
+	kfree(uc->huc_fw.path);
+	i915_vma_coredump_free(uc->guc_log);
 
-	kfree(error_uc->guc_fw.path);
-	kfree(error_uc->huc_fw.path);
-	i915_error_object_free(error_uc->guc_log);
+	kfree(uc);
 }
 
-void __i915_gpu_state_free(struct kref *error_ref)
+static void cleanup_gt(struct intel_gt_coredump *gt)
 {
-	struct i915_gpu_state *error =
-		container_of(error_ref, typeof(*error), ref);
-	long i;
+	while (gt->engine) {
+		struct intel_engine_coredump *ee = gt->engine;
+
+		gt->engine = ee->next;
 
-	while (error->engine) {
-		struct drm_i915_error_engine *ee = error->engine;
+		i915_vma_coredump_free(ee->vma);
+		kfree(ee->requests);
+		kfree(ee);
+	}
 
-		error->engine = ee->next;
+	if (gt->uc)
+		cleanup_uc(gt->uc);
 
-		for (i = 0; i < ee->user_bo_count; i++)
-			i915_error_object_free(ee->user_bo[i]);
-		kfree(ee->user_bo);
+	kfree(gt);
+}
 
-		i915_error_object_free(ee->batchbuffer);
-		i915_error_object_free(ee->wa_batchbuffer);
-		i915_error_object_free(ee->ringbuffer);
-		i915_error_object_free(ee->hws_page);
-		i915_error_object_free(ee->ctx);
-		i915_error_object_free(ee->wa_ctx);
+void __i915_gpu_coredump_free(struct kref *error_ref)
+{
+	struct i915_gpu_coredump *error =
+		container_of(error_ref, typeof(*error), ref);
 
-		kfree(ee->requests);
-		kfree(ee);
+	while (error->gt) {
+		struct intel_gt_coredump *gt = error->gt;
+
+		error->gt = gt->next;
+		cleanup_gt(gt);
 	}
 
 	kfree(error->overlay);
 	kfree(error->display);
 
 	cleanup_params(error);
-	cleanup_uc_state(error);
 
 	err_free_sgl(error->sgl);
 	kfree(error);
 }
 
-static struct drm_i915_error_object *
-i915_error_object_create(struct drm_i915_private *i915,
-			 struct i915_vma *vma,
-			 struct compress *compress)
+static struct i915_vma_coredump *
+i915_vma_coredump_create(const struct intel_gt *gt,
+			 const struct i915_vma *vma,
+			 const char *name,
+			 struct i915_vma_compress *compress)
 {
-	struct i915_ggtt *ggtt = &i915->ggtt;
+	struct i915_ggtt *ggtt = gt->ggtt;
 	const u64 slot = ggtt->error_capture.start;
-	struct drm_i915_error_object *dst;
+	struct i915_vma_coredump *dst;
 	unsigned long num_pages;
 	struct sgt_iter iter;
 	int ret;
 
 	might_sleep();
 
-	if (!vma || !vma->pages)
+	if (!vma || !vma->pages || !compress)
 		return NULL;
 
 	num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
@@ -998,6 +998,9 @@ i915_error_object_create(struct drm_i915_private *i915,
 		return NULL;
 	}
 
+	strcpy(dst->name, name);
+	dst->next = NULL;
+
 	dst->gtt_offset = vma->node.start;
 	dst->gtt_size = vma->node.size;
 	dst->gtt_page_sizes = vma->page_sizes.gtt;
@@ -1005,9 +1008,6 @@ i915_error_object_create(struct drm_i915_private *i915,
 	dst->page_count = 0;
 	dst->unused = 0;
 
-	compress->wc = i915_gem_object_is_lmem(vma->obj) ||
-		       drm_mm_node_allocated(&ggtt->error_capture);
-
 	ret = -EINVAL;
 	if (drm_mm_node_allocated(&ggtt->error_capture)) {
 		void __iomem *s;
@@ -1016,9 +1016,12 @@ i915_error_object_create(struct drm_i915_private *i915,
 		for_each_sgt_daddr(dma, iter, vma->pages) {
 			ggtt->vm.insert_page(&ggtt->vm, dma, slot,
 					     I915_CACHE_NONE, 0);
+			mb();
 
 			s = io_mapping_map_wc(&ggtt->iomap, slot, PAGE_SIZE);
-			ret = compress_page(compress, (void  __force *)s, dst);
+			ret = compress_page(compress,
+					    (void  __force *)s, dst,
+					    true);
 			io_mapping_unmap(s);
 			if (ret)
 				break;
@@ -1031,7 +1034,9 @@ i915_error_object_create(struct drm_i915_private *i915,
 			void __iomem *s;
 
 			s = io_mapping_map_wc(&mem->iomap, dma, PAGE_SIZE);
-			ret = compress_page(compress, (void __force *)s, dst);
+			ret = compress_page(compress,
+					    (void __force *)s, dst,
+					    true);
 			io_mapping_unmap(s);
 			if (ret)
 				break;
@@ -1045,7 +1050,7 @@ i915_error_object_create(struct drm_i915_private *i915,
 			drm_clflush_pages(&page, 1);
 
 			s = kmap(page);
-			ret = compress_page(compress, s, dst);
+			ret = compress_page(compress, s, dst, false);
 			kunmap(page);
 
 			drm_clflush_pages(&page, 1);
@@ -1066,77 +1071,56 @@ i915_error_object_create(struct drm_i915_private *i915,
 	return dst;
 }
 
-/*
- * Generate a semi-unique error code. The code is not meant to have meaning, The
- * code's only purpose is to try to prevent false duplicated bug reports by
- * grossly estimating a GPU error state.
- *
- * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
- * the hang if we could strip the GTT offset information from it.
- *
- * It's only a small step better than a random number in its current form.
- */
-static u32 i915_error_generate_code(struct i915_gpu_state *error)
+static void gt_record_fences(struct intel_gt_coredump *gt)
 {
-	const struct drm_i915_error_engine *ee = error->engine;
-
-	/*
-	 * IPEHR would be an ideal way to detect errors, as it's the gross
-	 * measure of "the command that hung." However, has some very common
-	 * synchronization commands which almost always appear in the case
-	 * strictly a client bug. Use instdone to differentiate those some.
-	 */
-	return ee ? ee->ipehr ^ ee->instdone.instdone : 0;
-}
-
-static void gem_record_fences(struct i915_gpu_state *error)
-{
-	struct drm_i915_private *dev_priv = error->i915;
-	struct intel_uncore *uncore = &dev_priv->uncore;
+	struct i915_ggtt *ggtt = gt->_gt->ggtt;
+	struct intel_uncore *uncore = gt->_gt->uncore;
 	int i;
 
-	if (INTEL_GEN(dev_priv) >= 6) {
-		for (i = 0; i < dev_priv->ggtt.num_fences; i++)
-			error->fence
author	Chris Wilson <chris@chris-wilson.co.uk>	2020-01-10 12:30:56 +0000
committer	Chris Wilson <chris@chris-wilson.co.uk>	2020-01-10 15:34:33 +0000
commit	742379c0c4001fd2a6e02005c1ffa1ff611b28fa (patch)
tree	6b553daa422f74ff42de00abc3b08710a886f19b /drivers
parent	8ccfc20a7d56d7e16510e6e068ffb7b43c3ac100 (diff)
download	linux-742379c0c4001fd2a6e02005c1ffa1ff611b28fa.tar.gz linux-742379c0c4001fd2a6e02005c1ffa1ff611b28fa.tar.bz2 linux-742379c0c4001fd2a6e02005c1ffa1ff611b28fa.zip