summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/gpu/drm/i915/i915_drv.h2
-rw-r--r--drivers/gpu/drm/i915/i915_gem_evict.c92
-rw-r--r--drivers/gpu/drm/i915/i915_gem_execbuffer.c2038
-rw-r--r--drivers/gpu/drm/i915/i915_vma.c2
-rw-r--r--drivers/gpu/drm/i915/i915_vma.h1
-rw-r--r--drivers/gpu/drm/i915/selftests/i915_gem_evict.c4
-rw-r--r--drivers/gpu/drm/i915/selftests/i915_vma.c16
7 files changed, 1239 insertions, 916 deletions
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index af2a54672396..7e182dd7e356 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3581,7 +3581,7 @@ int __must_check i915_gem_evict_something(struct i915_address_space *vm,
int __must_check i915_gem_evict_for_node(struct i915_address_space *vm,
struct drm_mm_node *node,
unsigned int flags);
-int i915_gem_evict_vm(struct i915_address_space *vm, bool do_idle);
+int i915_gem_evict_vm(struct i915_address_space *vm);
/* belongs in i915_gem_gtt.h */
static inline void i915_gem_chipset_flush(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index 204a2d9288ae..a193f1b36c67 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -50,6 +50,29 @@ static bool ggtt_is_idle(struct drm_i915_private *dev_priv)
return true;
}
+static int ggtt_flush(struct drm_i915_private *i915)
+{
+ int err;
+
+ /* Not everything in the GGTT is tracked via vma (otherwise we
+ * could evict as required with minimal stalling) so we are forced
+ * to idle the GPU and explicitly retire outstanding requests in
+ * the hopes that we can then remove contexts and the like only
+ * bound by their active reference.
+ */
+ err = i915_gem_switch_to_kernel_context(i915);
+ if (err)
+ return err;
+
+ err = i915_gem_wait_for_idle(i915,
+ I915_WAIT_INTERRUPTIBLE |
+ I915_WAIT_LOCKED);
+ if (err)
+ return err;
+
+ return 0;
+}
+
static bool
mark_free(struct drm_mm_scan *scan,
struct i915_vma *vma,
@@ -175,19 +198,7 @@ search_again:
return intel_has_pending_fb_unpin(dev_priv) ? -EAGAIN : -ENOSPC;
}
- /* Not everything in the GGTT is tracked via vma (otherwise we
- * could evict as required with minimal stalling) so we are forced
- * to idle the GPU and explicitly retire outstanding requests in
- * the hopes that we can then remove contexts and the like only
- * bound by their active reference.
- */
- ret = i915_gem_switch_to_kernel_context(dev_priv);
- if (ret)
- return ret;
-
- ret = i915_gem_wait_for_idle(dev_priv,
- I915_WAIT_INTERRUPTIBLE |
- I915_WAIT_LOCKED);
+ ret = ggtt_flush(dev_priv);
if (ret)
return ret;
@@ -337,10 +348,8 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
/**
* i915_gem_evict_vm - Evict all idle vmas from a vm
* @vm: Address space to cleanse
- * @do_idle: Boolean directing whether to idle first.
*
- * This function evicts all idles vmas from a vm. If all unpinned vmas should be
- * evicted the @do_idle needs to be set to true.
+ * This function evicts all vmas from a vm.
*
* This is used by the execbuf code as a last-ditch effort to defragment the
* address space.
@@ -348,37 +357,50 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
* To clarify: This is for freeing up virtual address space, not for freeing
* memory in e.g. the shrinker.
*/
-int i915_gem_evict_vm(struct i915_address_space *vm, bool do_idle)
+int i915_gem_evict_vm(struct i915_address_space *vm)
{
+ struct list_head *phases[] = {
+ &vm->inactive_list,
+ &vm->active_list,
+ NULL
+ }, **phase;
+ struct list_head eviction_list;
struct i915_vma *vma, *next;
int ret;
lockdep_assert_held(&vm->i915->drm.struct_mutex);
trace_i915_gem_evict_vm(vm);
- if (do_idle) {
- struct drm_i915_private *dev_priv = vm->i915;
-
- if (i915_is_ggtt(vm)) {
- ret = i915_gem_switch_to_kernel_context(dev_priv);
- if (ret)
- return ret;
- }
-
- ret = i915_gem_wait_for_idle(dev_priv,
- I915_WAIT_INTERRUPTIBLE |
- I915_WAIT_LOCKED);
+ /* Switch back to the default context in order to unpin
+ * the existing context objects. However, such objects only
+ * pin themselves inside the global GTT and performing the
+ * switch otherwise is ineffective.
+ */
+ if (i915_is_ggtt(vm)) {
+ ret = ggtt_flush(vm->i915);
if (ret)
return ret;
-
- WARN_ON(!list_empty(&vm->active_list));
}
- list_for_each_entry_safe(vma, next, &vm->inactive_list, vm_link)
- if (!i915_vma_is_pinned(vma))
- WARN_ON(i915_vma_unbind(vma));
+ INIT_LIST_HEAD(&eviction_list);
+ phase = phases;
+ do {
+ list_for_each_entry(vma, *phase, vm_link) {
+ if (i915_vma_is_pinned(vma))
+ continue;
- return 0;
+ __i915_vma_pin(vma);
+ list_add(&vma->evict_link, &eviction_list);
+ }
+ } while (*++phase);
+
+ ret = 0;
+ list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
+ __i915_vma_unpin(vma);
+ if (ret == 0)
+ ret = i915_vma_unbind(vma);
+ }
+ return ret;
}
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 9c3f6c40270f..a052072fe8b3 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -42,41 +42,195 @@
#define DBG_USE_CPU_RELOC 0 /* -1 force GTT relocs; 1 force CPU relocs */
-#define __EXEC_OBJECT_HAS_PIN (1<<31)
-#define __EXEC_OBJECT_HAS_FENCE (1<<30)
-#define __EXEC_OBJECT_NEEDS_MAP (1<<29)
-#define __EXEC_OBJECT_NEEDS_BIAS (1<<28)
-#define __EXEC_OBJECT_INTERNAL_FLAGS (0xf<<28) /* all of the above */
+#define __EXEC_OBJECT_HAS_PIN BIT(31)
+#define __EXEC_OBJECT_HAS_FENCE BIT(30)
+#define __EXEC_OBJECT_NEEDS_MAP BIT(29)
+#define __EXEC_OBJECT_NEEDS_BIAS BIT(28)
+#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 28) /* all of the above */
+#define __EXEC_OBJECT_RESERVED (__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE)
+
+#define __EXEC_HAS_RELOC BIT(31)
+#define __EXEC_VALIDATED BIT(30)
+#define UPDATE PIN_OFFSET_FIXED
#define BATCH_OFFSET_BIAS (256*1024)
#define __I915_EXEC_ILLEGAL_FLAGS \
(__I915_EXEC_UNKNOWN_FLAGS | I915_EXEC_CONSTANTS_MASK)
+/**
+ * DOC: User command execution
+ *
+ * Userspace submits commands to be executed on the GPU as an instruction
+ * stream within a GEM object we call a batchbuffer. This instructions may
+ * refer to other GEM objects containing auxiliary state such as kernels,
+ * samplers, render targets and even secondary batchbuffers. Userspace does
+ * not know where in the GPU memory these objects reside and so before the
+ * batchbuffer is passed to the GPU for execution, those addresses in the
+ * batchbuffer and auxiliary objects are updated. This is known as relocation,
+ * or patching. To try and avoid having to relocate each object on the next
+ * execution, userspace is told the location of those objects in this pass,
+ * but this remains just a hint as the kernel may choose a new location for
+ * any object in the future.
+ *
+ * Processing an execbuf ioctl is conceptually split up into a few phases.
+ *
+ * 1. Validation - Ensure all the pointers, handles and flags are valid.
+ * 2. Reservation - Assign GPU address space for every object
+ * 3. Relocation - Update any addresses to point to the final locations
+ * 4. Serialisation - Order the request with respect to its dependencies
+ * 5. Construction - Construct a request to execute the batchbuffer
+ * 6. Submission (at some point in the future execution)
+ *
+ * Reserving resources for the execbuf is the most complicated phase. We
+ * neither want to have to migrate the object in the address space, nor do
+ * we want to have to update any relocations pointing to this object. Ideally,
+ * we want to leave the object where it is and for all the existing relocations
+ * to match. If the object is given a new address, or if userspace thinks the
+ * object is elsewhere, we have to parse all the relocation entries and update
+ * the addresses. Userspace can set the I915_EXEC_NORELOC flag to hint that
+ * all the target addresses in all of its objects match the value in the
+ * relocation entries and that they all match the presumed offsets given by the
+ * list of execbuffer objects. Using this knowledge, we know that if we haven't
+ * moved any buffers, all the relocation entries are valid and we can skip
+ * the update. (If userspace is wrong, the likely outcome is an impromptu GPU
+ * hang.) The requirement for using I915_EXEC_NO_RELOC are:
+ *
+ * The addresses written in the objects must match the corresponding
+ * reloc.presumed_offset which in turn must match the corresponding
+ * execobject.offset.
+ *
+ * Any render targets written to in the batch must be flagged with
+ * EXEC_OBJECT_WRITE.
+ *
+ * To avoid stalling, execobject.offset should match the current
+ * address of that object within the active context.
+ *
+ * The reservation is done is multiple phases. First we try and keep any
+ * object already bound in its current location - so as long as meets the
+ * constraints imposed by the new execbuffer. Any object left unbound after the
+ * first pass is then fitted into any available idle space. If an object does
+ * not fit, all objects are removed from the reservation and the process rerun
+ * after sorting the objects into a priority order (more difficult to fit
+ * objects are tried first). Failing that, the entire VM is cleared and we try
+ * to fit the execbuf once last time before concluding that it simply will not
+ * fit.
+ *
+ * A small complication to all of this is that we allow userspace not only to
+ * specify an alignment and a size for the object in the address space, but
+ * we also allow userspace to specify the exact offset. This objects are
+ * simpler to place (the location is known a priori) all we have to do is make
+ * sure the space is available.
+ *
+ * Once all the objects are in place, patching up the buried pointers to point
+ * to the final locations is a fairly simple job of walking over the relocation
+ * entry arrays, looking up the right address and rewriting the value into
+ * the object. Simple! ... The relocation entries are stored in user memory
+ * and so to access them we have to copy them into a local buffer. That copy
+ * has to avoid taking any pagefaults as they may lead back to a GEM object
+ * requiring the struct_mutex (i.e. recursive deadlock). So once again we split
+ * the relocation into multiple passes. First we try to do everything within an
+ * atomic context (avoid the pagefaults) which requires that we never wait. If
+ * we detect that we may wait, or if we need to fault, then we have to fallback
+ * to a slower path. The slowpath has to drop the mutex. (Can you hear alarm
+ * bells yet?) Dropping the mutex means that we lose all the state we have
+ * built up so far for the execbuf and we must reset any global data. However,
+ * we do leave the objects pinned in their final locations - which is a
+ * potential issue for concurrent execbufs. Once we have left the mutex, we can
+ * allocate and copy all the relocation entries into a large array at our
+ * leisure, reacquire the mutex, reclaim all the objects and other state and
+ * then proceed to update any incorrect addresses with the objects.
+ *
+ * As we process the relocation entries, we maintain a record of whether the
+ * object is being written to. Using NORELOC, we expect userspace to provide
+ * this information instead. We also check whether we can skip the relocation
+ * by comparing the expected value inside the relocation entry with the target's
+ * final address. If they differ, we have to map the current object and rewrite
+ * the 4 or 8 byte pointer within.
+ *
+ * Serialising an execbuf is quite simple according to the rules of the GEM
+ * ABI. Execution within each context is ordered by the order of submission.
+ * Writes to any GEM object are in order of submission and are exclusive. Reads
+ * from a GEM object are unordered with respect to other reads, but ordered by
+ * writes. A write submitted after a read cannot occur before the read, and
+ * similarly any read submitted after a write cannot occur before the write.
+ * Writes are ordered between engines such that only one write occurs at any
+ * time (completing any reads beforehand) - using semaphores where available
+ * and CPU serialisation otherwise. Other GEM access obey the same rules, any
+ * write (either via mmaps using set-domain, or via pwrite) must flush all GPU
+ * reads before starting, and any read (either using set-domain or pread) must
+ * flush all GPU writes before starting. (Note we only employ a barrier before,
+ * we currently rely on userspace not concurrently starting a new execution
+ * whilst reading or writing to an object. This may be an advantage or not
+ * depending on how much you trust userspace not to shoot themselves in the
+ * foot.) Serialisation may just result in the request being inserted into
+ * a DAG awaiting its turn, but most simple is to wait on the CPU until
+ * all dependencies are resolved.
+ *
+ * After all of that, is just a matter of closing the request and handing it to
+ * the hardware (well, leaving it in a queue to be executed). However, we also
+ * offer the ability for batchbuffers to be run with elevated privileges so
+ * that they access otherwise hidden registers. (Used to adjust L3 cache etc.)
+ * Before any batch is given extra privileges we first must check that it
+ * contains no nefarious instructions, we check that each instruction is from
+ * our whitelist and all registers are also from an allowed list. We first
+ * copy the user's batchbuffer to a shadow (so that the user doesn't have
+ * access to it, either by the CPU or GPU as we scan it) and then parse each
+ * instruction. If everything is ok, we set a flag telling the hardware to run
+ * the batchbuffer in trusted mode, otherwise the ioctl is rejected.
+ */
+
struct i915_execbuffer {
- struct drm_i915_private *i915;
- struct drm_file *file;
- struct drm_i915_gem_execbuffer2 *args;
- struct drm_i915_gem_exec_object2 *exec;
- struct intel_engine_cs *engine;
- struct i915_gem_context *ctx;
- struct i915_address_space *vm;
- struct i915_vma *batch;
- struct drm_i915_gem_request *request;
- u32 batch_start_offset;
- u32 batch_len;
- unsigned int dispatch_flags;
- struct drm_i915_gem_exec_object2 shadow_exec_entry;
- bool need_relocs;
- struct list_head vmas;
+ struct drm_i915_private *i915; /** i915 backpointer */
+ struct drm_file *file; /** per-file lookup tables and limits */
+ struct drm_i915_gem_execbuffer2 *args; /** ioctl parameters */
+ struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */
+
+ struct intel_engine_cs *engine; /** engine to queue the request to */
+ struct i915_gem_context *ctx; /** context for building the request */
+ struct i915_address_space *vm; /** GTT and vma for the request */
+
+ struct drm_i915_gem_request *request; /** our request to build */
+ struct i915_vma *batch; /** identity of the batch obj/vma */
+
+ /** actual size of execobj[] as we may extend it for the cmdparser */
+ unsigned int buffer_count;
+
+ /** list of vma not yet bound during reservation phase */
+ struct list_head unbound;
+
+ /** list of vma that have execobj.relocation_count */
+ struct list_head relocs;
+
+ /**
+ * Track the most recently used object for relocations, as we
+ * frequently have to perform multiple relocations within the same
+ * obj/page
+ */
struct reloc_cache {
- struct drm_mm_node node;
- unsigned long vaddr;
- unsigned int page;
+ struct drm_mm_node node; /** temporary GTT binding */
+ unsigned long vaddr; /** Current kmap address */
+ unsigned long page; /** Currently mapped page index */
bool use_64bit_reloc : 1;
+ bool has_llc : 1;
+ bool has_fence : 1;
+ bool needs_unfenced : 1;
} reloc_cache;
- int lut_mask;
- struct hlist_head *buckets;
+
+ u64 invalid_flags; /** Set of execobj.flags that are invalid */
+ u32 context_flags; /** Set of execobj.flags to insert from the ctx */
+
+ u32 batch_start_offset; /** Location within object of batch */
+ u32 batch_len; /** Length of batch within object */
+ u32 batch_flags; /** Flags composed for emit_bb_start() */
+
+ /**
+ * Indicate either the size of the hastable used to resolve
+ * relocation handles, or if negative that we are using a direct
+ * index into the execobj[].
+ */
+ int lut_size;
+ struct hlist_head *buckets; /** ht for relocation handles */
};
/*
@@ -87,11 +241,41 @@ struct i915_execbuffer {
#define __exec_to_vma(ee) (ee)->rsvd2
#define exec_to_vma(ee) u64_to_ptr(struct i915_vma, __exec_to_vma(ee))
+/*
+ * Used to convert any address to canonical form.
+ * Starting from gen8, some commands (e.g. STATE_BASE_ADDRESS,
+ * MI_LOAD_REGISTER_MEM and others, see Broadwell PRM Vol2a) require the
+ * addresses to be in a canonical form:
+ * "GraphicsAddress[63:48] are ignored by the HW and assumed to be in correct
+ * canonical form [63:48] == [47]."
+ */
+#define GEN8_HIGH_ADDRESS_BIT 47
+static inline u64 gen8_canonical_addr(u64 address)
+{
+ return sign_extend64(address, GEN8_HIGH_ADDRESS_BIT);
+}
+
+static inline u64 gen8_noncanonical_addr(u64 address)
+{
+ return address & GENMASK_ULL(GEN8_HIGH_ADDRESS_BIT, 0);
+}
+
static int eb_create(struct i915_execbuffer *eb)
{
- if ((eb->args->flags & I915_EXEC_HANDLE_LUT) == 0) {
- unsigned int size = 1 + ilog2(eb->args->buffer_count);
+ if (!(eb->args->flags & I915_EXEC_HANDLE_LUT)) {
+ unsigned int size = 1 + ilog2(eb->buffer_count);
+ /*
+ * Without a 1:1 association between relocation handles and
+ * the execobject[] index, we instead create a hashtable.
+ * We size it dynamically based on available memory, starting
+ * first with 1:1 assocative hash and scaling back until
+ * the allocation succeeds.
+ *
+ * Later on we use a positive lut_size to indicate we are
+ * using this hashtable, and a negative value to indicate a
+ * direct lookup.
+ */
do {
eb->buckets = kzalloc(sizeof(struct hlist_head) << size,
GFP_TEMPORARY |
@@ -108,112 +292,411 @@ static int eb_create(struct i915_execbuffer *eb)
return -ENOMEM;
}
- eb->lut_mask = size;
+ eb->lut_size = size;
} else {
- eb->lut_mask = -eb->args->buffer_count;
+ eb->lut_size = -eb->buffer_count;
}
return 0;
}
+static bool
+eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry,
+ const struct i915_vma *vma)
+{
+ if (!(entry->flags & __EXEC_OBJECT_HAS_PIN))
+ return true;
+
+ if (vma->node.size < entry->pad_to_size)
+ return true;
+
+ if (entry->alignment && !IS_ALIGNED(vma->node.start, entry->alignment))
+ return true;
+
+ if (entry->flags & EXEC_OBJECT_PINNED &&
+ vma->node.start != entry->offset)
+ return true;
+
+ if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS &&
+ vma->node.start < BATCH_OFFSET_BIAS)
+ return true;
+
+ if (!(entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) &&
+ (vma->node.start + vma->node.size - 1) >> 32)
+ return true;
+
+ return false;
+}
+
+static inline void
+eb_pin_vma(struct i915_execbuffer *eb,
+ struct drm_i915_gem_exec_object2 *entry,
+ struct i915_vma *vma)
+{
+ u64 flags;
+
+ flags = vma->node.start;
+ flags |= PIN_USER | PIN_NONBLOCK | PIN_OFFSET_FIXED;
+ if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_GTT))
+ flags |= PIN_GLOBAL;
+ if (unlikely(i915_vma_pin(vma, 0, 0, flags)))
+ return;
+
+ if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
+ if (unlikely(i915_vma_get_fence(vma))) {
+ i915_vma_unpin(vma);
+ return;
+ }
+
+ if (i915_vma_pin_fence(vma))
+ entry->flags |= __EXEC_OBJECT_HAS_FENCE;
+ }
+
+ entry->flags |= __EXEC_OBJECT_HAS_PIN;
+}
+
static inline void
__eb_unreserve_vma(struct i915_vma *vma,
const struct drm_i915_gem_exec_object2 *entry)
{
+ GEM_BUG_ON(!(entry->flags & __EXEC_OBJECT_HAS_PIN));
+
if (unlikely(entry->flags & __EXEC_OBJECT_HAS_FENCE))
i915_vma_unpin_fence(vma);
- if (entry->flags & __EXEC_OBJECT_HAS_PIN)
- __i915_vma_unpin(vma);
+ __i915_vma_unpin(vma);
}
-static void
-eb_unreserve_vma(struct i915_vma *vma)
+static inline void
+eb_unreserve_vma(struct i915_vma *vma,
+ struct drm_i915_gem_exec_object2 *entry)
{
- struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+ if (!(entry->flags & __EXEC_OBJECT_HAS_PIN))
+ return;
__eb_unreserve_vma(vma, entry);
- entry->flags &= ~(__EXEC_OBJECT_HAS_FENCE | __EXEC_OBJECT_HAS_PIN);
+ entry->flags &= ~__EXEC_OBJECT_RESERVED;
}
-static void
-eb_reset(struct i915_execbuffer *eb)
+static int
+eb_validate_vma(struct i915_execbuffer *eb,
+ struct drm_i915_gem_exec_object2 *entry,
+ struct i915_vma *vma)
{
- struct i915_vma *vma;
+ if (unlikely(entry->flags & eb->invalid_flags))
+ return -EINVAL;
- list_for_each_entry(vma, &eb->vmas, exec_link) {
- eb_unreserve_vma(vma);
- i915_vma_put(vma);
- vma->exec_entry = NULL;
+ if (unlikely(entry->alignment && !is_power_of_2(entry->alignment)))
+ return -EINVAL;
+
+ /*
+ * Offset can be used as input (EXEC_OBJECT_PINNED), reject
+ * any non-page-aligned or non-canonical addresses.
+ */
+ if (unlikely(entry->flags & EXEC_OBJECT_PINNED &&
+ entry->offset != gen8_canonical_addr(entry->offset & PAGE_MASK)))
+ return -EINVAL;
+
+ /* pad_to_size was once a reserved field, so sanitize it */
+ if (entry->flags & EXEC_OBJECT_PAD_TO_SIZE) {
+ if (unlikely(offset_in_page(entry->pad_to_size)))
+ return -EINVAL;
+ } else {
+ entry->pad_to_size = 0;
}
- if (eb->lut_mask >= 0)
- memset(eb->buckets, 0,
- sizeof(struct hlist_head) << eb->lut_mask);
+ if (unlikely(vma->exec_entry)) {
+ DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n",
+ entry->handle, (int)(entry - eb->exec));
+ return -EINVAL;
+ }
+
+ /*
+ * From drm_mm perspective address space is continuous,
+ * so from this point we're always using non-canonical
+ * form internally.
+ */
+ entry->offset = gen8_noncanonical_addr(entry->offset);
+
+ return 0;
}
-static bool
-eb_add_vma(struct i915_execbuffer *eb, struct i915_vma *vma, int i)
+static int
+eb_add_vma(struct i915_execbuffer *eb,
+ struct drm_i915_gem_exec_object2 *entry,
+ struct i915_vma *vma)
{
- if (unlikely(vma->exec_entry)) {
- DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n",
- eb->exec[i].handle, i);
- return false;
+ int err;
+
+ GEM_BUG_ON(i915_vma_is_closed(vma));
+
+ if (!(eb->args->flags & __EXEC_VALIDATED)) {
+ err = eb_validate_vma(eb, entry, vma);
+ if (unlikely(err))
+ return err;
}
- list_add_tail(&vma->exec_link, &eb->vmas);
- vma->exec_entry = &eb->exec[i];
- if (eb->lut_mask >= 0) {
- vma->exec_handle = eb->exec[i].handle;
+ if (eb->lut_size >= 0) {
+ vma->exec_handle = entry->handle;
hlist_add_head(&vma->exec_node,
- &eb->buckets[hash_32(vma->exec_handle,
- eb->lut_mask)]);
+ &eb->buckets[hash_32(entry->handle,
+ eb->lut_size)]);
}
- i915_vma_get(vma);
- __exec_to_vma(&eb->exec[i]) = (uintptr_t)vma;
- return true;
+ if (entry->relocation_count)
+ list_add_tail(&vma->reloc_link, &eb->relocs);
+
+ if (!eb->reloc_cache.has_fence) {
+ entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE;
+ } else {
+ if ((entry->flags & EXEC_OBJECT_NEEDS_FENCE ||
+ eb->reloc_cache.needs_unfenced) &&
+ i915_gem_object_is_tiled(vma->obj))
+ entry->flags |= EXEC_OBJECT_NEEDS_GTT | __EXEC_OBJECT_NEEDS_MAP;
+ }
+
+ if (!(entry->flags & EXEC_OBJECT_PINNED))
+ entry->flags |= eb->context_flags;
+
+ /*
+ * Stash a pointer from the vma to execobj, so we can query its flags,
+ * size, alignment etc as provided by the user. Also we stash a pointer
+ * to the vma inside the execobj so that we can use a direct lookup
+ * to find the right target VMA when doing relocations.
+ */
+ vma->exec_entry = entry;
+ __exec_to_vma(entry) = (uintptr_t)i915_vma_get(vma);
+
+ err = 0;
+ if (vma->node.size)
+ eb_pin_vma(eb, entry, vma);
+ if (eb_vma_misplaced(entry, vma)) {
+ eb_unreserve_vma(vma, entry);
+
+ list_add_tail(&vma->exec_link, &eb->unbound);
+ if (drm_mm_node_allocated(&vma->node))
+ err = i915_vma_unbind(vma);
+ } else {
+ if (entry->offset != vma->node.start) {
+ entry->offset = vma->node.start | UPDATE;
+ eb->args->flags |= __EXEC_HAS_RELOC;
+ }
+ }
+ return err;
+}
+
+static inline int use_cpu_reloc(const struct reloc_cache *cache,
+ const struct drm_i915_gem_object *obj)
+{
+ if (!i915_gem_object_has_struct_page(obj))
+ return false;
+
+ if (DBG_USE_CPU_RELOC)
+ return DBG_USE_CPU_RELOC > 0;
+
+ return (cache->has_llc ||
+ obj->cache_dirty ||
+ obj->cache_level != I915_CACHE_NONE);
+}
+
+static int eb_reserve_vma(const struct i915_execbuffer *eb,
+ struct i915_vma *vma)
+{
+ struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+ u64 flags;
+ int err;
+
+ flags = PIN_USER | PIN_NONBLOCK;
+ if (entry->flags & EXEC_OBJECT_NEEDS_GTT)
+ flags |= PIN_GLOBAL;
+
+ /*
+ * Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset,
+ * limit address to the first 4GBs for unflagged objects.
+ */
+ if (!(entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS))
+ flags |= PIN_ZONE_4G;
+
+ if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
+ flags |= PIN_MAPPABLE;
+
+ if (entry->flags & EXEC_OBJECT_PINNED) {
+ flags |= entry->offset | PIN_OFFSET_FIXED;
+ flags &= ~PIN_NONBLOCK; /* force overlapping PINNED checks */
+ } else if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS) {
+ flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS;
+ }
+
+ err = i915_vma_pin(vma, entry->pad_to_size, entry->alignment, flags);
+ if (err)
+ return err;
+
+ if (entry->offset != vma->node.start) {
+ entry->offset = vma->node.start | UPDATE;
+ eb->args->flags |= __EXEC_HAS_RELOC;
+ }
+
+ entry->flags |= __EXEC_OBJECT_HAS_PIN;
+ GEM_BUG_ON(eb_vma_misplaced(entry, vma));
+
+ if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
+ err = i915_vma_get_fence(vma);
+ if (unlikely(err)) {
+ i915_vma_unpin(vma);
+ return err;
+ }
+
+ if (i915_vma_pin_fence(vma))
+ entry->flags |= __EXEC_OBJECT_HAS_FENCE;
+ }
+
+ return 0;
+}
+
+static int eb_reserve(struct i915_execbuffer *eb)
+{
+ const unsigned int count = eb->buffer_count;
+ struct list_head last;
+ struct i915_vma *vma;
+ unsigned int i, pass;
+ int err;
+
+ /*
+ * Attempt to pin all of the buffers into the GTT.
+ * This is done in 3 phases:
+ *
+ * 1a. Unbind all objects that do not match the GTT constraints for
+ * the execbuffer (fenceable, mappable, alignment etc).
+ * 1b. Increment pin count for already bound objects.
+ * 2. Bind new objects.
+ * 3. Decrement pin count.
+ *
+ * This avoid unnecessary unbinding of later objects in order to make
+ * room for the earlier objects *unless* we need to defragment.
+ */
+
+ pass = 0;
+ err = 0;
+ do {
+ list_for_each_entry(vma, &eb->unbound, exec_link) {
+ err = eb_reserve_vma(eb, vma);
+ if (err)
+ break;
+ }
+ if (err != -ENOSPC)
+ return err;
+
+ /* Resort *all* the objects into priority order */
+ INIT_LIST_HEAD(&eb->unbound);
+ INIT_LIST_HEAD(&last);
+ for (i = 0; i < count; i++) {
+ struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+
+ if (entry->flags & EXEC_OBJECT_PINNED &&
+ entry->flags & __EXEC_OBJECT_HAS_PIN)
+ continue;
+
+ vma = exec_to_vma(entry);
+ eb_unreserve_vma(vma, entry);
+
+ if (entry->flags & EXEC_OBJECT_PINNED)
+ list_add(&vma->exec_link, &eb->unbound);
+ else if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
+ list_add_tail(&vma->exec_link, &eb->unbound);
+ else
+ list_add_tail(&vma->exec_link, &last);
+ }
+ list_splice_tail(&last, &eb->unbound);
+
+ switch (pass++) {
+ case 0:
+ break;
+
+ case 1:
+ /* Too fragmented, unbind everything and retry */
+ err = i915_gem_evict_vm(eb->vm);
+ if (err)
+ return err;
+ break;
+
+ default:
+ return -ENOSPC;
+ }
+ } while (1);
}
static inline struct hlist_head *
-ht_head(const struct i915_gem_context *ctx, u32 handle)
+ht_head(const struct i915_gem_context_vma_lut *lut, u32 handle)
{
- return &ctx->vma_lut.ht[hash_32(handle, ctx->vma_lut.ht_bits)];
+ return &lut->ht[hash_32(handle, lut->ht_bits)];
}
static inline bool
-ht_needs_resize(const struct i915_gem_context *ctx)
+ht_needs_resize(const struct i915_gem_context_vma_lut *lut)
{
- return (4*ctx->vma_lut.ht_count > 3*ctx->vma_lut.ht_size ||
- 4*ctx->vma_lut.ht_count + 1 < ctx->vma_lut.ht_size);
+ return (4*lut->ht_count > 3*lut->ht_size ||
+ 4*lut->ht_count + 1 < lut->ht_size);
}
-static int
-eb_lookup_vmas(struct i915_execbuffer *eb)
+static unsigned int eb_batch_index(const struct i915_execbuffer *eb)
+{
+ return eb->buffer_count - 1;
+}
+
+static int eb_select_context(struct i915_execbuffer *eb)
+{
+ struct i915_gem_context *ctx;
+
+ ctx = i915_gem_context_lookup(eb->file->driver_priv, eb->args->rsvd1);
+ if (unlikely(IS_ERR(ctx)))
+ return PTR_ERR(ctx);
+
+ if (unlikely(i915_gem_context_is_banned(ctx))) {
+ DRM_DEBUG("Context %u tried to submit while banned\n",
+ ctx->user_handle);
+ return -EIO;
+ }
+
+ eb->ctx = i915_gem_context_get(ctx);
+ eb->vm = ctx->ppgtt ? &ctx->ppgtt->base : &eb->i915->ggtt.base;
+
+ eb->context_flags = 0;
+ if (ctx->flags & CONTEXT_NO_ZEROMAP)
+ eb->context_flags |= __EXEC_OBJECT_NEEDS_BIAS;
+
+ return 0;
+}
+
+static int eb_lookup_vmas(struct i915_execbuffer *eb)
{
#define INTERMEDIATE BIT(0)
- const int count = eb->args->buffer_count;
+ const unsigned int count = eb->buffer_count;
+ struct i915_gem_context_vma_lut *lut = &eb->ctx->vma_lut;
struct i915_vma *vma;
+ struct idr *idr;
+ unsigned int i;
int slow_pass = -1;
- int i;
+ int err;
- INIT_LIST_HEAD(&eb->vmas);
+ INIT_LIST_HEAD(&eb->relocs);
+ INIT_LIST_HEAD(&eb->unbound);
- if (unlikely(eb->ctx->vma_lut.ht_size & I915_CTX_RESIZE_IN_PROGRESS))
- flush_work(&eb->ctx->vma_lut.resize);
- GEM_BUG_ON(eb->ctx->vma_lut.ht_size & I915_CTX_RESIZE_IN_PROGRESS);
+ if (unlikely(lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS))
+ flush_work(&lut->resize);
+ GEM_BUG_ON(lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS);
for (i = 0; i < count; i++) {
__exec_to_vma(&eb->exec[i]) = 0;
hlist_for_each_entry(vma,
- ht_head(eb->ctx, eb->exec[i].handle),
+ ht_head(lut, eb->exec[i].handle),
ctx_node) {
if (vma->ctx_handle != eb->exec[i].handle)
continue;
- if (!eb_add_vma(eb, vma, i))
- return -EINVAL;
+ err = eb_add_vma(eb, &eb->exec[i], vma);
+ if (unlikely(err))
+ return err;
goto next_vma;
}
@@ -224,24 +707,27 @@ next_vma: ;
}
if (slow_pass < 0)
- return 0;
+ goto out;
spin_lock(&eb->file->table_lock);
- /* Grab a reference to the object and release the lock so we can lookup
- * or create the VMA without using GFP_ATOMIC */
+ /*
+ * Grab a reference to the object and release the lock so we can lookup
+ * or create the VMA without using GFP_ATOMIC
+ */
+ idr = &eb->file->object_idr;
for (i = slow_pass; i < count; i++) {
struct drm_i915_gem_object *obj;
if (__exec_to_vma(&eb->exec[i]))
continue;
- obj = to_intel_bo(idr_find(&eb->file->object_idr,
- eb->exec[i].handle));
+ obj = to_intel_bo(idr_find(idr, eb->exec[i].handle));
if (unlikely(!obj)) {
spin_unlock(&eb->file->table_lock);
DRM_DEBUG("Invalid object handle %d at index %d\n",
eb->exec[i].handle, i);
- return -ENOENT;
+ err = -ENOENT;
+ goto err;
}
__exec_to_vma(&eb->exec[i]) = INTERMEDIATE | (uintptr_t)obj;
@@ -251,7 +737,7 @@ next_vma: ;
for (i = slow_pass; i < count; i++) {
struct drm_i915_gem_object *obj;
- if ((__exec_to_vma(&eb->exec[i]) & INTERMEDIATE) == 0)
+ if (!(__exec_to_vma(&eb->exec[i]) & INTERMEDIATE))
continue;
/*
@@ -262,12 +748,13 @@ next_vma: ;
* from the (obj, vm) we don't run the risk of creating
* duplicated vmas for the same vm.
*/
- obj = u64_to_ptr(struct drm_i915_gem_object,
+ obj = u64_to_ptr(typeof(*obj),
__exec_to_vma(&eb->exec[i]) & ~INTERMEDIATE);
vma = i915_vma_instance(obj, eb->vm, NULL);
if (unlikely(IS_ERR(vma))) {
DRM_DEBUG("Failed to lookup VMA\n");
- return PTR_ERR(vma);
+ err = PTR_ERR(vma);
+ goto err;
}
/* First come, first served */
@@ -275,32 +762,31 @@ next_vma: ;
vma->ctx = eb->ctx;
vma->ctx_handle = eb->exec[i].handle;
hlist_add_head(&vma->ctx_node,
- ht_head(eb->ctx, eb->exec[i].handle));
- eb->ctx->vma_lut.ht_count++;
+ ht_head(lut, eb->exec[i].handle));
+ lut->ht_count++;
+ lut->ht_size |= I915_CTX_RESIZE_IN_PROGRESS;
if (i915_vma_is_ggtt(vma)) {
GEM_BUG_ON(obj->vma_hashed);
obj->vma_hashed = vma;
}
}
- if (!eb_add_vma(eb, vma, i))
- return -EINVAL;
+ err = eb_add_vma(eb, &eb->exec[i], vma);
+ if (unlikely(err))
+ goto err;
}
- if (ht_needs_resize(eb->ctx)) {
- eb->ctx->vma_lut.ht_size |= I915_CTX_RESIZE_IN_PROGRESS;
- queue_work(system_highpri_wq, &eb->ctx->vma_lut.resize);
+ if (lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS) {
+ if (ht_needs_resize(lut))
+ queue_work(system_highpri_wq, &lut->resize);
+ else
+ lut->ht_size &= ~I915_CTX_RESIZE_IN_PROGRESS;
}
- return 0;
-#undef INTERMEDIATE
-}
-
-static struct i915_vma *
-eb_get_batch(struct i915_execbuffer *eb)
-{
- struct i915_vma *vma =
- exec_to_vma(&eb->exec[eb->args->buffer_count - 1]);
+out:
+ /* take note of the batch buffer before we might reorder the lists */
+ i = eb_batch_index(eb);
+ eb->batch = exec_to_vma(&eb->exec[i]);
/*
* SNA is doing fancy tricks with compressing batch buffers, which leads
@@ -311,24 +797,36 @@ eb_get_batch(struct i915_execbuffer *eb)
* Note that actual hangs have only been observed on gen7, but for
* paranoia do it everywhere.