diff options
| -rw-r--r-- | drivers/gpu/drm/i915/i915_drv.h | 2 | ||||
| -rw-r--r-- | drivers/gpu/drm/i915/i915_gem_evict.c | 92 | ||||
| -rw-r--r-- | drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2038 | ||||
| -rw-r--r-- | drivers/gpu/drm/i915/i915_vma.c | 2 | ||||
| -rw-r--r-- | drivers/gpu/drm/i915/i915_vma.h | 1 | ||||
| -rw-r--r-- | drivers/gpu/drm/i915/selftests/i915_gem_evict.c | 4 | ||||
| -rw-r--r-- | drivers/gpu/drm/i915/selftests/i915_vma.c | 16 |
7 files changed, 1239 insertions, 916 deletions
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index af2a54672396..7e182dd7e356 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3581,7 +3581,7 @@ int __must_check i915_gem_evict_something(struct i915_address_space *vm, int __must_check i915_gem_evict_for_node(struct i915_address_space *vm, struct drm_mm_node *node, unsigned int flags); -int i915_gem_evict_vm(struct i915_address_space *vm, bool do_idle); +int i915_gem_evict_vm(struct i915_address_space *vm); /* belongs in i915_gem_gtt.h */ static inline void i915_gem_chipset_flush(struct drm_i915_private *dev_priv) diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c index 204a2d9288ae..a193f1b36c67 100644 --- a/drivers/gpu/drm/i915/i915_gem_evict.c +++ b/drivers/gpu/drm/i915/i915_gem_evict.c @@ -50,6 +50,29 @@ static bool ggtt_is_idle(struct drm_i915_private *dev_priv) return true; } +static int ggtt_flush(struct drm_i915_private *i915) +{ + int err; + + /* Not everything in the GGTT is tracked via vma (otherwise we + * could evict as required with minimal stalling) so we are forced + * to idle the GPU and explicitly retire outstanding requests in + * the hopes that we can then remove contexts and the like only + * bound by their active reference. + */ + err = i915_gem_switch_to_kernel_context(i915); + if (err) + return err; + + err = i915_gem_wait_for_idle(i915, + I915_WAIT_INTERRUPTIBLE | + I915_WAIT_LOCKED); + if (err) + return err; + + return 0; +} + static bool mark_free(struct drm_mm_scan *scan, struct i915_vma *vma, @@ -175,19 +198,7 @@ search_again: return intel_has_pending_fb_unpin(dev_priv) ? -EAGAIN : -ENOSPC; } - /* Not everything in the GGTT is tracked via vma (otherwise we - * could evict as required with minimal stalling) so we are forced - * to idle the GPU and explicitly retire outstanding requests in - * the hopes that we can then remove contexts and the like only - * bound by their active reference. - */ - ret = i915_gem_switch_to_kernel_context(dev_priv); - if (ret) - return ret; - - ret = i915_gem_wait_for_idle(dev_priv, - I915_WAIT_INTERRUPTIBLE | - I915_WAIT_LOCKED); + ret = ggtt_flush(dev_priv); if (ret) return ret; @@ -337,10 +348,8 @@ int i915_gem_evict_for_node(struct i915_address_space *vm, /** * i915_gem_evict_vm - Evict all idle vmas from a vm * @vm: Address space to cleanse - * @do_idle: Boolean directing whether to idle first. * - * This function evicts all idles vmas from a vm. If all unpinned vmas should be - * evicted the @do_idle needs to be set to true. + * This function evicts all vmas from a vm. * * This is used by the execbuf code as a last-ditch effort to defragment the * address space. @@ -348,37 +357,50 @@ int i915_gem_evict_for_node(struct i915_address_space *vm, * To clarify: This is for freeing up virtual address space, not for freeing * memory in e.g. the shrinker. */ -int i915_gem_evict_vm(struct i915_address_space *vm, bool do_idle) +int i915_gem_evict_vm(struct i915_address_space *vm) { + struct list_head *phases[] = { + &vm->inactive_list, + &vm->active_list, + NULL + }, **phase; + struct list_head eviction_list; struct i915_vma *vma, *next; int ret; lockdep_assert_held(&vm->i915->drm.struct_mutex); trace_i915_gem_evict_vm(vm); - if (do_idle) { - struct drm_i915_private *dev_priv = vm->i915; - - if (i915_is_ggtt(vm)) { - ret = i915_gem_switch_to_kernel_context(dev_priv); - if (ret) - return ret; - } - - ret = i915_gem_wait_for_idle(dev_priv, - I915_WAIT_INTERRUPTIBLE | - I915_WAIT_LOCKED); + /* Switch back to the default context in order to unpin + * the existing context objects. However, such objects only + * pin themselves inside the global GTT and performing the + * switch otherwise is ineffective. + */ + if (i915_is_ggtt(vm)) { + ret = ggtt_flush(vm->i915); if (ret) return ret; - - WARN_ON(!list_empty(&vm->active_list)); } - list_for_each_entry_safe(vma, next, &vm->inactive_list, vm_link) - if (!i915_vma_is_pinned(vma)) - WARN_ON(i915_vma_unbind(vma)); + INIT_LIST_HEAD(&eviction_list); + phase = phases; + do { + list_for_each_entry(vma, *phase, vm_link) { + if (i915_vma_is_pinned(vma)) + continue; - return 0; + __i915_vma_pin(vma); + list_add(&vma->evict_link, &eviction_list); + } + } while (*++phase); + + ret = 0; + list_for_each_entry_safe(vma, next, &eviction_list, evict_link) { + __i915_vma_unpin(vma); + if (ret == 0) + ret = i915_vma_unbind(vma); + } + return ret; } #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 9c3f6c40270f..a052072fe8b3 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -42,41 +42,195 @@ #define DBG_USE_CPU_RELOC 0 /* -1 force GTT relocs; 1 force CPU relocs */ -#define __EXEC_OBJECT_HAS_PIN (1<<31) -#define __EXEC_OBJECT_HAS_FENCE (1<<30) -#define __EXEC_OBJECT_NEEDS_MAP (1<<29) -#define __EXEC_OBJECT_NEEDS_BIAS (1<<28) -#define __EXEC_OBJECT_INTERNAL_FLAGS (0xf<<28) /* all of the above */ +#define __EXEC_OBJECT_HAS_PIN BIT(31) +#define __EXEC_OBJECT_HAS_FENCE BIT(30) +#define __EXEC_OBJECT_NEEDS_MAP BIT(29) +#define __EXEC_OBJECT_NEEDS_BIAS BIT(28) +#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 28) /* all of the above */ +#define __EXEC_OBJECT_RESERVED (__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE) + +#define __EXEC_HAS_RELOC BIT(31) +#define __EXEC_VALIDATED BIT(30) +#define UPDATE PIN_OFFSET_FIXED #define BATCH_OFFSET_BIAS (256*1024) #define __I915_EXEC_ILLEGAL_FLAGS \ (__I915_EXEC_UNKNOWN_FLAGS | I915_EXEC_CONSTANTS_MASK) +/** + * DOC: User command execution + * + * Userspace submits commands to be executed on the GPU as an instruction + * stream within a GEM object we call a batchbuffer. This instructions may + * refer to other GEM objects containing auxiliary state such as kernels, + * samplers, render targets and even secondary batchbuffers. Userspace does + * not know where in the GPU memory these objects reside and so before the + * batchbuffer is passed to the GPU for execution, those addresses in the + * batchbuffer and auxiliary objects are updated. This is known as relocation, + * or patching. To try and avoid having to relocate each object on the next + * execution, userspace is told the location of those objects in this pass, + * but this remains just a hint as the kernel may choose a new location for + * any object in the future. + * + * Processing an execbuf ioctl is conceptually split up into a few phases. + * + * 1. Validation - Ensure all the pointers, handles and flags are valid. + * 2. Reservation - Assign GPU address space for every object + * 3. Relocation - Update any addresses to point to the final locations + * 4. Serialisation - Order the request with respect to its dependencies + * 5. Construction - Construct a request to execute the batchbuffer + * 6. Submission (at some point in the future execution) + * + * Reserving resources for the execbuf is the most complicated phase. We + * neither want to have to migrate the object in the address space, nor do + * we want to have to update any relocations pointing to this object. Ideally, + * we want to leave the object where it is and for all the existing relocations + * to match. If the object is given a new address, or if userspace thinks the + * object is elsewhere, we have to parse all the relocation entries and update + * the addresses. Userspace can set the I915_EXEC_NORELOC flag to hint that + * all the target addresses in all of its objects match the value in the + * relocation entries and that they all match the presumed offsets given by the + * list of execbuffer objects. Using this knowledge, we know that if we haven't + * moved any buffers, all the relocation entries are valid and we can skip + * the update. (If userspace is wrong, the likely outcome is an impromptu GPU + * hang.) The requirement for using I915_EXEC_NO_RELOC are: + * + * The addresses written in the objects must match the corresponding + * reloc.presumed_offset which in turn must match the corresponding + * execobject.offset. + * + * Any render targets written to in the batch must be flagged with + * EXEC_OBJECT_WRITE. + * + * To avoid stalling, execobject.offset should match the current + * address of that object within the active context. + * + * The reservation is done is multiple phases. First we try and keep any + * object already bound in its current location - so as long as meets the + * constraints imposed by the new execbuffer. Any object left unbound after the + * first pass is then fitted into any available idle space. If an object does + * not fit, all objects are removed from the reservation and the process rerun + * after sorting the objects into a priority order (more difficult to fit + * objects are tried first). Failing that, the entire VM is cleared and we try + * to fit the execbuf once last time before concluding that it simply will not + * fit. + * + * A small complication to all of this is that we allow userspace not only to + * specify an alignment and a size for the object in the address space, but + * we also allow userspace to specify the exact offset. This objects are + * simpler to place (the location is known a priori) all we have to do is make + * sure the space is available. + * + * Once all the objects are in place, patching up the buried pointers to point + * to the final locations is a fairly simple job of walking over the relocation + * entry arrays, looking up the right address and rewriting the value into + * the object. Simple! ... The relocation entries are stored in user memory + * and so to access them we have to copy them into a local buffer. That copy + * has to avoid taking any pagefaults as they may lead back to a GEM object + * requiring the struct_mutex (i.e. recursive deadlock). So once again we split + * the relocation into multiple passes. First we try to do everything within an + * atomic context (avoid the pagefaults) which requires that we never wait. If + * we detect that we may wait, or if we need to fault, then we have to fallback + * to a slower path. The slowpath has to drop the mutex. (Can you hear alarm + * bells yet?) Dropping the mutex means that we lose all the state we have + * built up so far for the execbuf and we must reset any global data. However, + * we do leave the objects pinned in their final locations - which is a + * potential issue for concurrent execbufs. Once we have left the mutex, we can + * allocate and copy all the relocation entries into a large array at our + * leisure, reacquire the mutex, reclaim all the objects and other state and + * then proceed to update any incorrect addresses with the objects. + * + * As we process the relocation entries, we maintain a record of whether the + * object is being written to. Using NORELOC, we expect userspace to provide + * this information instead. We also check whether we can skip the relocation + * by comparing the expected value inside the relocation entry with the target's + * final address. If they differ, we have to map the current object and rewrite + * the 4 or 8 byte pointer within. + * + * Serialising an execbuf is quite simple according to the rules of the GEM + * ABI. Execution within each context is ordered by the order of submission. + * Writes to any GEM object are in order of submission and are exclusive. Reads + * from a GEM object are unordered with respect to other reads, but ordered by + * writes. A write submitted after a read cannot occur before the read, and + * similarly any read submitted after a write cannot occur before the write. + * Writes are ordered between engines such that only one write occurs at any + * time (completing any reads beforehand) - using semaphores where available + * and CPU serialisation otherwise. Other GEM access obey the same rules, any + * write (either via mmaps using set-domain, or via pwrite) must flush all GPU + * reads before starting, and any read (either using set-domain or pread) must + * flush all GPU writes before starting. (Note we only employ a barrier before, + * we currently rely on userspace not concurrently starting a new execution + * whilst reading or writing to an object. This may be an advantage or not + * depending on how much you trust userspace not to shoot themselves in the + * foot.) Serialisation may just result in the request being inserted into + * a DAG awaiting its turn, but most simple is to wait on the CPU until + * all dependencies are resolved. + * + * After all of that, is just a matter of closing the request and handing it to + * the hardware (well, leaving it in a queue to be executed). However, we also + * offer the ability for batchbuffers to be run with elevated privileges so + * that they access otherwise hidden registers. (Used to adjust L3 cache etc.) + * Before any batch is given extra privileges we first must check that it + * contains no nefarious instructions, we check that each instruction is from + * our whitelist and all registers are also from an allowed list. We first + * copy the user's batchbuffer to a shadow (so that the user doesn't have + * access to it, either by the CPU or GPU as we scan it) and then parse each + * instruction. If everything is ok, we set a flag telling the hardware to run + * the batchbuffer in trusted mode, otherwise the ioctl is rejected. + */ + struct i915_execbuffer { - struct drm_i915_private *i915; - struct drm_file *file; - struct drm_i915_gem_execbuffer2 *args; - struct drm_i915_gem_exec_object2 *exec; - struct intel_engine_cs *engine; - struct i915_gem_context *ctx; - struct i915_address_space *vm; - struct i915_vma *batch; - struct drm_i915_gem_request *request; - u32 batch_start_offset; - u32 batch_len; - unsigned int dispatch_flags; - struct drm_i915_gem_exec_object2 shadow_exec_entry; - bool need_relocs; - struct list_head vmas; + struct drm_i915_private *i915; /** i915 backpointer */ + struct drm_file *file; /** per-file lookup tables and limits */ + struct drm_i915_gem_execbuffer2 *args; /** ioctl parameters */ + struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */ + + struct intel_engine_cs *engine; /** engine to queue the request to */ + struct i915_gem_context *ctx; /** context for building the request */ + struct i915_address_space *vm; /** GTT and vma for the request */ + + struct drm_i915_gem_request *request; /** our request to build */ + struct i915_vma *batch; /** identity of the batch obj/vma */ + + /** actual size of execobj[] as we may extend it for the cmdparser */ + unsigned int buffer_count; + + /** list of vma not yet bound during reservation phase */ + struct list_head unbound; + + /** list of vma that have execobj.relocation_count */ + struct list_head relocs; + + /** + * Track the most recently used object for relocations, as we + * frequently have to perform multiple relocations within the same + * obj/page + */ struct reloc_cache { - struct drm_mm_node node; - unsigned long vaddr; - unsigned int page; + struct drm_mm_node node; /** temporary GTT binding */ + unsigned long vaddr; /** Current kmap address */ + unsigned long page; /** Currently mapped page index */ bool use_64bit_reloc : 1; + bool has_llc : 1; + bool has_fence : 1; + bool needs_unfenced : 1; } reloc_cache; - int lut_mask; - struct hlist_head *buckets; + + u64 invalid_flags; /** Set of execobj.flags that are invalid */ + u32 context_flags; /** Set of execobj.flags to insert from the ctx */ + + u32 batch_start_offset; /** Location within object of batch */ + u32 batch_len; /** Length of batch within object */ + u32 batch_flags; /** Flags composed for emit_bb_start() */ + + /** + * Indicate either the size of the hastable used to resolve + * relocation handles, or if negative that we are using a direct + * index into the execobj[]. + */ + int lut_size; + struct hlist_head *buckets; /** ht for relocation handles */ }; /* @@ -87,11 +241,41 @@ struct i915_execbuffer { #define __exec_to_vma(ee) (ee)->rsvd2 #define exec_to_vma(ee) u64_to_ptr(struct i915_vma, __exec_to_vma(ee)) +/* + * Used to convert any address to canonical form. + * Starting from gen8, some commands (e.g. STATE_BASE_ADDRESS, + * MI_LOAD_REGISTER_MEM and others, see Broadwell PRM Vol2a) require the + * addresses to be in a canonical form: + * "GraphicsAddress[63:48] are ignored by the HW and assumed to be in correct + * canonical form [63:48] == [47]." + */ +#define GEN8_HIGH_ADDRESS_BIT 47 +static inline u64 gen8_canonical_addr(u64 address) +{ + return sign_extend64(address, GEN8_HIGH_ADDRESS_BIT); +} + +static inline u64 gen8_noncanonical_addr(u64 address) +{ + return address & GENMASK_ULL(GEN8_HIGH_ADDRESS_BIT, 0); +} + static int eb_create(struct i915_execbuffer *eb) { - if ((eb->args->flags & I915_EXEC_HANDLE_LUT) == 0) { - unsigned int size = 1 + ilog2(eb->args->buffer_count); + if (!(eb->args->flags & I915_EXEC_HANDLE_LUT)) { + unsigned int size = 1 + ilog2(eb->buffer_count); + /* + * Without a 1:1 association between relocation handles and + * the execobject[] index, we instead create a hashtable. + * We size it dynamically based on available memory, starting + * first with 1:1 assocative hash and scaling back until + * the allocation succeeds. + * + * Later on we use a positive lut_size to indicate we are + * using this hashtable, and a negative value to indicate a + * direct lookup. + */ do { eb->buckets = kzalloc(sizeof(struct hlist_head) << size, GFP_TEMPORARY | @@ -108,112 +292,411 @@ static int eb_create(struct i915_execbuffer *eb) return -ENOMEM; } - eb->lut_mask = size; + eb->lut_size = size; } else { - eb->lut_mask = -eb->args->buffer_count; + eb->lut_size = -eb->buffer_count; } return 0; } +static bool +eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry, + const struct i915_vma *vma) +{ + if (!(entry->flags & __EXEC_OBJECT_HAS_PIN)) + return true; + + if (vma->node.size < entry->pad_to_size) + return true; + + if (entry->alignment && !IS_ALIGNED(vma->node.start, entry->alignment)) + return true; + + if (entry->flags & EXEC_OBJECT_PINNED && + vma->node.start != entry->offset) + return true; + + if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS && + vma->node.start < BATCH_OFFSET_BIAS) + return true; + + if (!(entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) && + (vma->node.start + vma->node.size - 1) >> 32) + return true; + + return false; +} + +static inline void +eb_pin_vma(struct i915_execbuffer *eb, + struct drm_i915_gem_exec_object2 *entry, + struct i915_vma *vma) +{ + u64 flags; + + flags = vma->node.start; + flags |= PIN_USER | PIN_NONBLOCK | PIN_OFFSET_FIXED; + if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_GTT)) + flags |= PIN_GLOBAL; + if (unlikely(i915_vma_pin(vma, 0, 0, flags))) + return; + + if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) { + if (unlikely(i915_vma_get_fence(vma))) { + i915_vma_unpin(vma); + return; + } + + if (i915_vma_pin_fence(vma)) + entry->flags |= __EXEC_OBJECT_HAS_FENCE; + } + + entry->flags |= __EXEC_OBJECT_HAS_PIN; +} + static inline void __eb_unreserve_vma(struct i915_vma *vma, const struct drm_i915_gem_exec_object2 *entry) { + GEM_BUG_ON(!(entry->flags & __EXEC_OBJECT_HAS_PIN)); + if (unlikely(entry->flags & __EXEC_OBJECT_HAS_FENCE)) i915_vma_unpin_fence(vma); - if (entry->flags & __EXEC_OBJECT_HAS_PIN) - __i915_vma_unpin(vma); + __i915_vma_unpin(vma); } -static void -eb_unreserve_vma(struct i915_vma *vma) +static inline void +eb_unreserve_vma(struct i915_vma *vma, + struct drm_i915_gem_exec_object2 *entry) { - struct drm_i915_gem_exec_object2 *entry = vma->exec_entry; + if (!(entry->flags & __EXEC_OBJECT_HAS_PIN)) + return; __eb_unreserve_vma(vma, entry); - entry->flags &= ~(__EXEC_OBJECT_HAS_FENCE | __EXEC_OBJECT_HAS_PIN); + entry->flags &= ~__EXEC_OBJECT_RESERVED; } -static void -eb_reset(struct i915_execbuffer *eb) +static int +eb_validate_vma(struct i915_execbuffer *eb, + struct drm_i915_gem_exec_object2 *entry, + struct i915_vma *vma) { - struct i915_vma *vma; + if (unlikely(entry->flags & eb->invalid_flags)) + return -EINVAL; - list_for_each_entry(vma, &eb->vmas, exec_link) { - eb_unreserve_vma(vma); - i915_vma_put(vma); - vma->exec_entry = NULL; + if (unlikely(entry->alignment && !is_power_of_2(entry->alignment))) + return -EINVAL; + + /* + * Offset can be used as input (EXEC_OBJECT_PINNED), reject + * any non-page-aligned or non-canonical addresses. + */ + if (unlikely(entry->flags & EXEC_OBJECT_PINNED && + entry->offset != gen8_canonical_addr(entry->offset & PAGE_MASK))) + return -EINVAL; + + /* pad_to_size was once a reserved field, so sanitize it */ + if (entry->flags & EXEC_OBJECT_PAD_TO_SIZE) { + if (unlikely(offset_in_page(entry->pad_to_size))) + return -EINVAL; + } else { + entry->pad_to_size = 0; } - if (eb->lut_mask >= 0) - memset(eb->buckets, 0, - sizeof(struct hlist_head) << eb->lut_mask); + if (unlikely(vma->exec_entry)) { + DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n", + entry->handle, (int)(entry - eb->exec)); + return -EINVAL; + } + + /* + * From drm_mm perspective address space is continuous, + * so from this point we're always using non-canonical + * form internally. + */ + entry->offset = gen8_noncanonical_addr(entry->offset); + + return 0; } -static bool -eb_add_vma(struct i915_execbuffer *eb, struct i915_vma *vma, int i) +static int +eb_add_vma(struct i915_execbuffer *eb, + struct drm_i915_gem_exec_object2 *entry, + struct i915_vma *vma) { - if (unlikely(vma->exec_entry)) { - DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n", - eb->exec[i].handle, i); - return false; + int err; + + GEM_BUG_ON(i915_vma_is_closed(vma)); + + if (!(eb->args->flags & __EXEC_VALIDATED)) { + err = eb_validate_vma(eb, entry, vma); + if (unlikely(err)) + return err; } - list_add_tail(&vma->exec_link, &eb->vmas); - vma->exec_entry = &eb->exec[i]; - if (eb->lut_mask >= 0) { - vma->exec_handle = eb->exec[i].handle; + if (eb->lut_size >= 0) { + vma->exec_handle = entry->handle; hlist_add_head(&vma->exec_node, - &eb->buckets[hash_32(vma->exec_handle, - eb->lut_mask)]); + &eb->buckets[hash_32(entry->handle, + eb->lut_size)]); } - i915_vma_get(vma); - __exec_to_vma(&eb->exec[i]) = (uintptr_t)vma; - return true; + if (entry->relocation_count) + list_add_tail(&vma->reloc_link, &eb->relocs); + + if (!eb->reloc_cache.has_fence) { + entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE; + } else { + if ((entry->flags & EXEC_OBJECT_NEEDS_FENCE || + eb->reloc_cache.needs_unfenced) && + i915_gem_object_is_tiled(vma->obj)) + entry->flags |= EXEC_OBJECT_NEEDS_GTT | __EXEC_OBJECT_NEEDS_MAP; + } + + if (!(entry->flags & EXEC_OBJECT_PINNED)) + entry->flags |= eb->context_flags; + + /* + * Stash a pointer from the vma to execobj, so we can query its flags, + * size, alignment etc as provided by the user. Also we stash a pointer + * to the vma inside the execobj so that we can use a direct lookup + * to find the right target VMA when doing relocations. + */ + vma->exec_entry = entry; + __exec_to_vma(entry) = (uintptr_t)i915_vma_get(vma); + + err = 0; + if (vma->node.size) + eb_pin_vma(eb, entry, vma); + if (eb_vma_misplaced(entry, vma)) { + eb_unreserve_vma(vma, entry); + + list_add_tail(&vma->exec_link, &eb->unbound); + if (drm_mm_node_allocated(&vma->node)) + err = i915_vma_unbind(vma); + } else { + if (entry->offset != vma->node.start) { + entry->offset = vma->node.start | UPDATE; + eb->args->flags |= __EXEC_HAS_RELOC; + } + } + return err; +} + +static inline int use_cpu_reloc(const struct reloc_cache *cache, + const struct drm_i915_gem_object *obj) +{ + if (!i915_gem_object_has_struct_page(obj)) + return false; + + if (DBG_USE_CPU_RELOC) + return DBG_USE_CPU_RELOC > 0; + + return (cache->has_llc || + obj->cache_dirty || + obj->cache_level != I915_CACHE_NONE); +} + +static int eb_reserve_vma(const struct i915_execbuffer *eb, + struct i915_vma *vma) +{ + struct drm_i915_gem_exec_object2 *entry = vma->exec_entry; + u64 flags; + int err; + + flags = PIN_USER | PIN_NONBLOCK; + if (entry->flags & EXEC_OBJECT_NEEDS_GTT) + flags |= PIN_GLOBAL; + + /* + * Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset, + * limit address to the first 4GBs for unflagged objects. + */ + if (!(entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) + flags |= PIN_ZONE_4G; + + if (entry->flags & __EXEC_OBJECT_NEEDS_MAP) + flags |= PIN_MAPPABLE; + + if (entry->flags & EXEC_OBJECT_PINNED) { + flags |= entry->offset | PIN_OFFSET_FIXED; + flags &= ~PIN_NONBLOCK; /* force overlapping PINNED checks */ + } else if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS) { + flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS; + } + + err = i915_vma_pin(vma, entry->pad_to_size, entry->alignment, flags); + if (err) + return err; + + if (entry->offset != vma->node.start) { + entry->offset = vma->node.start | UPDATE; + eb->args->flags |= __EXEC_HAS_RELOC; + } + + entry->flags |= __EXEC_OBJECT_HAS_PIN; + GEM_BUG_ON(eb_vma_misplaced(entry, vma)); + + if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) { + err = i915_vma_get_fence(vma); + if (unlikely(err)) { + i915_vma_unpin(vma); + return err; + } + + if (i915_vma_pin_fence(vma)) + entry->flags |= __EXEC_OBJECT_HAS_FENCE; + } + + return 0; +} + +static int eb_reserve(struct i915_execbuffer *eb) +{ + const unsigned int count = eb->buffer_count; + struct list_head last; + struct i915_vma *vma; + unsigned int i, pass; + int err; + + /* + * Attempt to pin all of the buffers into the GTT. + * This is done in 3 phases: + * + * 1a. Unbind all objects that do not match the GTT constraints for + * the execbuffer (fenceable, mappable, alignment etc). + * 1b. Increment pin count for already bound objects. + * 2. Bind new objects. + * 3. Decrement pin count. + * + * This avoid unnecessary unbinding of later objects in order to make + * room for the earlier objects *unless* we need to defragment. + */ + + pass = 0; + err = 0; + do { + list_for_each_entry(vma, &eb->unbound, exec_link) { + err = eb_reserve_vma(eb, vma); + if (err) + break; + } + if (err != -ENOSPC) + return err; + + /* Resort *all* the objects into priority order */ + INIT_LIST_HEAD(&eb->unbound); + INIT_LIST_HEAD(&last); + for (i = 0; i < count; i++) { + struct drm_i915_gem_exec_object2 *entry = &eb->exec[i]; + + if (entry->flags & EXEC_OBJECT_PINNED && + entry->flags & __EXEC_OBJECT_HAS_PIN) + continue; + + vma = exec_to_vma(entry); + eb_unreserve_vma(vma, entry); + + if (entry->flags & EXEC_OBJECT_PINNED) + list_add(&vma->exec_link, &eb->unbound); + else if (entry->flags & __EXEC_OBJECT_NEEDS_MAP) + list_add_tail(&vma->exec_link, &eb->unbound); + else + list_add_tail(&vma->exec_link, &last); + } + list_splice_tail(&last, &eb->unbound); + + switch (pass++) { + case 0: + break; + + case 1: + /* Too fragmented, unbind everything and retry */ + err = i915_gem_evict_vm(eb->vm); + if (err) + return err; + break; + + default: + return -ENOSPC; + } + } while (1); } static inline struct hlist_head * -ht_head(const struct i915_gem_context *ctx, u32 handle) +ht_head(const struct i915_gem_context_vma_lut *lut, u32 handle) { - return &ctx->vma_lut.ht[hash_32(handle, ctx->vma_lut.ht_bits)]; + return &lut->ht[hash_32(handle, lut->ht_bits)]; } static inline bool -ht_needs_resize(const struct i915_gem_context *ctx) +ht_needs_resize(const struct i915_gem_context_vma_lut *lut) { - return (4*ctx->vma_lut.ht_count > 3*ctx->vma_lut.ht_size || - 4*ctx->vma_lut.ht_count + 1 < ctx->vma_lut.ht_size); + return (4*lut->ht_count > 3*lut->ht_size || + 4*lut->ht_count + 1 < lut->ht_size); } -static int -eb_lookup_vmas(struct i915_execbuffer *eb) +static unsigned int eb_batch_index(const struct i915_execbuffer *eb) +{ + return eb->buffer_count - 1; +} + +static int eb_select_context(struct i915_execbuffer *eb) +{ + struct i915_gem_context *ctx; + + ctx = i915_gem_context_lookup(eb->file->driver_priv, eb->args->rsvd1); + if (unlikely(IS_ERR(ctx))) + return PTR_ERR(ctx); + + if (unlikely(i915_gem_context_is_banned(ctx))) { + DRM_DEBUG("Context %u tried to submit while banned\n", + ctx->user_handle); + return -EIO; + } + + eb->ctx = i915_gem_context_get(ctx); + eb->vm = ctx->ppgtt ? &ctx->ppgtt->base : &eb->i915->ggtt.base; + + eb->context_flags = 0; + if (ctx->flags & CONTEXT_NO_ZEROMAP) + eb->context_flags |= __EXEC_OBJECT_NEEDS_BIAS; + + return 0; +} + +static int eb_lookup_vmas(struct i915_execbuffer *eb) { #define INTERMEDIATE BIT(0) - const int count = eb->args->buffer_count; + const unsigned int count = eb->buffer_count; + struct i915_gem_context_vma_lut *lut = &eb->ctx->vma_lut; struct i915_vma *vma; + struct idr *idr; + unsigned int i; int slow_pass = -1; - int i; + int err; - INIT_LIST_HEAD(&eb->vmas); + INIT_LIST_HEAD(&eb->relocs); + INIT_LIST_HEAD(&eb->unbound); - if (unlikely(eb->ctx->vma_lut.ht_size & I915_CTX_RESIZE_IN_PROGRESS)) - flush_work(&eb->ctx->vma_lut.resize); - GEM_BUG_ON(eb->ctx->vma_lut.ht_size & I915_CTX_RESIZE_IN_PROGRESS); + if (unlikely(lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS)) + flush_work(&lut->resize); + GEM_BUG_ON(lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS); for (i = 0; i < count; i++) { __exec_to_vma(&eb->exec[i]) = 0; hlist_for_each_entry(vma, - ht_head(eb->ctx, eb->exec[i].handle), + ht_head(lut, eb->exec[i].handle), ctx_node) { if (vma->ctx_handle != eb->exec[i].handle) continue; - if (!eb_add_vma(eb, vma, i)) - return -EINVAL; + err = eb_add_vma(eb, &eb->exec[i], vma); + if (unlikely(err)) + return err; goto next_vma; } @@ -224,24 +707,27 @@ next_vma: ; } if (slow_pass < 0) - return 0; + goto out; spin_lock(&eb->file->table_lock); - /* Grab a reference to the object and release the lock so we can lookup - * or create the VMA without using GFP_ATOMIC */ + /* + * Grab a reference to the object and release the lock so we can lookup + * or create the VMA without using GFP_ATOMIC + */ + idr = &eb->file->object_idr; for (i = slow_pass; i < count; i++) { struct drm_i915_gem_object *obj; if (__exec_to_vma(&eb->exec[i])) continue; - obj = to_intel_bo(idr_find(&eb->file->object_idr, - eb->exec[i].handle)); + obj = to_intel_bo(idr_find(idr, eb->exec[i].handle)); if (unlikely(!obj)) { spin_unlock(&eb->file->table_lock); DRM_DEBUG("Invalid object handle %d at index %d\n", eb->exec[i].handle, i); - return -ENOENT; + err = -ENOENT; + goto err; } __exec_to_vma(&eb->exec[i]) = INTERMEDIATE | (uintptr_t)obj; @@ -251,7 +737,7 @@ next_vma: ; for (i = slow_pass; i < count; i++) { struct drm_i915_gem_object *obj; - if ((__exec_to_vma(&eb->exec[i]) & INTERMEDIATE) == 0) + if (!(__exec_to_vma(&eb->exec[i]) & INTERMEDIATE)) continue; /* @@ -262,12 +748,13 @@ next_vma: ; * from the (obj, vm) we don't run the risk of creating * duplicated vmas for the same vm. */ - obj = u64_to_ptr(struct drm_i915_gem_object, + obj = u64_to_ptr(typeof(*obj), __exec_to_vma(&eb->exec[i]) & ~INTERMEDIATE); vma = i915_vma_instance(obj, eb->vm, NULL); if (unlikely(IS_ERR(vma))) { DRM_DEBUG("Failed to lookup VMA\n"); - return PTR_ERR(vma); + err = PTR_ERR(vma); + goto err; } /* First come, first served */ @@ -275,32 +762,31 @@ next_vma: ; vma->ctx = eb->ctx; vma->ctx_handle = eb->exec[i].handle; hlist_add_head(&vma->ctx_node, - ht_head(eb->ctx, eb->exec[i].handle)); - eb->ctx->vma_lut.ht_count++; + ht_head(lut, eb->exec[i].handle)); + lut->ht_count++; + lut->ht_size |= I915_CTX_RESIZE_IN_PROGRESS; if (i915_vma_is_ggtt(vma)) { GEM_BUG_ON(obj->vma_hashed); obj->vma_hashed = vma; } } - if (!eb_add_vma(eb, vma, i)) - return -EINVAL; + err = eb_add_vma(eb, &eb->exec[i], vma); + if (unlikely(err)) + goto err; } - if (ht_needs_resize(eb->ctx)) { - eb->ctx->vma_lut.ht_size |= I915_CTX_RESIZE_IN_PROGRESS; - queue_work(system_highpri_wq, &eb->ctx->vma_lut.resize); + if (lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS) { + if (ht_needs_resize(lut)) + queue_work(system_highpri_wq, &lut->resize); + else + lut->ht_size &= ~I915_CTX_RESIZE_IN_PROGRESS; } - return 0; -#undef INTERMEDIATE -} - -static struct i915_vma * -eb_get_batch(struct i915_execbuffer *eb) -{ - struct i915_vma *vma = - exec_to_vma(&eb->exec[eb->args->buffer_count - 1]); +out: + /* take note of the batch buffer before we might reorder the lists */ + i = eb_batch_index(eb); + eb->batch = exec_to_vma(&eb->exec[i]); /* * SNA is doing fancy tricks with compressing batch buffers, which leads @@ -311,24 +797,36 @@ eb_get_batch(struct i915_execbuffer *eb) * Note that actual hangs have only been observed on gen7, but for * paranoia do it everywhere. |
