From 00208852d351ca6e4a8b9ff0c5376fa3a8ed8eaa Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 17 Oct 2022 16:01:22 -0700 Subject: iommu: Add return value rules to attach_dev op and APIs Cases like VFIO wish to attach a device to an existing domain that was not allocated specifically from the device. This raises a condition where the IOMMU driver can fail the domain attach because the domain and device are incompatible with each other. This is a soft failure that can be resolved by using a different domain. Provide a dedicated errno EINVAL from the IOMMU driver during attach that the reason why the attach failed is because of domain incompatibility. VFIO can use this to know that the attach is a soft failure and it should continue searching. Otherwise, the attach will be a hard failure and VFIO will return the code to userspace. Update kdocs to add rules of return value to the attach_dev op and APIs. Link: https://lore.kernel.org/r/bd56d93c18621104a0fa1b0de31e9b760b81b769.1666042872.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 3c9da1f8979e..857898d102b3 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -266,6 +266,18 @@ struct iommu_ops { /** * struct iommu_domain_ops - domain specific operations * @attach_dev: attach an iommu domain to a device + * Return: + * * 0 - success + * * EINVAL - can indicate that device and domain are incompatible due to + * some previous configuration of the domain, in which case the + * driver shouldn't log an error, since it is legitimate for a + * caller to test reuse of existing domains. Otherwise, it may + * still represent some other fundamental problem + * * ENOMEM - out of memory + * * ENOSPC - non-ENOMEM type of resource allocation failures + * * EBUSY - device is attached to a domain and cannot be changed + * * ENODEV - device specific errors, not able to be attached + * * - treated as ENODEV by the caller. Use is discouraged * @detach_dev: detach an iommu domain from a device * @map: map a physically contiguous memory region to an iommu domain * @map_pages: map a physically contiguous set of pages of the same size to -- cgit v1.2.3 From 1adf3cc20d693569ebee90fd91fa34b0570fcd6f Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:05 +0800 Subject: iommu: Add max_pasids field in struct iommu_device Use this field to keep the number of supported PASIDs that an IOMMU hardware is able to support. This is a generic attribute of an IOMMU and lifting it into the per-IOMMU device structure makes it possible to allocate a PASID for device without calls into the IOMMU drivers. Any iommu driver that supports PASID related features should set this field before enabling them on the devices. In the Intel IOMMU driver, intel_iommu_sm is moved to CONFIG_INTEL_IOMMU enclave so that the pasid_supported() helper could be used in dmar.c without compilation errors. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 3c9da1f8979e..e3af4f46e6e0 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -322,12 +322,14 @@ struct iommu_domain_ops { * @list: Used by the iommu-core to keep a list of registered iommus * @ops: iommu-ops for talking to this iommu * @dev: struct device for sysfs handling + * @max_pasids: number of supported PASIDs */ struct iommu_device { struct list_head list; const struct iommu_ops *ops; struct fwnode_handle *fwnode; struct device *dev; + u32 max_pasids; }; /** -- cgit v1.2.3 From 22d2c7afb3697a68c7fc05c935ef662dee06dc60 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:06 +0800 Subject: iommu: Add max_pasids field in struct dev_iommu Use this field to save the number of PASIDs that a device is able to consume. It is a generic attribute of a device and lifting it into the per-device dev_iommu struct could help to avoid the boilerplate code in various IOMMU drivers. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e3af4f46e6e0..ac3f6c6dcc6d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -368,6 +368,7 @@ struct iommu_fault_param { * @fwspec: IOMMU fwspec data * @iommu_dev: IOMMU device this device is linked to * @priv: IOMMU Driver private data + * @max_pasids: number of PASIDs this device can consume * * TODO: migrate other per device data pointers under iommu_dev_data, e.g. * struct iommu_group *iommu_group; @@ -379,6 +380,7 @@ struct dev_iommu { struct iommu_fwspec *fwspec; struct iommu_device *iommu_dev; void *priv; + u32 max_pasids; }; int iommu_device_register(struct iommu_device *iommu, -- cgit v1.2.3 From 942fd5435dccb273f90176b046ae6bbba60cfbd8 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:07 +0800 Subject: iommu: Remove SVM_FLAG_SUPERVISOR_MODE support The current kernel DMA with PASID support is based on the SVA with a flag SVM_FLAG_SUPERVISOR_MODE. The IOMMU driver binds the kernel memory address space to a PASID of the device. The device driver programs the device with kernel virtual address (KVA) for DMA access. There have been security and functional issues with this approach: - The lack of IOTLB synchronization upon kernel page table updates. (vmalloc, module/BPF loading, CONFIG_DEBUG_PAGEALLOC etc.) - Other than slight more protection, using kernel virtual address (KVA) has little advantage over physical address. There are also no use cases yet where DMA engines need kernel virtual addresses for in-kernel DMA. This removes SVM_FLAG_SUPERVISOR_MODE support from the IOMMU interface. The device drivers are suggested to handle kernel DMA with PASID through the kernel DMA APIs. The drvdata parameter in iommu_sva_bind_device() and all callbacks is not needed anymore. Cleanup them as well. Link: https://lore.kernel.org/linux-iommu/20210511194726.GP1002214@nvidia.com/ Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Fenghua Yu Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/intel-svm.h | 13 ------------- include/linux/iommu.h | 8 +++----- 2 files changed, 3 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index 207ef06ba3e1..f9a0d44f6fdb 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -13,17 +13,4 @@ #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) #define PRQ_DEPTH ((0x1000 << PRQ_ORDER) >> 5) -/* - * The SVM_FLAG_SUPERVISOR_MODE flag requests a PASID which can be used only - * for access to kernel addresses. No IOTLB flushes are automatically done - * for kernel mappings; it is valid only for access to the kernel's static - * 1:1 mapping of physical memory — not to vmalloc or even module mappings. - * A future API addition may permit the use of such ranges, by means of an - * explicit IOTLB flush call (akin to the DMA API's unmap method). - * - * It is unlikely that we will ever hook into flush_tlb_kernel_range() to - * do such IOTLB flushes automatically. - */ -#define SVM_FLAG_SUPERVISOR_MODE BIT(0) - #endif /* __INTEL_SVM_H__ */ diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ac3f6c6dcc6d..72bb0531aa76 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -247,8 +247,7 @@ struct iommu_ops { int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f); int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f); - struct iommu_sva *(*sva_bind)(struct device *dev, struct mm_struct *mm, - void *drvdata); + struct iommu_sva *(*sva_bind)(struct device *dev, struct mm_struct *mm); void (*sva_unbind)(struct iommu_sva *handle); u32 (*sva_get_pasid)(struct iommu_sva *handle); @@ -668,8 +667,7 @@ int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f); int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features f); struct iommu_sva *iommu_sva_bind_device(struct device *dev, - struct mm_struct *mm, - void *drvdata); + struct mm_struct *mm); void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); @@ -1000,7 +998,7 @@ iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat) } static inline struct iommu_sva * -iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, void *drvdata) +iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) { return NULL; } -- cgit v1.2.3 From 16603704559c7a68718059c4f75287886c01b20f Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:09 +0800 Subject: iommu: Add attach/detach_dev_pasid iommu interfaces Attaching an IOMMU domain to a PASID of a device is a generic operation for modern IOMMU drivers which support PASID-granular DMA address translation. Currently visible usage scenarios include (but not limited): - SVA (Shared Virtual Address) - kernel DMA with PASID - hardware-assist mediated device This adds the set_dev_pasid domain ops for setting the domain onto a PASID of a device and remove_dev_pasid iommu ops for removing any setup on a PASID of device. This also adds interfaces for device drivers to attach/detach/retrieve a domain for a PASID of a device. If multiple devices share a single group, it's fine as long the fabric always routes every TLP marked with a PASID to the host bridge and only the host bridge. For example, ACS achieves this universally and has been checked when pci_enable_pasid() is called. As we can't reliably tell the source apart in a group, all the devices in a group have to be considered as the same source, and mapped to the same PASID table. The DMA ownership is about the whole device (more precisely, iommu group), including the RID and PASIDs. When the ownership is converted, the pasid array must be empty. This also adds necessary checks in the DMA ownership interfaces. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Reviewed-by: Jason Gunthorpe Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-6-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 72bb0531aa76..5d2b78ac5416 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -223,6 +223,9 @@ struct iommu_iotlb_gather { * - IOMMU_DOMAIN_DMA: must use a dma domain * - 0: use the default setting * @default_domain_ops: the default ops for domains + * @remove_dev_pasid: Remove any translation configurations of a specific + * pasid, so that any DMA transactions with this pasid + * will be blocked by the hardware. * @pgsize_bitmap: bitmap of all possible supported page sizes * @owner: Driver module providing these ops */ @@ -256,6 +259,7 @@ struct iommu_ops { struct iommu_page_response *msg); int (*def_domain_type)(struct device *dev); + void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid); const struct iommu_domain_ops *default_domain_ops; unsigned long pgsize_bitmap; @@ -266,6 +270,7 @@ struct iommu_ops { * struct iommu_domain_ops - domain specific operations * @attach_dev: attach an iommu domain to a device * @detach_dev: detach an iommu domain from a device + * @set_dev_pasid: set an iommu domain to a pasid of device * @map: map a physically contiguous memory region to an iommu domain * @map_pages: map a physically contiguous set of pages of the same size to * an iommu domain. @@ -286,6 +291,8 @@ struct iommu_ops { struct iommu_domain_ops { int (*attach_dev)(struct iommu_domain *domain, struct device *dev); void (*detach_dev)(struct iommu_domain *domain, struct device *dev); + int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev, + ioasid_t pasid); int (*map)(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp); @@ -678,6 +685,13 @@ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner); void iommu_group_release_dma_owner(struct iommu_group *group); bool iommu_group_dma_owner_claimed(struct iommu_group *group); +int iommu_attach_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); +void iommu_detach_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); +struct iommu_domain * +iommu_get_domain_for_dev_pasid(struct device *dev, ioasid_t pasid, + unsigned int type); #else /* CONFIG_IOMMU_API */ struct iommu_ops {}; @@ -1040,6 +1054,24 @@ static inline bool iommu_group_dma_owner_claimed(struct iommu_group *group) { return false; } + +static inline int iommu_attach_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + return -ENODEV; +} + +static inline void iommu_detach_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ +} + +static inline struct iommu_domain * +iommu_get_domain_for_dev_pasid(struct device *dev, ioasid_t pasid, + unsigned int type) +{ + return NULL; +} #endif /* CONFIG_IOMMU_API */ /** -- cgit v1.2.3 From 136467962e49931dbc6240aea8197fab7e407ba4 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:10 +0800 Subject: iommu: Add IOMMU SVA domain support The SVA iommu_domain represents a hardware pagetable that the IOMMU hardware could use for SVA translation. This adds some infrastructures to support SVA domain in the iommu core. It includes: - Extend the iommu_domain to support a new IOMMU_DOMAIN_SVA domain type. The IOMMU drivers that support allocation of the SVA domain should provide its own SVA domain specific iommu_domain_ops. - Add a helper to allocate an SVA domain. The iommu_domain_free() is still used to free an SVA domain. The report_iommu_fault() should be replaced by the new iommu_report_device_fault(). Leave the existing fault handler with the existing users and the newly added SVA members excludes it. Suggested-by: Jean-Philippe Brucker Suggested-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-7-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 5d2b78ac5416..776baa375967 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -64,6 +64,8 @@ struct iommu_domain_geometry { #define __IOMMU_DOMAIN_PT (1U << 2) /* Domain is identity mapped */ #define __IOMMU_DOMAIN_DMA_FQ (1U << 3) /* DMA-API uses flush queue */ +#define __IOMMU_DOMAIN_SVA (1U << 4) /* Shared process address space */ + /* * This are the possible domain-types * @@ -77,6 +79,8 @@ struct iommu_domain_geometry { * certain optimizations for these domains * IOMMU_DOMAIN_DMA_FQ - As above, but definitely using batched TLB * invalidation. + * IOMMU_DOMAIN_SVA - DMA addresses are shared process addresses + * represented by mm_struct's. */ #define IOMMU_DOMAIN_BLOCKED (0U) #define IOMMU_DOMAIN_IDENTITY (__IOMMU_DOMAIN_PT) @@ -86,15 +90,24 @@ struct iommu_domain_geometry { #define IOMMU_DOMAIN_DMA_FQ (__IOMMU_DOMAIN_PAGING | \ __IOMMU_DOMAIN_DMA_API | \ __IOMMU_DOMAIN_DMA_FQ) +#define IOMMU_DOMAIN_SVA (__IOMMU_DOMAIN_SVA) struct iommu_domain { unsigned type; const struct iommu_domain_ops *ops; unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ - iommu_fault_handler_t handler; - void *handler_token; struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; + union { + struct { + iommu_fault_handler_t handler; + void *handler_token; + }; + struct { /* IOMMU_DOMAIN_SVA */ + struct mm_struct *mm; + int users; + }; + }; }; static inline bool iommu_is_dma_domain(struct iommu_domain *domain) @@ -685,6 +698,8 @@ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner); void iommu_group_release_dma_owner(struct iommu_group *group); bool iommu_group_dma_owner_claimed(struct iommu_group *group); +struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, + struct mm_struct *mm); int iommu_attach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); void iommu_detach_device_pasid(struct iommu_domain *domain, @@ -1055,6 +1070,12 @@ static inline bool iommu_group_dma_owner_claimed(struct iommu_group *group) return false; } +static inline struct iommu_domain * +iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm) +{ + return NULL; +} + static inline int iommu_attach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid) { -- cgit v1.2.3 From be51b1d6bbff48c7d1943a8ff1e5a55777807f6e Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:13 +0800 Subject: iommu/sva: Refactoring iommu_sva_bind/unbind_device() The existing iommu SVA interfaces are implemented by calling the SVA specific iommu ops provided by the IOMMU drivers. There's no need for any SVA specific ops in iommu_ops vector anymore as we can achieve this through the generic attach/detach_dev_pasid domain ops. This refactors the IOMMU SVA interfaces implementation by using the iommu_attach/detach_device_pasid interfaces and align them with the concept of the SVA iommu domain. Put the new SVA code in the SVA related file in order to make it self-contained. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-10-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 776baa375967..bee5659d07eb 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -645,6 +645,7 @@ struct iommu_fwspec { */ struct iommu_sva { struct device *dev; + struct iommu_domain *domain; }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, @@ -686,11 +687,6 @@ void iommu_release_device(struct device *dev); int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f); int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features f); -struct iommu_sva *iommu_sva_bind_device(struct device *dev, - struct mm_struct *mm); -void iommu_sva_unbind_device(struct iommu_sva *handle); -u32 iommu_sva_get_pasid(struct iommu_sva *handle); - int iommu_device_use_default_domain(struct device *dev); void iommu_device_unuse_default_domain(struct device *dev); @@ -1026,21 +1022,6 @@ iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat) return -ENODEV; } -static inline struct iommu_sva * -iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) -{ - return NULL; -} - -static inline void iommu_sva_unbind_device(struct iommu_sva *handle) -{ -} - -static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle) -{ - return IOMMU_PASID_INVALID; -} - static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) { return NULL; @@ -1154,4 +1135,26 @@ static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_m #endif /* CONFIG_IOMMU_DMA */ +#ifdef CONFIG_IOMMU_SVA +struct iommu_sva *iommu_sva_bind_device(struct device *dev, + struct mm_struct *mm); +void iommu_sva_unbind_device(struct iommu_sva *handle); +u32 iommu_sva_get_pasid(struct iommu_sva *handle); +#else +static inline struct iommu_sva * +iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) +{ + return NULL; +} + +static inline void iommu_sva_unbind_device(struct iommu_sva *handle) +{ +} + +static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle) +{ + return IOMMU_PASID_INVALID; +} +#endif /* CONFIG_IOMMU_SVA */ + #endif /* __LINUX_IOMMU_H */ -- cgit v1.2.3 From 1c263576f4735e063e234fa5f43fd3046d36b5b3 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:14 +0800 Subject: iommu: Remove SVA related callbacks from iommu ops These ops'es have been deprecated. There's no need for them anymore. Remove them to avoid dead code. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-11-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index bee5659d07eb..c337ef1c97bc 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -227,9 +227,6 @@ struct iommu_iotlb_gather { * driver init to device driver init (default no) * @dev_enable/disable_feat: per device entries to enable/disable * iommu specific features. - * @sva_bind: Bind process address space to device - * @sva_unbind: Unbind process address space from device - * @sva_get_pasid: Get PASID associated to a SVA handle * @page_response: handle page request response * @def_domain_type: device default domain type, return value: * - IOMMU_DOMAIN_IDENTITY: must use an identity domain @@ -263,10 +260,6 @@ struct iommu_ops { int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f); int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f); - struct iommu_sva *(*sva_bind)(struct device *dev, struct mm_struct *mm); - void (*sva_unbind)(struct iommu_sva *handle); - u32 (*sva_get_pasid)(struct iommu_sva *handle); - int (*page_response)(struct device *dev, struct iommu_fault_event *evt, struct iommu_page_response *msg); -- cgit v1.2.3 From 8cc93159f91960b4812ea48887e9e7501babc95a Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 31 Oct 2022 08:59:15 +0800 Subject: iommu: Prepare IOMMU domain for IOPF This adds some mechanisms around the iommu_domain so that the I/O page fault handling framework could route a page fault to the domain and call the fault handler from it. Add pointers to the page fault handler and its private data in struct iommu_domain. The fault handler will be called with the private data as a parameter once a page fault is routed to the domain. Any kernel component which owns an iommu domain could install handler and its private parameter so that the page fault could be further routed and handled. This also prepares the SVA implementation to be the first consumer of the per-domain page fault handling model. The I/O page fault handler for SVA is copied to the SVA file with mmget_not_zero() added before mmap_read_lock(). Suggested-by: Jean-Philippe Brucker Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Zhangfei Gao Tested-by: Tony Zhu Link: https://lore.kernel.org/r/20221031005917.45690-12-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c337ef1c97bc..7d2648058e43 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -98,6 +98,9 @@ struct iommu_domain { unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; + enum iommu_page_response_code (*iopf_handler)(struct iommu_fault *fault, + void *data); + void *fault_data; union { struct { iommu_fault_handler_t handler; -- cgit v1.2.3 From 4989764d8ed3d3d1024e4e831ff2affc40ee01d6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:24 -0400 Subject: iommu: Add IOMMU_CAP_ENFORCE_CACHE_COHERENCY This queries if a domain linked to a device should expect to support enforce_cache_coherency() so iommufd can negotiate the rules for when a domain should be shared or not. For iommufd a device that declares IOMMU_CAP_ENFORCE_CACHE_COHERENCY will not be attached to a domain that does not support it. Link: https://lore.kernel.org/r/1-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Tested-by: Yu He Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 68d7d304cdb7..a09fd32d8cc2 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -124,6 +124,11 @@ enum iommu_cap { IOMMU_CAP_NOEXEC, /* IOMMU_NOEXEC flag */ IOMMU_CAP_PRE_BOOT_PROTECTION, /* Firmware says it used the IOMMU for DMA protection and we should too */ + /* + * Per-device flag indicating if enforce_cache_coherency() will work on + * this device. + */ + IOMMU_CAP_ENFORCE_CACHE_COHERENCY, }; /* These are the possible reserved region types */ -- cgit v1.2.3 From 89395ccedbc153fecbc29342fbb94a6dfadf24cd Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 29 Nov 2022 16:29:25 -0400 Subject: iommu: Add device-centric DMA ownership interfaces These complement the group interfaces used by VFIO and are for use by iommufd. The main difference is that multiple devices in the same group can all share the ownership by passing the same ownership pointer. Move the common code into shared functions. Link: https://lore.kernel.org/r/2-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Signed-off-by: Lu Baolu Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a09fd32d8cc2..1690c334e516 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -707,6 +707,9 @@ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner); void iommu_group_release_dma_owner(struct iommu_group *group); bool iommu_group_dma_owner_claimed(struct iommu_group *group); +int iommu_device_claim_dma_owner(struct device *dev, void *owner); +void iommu_device_release_dma_owner(struct device *dev); + struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm); int iommu_attach_device_pasid(struct iommu_domain *domain, @@ -1064,6 +1067,15 @@ static inline bool iommu_group_dma_owner_claimed(struct iommu_group *group) return false; } +static inline void iommu_device_release_dma_owner(struct device *dev) +{ +} + +static inline int iommu_device_claim_dma_owner(struct device *dev, void *owner) +{ + return -ENODEV; +} + static inline struct iommu_domain * iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm) { -- cgit v1.2.3 From 5fe937862c8426f24cd1dcbf7c22fb1a31069b4f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:26 -0400 Subject: interval-tree: Add a utility to iterate over spans in an interval tree The span iterator travels over the indexes of the interval_tree, not the nodes, and classifies spans of indexes as either 'used' or 'hole'. 'used' spans are fully covered by nodes in the tree and 'hole' spans have no node intersecting the span. This is done greedily such that spans are maximally sized and every iteration step switches between used/hole. As an example a trivial allocator can be written as: for (interval_tree_span_iter_first(&span, itree, 0, ULONG_MAX); !interval_tree_span_iter_done(&span); interval_tree_span_iter_next(&span)) if (span.is_hole && span.last_hole - span.start_hole >= allocation_size - 1) return span.start_hole; With all the tricky boundary conditions handled by the library code. The following iommufd patches have several algorithms for its overlapping node interval trees that are significantly simplified with this kind of iteration primitive. As it seems generally useful, put it into lib/. Link: https://lore.kernel.org/r/3-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Jason Gunthorpe --- include/linux/interval_tree.h | 58 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'include/linux') diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h index 288c26f50732..2b8026a39906 100644 --- a/include/linux/interval_tree.h +++ b/include/linux/interval_tree.h @@ -27,4 +27,62 @@ extern struct interval_tree_node * interval_tree_iter_next(struct interval_tree_node *node, unsigned long start, unsigned long last); +/** + * struct interval_tree_span_iter - Find used and unused spans. + * @start_hole: Start of an interval for a hole when is_hole == 1 + * @last_hole: Inclusive end of an interval for a hole when is_hole == 1 + * @start_used: Start of a used interval when is_hole == 0 + * @last_used: Inclusive end of a used interval when is_hole == 0 + * @is_hole: 0 == used, 1 == is_hole, -1 == done iteration + * + * This iterator travels over spans in an interval tree. It does not return + * nodes but classifies each span as either a hole, where no nodes intersect, or + * a used, which is fully covered by nodes. Each iteration step toggles between + * hole and used until the entire range is covered. The returned spans always + * fully cover the requested range. + * + * The iterator is greedy, it always returns the largest hole or used possible, + * consolidating all consecutive nodes. + * + * Use interval_tree_span_iter_done() to detect end of iteration. + */ +struct interval_tree_span_iter { + /* private: not for use by the caller */ + struct interval_tree_node *nodes[2]; + unsigned long first_index; + unsigned long last_index; + + /* public: */ + union { + unsigned long start_hole; + unsigned long start_used; + }; + union { + unsigned long last_hole; + unsigned long last_used; + }; + int is_hole; +}; + +void interval_tree_span_iter_first(struct interval_tree_span_iter *state, + struct rb_root_cached *itree, + unsigned long first_index, + unsigned long last_index); +void interval_tree_span_iter_advance(struct interval_tree_span_iter *iter, + struct rb_root_cached *itree, + unsigned long new_index); +void interval_tree_span_iter_next(struct interval_tree_span_iter *state); + +static inline bool +interval_tree_span_iter_done(struct interval_tree_span_iter *state) +{ + return state->is_hole == -1; +} + +#define interval_tree_for_each_span(span, itree, first_index, last_index) \ + for (interval_tree_span_iter_first(span, itree, \ + first_index, last_index); \ + !interval_tree_span_iter_done(span); \ + interval_tree_span_iter_next(span)) + #endif /* _LINUX_INTERVAL_TREE_H */ -- cgit v1.2.3 From 2ff4bed7fee72ba1abfcff5f11ae8f8e570353f2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:29 -0400 Subject: iommufd: File descriptor, context, kconfig and makefiles This is the basic infrastructure of a new miscdevice to hold the iommufd IOCTL API. It provides: - A miscdevice to create file descriptors to run the IOCTL interface over - A table based ioctl dispatch and centralized extendable pre-validation step - An xarray mapping userspace ID's to kernel objects. The design has multiple inter-related objects held within in a single IOMMUFD fd - A simple usage count to build a graph of object relations and protect against hostile userspace racing ioctls The only IOCTL provided in this patch is the generic 'destroy any object by handle' operation. Link: https://lore.kernel.org/r/6-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 include/linux/iommufd.h (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h new file mode 100644 index 000000000000..d1817472c273 --- /dev/null +++ b/include/linux/iommufd.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Intel Corporation + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#ifndef __LINUX_IOMMUFD_H +#define __LINUX_IOMMUFD_H + +#include +#include +#include + +struct iommufd_ctx; +struct file; + +void iommufd_ctx_get(struct iommufd_ctx *ictx); + +#if IS_ENABLED(CONFIG_IOMMUFD) +struct iommufd_ctx *iommufd_ctx_from_file(struct file *file); +void iommufd_ctx_put(struct iommufd_ctx *ictx); +#else /* !CONFIG_IOMMUFD */ +static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void iommufd_ctx_put(struct iommufd_ctx *ictx) +{ +} +#endif /* CONFIG_IOMMUFD */ +#endif -- cgit v1.2.3 From ce5a23c835aa0f0a931b5bcde1e7811f951b0146 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:30 -0400 Subject: kernel/user: Allow user_struct::locked_vm to be usable for iommufd Following the pattern of io_uring, perf, skb, and bpf, iommfd will use user->locked_vm for accounting pinned pages. Ensure the value is included in the struct and export free_uid() as iommufd is modular. user->locked_vm is the good accounting to use for ulimit because it is per-user, and the security sandboxing of locked pages is not supposed to be per-process. Other places (vfio, vdpa and infiniband) have used mm->pinned_vm and/or mm->locked_vm for accounting pinned pages, but this is only per-process and inconsistent with the new FOLL_LONGTERM users in the kernel. Concurrent work is underway to try to put this in a cgroup, so everything can be consistent and the kernel can provide a FOLL_LONGTERM limit that actually provides security. Link: https://lore.kernel.org/r/7-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Jason Gunthorpe --- include/linux/sched/user.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index f054d0360a75..4cc52698e214 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -25,7 +25,7 @@ struct user_struct { #if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \ defined(CONFIG_NET) || defined(CONFIG_IO_URING) || \ - defined(CONFIG_VFIO_PCI_ZDEV_KVM) + defined(CONFIG_VFIO_PCI_ZDEV_KVM) || IS_ENABLED(CONFIG_IOMMUFD) atomic_long_t locked_vm; #endif #ifdef CONFIG_WATCH_QUEUE -- cgit v1.2.3 From f394576eb11dbcd3a740fa41e577b97f0720d26e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:31 -0400 Subject: iommufd: PFN handling for iopt_pages The top of the data structure provides an IO Address Space (IOAS) that is similar to a VFIO container. The IOAS allows map/unmap of memory into ranges of IOVA called iopt_areas. Multiple IOMMU domains (IO page tables) and in-kernel accesses (like VFIO mdevs) can be attached to the IOAS to access the PFNs that those IOVA areas cover. The IO Address Space (IOAS) datastructure is composed of: - struct io_pagetable holding the IOVA map - struct iopt_areas representing populated portions of IOVA - struct iopt_pages representing the storage of PFNs - struct iommu_domain representing each IO page table in the system IOMMU - struct iopt_pages_access representing in-kernel accesses of PFNs (ie VFIO mdevs) - struct xarray pinned_pfns holding a list of pages pinned by in-kernel accesses This patch introduces the lowest part of the datastructure - the movement of PFNs in a tiered storage scheme: 1) iopt_pages::pinned_pfns xarray 2) Multiple iommu_domains 3) The origin of the PFNs, i.e. the userspace pointer PFN have to be copied between all combinations of tiers, depending on the configuration. The interface is an iterator called a 'pfn_reader' which determines which tier each PFN is stored and loads it into a list of PFNs held in a struct pfn_batch. Each step of the iterator will fill up the pfn_batch, then the caller can use the pfn_batch to send the PFNs to the required destination. Repeating this loop will read all the PFNs in an IOVA range. The pfn_reader and pfn_batch also keep track of the pinned page accounting. While PFNs are always stored and accessed as full PAGE_SIZE units the iommu_domain tier can store with a sub-page offset/length to support IOMMUs with a smaller IOPTE size than PAGE_SIZE. Link: https://lore.kernel.org/r/8-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index d1817472c273..26e09d539737 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -13,6 +13,13 @@ struct iommufd_ctx; struct file; +enum { + IOMMUFD_ACCESS_RW_READ = 0, + IOMMUFD_ACCESS_RW_WRITE = 1 << 0, + /* Set if the caller is in a kthread then rw will use kthread_use_mm() */ + IOMMUFD_ACCESS_RW_KTHREAD = 1 << 1, +}; + void iommufd_ctx_get(struct iommufd_ctx *ictx); #if IS_ENABLED(CONFIG_IOMMUFD) -- cgit v1.2.3 From e8d57210035b6377d424ba964961892d01127cf6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:36 -0400 Subject: iommufd: Add kAPI toward external drivers for physical devices Add the four functions external drivers need to connect physical DMA to the IOMMUFD: iommufd_device_bind() / iommufd_device_unbind() Register the device with iommufd and establish security isolation. iommufd_device_attach() / iommufd_device_detach() Connect a bound device to a page table Binding a device creates a device object ID in the uAPI, however the generic API does not yet provide any IOCTLs to manipulate them. Link: https://lore.kernel.org/r/13-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 26e09d539737..185dff3eb32f 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -9,10 +9,19 @@ #include #include #include +#include +struct iommufd_device; struct iommufd_ctx; struct file; +struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, + struct device *dev, u32 *id); +void iommufd_device_unbind(struct iommufd_device *idev); + +int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id); +void iommufd_device_detach(struct iommufd_device *idev); + enum { IOMMUFD_ACCESS_RW_READ = 0, IOMMUFD_ACCESS_RW_WRITE = 1 << 0, -- cgit v1.2.3 From 8d40205f6093f18e07fe3dc5920fc85e9f82b8b3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:37 -0400 Subject: iommufd: Add kAPI toward external drivers for kernel access Kernel access is the mode that VFIO "mdevs" use. In this case there is no struct device and no IOMMU connection. iommufd acts as a record keeper for accesses and returns the actual struct pages back to the caller to use however they need. eg with kmap or the DMA API. Each caller must create a struct iommufd_access with iommufd_access_create(), similar to how iommufd_device_bind() works. Using this struct the caller can access blocks of IOVA using iommufd_access_pin_pages() or iommufd_access_rw(). Callers must provide a callback that immediately unpins any IOVA being used within a range. This happens if userspace unmaps the IOVA under the pin. The implementation forwards the access requests directly to the iopt infrastructure that manages the iopt_pages_access. Link: https://lore.kernel.org/r/14-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 185dff3eb32f..46c481a26d79 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -9,10 +9,12 @@ #include #include #include -#include +struct device; struct iommufd_device; +struct page; struct iommufd_ctx; +struct iommufd_access; struct file; struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, @@ -22,6 +24,11 @@ void iommufd_device_unbind(struct iommufd_device *idev); int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id); void iommufd_device_detach(struct iommufd_device *idev); +struct iommufd_access_ops { + u8 needs_pin_pages : 1; + void (*unmap)(void *data, unsigned long iova, unsigned long length); +}; + enum { IOMMUFD_ACCESS_RW_READ = 0, IOMMUFD_ACCESS_RW_WRITE = 1 << 0, @@ -29,11 +36,24 @@ enum { IOMMUFD_ACCESS_RW_KTHREAD = 1 << 1, }; +struct iommufd_access * +iommufd_access_create(struct iommufd_ctx *ictx, u32 ioas_id, + const struct iommufd_access_ops *ops, void *data); +void iommufd_access_destroy(struct iommufd_access *access); + void iommufd_ctx_get(struct iommufd_ctx *ictx); #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_ctx *iommufd_ctx_from_file(struct file *file); void iommufd_ctx_put(struct iommufd_ctx *ictx); + +int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, + unsigned long length, struct page **out_pages, + unsigned int flags); +void iommufd_access_unpin_pages(struct iommufd_access *access, + unsigned long iova, unsigned long length); +int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, + void *data, size_t len, unsigned int flags); #else /* !CONFIG_IOMMUFD */ static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) { @@ -43,5 +63,26 @@ static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) static inline void iommufd_ctx_put(struct iommufd_ctx *ictx) { } + +static inline int iommufd_access_pin_pages(struct iommufd_access *access, + unsigned long iova, + unsigned long length, + struct page **out_pages, + unsigned int flags) +{ + return -EOPNOTSUPP; +} + +static inline void iommufd_access_unpin_pages(struct iommufd_access *access, + unsigned long iova, + unsigned long length) +{ +} + +static inline int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, + void *data, size_t len, unsigned int flags) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_IOMMUFD */ #endif -- cgit v1.2.3 From d624d6652a65ad4f47a58b8651a1ec1163bb81d3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:38 -0400 Subject: iommufd: vfio container FD ioctl compatibility iommufd can directly implement the /dev/vfio/vfio container IOCTLs by mapping them into io_pagetable operations. A userspace application can test against iommufd and confirm compatibility then simply make a small change to open /dev/iommu instead of /dev/vfio/vfio. For testing purposes /dev/vfio/vfio can be symlinked to /dev/iommu and then all applications will use the compatibility path with no code changes. A later series allows /dev/vfio/vfio to be directly provided by iommufd, which allows the rlimit mode to work the same as well. This series just provides the iommufd side of compatibility. Actually linking this to VFIO_SET_CONTAINER is a followup series, with a link in the cover letter. Internally the compatibility API uses a normal IOAS object that, like vfio, is automatically allocated when the first device is attached. Userspace can also query or set this IOAS object directly using the IOMMU_VFIO_IOAS ioctl. This allows mixing and matching new iommufd only features while still using the VFIO style map/unmap ioctls. While this is enough to operate qemu, it has a few differences: - Resource limits rely on memory cgroups to bound what userspace can do instead of the module parameter dma_entry_limit. - VFIO P2P is not implemented. The DMABUF patches for vfio are a start at a solution where iommufd would import a special DMABUF. This is to avoid further propogating the follow_pfn() security problem. - A full audit for pedantic compatibility details (eg errnos, etc) has not yet been done - powerpc SPAPR is left out, as it is not connected to the iommu_domain framework. It seems interest in SPAPR is minimal as it is currently non-working in v6.1-rc1. They will have to convert to the iommu subsystem framework to enjoy iommfd. The following are not going to be implemented and we expect to remove them from VFIO type1: - SW access 'dirty tracking'. As discussed in the cover letter this will be done in VFIO. - VFIO_TYPE1_NESTING_IOMMU https://lore.kernel.org/all/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/ - VFIO_DMA_MAP_FLAG_VADDR https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/ Link: https://lore.kernel.org/r/15-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 46c481a26d79..84af9a239769 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -54,6 +54,7 @@ void iommufd_access_unpin_pages(struct iommufd_access *access, unsigned long iova, unsigned long length); int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, void *data, size_t len, unsigned int flags); +int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id); #else /* !CONFIG_IOMMUFD */ static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) { @@ -84,5 +85,11 @@ static inline int iommufd_access_rw(struct iommufd_access *access, unsigned long { return -EOPNOTSUPP; } + +static inline int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, + u32 *out_ioas_id) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_IOMMUFD */ #endif -- cgit v1.2.3 From f4b20bb34c83dceade5470288f48f94ce3598ada Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:29:39 -0400 Subject: iommufd: Add kernel support for testing iommufd Provide a mock kernel module for the iommu_domain that allows it to run without any HW and the mocking provides a way to directly validate that the PFNs loaded into the iommu_domain are correct. This exposes the access kAPI toward userspace to allow userspace to explore the functionality of pages.c and io_pagetable.c The mock also simulates the rare case of PAGE_SIZE > iommu page size as the mock will operate at a 2K iommu page size. This allows exercising all of the calculations to support this mismatch. This is also intended to support syzkaller exploring the same space. However, it is an unusually invasive config option to enable all of this. The config option should not be enabled in a production kernel. Link: https://lore.kernel.org/r/16-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Tested-by: Matthew Rosato # s390 Tested-by: Eric Auger # aarch64 Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 84af9a239769..650d45629647 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -34,6 +34,9 @@ enum { IOMMUFD_ACCESS_RW_WRITE = 1 << 0, /* Set if the caller is in a kthread then rw will use kthread_use_mm() */ IOMMUFD_ACCESS_RW_KTHREAD = 1 << 1, + + /* Only for use by selftest */ + __IOMMUFD_ACCESS_RW_SLOW_PATH = 1 << 2, }; struct iommufd_access * -- cgit v1.2.3 From a4d1f91db5021c57e14721ac090616c90386ac70 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:31:51 -0400 Subject: vfio-iommufd: Support iommufd for physical VFIO devices This creates the iommufd_device for the physical VFIO drivers. These are all the drivers that are calling vfio_register_group_dev() and expect the type1 code to setup a real iommu_domain against their parent struct device. The design gives the driver a choice in how it gets connected to iommufd by providing bind_iommufd/unbind_iommufd/attach_ioas callbacks to implement as required. The core code provides three default callbacks for physical mode using a real iommu_domain. This is suitable for drivers using vfio_register_group_dev() Link: https://lore.kernel.org/r/6-v4-42cd2eb0e3eb+335a-vfio_iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Tested-by: Alex Williamson Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Tested-by: Yu He Signed-off-by: Jason Gunthorpe --- include/linux/vfio.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e7cebeb875dd..a7fc4d747dc2 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -17,6 +17,8 @@ #include struct kvm; +struct iommufd_ctx; +struct iommufd_device; /* * VFIO devices can be placed in a set, this allows all devices to share this @@ -54,6 +56,10 @@ struct vfio_device { struct completion comp; struct list_head group_next; struct list_head iommu_entry; +#if IS_ENABLED(CONFIG_IOMMUFD) + struct iommufd_device *iommufd_device; + bool iommufd_attached; +#endif }; /** @@ -80,6 +86,10 @@ struct vfio_device_ops { char *name; int (*init)(struct vfio_device *vdev); void (*release)(struct vfio_device *vdev); + int (*bind_iommufd)(struct vfio_device *vdev, + struct iommufd_ctx *ictx, u32 *out_device_id); + void (*unbind_iommufd)(struct vfio_device *vdev); + int (*attach_ioas)(struct vfio_device *vdev, u32 *pt_id); int (*open_device)(struct vfio_device *vdev); void (*close_device)(struct vfio_device *vdev); ssize_t (*read)(struct vfio_device *vdev, char __user *buf, @@ -96,6 +106,21 @@ struct vfio_device_ops { void __user *arg, size_t argsz); }; +#if IS_ENABLED(CONFIG_IOMMUFD) +int vfio_iommufd_physical_bind(struct vfio_device *vdev, + struct iommufd_ctx *ictx, u32 *out_device_id); +void vfio_iommufd_physical_unbind(struct vfio_device *vdev); +int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id); +#else +#define vfio_iommufd_physical_bind \ + ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ + u32 *out_device_id)) NULL) +#define vfio_iommufd_physical_unbind \ + ((void (*)(struct vfio_device *vdev)) NULL) +#define vfio_iommufd_physical_attach_ioas \ + ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) +#endif + /** * @migration_set_state: Optional callback to change the migration state for * devices that support migration. It's mandatory for -- cgit v1.2.3 From 4741f2e941298ad7553b65e66624435e14793391 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 29 Nov 2022 16:31:52 -0400 Subject: vfio-iommufd: Support iommufd for emulated VFIO devices Emulated VFIO devices are calling vfio_register_emulated_iommu_dev() and consist of all the mdev drivers. Like the physical drivers, support for iommufd is provided by the driver supplying the correct standard ops. Provide ops from the core that duplicate what vfio_register_emulated_iommu_dev() does. Emulated drivers are where it is more likely to see variation in the iommfd support ops. For instance IDXD will probably need to setup both a iommfd_device context linked to a PASID and an iommufd_access context to support all their mdev operations. Link: https://lore.kernel.org/r/7-v4-42cd2eb0e3eb+335a-vfio_iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Tested-by: Alex Williamson Tested-by: Nicolin Chen Tested-by: Yi Liu Tested-by: Lixiao Yang Tested-by: Matthew Rosato Tested-by: Yu He Signed-off-by: Jason Gunthorpe --- include/linux/vfio.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index a7fc4d747dc2..d5f84f98c0fa 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -19,6 +19,7 @@ struct kvm; struct iommufd_ctx; struct iommufd_device; +struct iommufd_access; /* * VFIO devices can be placed in a set, this allows all devices to share this @@ -56,8 +57,10 @@ struct vfio_device { struct completion comp; struct list_head group_next; struct list_head iommu_entry; + struct iommufd_access *iommufd_access; #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_device *iommufd_device; + struct iommufd_ctx *iommufd_ictx; bool iommufd_attached; #endif }; @@ -111,6 +114,10 @@ int vfio_iommufd_physical_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx, u32 *out_device_id); void vfio_iommufd_physical_unbind(struct vfio_device *vdev); int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id); +int vfio_iommufd_emulated_bind(struct vfio_device *vdev, + struct iommufd_ctx *ictx, u32 *out_device_id); +void vfio_iommufd_emulated_unbind(struct vfio_device *vdev); +int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id); #else #define vfio_iommufd_physical_bind \ ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ @@ -119,6 +126,13 @@ int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id); ((void (*)(struct vfio_device *vdev)) NULL) #define vfio_iommufd_physical_attach_ioas \ ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) +#define vfio_iommufd_emulated_bind \ + ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ + u32 *out_device_id)) NULL) +#define vfio_iommufd_emulated_unbind \ + ((void (*)(struct vfio_device *vdev)) NULL) +#define vfio_iommufd_emulated_attach_ioas \ + ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) #endif /** -- cgit v1.2.3