From 59bbf596791b89c7f88fdcac29dfc39c1221d25d Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Wed, 9 Nov 2022 15:28:59 +0100 Subject: iommu/s390: Make attach succeed even if the device is in error state If a zPCI device is in the error state while switching IOMMU domains zpci_register_ioat() will fail and we would end up with the device not attached to any domain. In this state since zdev->dma_table == NULL a reset via zpci_hot_reset_device() would wrongfully re-initialize the device for DMA API usage using zpci_dma_init_device(). As automatic recovery is currently disabled while attached to an IOMMU domain this only affects slot resets triggered through other means but will affect automatic recovery once we switch to using dma-iommu. Additionally with that switch common code expects attaching to the default domain to always work so zpci_register_ioat() should only fail if there is no chance to recover anyway, e.g. if the device has been unplugged. Improve the robustness of attach by specifically looking at the status returned by zpci_mod_fc() to determine if the device is unavailable and in this case simply ignore the error. Once the device is reset zpci_hot_reset_device() will then correctly set the domain's DMA translation tables. Signed-off-by: Niklas Schnelle Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221109142903.4080275-2-schnelle@linux.ibm.com Signed-off-by: Joerg Roedel --- arch/s390/pci/pci.c | 11 ++++++----- arch/s390/pci/pci_dma.c | 3 ++- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/s390/pci') diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 73cdc5539384..a703dcd94a68 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -116,20 +116,20 @@ EXPORT_SYMBOL_GPL(pci_proc_domain); /* Modify PCI: Register I/O address translation parameters */ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas, - u64 base, u64 limit, u64 iota) + u64 base, u64 limit, u64 iota, u8 *status) { u64 req = ZPCI_CREATE_REQ(zdev->fh, dmaas, ZPCI_MOD_FC_REG_IOAT); struct zpci_fib fib = {0}; - u8 cc, status; + u8 cc; WARN_ON_ONCE(iota & 0x3fff); fib.pba = base; fib.pal = limit; fib.iota = iota | ZPCI_IOTA_RTTO_FLAG; fib.gd = zdev->gisa; - cc = zpci_mod_fc(req, &fib, &status); + cc = zpci_mod_fc(req, &fib, status); if (cc) - zpci_dbg(3, "reg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status); + zpci_dbg(3, "reg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, *status); return cc; } EXPORT_SYMBOL_GPL(zpci_register_ioat); @@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(zpci_disable_device); */ int zpci_hot_reset_device(struct zpci_dev *zdev) { + u8 status; int rc; zpci_dbg(3, "rst fid:%x, fh:%x\n", zdev->fid, zdev->fh); @@ -787,7 +788,7 @@ int zpci_hot_reset_device(struct zpci_dev *zdev) if (zdev->dma_table) rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table)); + virt_to_phys(zdev->dma_table), &status); else rc = zpci_dma_init_device(zdev); if (rc) { diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index 227cf0a62800..dee825ee7305 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -547,6 +547,7 @@ static void s390_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int zpci_dma_init_device(struct zpci_dev *zdev) { + u8 status; int rc; /* @@ -598,7 +599,7 @@ int zpci_dma_init_device(struct zpci_dev *zdev) } if (zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table))) { + virt_to_phys(zdev->dma_table), &status)) { rc = -EIO; goto free_bitmap; } -- cgit v1.2.3 From 2ba8336dab5fb81452aea9c21dfc870050a017f3 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Wed, 9 Nov 2022 15:29:01 +0100 Subject: iommu/s390: Use RCU to allow concurrent domain_list iteration The s390_domain->devices list is only added to when new devices are attached but is iterated through in read-only fashion for every mapping operation as well as for I/O TLB flushes and thus in performance critical code causing contention on the s390_domain->list_lock. Fortunately such a read-mostly linked list is a standard use case for RCU. This change closely follows the example fpr RCU protected list given in Documentation/RCU/listRCU.rst. Signed-off-by: Niklas Schnelle Link: https://lore.kernel.org/r/20221109142903.4080275-4-schnelle@linux.ibm.com Signed-off-by: Joerg Roedel --- arch/s390/pci/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/s390/pci') diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index a703dcd94a68..ef38b1514c77 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -996,7 +996,7 @@ void zpci_release_device(struct kref *kref) break; } zpci_dbg(3, "rem fid:%x\n", zdev->fid); - kfree(zdev); + kfree_rcu(zdev, rcu); } int zpci_report_error(struct pci_dev *pdev, -- cgit v1.2.3 From 21c1f9021f0e7d28c3edfcc70e1ca1926ea3774e Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Wed, 9 Nov 2022 15:29:03 +0100 Subject: s390/pci: use lock-free I/O translation updates I/O translation tables on s390 use 8 byte page table entries and tables which are allocated lazily but only freed when the entire I/O translation table is torn down. Also each IOVA can at any time only translate to one physical address Furthermore I/O table accesses by the IOMMU hardware are cache coherent. With a bit of care we can thus use atomic updates to manipulate the translation table without having to use a global lock at all. This is done analogous to the existing I/O translation table handling code used on Intel and AMD x86 systems. Signed-off-by: Niklas Schnelle Link: https://lore.kernel.org/r/20221109142903.4080275-6-schnelle@linux.ibm.com Signed-off-by: Joerg Roedel --- arch/s390/pci/pci_dma.c | 74 ++++++++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 29 deletions(-) (limited to 'arch/s390/pci') diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index dee825ee7305..ea478d11fbd1 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -63,37 +63,55 @@ static void dma_free_page_table(void *table) kmem_cache_free(dma_page_table_cache, table); } -static unsigned long *dma_get_seg_table_origin(unsigned long *entry) +static unsigned long *dma_get_seg_table_origin(unsigned long *rtep) { + unsigned long old_rte, rte; unsigned long *sto; - if (reg_entry_isvalid(*entry)) - sto = get_rt_sto(*entry); - else { + rte = READ_ONCE(*rtep); + if (reg_entry_isvalid(rte)) { + sto = get_rt_sto(rte); + } else { sto = dma_alloc_cpu_table(); if (!sto) return NULL; - set_rt_sto(entry, virt_to_phys(sto)); - validate_rt_entry(entry); - entry_clr_protected(entry); + set_rt_sto(&rte, virt_to_phys(sto)); + validate_rt_entry(&rte); + entry_clr_protected(&rte); + + old_rte = cmpxchg(rtep, ZPCI_TABLE_INVALID, rte); + if (old_rte != ZPCI_TABLE_INVALID) { + /* Somone else was faster, use theirs */ + dma_free_cpu_table(sto); + sto = get_rt_sto(old_rte); + } } return sto; } -static unsigned long *dma_get_page_table_origin(unsigned long *entry) +static unsigned long *dma_get_page_table_origin(unsigned long *step) { + unsigned long old_ste, ste; unsigned long *pto; - if (reg_entry_isvalid(*entry)) - pto = get_st_pto(*entry); - else { + ste = READ_ONCE(*step); + if (reg_entry_isvalid(ste)) { + pto = get_st_pto(ste); + } else { pto = dma_alloc_page_table(); if (!pto) return NULL; - set_st_pto(entry, virt_to_phys(pto)); - validate_st_entry(entry); - entry_clr_protected(entry); + set_st_pto(&ste, virt_to_phys(pto)); + validate_st_entry(&ste); + entry_clr_protected(&ste); + + old_ste = cmpxchg(step, ZPCI_TABLE_INVALID, ste); + if (old_ste != ZPCI_TABLE_INVALID) { + /* Somone else was faster, use theirs */ + dma_free_page_table(pto); + pto = get_st_pto(old_ste); + } } return pto; } @@ -117,19 +135,24 @@ unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr) return &pto[px]; } -void dma_update_cpu_trans(unsigned long *entry, phys_addr_t page_addr, int flags) +void dma_update_cpu_trans(unsigned long *ptep, phys_addr_t page_addr, int flags) { + unsigned long pte; + + pte = READ_ONCE(*ptep); if (flags & ZPCI_PTE_INVALID) { - invalidate_pt_entry(entry); + invalidate_pt_entry(&pte); } else { - set_pt_pfaa(entry, page_addr); - validate_pt_entry(entry); + set_pt_pfaa(&pte, page_addr); + validate_pt_entry(&pte); } if (flags & ZPCI_TABLE_PROTECTED) - entry_set_protected(entry); + entry_set_protected(&pte); else - entry_clr_protected(entry); + entry_clr_protected(&pte); + + xchg(ptep, pte); } static int __dma_update_trans(struct zpci_dev *zdev, phys_addr_t pa, @@ -137,18 +160,14 @@ static int __dma_update_trans(struct zpci_dev *zdev, phys_addr_t pa, { unsigned int nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; phys_addr_t page_addr = (pa & PAGE_MASK); - unsigned long irq_flags; unsigned long *entry; int i, rc = 0; if (!nr_pages) return -EINVAL; - spin_lock_irqsave(&zdev->dma_table_lock, irq_flags); - if (!zdev->dma_table) { - rc = -EINVAL; - goto out_unlock; - } + if (!zdev->dma_table) + return -EINVAL; for (i = 0; i < nr_pages; i++) { entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr); @@ -173,8 +192,6 @@ undo_cpu_trans: dma_update_cpu_trans(entry, page_addr, flags); } } -out_unlock: - spin_unlock_irqrestore(&zdev->dma_table_lock, irq_flags); return rc; } @@ -558,7 +575,6 @@ int zpci_dma_init_device(struct zpci_dev *zdev) WARN_ON(zdev->s390_domain); spin_lock_init(&zdev->iommu_bitmap_lock); - spin_lock_init(&zdev->dma_table_lock); zdev->dma_table = dma_alloc_cpu_table(); if (!zdev->dma_table) { -- cgit v1.2.3