From 1442bb87b878f889442c7e8e83d9125e31ef5072 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Fri, 28 Nov 2025 23:01:02 +0100 Subject: s390/boot: Use entire page for PTEs Make boot_pte_alloc() always allocate a full PAGE_SIZE page for PTE tables, instead of carving two 2K PTE tables out of a single 4K page, similar to commit daa8af80d283 ("s390/mm: Allocate page table with PAGE_SIZE granularity"). This mirrors the change in the vmem code and ensures that boot page tables backing the early KASAN shadow can later be fully freed by the vmem page-table teardown helpers (e.g. when unmapping early KASAN shadow on memory hotplug). The leftover-based allocation was originally added to reduce physmem allocator fragmentation when EDAT was disabled. On current hardware EDAT1 is available on all production systems, so the complexity is no longer justified and gets in the way of freeing the shadow mappings. Signed-off-by: Vasily Gorbik Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/boot/vmem.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index fbe64ffdfb96..7d6cc4c85af0 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -244,22 +244,10 @@ static void *boot_crst_alloc(unsigned long val) static pte_t *boot_pte_alloc(void) { - static void *pte_leftover; pte_t *pte; - /* - * handling pte_leftovers this way helps to avoid memory fragmentation - * during POPULATE_KASAN_MAP_SHADOW when EDAT is off - */ - if (!pte_leftover) { - pte_leftover = (void *)physmem_alloc_or_die(RR_VMEM, PAGE_SIZE, PAGE_SIZE); - pte = pte_leftover + _PAGE_TABLE_SIZE; - __arch_set_page_dat(pte, 1); - } else { - pte = pte_leftover; - pte_leftover = NULL; - } - + pte = (void *)physmem_alloc_or_die(RR_VMEM, PAGE_SIZE, PAGE_SIZE); + __arch_set_page_dat(pte, 1); memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); return pte; } -- cgit v1.2.3 From 6a35d02fec5a1e2ab6c0c94ccc5b0c57a580b098 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Fri, 28 Nov 2025 23:01:05 +0100 Subject: s390/vmem: Support 2G page splitting for KASAN shadow freeing Export split_pud_page() so it can be used from the vmem code and teach modify_pud_table() to split PUD-sized mappings when only a subrange needs to be removed. If the range to be removed covers a full PUD-sized mapping, keep the existing behavior: clear the PUD entry and free the backing large page (for non-direct mappings). Otherwise, split the PUD-mapped page into PMD mappings and let the walker handle the smaller ranges. This is needed for KASAN early shadow removal support: memory hotplug freeing the KASAN early shadow is the only expected caller that will try to free 2G PUD-mapped regions of non-direct mappings. Signed-off-by: Vasily Gorbik Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/include/asm/page.h | 2 ++ arch/s390/mm/pageattr.c | 2 +- arch/s390/mm/vmem.c | 6 +++++- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 9240a363c893..c1d63b613bf9 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -166,6 +166,8 @@ static inline int page_reset_referenced(unsigned long addr) return CC_TRANSFORM(cc); } +int split_pud_page(pud_t *pudp, unsigned long addr); + /* Bits int the storage key */ #define _PAGE_CHANGED 0x02 /* HW changed bit */ #define _PAGE_REFERENCED 0x04 /* HW referenced bit */ diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 3042647c9dbf..d3ce04a4b248 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -204,7 +204,7 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end, return rc; } -static int split_pud_page(pud_t *pudp, unsigned long addr) +int split_pud_page(pud_t *pudp, unsigned long addr) { unsigned long pmd_addr, prot; pmd_t *pm_dir, *pmdp; diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index d96587b84e81..faed09531499 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -330,10 +330,14 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, if (pud_leaf(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE)) { + if (!direct) + vmem_free_pages(pud_deref(*pud), get_order(PUD_SIZE), altmap); pud_clear(pud); pages++; + continue; + } else { + split_pud_page(pud, addr & PUD_MASK); } - continue; } } else if (pud_none(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && -- cgit v1.2.3 From 8543ecc0e03b9367e36a93d82bdef0bf5a16dc56 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Fri, 28 Nov 2025 23:01:08 +0100 Subject: s390: Unmap early KASAN shadow on memory offlining Teach the memory hotplug path to tear down KASAN shadow that was mapped during early boot when a memory block is offlined. Track for each sclp_mem whether its range was covered by the early KASAN shadow via an early_shadow_mapped flag. When such a block is deconfigured and removed via sclp_config_mem_store(), compute the corresponding shadow range and call vmemmap_free() to unmap the boot mapped shadow, then clear the flag. Using vmemmap_free() for the early shadow is safe despite the use of large mappings in the boot-time KASAN setup. The initial shadow is mapped with 1M and 2G pages, where possible. The minimum hotplug memory block size is 128M and always aligned (the identity mapping is at least 2G aligned), which corresponds to a 16M chunk of at least 1M aligned shadow. PMD-mapped 1M shadow pages therefore never need splitting, and PUD-mapped 2G shadow pages can now be split following the preceding changes. Relax the modify_pagetable() sanity check in vmem so that, with KASAN enabled, it may also operate on the KASAN shadow region in addition to the 1:1 mapping and vmemmap area. This allows the KASAN shadow unmapping to reuse the common vmem helpers. Signed-off-by: Vasily Gorbik Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/mm/vmem.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index faed09531499..eeadff45e0e1 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -437,9 +437,15 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add, if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) return -EINVAL; - /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ + /* Don't mess with any tables not fully in 1:1 mapping, vmemmap & kasan area */ +#ifdef CONFIG_KASAN + if (WARN_ON_ONCE(!(start >= KASAN_SHADOW_START && end <= KASAN_SHADOW_END) && + end > __abs_lowcore)) + return -EINVAL; +#else if (WARN_ON_ONCE(end > __abs_lowcore)) return -EINVAL; +#endif for (addr = start; addr < end; addr = next) { next = pgd_addr_end(addr, end); pgd = pgd_offset_k(addr); -- cgit v1.2.3 From eb9780a1a3c4ffc1f383991ce3fc50da1fe4390d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 28 Nov 2025 10:25:29 +0100 Subject: s390: Select POSIX_CPU_TIMERS_TASK_WORK After support for VIRT_XFER_TO_GUEST_WORK is available for s390 it is possible to also select HAVE_POSIX_CPU_TIMERS_TASK_WORK. See [1] for the reasons why it makes sense, also for architectures which do not support PREEMPT_RT. [1] https://lore.kernel.org/all/20200716201923.228696399@linutronix.de Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 938e5df75b2d..014bf4cb3e9f 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -238,6 +238,7 @@ config S390 select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_PREEMPT_DYNAMIC_KEY select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE -- cgit v1.2.3 From f770950a4709af290f314e691897ec0003fbd8ae Mon Sep 17 00:00:00 2001 From: Tobias Schumacher Date: Thu, 4 Dec 2025 06:05:02 +0100 Subject: s390/pci: Migrate s390 IRQ logic to IRQ domain API s390 is one of the last architectures using the legacy API for setup and teardown of PCI MSI IRQs. Migrate the s390 IRQ allocation and teardown to the MSI parent domain API. For details, see: https://lore.kernel.org/lkml/20221111120501.026511281@linutronix.de In detail, create an MSI parent domain for each PCI domain. When a PCI device sets up MSI or MSI-X IRQs, the library creates a per-device IRQ domain for this device, which is used by the device for allocating and freeing IRQs. The per-device domain delegates this allocation and freeing to the parent-domain. In the end, the corresponding callbacks of the parent domain are responsible for allocating and freeing the IRQs. The allocation is split into two parts: - zpci_msi_prepare() is called once for each device and allocates the required resources. On s390, each PCI function has its own airq vector and a summary bit, which must be configured once per function. This is done in prepare(). - zpci_msi_alloc() can be called multiple times for allocating one or more MSI/MSI-X IRQs. This creates a mapping between the virtual IRQ number in the kernel and the hardware IRQ number. Freeing is split into two counterparts: - zpci_msi_free() reverts the effects of zpci_msi_alloc() and - zpci_msi_teardown() reverts the effects of zpci_msi_prepare(). This is called once when all IRQs are freed before a device is removed. Since the parent domain in the end allocates the IRQs, the hwirq encoding must be unambiguous for all IRQs of all devices. This is achieved by encoding the hwirq using the devfn and the MSI index. Reviewed-by: Niklas Schnelle Reviewed-by: Farhan Ali Signed-off-by: Tobias Schumacher Reviewed-by: Gerd Bayer Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 1 + arch/s390/include/asm/pci.h | 5 + arch/s390/pci/pci.c | 6 + arch/s390/pci/pci_bus.c | 18 ++- arch/s390/pci/pci_irq.c | 332 +++++++++++++++++++++++++++++--------------- 5 files changed, 247 insertions(+), 115 deletions(-) (limited to 'arch') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 014bf4cb3e9f..0e5fad5f06ca 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -255,6 +255,7 @@ config S390 select HOTPLUG_SMT select IOMMU_HELPER if PCI select IOMMU_SUPPORT if PCI + select IRQ_MSI_LIB if PCI select KASAN_VMALLOC if KASAN select LOCK_MM_AND_FIND_VMA select MMU_GATHER_MERGE_VMAS diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index a32f465ecf73..c0ff19dab580 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -109,6 +110,7 @@ struct zpci_bus { struct list_head resources; struct list_head bus_next; struct resource bus_resource; + struct irq_domain *msi_parent_domain; int topo; /* TID if topo_is_tid, PCHID otherwise */ int domain_nr; u8 multifunction : 1; @@ -310,6 +312,9 @@ int zpci_dma_exit_device(struct zpci_dev *zdev); /* IRQ */ int __init zpci_irq_init(void); void __init zpci_irq_exit(void); +int zpci_set_irq(struct zpci_dev *zdev); +int zpci_create_parent_msi_domain(struct zpci_bus *zbus); +void zpci_remove_parent_msi_domain(struct zpci_bus *zbus); /* FMB */ int zpci_fmb_enable_device(struct zpci_dev *); diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 93d2c9c780fc..5a6ace9d875a 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -708,6 +708,12 @@ int zpci_reenable_device(struct zpci_dev *zdev) if (rc) return rc; + if (zdev->msi_nr_irqs > 0) { + rc = zpci_set_irq(zdev); + if (rc) + return rc; + } + rc = zpci_iommu_register_ioat(zdev, &status); if (rc) zpci_disable_device(zdev); diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index 72adc8f6e94f..66c4bd888b29 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -198,19 +199,27 @@ static int zpci_bus_create_pci_bus(struct zpci_bus *zbus, struct zpci_dev *fr, s zbus->multifunction = zpci_bus_is_multifunction_root(fr); zbus->max_bus_speed = fr->max_bus_speed; + if (zpci_create_parent_msi_domain(zbus)) + goto out_free_domain; + /* * Note that the zbus->resources are taken over and zbus->resources * is empty after a successful call */ bus = pci_create_root_bus(NULL, ZPCI_BUS_NR, ops, zbus, &zbus->resources); - if (!bus) { - zpci_free_domain(zbus->domain_nr); - return -EFAULT; - } + if (!bus) + goto out_remove_msi_domain; zbus->bus = bus; + dev_set_msi_domain(&zbus->bus->dev, zbus->msi_parent_domain); return 0; + +out_remove_msi_domain: + zpci_remove_parent_msi_domain(zbus); +out_free_domain: + zpci_free_domain(zbus->domain_nr); + return -ENOMEM; } static void zpci_bus_release(struct kref *kref) @@ -231,6 +240,7 @@ static void zpci_bus_release(struct kref *kref) mutex_lock(&zbus_list_lock); list_del(&zbus->bus_next); mutex_unlock(&zbus_list_lock); + zpci_remove_parent_msi_domain(zbus); kfree(zbus); } diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index 2a06df8c2498..e9dd45f3c09d 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -97,7 +98,7 @@ static int zpci_clear_directed_irq(struct zpci_dev *zdev) } /* Register adapter interruptions */ -static int zpci_set_irq(struct zpci_dev *zdev) +int zpci_set_irq(struct zpci_dev *zdev) { int rc; @@ -125,27 +126,53 @@ static int zpci_clear_irq(struct zpci_dev *zdev) static int zpci_set_irq_affinity(struct irq_data *data, const struct cpumask *dest, bool force) { - struct msi_desc *entry = irq_data_get_msi_desc(data); - struct msi_msg msg = entry->msg; - int cpu_addr = smp_cpu_get_cpu_address(cpumask_first(dest)); + irq_data_update_affinity(data, dest); + return IRQ_SET_MASK_OK; +} - msg.address_lo &= 0xff0000ff; - msg.address_lo |= (cpu_addr << 8); - pci_write_msi_msg(data->irq, &msg); +/* + * Encode the hwirq number for the parent domain. The encoding must be unique + * for each IRQ of each device in the parent domain, so it uses the devfn to + * identify the device and the msi_index to identify the IRQ within that device. + */ +static inline u32 zpci_encode_hwirq(u8 devfn, u16 msi_index) +{ + return (devfn << 16) | msi_index; +} - return IRQ_SET_MASK_OK; +static inline u16 zpci_decode_hwirq_msi_index(irq_hw_number_t hwirq) +{ + return hwirq & 0xffff; +} + +static void zpci_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct msi_desc *desc = irq_data_get_msi_desc(data); + struct zpci_dev *zdev = to_zpci_dev(desc->dev); + + if (irq_delivery == DIRECTED) { + int cpu = cpumask_first(irq_data_get_affinity_mask(data)); + + msg->address_lo = zdev->msi_addr & 0xff0000ff; + msg->address_lo |= (smp_cpu_get_cpu_address(cpu) << 8); + } else { + msg->address_lo = zdev->msi_addr & 0xffffffff; + } + msg->address_hi = zdev->msi_addr >> 32; + msg->data = zpci_decode_hwirq_msi_index(data->hwirq); } static struct irq_chip zpci_irq_chip = { .name = "PCI-MSI", - .irq_unmask = pci_msi_unmask_irq, - .irq_mask = pci_msi_mask_irq, + .irq_compose_msi_msg = zpci_compose_msi_msg, }; static void zpci_handle_cpu_local_irq(bool rescan) { struct airq_iv *dibv = zpci_ibv[smp_processor_id()]; union zpci_sic_iib iib = {{0}}; + struct irq_domain *msi_domain; + irq_hw_number_t hwirq; unsigned long bit; int irqs_on = 0; @@ -163,7 +190,9 @@ static void zpci_handle_cpu_local_irq(bool rescan) continue; } inc_irq_stat(IRQIO_MSI); - generic_handle_irq(airq_iv_get_data(dibv, bit)); + hwirq = airq_iv_get_data(dibv, bit); + msi_domain = (struct irq_domain *)airq_iv_get_ptr(dibv, bit); + generic_handle_domain_irq(msi_domain, hwirq); } } @@ -228,6 +257,8 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, struct tpi_info *tpi_info) { union zpci_sic_iib iib = {{0}}; + struct irq_domain *msi_domain; + irq_hw_number_t hwirq; unsigned long si, ai; struct airq_iv *aibv; int irqs_on = 0; @@ -255,7 +286,9 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, break; inc_irq_stat(IRQIO_MSI); airq_iv_lock(aibv, ai); - generic_handle_irq(airq_iv_get_data(aibv, ai)); + hwirq = airq_iv_get_data(aibv, ai); + msi_domain = (struct irq_domain *)airq_iv_get_ptr(aibv, ai); + generic_handle_domain_irq(msi_domain, hwirq); airq_iv_unlock(aibv, ai); } } @@ -277,7 +310,9 @@ static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs, zdev->aisb = *bit; /* Create adapter interrupt vector */ - zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK, NULL); + zdev->aibv = airq_iv_create(msi_vecs, + AIRQ_IV_PTR | AIRQ_IV_DATA | AIRQ_IV_BITLOCK, + NULL); if (!zdev->aibv) return -ENOMEM; @@ -289,146 +324,220 @@ static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs, return 0; } -int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +bool arch_restore_msi_irqs(struct pci_dev *pdev) { - unsigned int hwirq, msi_vecs, irqs_per_msi, i, cpu; struct zpci_dev *zdev = to_zpci(pdev); - struct msi_desc *msi; - struct msi_msg msg; - unsigned long bit; - int cpu_addr; - int rc, irq; + zpci_set_irq(zdev); + return true; +} + +static struct airq_struct zpci_airq = { + .handler = zpci_floating_irq_handler, + .isc = PCI_ISC, +}; + +static void zpci_msi_teardown_directed(struct zpci_dev *zdev) +{ + airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->max_msi); + zdev->msi_first_bit = -1U; + zdev->msi_nr_irqs = 0; +} + +static void zpci_msi_teardown_floating(struct zpci_dev *zdev) +{ + airq_iv_release(zdev->aibv); + zdev->aibv = NULL; + airq_iv_free_bit(zpci_sbv, zdev->aisb); zdev->aisb = -1UL; zdev->msi_first_bit = -1U; + zdev->msi_nr_irqs = 0; +} + +static void zpci_msi_teardown(struct irq_domain *domain, msi_alloc_info_t *arg) +{ + struct zpci_dev *zdev = to_zpci_dev(domain->dev); + + zpci_clear_irq(zdev); + if (irq_delivery == DIRECTED) + zpci_msi_teardown_directed(zdev); + else + zpci_msi_teardown_floating(zdev); +} + +static int zpci_msi_prepare(struct irq_domain *domain, + struct device *dev, int nvec, + msi_alloc_info_t *info) +{ + struct zpci_dev *zdev = to_zpci_dev(dev); + struct pci_dev *pdev = to_pci_dev(dev); + unsigned long bit; + int msi_vecs, rc; msi_vecs = min_t(unsigned int, nvec, zdev->max_msi); if (msi_vecs < nvec) { - pr_info("%s requested %d irqs, allocate system limit of %d", + pr_info("%s requested %d IRQs, allocate system limit of %d\n", pci_name(pdev), nvec, zdev->max_msi); } rc = __alloc_airq(zdev, msi_vecs, &bit); - if (rc < 0) + if (rc) { + pr_err("Allocating adapter IRQs for %s failed\n", pci_name(pdev)); return rc; + } - /* - * Request MSI interrupts: - * When using MSI, nvec_used interrupt sources and their irq - * descriptors are controlled through one msi descriptor. - * Thus the outer loop over msi descriptors shall run only once, - * while two inner loops iterate over the interrupt vectors. - * When using MSI-X, each interrupt vector/irq descriptor - * is bound to exactly one msi descriptor (nvec_used is one). - * So the inner loops are executed once, while the outer iterates - * over the MSI-X descriptors. - */ - hwirq = bit; - msi_for_each_desc(msi, &pdev->dev, MSI_DESC_NOTASSOCIATED) { - if (hwirq - bit >= msi_vecs) - break; - irqs_per_msi = min_t(unsigned int, msi_vecs, msi->nvec_used); - irq = __irq_alloc_descs(-1, 0, irqs_per_msi, 0, THIS_MODULE, - (irq_delivery == DIRECTED) ? - msi->affinity : NULL); - if (irq < 0) - return -ENOMEM; + zdev->msi_first_bit = bit; + zdev->msi_nr_irqs = msi_vecs; + rc = zpci_set_irq(zdev); + if (rc) { + pr_err("Registering adapter IRQs for %s failed\n", + pci_name(pdev)); + + if (irq_delivery == DIRECTED) + zpci_msi_teardown_directed(zdev); + else + zpci_msi_teardown_floating(zdev); + return rc; + } + return 0; +} - for (i = 0; i < irqs_per_msi; i++) { - rc = irq_set_msi_desc_off(irq, i, msi); - if (rc) - return rc; - irq_set_chip_and_handler(irq + i, &zpci_irq_chip, - handle_percpu_irq); - } +static int zpci_msi_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *args) +{ + struct msi_desc *desc = ((msi_alloc_info_t *)args)->desc; + struct zpci_dev *zdev = to_zpci_dev(desc->dev); + struct zpci_bus *zbus = zdev->zbus; + unsigned int cpu, hwirq; + unsigned long bit; + int i; - msg.data = hwirq - bit; - if (irq_delivery == DIRECTED) { - if (msi->affinity) - cpu = cpumask_first(&msi->affinity->mask); - else - cpu = 0; - cpu_addr = smp_cpu_get_cpu_address(cpu); + bit = zdev->msi_first_bit + desc->msi_index; + hwirq = zpci_encode_hwirq(zdev->devfn, desc->msi_index); - msg.address_lo = zdev->msi_addr & 0xff0000ff; - msg.address_lo |= (cpu_addr << 8); + if (desc->msi_index + nr_irqs > zdev->max_msi) + return -EINVAL; + for (i = 0; i < nr_irqs; i++) { + irq_domain_set_info(domain, virq + i, hwirq + i, + &zpci_irq_chip, zdev, + handle_percpu_irq, NULL, NULL); + + if (irq_delivery == DIRECTED) { for_each_possible_cpu(cpu) { - for (i = 0; i < irqs_per_msi; i++) - airq_iv_set_data(zpci_ibv[cpu], - hwirq + i, irq + i); + airq_iv_set_ptr(zpci_ibv[cpu], bit + i, + (unsigned long)zbus->msi_parent_domain); + airq_iv_set_data(zpci_ibv[cpu], bit + i, hwirq + i); } } else { - msg.address_lo = zdev->msi_addr & 0xffffffff; - for (i = 0; i < irqs_per_msi; i++) - airq_iv_set_data(zdev->aibv, hwirq + i, irq + i); + airq_iv_set_ptr(zdev->aibv, bit + i, + (unsigned long)zbus->msi_parent_domain); + airq_iv_set_data(zdev->aibv, bit + i, hwirq + i); } - msg.address_hi = zdev->msi_addr >> 32; - pci_write_msi_msg(irq, &msg); - hwirq += irqs_per_msi; } - zdev->msi_first_bit = bit; - zdev->msi_nr_irqs = hwirq - bit; - - rc = zpci_set_irq(zdev); - if (rc) - return rc; - - return (zdev->msi_nr_irqs == nvec) ? 0 : zdev->msi_nr_irqs; + return 0; } -void arch_teardown_msi_irqs(struct pci_dev *pdev) +static void zpci_msi_clear_airq(struct irq_data *d, int i) { - struct zpci_dev *zdev = to_zpci(pdev); - struct msi_desc *msi; - unsigned int i; - int rc; + struct msi_desc *desc = irq_data_get_msi_desc(d); + struct zpci_dev *zdev = to_zpci_dev(desc->dev); + unsigned long bit; + unsigned int cpu; + u16 msi_index; - /* Disable interrupts */ - rc = zpci_clear_irq(zdev); - if (rc) - return; + msi_index = zpci_decode_hwirq_msi_index(d->hwirq); + bit = zdev->msi_first_bit + msi_index; - /* Release MSI interrupts */ - msi_for_each_desc(msi, &pdev->dev, MSI_DESC_ASSOCIATED) { - for (i = 0; i < msi->nvec_used; i++) { - irq_set_msi_desc(msi->irq + i, NULL); - irq_free_desc(msi->irq + i); + if (irq_delivery == DIRECTED) { + for_each_possible_cpu(cpu) { + airq_iv_set_ptr(zpci_ibv[cpu], bit + i, 0); + airq_iv_set_data(zpci_ibv[cpu], bit + i, 0); } - msi->msg.address_lo = 0; - msi->msg.address_hi = 0; - msi->msg.data = 0; - msi->irq = 0; + } else { + airq_iv_set_ptr(zdev->aibv, bit + i, 0); + airq_iv_set_data(zdev->aibv, bit + i, 0); } +} - if (zdev->aisb != -1UL) { - zpci_ibv[zdev->aisb] = NULL; - airq_iv_free_bit(zpci_sbv, zdev->aisb); - zdev->aisb = -1UL; - } - if (zdev->aibv) { - airq_iv_release(zdev->aibv); - zdev->aibv = NULL; - } +static void zpci_msi_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d; + int i; - if ((irq_delivery == DIRECTED) && zdev->msi_first_bit != -1U) - airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->msi_nr_irqs); + for (i = 0; i < nr_irqs; i++) { + d = irq_domain_get_irq_data(domain, virq + i); + zpci_msi_clear_airq(d, i); + irq_domain_reset_irq_data(d); + } } -bool arch_restore_msi_irqs(struct pci_dev *pdev) +static const struct irq_domain_ops zpci_msi_domain_ops = { + .alloc = zpci_msi_domain_alloc, + .free = zpci_msi_domain_free, +}; + +static bool zpci_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, + struct msi_domain_info *info) { - struct zpci_dev *zdev = to_zpci(pdev); + if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) + return false; + + info->ops->msi_prepare = zpci_msi_prepare; + info->ops->msi_teardown = zpci_msi_teardown; - zpci_set_irq(zdev); return true; } -static struct airq_struct zpci_airq = { - .handler = zpci_floating_irq_handler, - .isc = PCI_ISC, +static struct msi_parent_ops zpci_msi_parent_ops = { + .supported_flags = MSI_GENERIC_FLAGS_MASK | + MSI_FLAG_PCI_MSIX | + MSI_FLAG_MULTI_PCI_MSI, + .required_flags = MSI_FLAG_USE_DEF_DOM_OPS | + MSI_FLAG_USE_DEF_CHIP_OPS, + .init_dev_msi_info = zpci_init_dev_msi_info, }; +int zpci_create_parent_msi_domain(struct zpci_bus *zbus) +{ + char fwnode_name[18]; + + snprintf(fwnode_name, sizeof(fwnode_name), "ZPCI_MSI_DOM_%04x", zbus->domain_nr); + struct irq_domain_info info = { + .fwnode = irq_domain_alloc_named_fwnode(fwnode_name), + .ops = &zpci_msi_domain_ops, + }; + + if (!info.fwnode) { + pr_err("Failed to allocate fwnode for MSI IRQ domain\n"); + return -ENOMEM; + } + + if (irq_delivery == FLOATING) + zpci_msi_parent_ops.required_flags |= MSI_FLAG_NO_AFFINITY; + + zbus->msi_parent_domain = msi_create_parent_irq_domain(&info, &zpci_msi_parent_ops); + if (!zbus->msi_parent_domain) { + irq_domain_free_fwnode(info.fwnode); + pr_err("Failed to create MSI IRQ domain\n"); + return -ENOMEM; + } + + return 0; +} + +void zpci_remove_parent_msi_domain(struct zpci_bus *zbus) +{ + struct fwnode_handle *fn; + + fn = zbus->msi_parent_domain->fwnode; + irq_domain_remove(zbus->msi_parent_domain); + irq_domain_free_fwnode(fn); +} + static void __init cpu_enable_directed_irq(void *unused) { union zpci_sic_iib iib = {{0}}; @@ -465,6 +574,7 @@ static int __init zpci_directed_irq_init(void) * is only done on the first vector. */ zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE, + AIRQ_IV_PTR | AIRQ_IV_DATA | AIRQ_IV_CACHELINE | (!cpu ? AIRQ_IV_ALLOC : 0), NULL); -- cgit v1.2.3 From 2f393c228cc519ddf19b8c6c05bf15723241aa96 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 4 Nov 2025 16:40:48 +0100 Subject: KVM: s390: Fix gmap_helper_zap_one_page() again A few checks were missing in gmap_helper_zap_one_page(), which can lead to memory corruption in the guest under specific circumstances. Add the missing checks. Fixes: 5deafa27d9ae ("KVM: s390: Fix to clear PTE when discarding a swapped page") Cc: stable@vger.kernel.org Reported-by: Marc Hartmayer Tested-by: Marc Hartmayer Acked-by: Christian Borntraeger Signed-off-by: Claudio Imbrenda Signed-off-by: Heiko Carstens --- arch/s390/mm/gmap_helpers.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index 549f14ad08af..d41b19925a5a 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -47,6 +47,7 @@ static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) { struct vm_area_struct *vma; + unsigned long pgstev; spinlock_t *ptl; pgste_t pgste; pte_t *ptep; @@ -65,9 +66,13 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) if (pte_swap(*ptep)) { preempt_disable(); pgste = pgste_get_lock(ptep); + pgstev = pgste_val(pgste); - ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); - pte_clear(mm, vmaddr, ptep); + if ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || + (pgstev & _PGSTE_GPS_ZERO)) { + ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); + pte_clear(mm, vmaddr, ptep); + } pgste_set_unlock(ptep, pgste); preempt_enable(); -- cgit v1.2.3 From 1a82d430c5f05d4bf15b86a9f0349e4a24ec485c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 2 Dec 2025 14:41:00 +0100 Subject: s390/bug: Add missing CONFIG_BUG ifdef again Fallback to generic BUG implementation in case CONFIG_BUG is disabled. This restores the old behaviour before 'cond_str' support was added. It probably doesn't matter, since nobody should disable CONFIG_BUG, but at least this is consistent to before. Fixes: 6584ff203aec ("bugs/s390: Use 'cond_str' in __EMIT_BUG()") Signed-off-by: Heiko Carstens --- arch/s390/include/asm/bug.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h index acb4b13d98c5..063ada55fbd1 100644 --- a/arch/s390/include/asm/bug.h +++ b/arch/s390/include/asm/bug.h @@ -4,6 +4,8 @@ #include +#ifdef CONFIG_BUG + #ifndef CONFIG_DEBUG_BUGVERBOSE #define _BUGVERBOSE_LOCATION(file, line) #else @@ -52,6 +54,8 @@ do { \ #define HAVE_ARCH_BUG +#endif /* CONFIG_BUG */ + #include #endif /* _ASM_S390_BUG_H */ -- cgit v1.2.3 From 70075e3d0ca0b72cc983d03f7cd9796e43492980 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 8 Dec 2025 13:40:56 +0100 Subject: s390/bug: Add missing alignment All objects are supposed to have a minimal alignment of two, since a couple of instructions only work with even addresses. Add the missing align statement for the file string. Fixes: 6584ff203aec ("bugs/s390: Use 'cond_str' in __EMIT_BUG()") Signed-off-by: Heiko Carstens --- arch/s390/include/asm/bug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h index 063ada55fbd1..ee9221bb5d18 100644 --- a/arch/s390/include/asm/bug.h +++ b/arch/s390/include/asm/bug.h @@ -11,6 +11,7 @@ #else #define __BUGVERBOSE_LOCATION(file, line) \ .pushsection .rodata.str, "aMS", @progbits, 1; \ + .align 2; \ 10002: .ascii file "\0"; \ .popsection; \ \ -- cgit v1.2.3