summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-05-20 10:23:39 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-05-20 10:23:39 -0700
commitdaa121128a2d2ac6006159e2c47676e4fcd21eab (patch)
tree92f5ebb4ebc9be3535c5c3905ba40ab68cbdf964
parent6e51b4b5bbc07e52b226017936874715629932d1 (diff)
parenta6016aac5252da9d22a4dc0b98121b0acdf6d2f5 (diff)
downloadlinux-daa121128a2d2ac6006159e2c47676e4fcd21eab.tar.gz
linux-daa121128a2d2ac6006159e2c47676e4fcd21eab.tar.bz2
linux-daa121128a2d2ac6006159e2c47676e4fcd21eab.zip
Merge tag 'dma-mapping-6.10-2024-05-20' of git://git.infradead.org/users/hch/dma-mapping
Pull dma-mapping updates from Christoph Hellwig: - optimize DMA sync calls when they are no-ops (Alexander Lobakin) - fix swiotlb padding for untrusted devices (Michael Kelley) - add documentation for swiotb (Michael Kelley) * tag 'dma-mapping-6.10-2024-05-20' of git://git.infradead.org/users/hch/dma-mapping: dma: fix DMA sync for drivers not calling dma_set_mask*() xsk: use generic DMA sync shortcut instead of a custom one page_pool: check for DMA sync shortcut earlier page_pool: don't use driver-set flags field directly page_pool: make sure frag API fields don't span between cachelines iommu/dma: avoid expensive indirect calls for sync operations dma: avoid redundant calls for sync operations dma: compile-out DMA sync op calls when not used iommu/dma: fix zeroing of bounce buffer padding used by untrusted devices swiotlb: remove alloc_size argument to swiotlb_tbl_map_single() Documentation/core-api: add swiotlb documentation
-rw-r--r--Documentation/core-api/index.rst1
-rw-r--r--Documentation/core-api/swiotlb.rst321
-rw-r--r--drivers/iommu/dma-iommu.c34
-rw-r--r--drivers/net/ethernet/engleder/tsnep_main.c2
-rw-r--r--drivers/net/ethernet/freescale/dpaa2/dpaa2-xsk.c2
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_xsk.c2
-rw-r--r--drivers/net/ethernet/intel/ice/ice_xsk.c2
-rw-r--r--drivers/net/ethernet/intel/igc/igc_main.c2
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c4
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_rx.c2
-rw-r--r--drivers/net/ethernet/netronome/nfp/nfd3/xsk.c2
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac_main.c2
-rw-r--r--drivers/xen/swiotlb-xen.c2
-rw-r--r--include/linux/device.h4
-rw-r--r--include/linux/dma-map-ops.h12
-rw-r--r--include/linux/dma-mapping.h105
-rw-r--r--include/linux/iova.h5
-rw-r--r--include/linux/swiotlb.h2
-rw-r--r--include/net/page_pool/types.h25
-rw-r--r--include/net/xdp_sock_drv.h7
-rw-r--r--include/net/xsk_buff_pool.h14
-rw-r--r--kernel/dma/Kconfig5
-rw-r--r--kernel/dma/mapping.c69
-rw-r--r--kernel/dma/swiotlb.c62
-rw-r--r--net/core/page_pool.c78
-rw-r--r--net/xdp/xsk_buff_pool.c29
27 files changed, 634 insertions, 163 deletions
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index 7a3a08d81f11..89c517665763 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -102,6 +102,7 @@ more memory-management documentation in Documentation/mm/index.rst.
dma-api-howto
dma-attributes
dma-isa-lpc
+ swiotlb
mm-api
genalloc
pin_user_pages
diff --git a/Documentation/core-api/swiotlb.rst b/Documentation/core-api/swiotlb.rst
new file mode 100644
index 000000000000..5ad2c9ca85bc
--- /dev/null
+++ b/Documentation/core-api/swiotlb.rst
@@ -0,0 +1,321 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+DMA and swiotlb
+===============
+
+swiotlb is a memory buffer allocator used by the Linux kernel DMA layer. It is
+typically used when a device doing DMA can't directly access the target memory
+buffer because of hardware limitations or other requirements. In such a case,
+the DMA layer calls swiotlb to allocate a temporary memory buffer that conforms
+to the limitations. The DMA is done to/from this temporary memory buffer, and
+the CPU copies the data between the temporary buffer and the original target
+memory buffer. This approach is generically called "bounce buffering", and the
+temporary memory buffer is called a "bounce buffer".
+
+Device drivers don't interact directly with swiotlb. Instead, drivers inform
+the DMA layer of the DMA attributes of the devices they are managing, and use
+the normal DMA map, unmap, and sync APIs when programming a device to do DMA.
+These APIs use the device DMA attributes and kernel-wide settings to determine
+if bounce buffering is necessary. If so, the DMA layer manages the allocation,
+freeing, and sync'ing of bounce buffers. Since the DMA attributes are per
+device, some devices in a system may use bounce buffering while others do not.
+
+Because the CPU copies data between the bounce buffer and the original target
+memory buffer, doing bounce buffering is slower than doing DMA directly to the
+original memory buffer, and it consumes more CPU resources. So it is used only
+when necessary for providing DMA functionality.
+
+Usage Scenarios
+---------------
+swiotlb was originally created to handle DMA for devices with addressing
+limitations. As physical memory sizes grew beyond 4 GiB, some devices could
+only provide 32-bit DMA addresses. By allocating bounce buffer memory below
+the 4 GiB line, these devices with addressing limitations could still work and
+do DMA.
+
+More recently, Confidential Computing (CoCo) VMs have the guest VM's memory
+encrypted by default, and the memory is not accessible by the host hypervisor
+and VMM. For the host to do I/O on behalf of the guest, the I/O must be
+directed to guest memory that is unencrypted. CoCo VMs set a kernel-wide option
+to force all DMA I/O to use bounce buffers, and the bounce buffer memory is set
+up as unencrypted. The host does DMA I/O to/from the bounce buffer memory, and
+the Linux kernel DMA layer does "sync" operations to cause the CPU to copy the
+data to/from the original target memory buffer. The CPU copying bridges between
+the unencrypted and the encrypted memory. This use of bounce buffers allows
+device drivers to "just work" in a CoCo VM, with no modifications
+needed to handle the memory encryption complexity.
+
+Other edge case scenarios arise for bounce buffers. For example, when IOMMU
+mappings are set up for a DMA operation to/from a device that is considered
+"untrusted", the device should be given access only to the memory containing
+the data being transferred. But if that memory occupies only part of an IOMMU
+granule, other parts of the granule may contain unrelated kernel data. Since
+IOMMU access control is per-granule, the untrusted device can gain access to
+the unrelated kernel data. This problem is solved by bounce buffering the DMA
+operation and ensuring that unused portions of the bounce buffers do not
+contain any unrelated kernel data.
+
+Core Functionality
+------------------
+The primary swiotlb APIs are swiotlb_tbl_map_single() and
+swiotlb_tbl_unmap_single(). The "map" API allocates a bounce buffer of a
+specified size in bytes and returns the physical address of the buffer. The
+buffer memory is physically contiguous. The expectation is that the DMA layer
+maps the physical memory address to a DMA address, and returns the DMA address
+to the driver for programming into the device. If a DMA operation specifies
+multiple memory buffer segments, a separate bounce buffer must be allocated for
+each segment. swiotlb_tbl_map_single() always does a "sync" operation (i.e., a
+CPU copy) to initialize the bounce buffer to match the contents of the original
+buffer.
+
+swiotlb_tbl_unmap_single() does the reverse. If the DMA operation might have
+updated the bounce buffer memory and DMA_ATTR_SKIP_CPU_SYNC is not set, the
+unmap does a "sync" operation to cause a CPU copy of the data from the bounce
+buffer back to the original buffer. Then the bounce buffer memory is freed.
+
+swiotlb also provides "sync" APIs that correspond to the dma_sync_*() APIs that
+a driver may use when control of a buffer transitions between the CPU and the
+device. The swiotlb "sync" APIs cause a CPU copy of the data between the
+original buffer and the bounce buffer. Like the dma_sync_*() APIs, the swiotlb
+"sync" APIs support doing a partial sync, where only a subset of the bounce
+buffer is copied to/from the original buffer.
+
+Core Functionality Constraints
+------------------------------
+The swiotlb map/unmap/sync APIs must operate without blocking, as they are
+called by the corresponding DMA APIs which may run in contexts that cannot
+block. Hence the default memory pool for swiotlb allocations must be
+pre-allocated at boot time (but see Dynamic swiotlb below). Because swiotlb
+allocations must be physically contiguous, the entire default memory pool is
+allocated as a single contiguous block.
+
+The need to pre-allocate the default swiotlb pool creates a boot-time tradeoff.
+The pool should be large enough to ensure that bounce buffer requests can
+always be satisfied, as the non-blocking requirement means requests can't wait
+for space to become available. But a large pool potentially wastes memory, as
+this pre-allocated memory is not available for other uses in the system. The
+tradeoff is particularly acute in CoCo VMs that use bounce buffers for all DMA
+I/O. These VMs use a heuristic to set the default pool size to ~6% of memory,
+with a max of 1 GiB, which has the potential to be very wasteful of memory.
+Conversely, the heuristic might produce a size that is insufficient, depending
+on the I/O patterns of the workload in the VM. The dynamic swiotlb feature
+described below can help, but has limitations. Better management of the swiotlb
+default memory pool size remains an open issue.
+
+A single allocation from swiotlb is limited to IO_TLB_SIZE * IO_TLB_SEGSIZE
+bytes, which is 256 KiB with current definitions. When a device's DMA settings
+are such that the device might use swiotlb, the maximum size of a DMA segment
+must be limited to that 256 KiB. This value is communicated to higher-level
+kernel code via dma_map_mapping_size() and swiotlb_max_mapping_size(). If the
+higher-level code fails to account for this limit, it may make requests that
+are too large for swiotlb, and get a "swiotlb full" error.
+
+A key device DMA setting is "min_align_mask", which is a power of 2 minus 1
+so that some number of low order bits are set, or it may be zero. swiotlb
+allocations ensure these min_align_mask bits of the physical address of the
+bounce buffer match the same bits in the address of the original buffer. When
+min_align_mask is non-zero, it may produce an "alignment offset" in the address
+of the bounce buffer that slightly reduces the maximum size of an allocation.
+This potential alignment offset is reflected in the value returned by
+swiotlb_max_mapping_size(), which can show up in places like
+/sys/block/<device>/queue/max_sectors_kb. For example, if a device does not use
+swiotlb, max_sectors_kb might be 512 KiB or larger. If a device might use
+swiotlb, max_sectors_kb will be 256 KiB. When min_align_mask is non-zero,
+max_sectors_kb might be even smaller, such as 252 KiB.
+
+swiotlb_tbl_map_single() also takes an "alloc_align_mask" parameter. This
+parameter specifies the allocation of bounce buffer space must start at a
+physical address with the alloc_align_mask bits set to zero. But the actual
+bounce buffer might start at a larger address if min_align_mask is non-zero.
+Hence there may be pre-padding space that is allocated prior to the start of
+the bounce buffer. Similarly, the end of the bounce buffer is rounded up to an
+alloc_align_mask boundary, potentially resulting in post-padding space. Any
+pre-padding or post-padding space is not initialized by swiotlb code. The
+"alloc_align_mask" parameter is used by IOMMU code when mapping for untrusted
+devices. It is set to the granule size - 1 so that the bounce buffer is
+allocated entirely from granules that are not used for any other purpose.
+
+Data structures concepts
+------------------------
+Memory used for swiotlb bounce buffers is allocated from overall system memory
+as one or more "pools". The default pool is allocated during system boot with a
+default size of 64 MiB. The default pool size may be modified with the
+"swiotlb=" kernel boot line parameter. The default size may also be adjusted
+due to other conditions, such as running in a CoCo VM, as described above. If
+CONFIG_SWIOTLB_DYNAMIC is enabled, additional pools may be allocated later in
+the life of the system. Each pool must be a contiguous range of physical
+memory. The default pool is allocated below the 4 GiB physical address line so
+it works for devices that can only address 32-bits of physical memory (unless
+architecture-specific code provides the SWIOTLB_ANY flag). In a CoCo VM, the
+pool memory must be decrypted before swiotlb is used.
+
+Each pool is divided into "slots" of size IO_TLB_SIZE, which is 2 KiB with
+current definitions. IO_TLB_SEGSIZE contiguous slots (128 slots) constitute
+what might be called a "slot set". When a bounce buffer is allocated, it
+occupies one or more contiguous slots. A slot is never shared by multiple
+bounce buffers. Furthermore, a bounce buffer must be allocated from a single
+slot set, which leads to the maximum bounce buffer size being IO_TLB_SIZE *
+IO_TLB_SEGSIZE. Multiple smaller bounce buffers may co-exist in a single slot
+set if the alignment and size constraints can be met.
+
+Slots are also grouped into "areas", with the constraint that a slot set exists
+entirely in a single area. Each area has its own spin lock that must be held to
+manipulate the slots in that area. The division into areas avoids contending
+for a single global spin lock when swiotlb is heavily used, such as in a CoCo
+VM. The number of areas defaults to the number of CPUs in the system for
+maximum parallelism, but since an area can't be smaller than IO_TLB_SEGSIZE
+slots, it might be necessary to assign multiple CPUs to the same area. The
+number of areas can also be set via the "swiotlb=" kernel boot parameter.
+
+When allocating a bounce buffer, if the area associated with the calling CPU
+does not have enough free space, areas associated with other CPUs are tried
+sequentially. For each area tried, the area's spin lock must be obtained before
+trying an allocation, so contention may occur if swiotlb is relatively busy
+overall. But an allocation request does not fail unless all areas do not have
+enough free space.
+
+IO_TLB_SIZE, IO_TLB_SEGSIZE, and the number of areas must all be powers of 2 as
+the code uses shifting and bit masking to do many of the calculations. The
+number of areas is rounded up to a power of 2 if necessary to meet this
+requirement.
+
+The default pool is allocated with PAGE_SIZE alignment. If an alloc_align_mask
+argument to swiotlb_tbl_map_single() specifies a larger alignment, one or more
+initial slots in each slot set might not meet the alloc_align_mask criterium.
+Because a bounce buffer allocation can't cross a slot set boundary, eliminating
+those initial slots effectively reduces the max size of a bounce buffer.
+Currently, there's no problem because alloc_align_mask is set based on IOMMU
+granule size, and granules cannot be larger than PAGE_SIZE. But if that were to
+change in the future, the initial pool allocation might need to be done with
+alignment larger than PAGE_SIZE.
+
+Dynamic swiotlb
+---------------
+When CONFIG_DYNAMIC_SWIOTLB is enabled, swiotlb can do on-demand expansion of
+the amount of memory available for allocation as bounce buffers. If a bounce
+buffer request fails due to lack of available space, an asynchronous background
+task is kicked off to allocate memory from general system memory and turn it
+into an swiotlb pool. Creating an additional pool must be done asynchronously
+because the memory allocation may block, and as noted above, swiotlb requests
+are not allowed to block. Once the background task is kicked off, the bounce
+buffer request creates a "transient pool" to avoid returning an "swiotlb full"
+error. A transient pool has the size of the bounce buffer request, and is
+deleted when the bounce buffer is freed. Memory for this transient pool comes
+from the general system memory atomic pool so that creation does not block.
+Creating a transient pool has relatively high cost, particularly in a CoCo VM
+where the memory must be decrypted, so it is done only as a stopgap until the
+background task can add another non-transient pool.
+
+Adding a dynamic pool has limitations. Like with the default pool, the memory
+must be physically contiguous, so the size is limited to MAX_PAGE_ORDER pages
+(e.g., 4 MiB on a typical x86 system). Due to memory fragmentation, a max size
+allocation may not be available. The dynamic pool allocator tries smaller sizes
+until it succeeds, but with a minimum size of 1 MiB. Given sufficient system
+memory fragmentation, dynamically adding a pool might not succeed at all.
+
+The number of areas in a dynamic pool may be different from the number of areas
+in the default pool. Because the new pool size is typically a few MiB at most,
+the number of areas will likely be smaller. For example, with a new pool size
+of 4 MiB and the 256 KiB minimum area size, only 16 areas can be created. If
+the system has more than 16 CPUs, multiple CPUs must share an area, creating
+more lock contention.
+
+New pools added via dynamic swiotlb are linked together in a linear list.
+swiotlb code frequently must search for the pool containing a particular
+swiotlb physical address, so that search is linear and not performant with a
+large number of dynamic pools. The data structures could be improved for
+faster searches.
+
+Overall, dynamic swiotlb works best for small configurations with relatively
+few CPUs. It allows the default swiotlb pool to be smaller so that memory is
+not wasted, with dynamic pools making more space available if needed (as long
+as fragmentation isn't an obstacle). It is less useful for large CoCo VMs.
+
+Data Structure Details
+----------------------
+swiotlb is managed with four primary data structures: io_tlb_mem, io_tlb_pool,
+io_tlb_area, and io_tlb_slot. io_tlb_mem describes a swiotlb memory allocator,
+which includes the default memory pool and any dynamic or transient pools
+linked to it. Limited statistics on swiotlb usage are kept per memory allocator
+and are stored in this data structure. These statistics are available under
+/sys/kernel/debug/swiotlb when CONFIG_DEBUG_FS is set.
+
+io_tlb_pool describes a memory pool, either the default pool, a dynamic pool,
+or a transient pool. The description includes the start and end addresses of
+the memory in the pool, a pointer to an array of io_tlb_area structures, and a
+pointer to an array of io_tlb_slot structures that are associated with the pool.
+
+io_tlb_area describes an area. The primary field is the spin lock used to
+serialize access to slots in the area. The io_tlb_area array for a pool has an
+entry for each area, and is accessed using a 0-based area index derived from the
+calling processor ID. Areas exist solely to allow parallel access to swiotlb
+from multiple CPUs.
+
+io_tlb_slot describes an individual memory slot in the pool, with size
+IO_TLB_SIZE (2 KiB currently). The io_tlb_slot array is indexed by the slot
+index computed from the bounce buffer address relative to the starting memory
+address of the pool. The size of struct io_tlb_slot is 24 bytes, so the
+overhead is about 1% of the slot size.
+
+The io_tlb_slot array is designed to meet several requirements. First, the DMA
+APIs and the corresponding swiotlb APIs use the bounce buffer address as the
+identifier for a bounce buffer. This address is returned by
+swiotlb_tbl_map_single(), and then passed as an argument to
+swiotlb_tbl_unmap_single() and the swiotlb_sync_*() functions. The original
+memory buffer address obviously must be passed as an argument to
+swiotlb_tbl_map_single(), but it is not passed to the other APIs. Consequently,
+swiotlb data structures must save the original memory buffer address so that it
+can be used when doing sync operations. This original address is saved in the
+io_tlb_slot array.
+
+Second, the io_tlb_slot array must handle partial sync requests. In such cases,
+the argument to swiotlb_sync_*() is not the address of the start of the bounce
+buffer but an address somewhere in the middle of the bounce buffer, and the
+address of the start of the bounce buffer isn't known to swiotlb code. But
+swiotlb code must be able to calculate the corresponding original memory buffer
+address to do the CPU copy dictated by the "sync". So an adjusted original
+memory buffer address is populated into the struct io_tlb_slot for each slot
+occupied by the bounce buffer. An adjusted "alloc_size" of the bounce buffer is
+also recorded in each struct io_tlb_slot so a sanity check can be performed on
+the size of the "sync" operation. The "alloc_size" field is not used except for
+the sanity check.
+
+Third, the io_tlb_slot array is used to track available slots. The "list" field
+in struct io_tlb_slot records how many contiguous available slots exist starting
+at that slot. A "0" indicates that the slot is occupied. A value of "1"
+indicates only the current slot is available. A value of "2" indicates the
+current slot and the next slot are available, etc. The maximum value is
+IO_TLB_SEGSIZE, which can appear in the first slot in a slot set, and indicates
+that the entire slot set is available. These values are used when searching for
+available slots to use for a new bounce buffer. They are updated when allocating
+a new bounce buffer and when freeing a bounce buffer. At pool creation time, the
+"list" field is initialized to IO_TLB_SEGSIZE down to 1 for the slots in every
+slot set.
+
+Fourth, the io_tlb_slot array keeps track of any "padding slots" allocated to
+meet alloc_align_mask requirements described above. When
+swiotlb_tlb_map_single() allocates bounce buffer space to meet alloc_align_mask
+requirements, it may allocate pre-padding space across zero or more slots. But
+when swiotbl_tlb_unmap_single() is called with the bounce buffer address, the
+alloc_align_mask value that governed the allocation, and therefore the
+allocation of any padding slots, is not known. The "pad_slots" field records
+the number of padding slots so that swiotlb_tbl_unmap_single() can free them.
+The "pad_slots" value is recorded only in the first non-padding slot allocated
+to the bounce buffer.
+
+Restricted pools
+----------------
+The swiotlb machinery is also used for "restricted pools", which are pools of
+memory separate from the default swiotlb pool, and that are dedicated for DMA
+use by a particular device. Restricted pools provide a level of DMA memory
+protection on systems with limited hardware protection capabilities, such as
+those lacking an IOMMU. Such usage is specified by DeviceTree entries and
+requires that CONFIG_DMA_RESTRICTED_POOL is set. Each restricted pool is based
+on its own io_tlb_mem data structure that is independent of the main swiotlb
+io_tlb_mem.
+
+Restricted pools add swiotlb_alloc() and swiotlb_free() APIs, which are called
+from the dma_alloc_*() and dma_free_*() APIs. The swiotlb_alloc/free() APIs
+allocate/free slots from/to the restricted pool directly and do not go through
+swiotlb_tbl_map/unmap_single().
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index eca1afa36508..f731e4b2a417 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1152,9 +1152,6 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
*/
if (dev_use_swiotlb(dev, size, dir) &&
iova_offset(iovad, phys | size)) {
- void *padding_start;
- size_t padding_size, aligned_size;
-
if (!is_swiotlb_active(dev)) {
dev_warn_once(dev, "DMA bounce buffers are inactive, unable to map unaligned transaction.\n");
return DMA_MAPPING_ERROR;
@@ -1162,24 +1159,30 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
trace_swiotlb_bounced(dev, phys, size);
- aligned_size = iova_align(iovad, size);
- phys = swiotlb_tbl_map_single(dev, phys, size, aligned_size,
+ phys = swiotlb_tbl_map_single(dev, phys, size,
iova_mask(iovad), dir, attrs);
if (phys == DMA_MAPPING_ERROR)
return DMA_MAPPING_ERROR;
- /* Cleanup the padding area. */
- padding_start = phys_to_virt(phys);
- padding_size = aligned_size;
+ /*
+ * Untrusted devices should not see padding areas with random
+ * leftover kernel data, so zero the pre- and post-padding.
+ * swiotlb_tbl_map_single() has initialized the bounce buffer
+ * proper to the contents of the original memory buffer.
+ */
+ if (dev_is_untrusted(dev)) {
+ size_t start, virt = (size_t)phys_to_virt(phys);
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
- (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) {
- padding_start += size;
- padding_size -= size;
- }
+ /* Pre-padding */
+ start = iova_align_down(iovad, virt);
+ memset((void *)start, 0, virt - start);
- memset(padding_start, 0, padding_size);
+ /* Post-padding */
+ start = virt + size;
+ memset((void *)start, 0,
+ iova_align(iovad, start) - start);
+ }
}
if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
@@ -1718,7 +1721,8 @@ static size_t iommu_dma_max_mapping_size(struct device *dev)
}
static const struct dma_map_ops iommu_dma_ops = {
- .flags = DMA_F_PCI_P2PDMA_SUPPORTED,
+ .flags = DMA_F_PCI_P2PDMA_SUPPORTED |
+ DMA_F_CAN_SKIP_SYNC,
.alloc = iommu_dma_alloc,
.free = iommu_dma_free,
.alloc_pages_op = dma_common_alloc_pages,
diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c
index 4b15af6b7122..44da335d66bd 100644
--- a/drivers/net/ethernet/engleder/tsnep_main.c
+++ b/drivers/net/ethernet/engleder/tsnep_main.c
@@ -1587,7 +1587,7 @@ static int tsnep_rx_poll_zc(struct tsnep_rx *rx, struct napi_struct *napi,
length = __le32_to_cpu(entry->desc_wb->properties) &
TSNEP_DESC_LENGTH_MASK;
xsk_buff_set_size(entry->xdp, length - ETH_FCS_LEN);
- xsk_buff_dma_sync_for_cpu(entry->xdp, rx->xsk_pool);
+ xsk_buff_dma_sync_for_cpu(entry->xdp);
/* RX metadata with timestamps is in front of actual data,
* subtract metadata size to get length of actual data and
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-xsk.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-xsk.c
index 051748b997f3..a466c2379146 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-xsk.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-xsk.c
@@ -55,7 +55,7 @@ static u32 dpaa2_xsk_run_xdp(struct dpaa2_eth_priv *priv,
xdp_set_data_meta_invalid(xdp_buff);
xdp_buff->rxq = &ch->xdp_rxq;
- xsk_buff_dma_sync_for_cpu(xdp_buff, ch->xsk_pool);
+ xsk_buff_dma_sync_for_cpu(xdp_buff);
xdp_act = bpf_prog_run_xdp(xdp_prog, xdp_buff);
/* xdp.data pointer may have changed */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index a85b425794df..4e885df789ef 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -482,7 +482,7 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
bi = *i40e_rx_bi(rx_ring, next_to_process);
xsk_buff_set_size(bi, size);
- xsk_buff_dma_sync_for_cpu(bi, rx_ring->xsk_pool);
+ xsk_buff_dma_sync_for_cpu(bi);
if (!first)
first = bi;
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index aa81d1162b81..7541f223bf4f 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -878,7 +878,7 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
ICE_RX_FLX_DESC_PKT_LEN_M;
xsk_buff_set_size(xdp, size);
- xsk_buff_dma_sync_for_cpu(xdp, xsk_pool);
+ xsk_buff_dma_sync_for_cpu(xdp);
if (!first) {
first = xdp;
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index b5bcabab7a1d..12f004f46082 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -2812,7 +2812,7 @@ static int igc_clean_rx_irq_zc(struct igc_q_vector *q_vector, const int budget)
}
bi->xdp->data_end = bi->xdp->data + size;
- xsk_buff_dma_sync_for_cpu(bi->xdp, ring->xsk_pool);
+ xsk_buff_dma_sync_for_cpu(bi->xdp);
res = __igc_xdp_run_prog(adapter, prog, bi->xdp);
switch (res) {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
index 397cb773fabb..3e3b471e53f0 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
@@ -303,7 +303,7 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
}
bi->xdp->data_end = bi->xdp->data + size;
- xsk_buff_dma_sync_for_cpu(bi->xdp, rx_ring->xsk_pool);
+ xsk_buff_dma_sync_for_cpu(bi->xdp);
xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, bi->xdp);
if (likely(xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR))) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index b8dd74453655..1b7132fa70de 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -270,7 +270,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
/* mxbuf->rq is set on allocation, but cqe is per-packet so set it here */
mxbuf->cqe = cqe;
xsk_buff_set_size(&mxbuf->xdp, cqe_bcnt);
- xsk_buff_dma_sync_for_cpu(&mxbuf->xdp, rq->xsk_pool);
+ xsk_buff_dma_sync_for_cpu(&mxbuf->xdp);
net_prefetch(mxbuf->xdp.data);
/* Possible flows:
@@ -319,7 +319,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
/* mxbuf->rq is set on allocation, but cqe is per-packet so set it here */
mxbuf->cqe = cqe;
xsk_buff_set_size(&mxbuf->xdp, cqe_bcnt);
- xsk_buff_dma_sync_for_cpu(&mxbuf->xdp, rq->xsk_pool);
+ xsk_buff_dma_sync_for_cpu(&mxbuf->xdp);
net_prefetch(mxbuf->xdp.data);
prog = rcu_dereference(rq->xdp_prog);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index d601b5faaed5..b5333da20e8a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -917,7 +917,7 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
if (!rq->xsk_pool) {
count = mlx5e_refill_rx_wqes(rq, head, wqe_bulk);
- } else if (likely(!rq->xsk_pool->dma_need_sync)) {
+ } else if (likely(!dma_dev_need_sync(rq->pdev))) {
mlx5e_xsk_free_rx_wqes(rq, head, wqe_bulk);
count = mlx5e_xsk_alloc_rx_wqes_batched(rq, head, wqe_bulk);
} else {
diff --git a/drivers/net/ethernet/netronome/nfp/nfd3/xsk.c b/drivers/net/ethernet/netronome/nfp/nfd3/xsk.c
index 45be6954d5aa..01cfa9cc1b5e 100644
--- a/drivers/net/ethernet/netronome/nfp/nfd3/xsk.c
+++ b/drivers/net/ethernet/netronome/nfp/nfd3/xsk.c
@@ -184,7 +184,7 @@ nfp_nfd3_xsk_rx(struct nfp_net_rx_ring *rx_ring, int budget,
xrxbuf->xdp->data += meta_len;
xrxbuf->xdp->data_end = xrxbuf->xdp->data + pkt_len;
xdp_set_data_meta_invalid(xrxbuf->xdp);
- xsk_buff_dma_sync_for_cpu(xrxbuf->xdp, r_vec->xsk_pool);
+ xsk_buff_dma_sync_for_cpu(xrxbuf->xdp);
net_prefetch(xrxbuf->xdp->data);
if (meta_len) {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 2e9a2da605f6..b3afc7cb7d72 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -5361,7 +5361,7 @@ read_again:
/* RX buffer is good and fit into a XSK pool buffer */
buf->xdp->data_end = buf->xdp->data + buf1_len;
- xsk_buff_dma_sync_for_cpu(buf->xdp, rx_q->xsk_pool);
+ xsk_buff_dma_sync_for_cpu(buf->xdp);
prog = READ_ONCE(priv->xdp_prog);
res = __stmmac_xdp_run_prog(priv, prog, buf->xdp);
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 1c4ef5111651..6579ae3f6dac 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -216,7 +216,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
*/
trace_swiotlb_bounced(dev, dev_addr, size);
- map = swiotlb_tbl_map_single(dev, phys, size, size, 0, dir, attrs);
+ map = swiotlb_tbl_map_single(dev, phys, size, 0, dir, attrs);
if (map == (phys_addr_t)DMA_MAPPING_ERROR)
return DMA_MAPPING_ERROR;
diff --git a/include/linux/device.h b/include/linux/device.h
index b9f5464f44ed..d4b50accff26 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -691,6 +691,7 @@ struct device_physical_location {
* and optionall (if the coherent mask is large enough) also
* for dma allocations. This flag is managed by the dma ops
* instance from ->dma_supported.
+ * @dma_skip_sync: DMA sync operations can be skipped for coherent buffers.
*
* At the lowest level, every device in a Linux system is represented by an
* instance of struct device. The device structure contains the information
@@ -803,6 +804,9 @@ struct device {
#ifdef CONFIG_DMA_OPS_BYPASS
bool dma_ops_bypass : 1;
#endif
+#ifdef CONFIG_DMA_NEED_SYNC