diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-25 11:52:57 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-25 11:52:57 -0800 |
| commit | cac85e4616b1cf4a90844b952b49b9cbc4562530 (patch) | |
| tree | 1d3d5f82188310497ced1a0ade2ab3b7a4346201 /drivers | |
| parent | 84cc6674b76ba2cdac0df8037b4d8a22a6fc1b77 (diff) | |
| parent | d649c34cb916b015fdcb487e51409fcc5caeca8d (diff) | |
| download | linux-cac85e4616b1cf4a90844b952b49b9cbc4562530.tar.gz linux-cac85e4616b1cf4a90844b952b49b9cbc4562530.tar.bz2 linux-cac85e4616b1cf4a90844b952b49b9cbc4562530.zip | |
Merge tag 'vfio-v6.3-rc1' of https://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson:
- Remove redundant resource check in vfio-platform (Angus Chen)
- Use GFP_KERNEL_ACCOUNT for persistent userspace allocations, allowing
removal of arbitrary kernel limits in favor of cgroup control (Yishai
Hadas)
- mdev tidy-ups, including removing the module-only build restriction
for sample drivers, Kconfig changes to select mdev support,
documentation movement to keep sample driver usage instructions with
sample drivers rather than with API docs, remove references to
out-of-tree drivers in docs (Christoph Hellwig)
- Fix collateral breakages from mdev Kconfig changes (Arnd Bergmann)
- Make mlx5 migration support match device support, improve source and
target flows to improve pre-copy support and reduce downtime (Yishai
Hadas)
- Convert additional mdev sysfs case to use sysfs_emit() (Bo Liu)
- Resolve copy-paste error in mdev mbochs sample driver Kconfig (Ye
Xingchen)
- Avoid propagating missing reset error in vfio-platform if reset
requirement is relaxed by module option (Tomasz Duszynski)
- Range size fixes in mlx5 variant driver for missed last byte and
stricter range calculation (Yishai Hadas)
- Fixes to suspended vaddr support and locked_vm accounting, excluding
mdev configurations from the former due to potential to indefinitely
block kernel threads, fix underflow and restore locked_vm on new mm
(Steve Sistare)
- Update outdated vfio documentation due to new IOMMUFD interfaces in
recent kernels (Yi Liu)
- Resolve deadlock between group_lock and kvm_lock, finally (Matthew
Rosato)
- Fix NULL pointer in group initialization error path with IOMMUFD (Yan
Zhao)
* tag 'vfio-v6.3-rc1' of https://github.com/awilliam/linux-vfio: (32 commits)
vfio: Fix NULL pointer dereference caused by uninitialized group->iommufd
docs: vfio: Update vfio.rst per latest interfaces
vfio: Update the kdoc for vfio_device_ops
vfio/mlx5: Fix range size calculation upon tracker creation
vfio: no need to pass kvm pointer during device open
vfio: fix deadlock between group lock and kvm lock
vfio: revert "iommu driver notify callback"
vfio/type1: revert "implement notify callback"
vfio/type1: revert "block on invalid vaddr"
vfio/type1: restore locked_vm
vfio/type1: track locked_vm per dma
vfio/type1: prevent underflow of locked_vm via exec()
vfio/type1: exclude mdevs from VFIO_UPDATE_VADDR
vfio: platform: ignore missing reset if disabled at module init
vfio/mlx5: Improve the target side flow to reduce downtime
vfio/mlx5: Improve the source side flow upon pre_copy
vfio/mlx5: Check whether VF is migratable
samples: fix the prompt about SAMPLE_VFIO_MDEV_MBOCHS
vfio/mdev: Use sysfs_emit() to instead of sprintf()
vfio-mdev: add back CONFIG_VFIO dependency
...
Diffstat (limited to 'drivers')
| -rw-r--r-- | drivers/gpu/drm/i915/Kconfig | 3 | ||||
| -rw-r--r-- | drivers/vfio/container.c | 7 | ||||
| -rw-r--r-- | drivers/vfio/fsl-mc/vfio_fsl_mc.c | 2 | ||||
| -rw-r--r-- | drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c | 4 | ||||
| -rw-r--r-- | drivers/vfio/group.c | 46 | ||||
| -rw-r--r-- | drivers/vfio/mdev/Kconfig | 8 | ||||
| -rw-r--r-- | drivers/vfio/mdev/mdev_sysfs.c | 2 | ||||
| -rw-r--r-- | drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 4 | ||||
| -rw-r--r-- | drivers/vfio/pci/mlx5/cmd.c | 79 | ||||
| -rw-r--r-- | drivers/vfio/pci/mlx5/cmd.h | 28 | ||||
| -rw-r--r-- | drivers/vfio/pci/mlx5/main.c | 261 | ||||
| -rw-r--r-- | drivers/vfio/pci/vfio_pci_config.c | 6 | ||||
| -rw-r--r-- | drivers/vfio/pci/vfio_pci_core.c | 7 | ||||
| -rw-r--r-- | drivers/vfio/pci/vfio_pci_igd.c | 2 | ||||
| -rw-r--r-- | drivers/vfio/pci/vfio_pci_intrs.c | 10 | ||||
| -rw-r--r-- | drivers/vfio/pci/vfio_pci_rdwr.c | 2 | ||||
| -rw-r--r-- | drivers/vfio/platform/vfio_platform_common.c | 12 | ||||
| -rw-r--r-- | drivers/vfio/platform/vfio_platform_irq.c | 8 | ||||
| -rw-r--r-- | drivers/vfio/vfio.h | 25 | ||||
| -rw-r--r-- | drivers/vfio/vfio_iommu_type1.c | 248 | ||||
| -rw-r--r-- | drivers/vfio/vfio_main.c | 70 | ||||
| -rw-r--r-- | drivers/vfio/virqfd.c | 2 |
22 files changed, 564 insertions, 272 deletions
diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig index 9c0990c0ec87..3d1cd04ac5fa 100644 --- a/drivers/gpu/drm/i915/Kconfig +++ b/drivers/gpu/drm/i915/Kconfig @@ -127,9 +127,10 @@ config DRM_I915_GVT_KVMGT depends on X86 depends on 64BIT depends on KVM - depends on VFIO_MDEV + depends on VFIO select DRM_I915_GVT select KVM_EXTERNAL_WRITE_TRACKING + select VFIO_MDEV help Choose this option if you want to enable Intel GVT-g graphics diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c index 89f10becf962..d53d08f16973 100644 --- a/drivers/vfio/container.c +++ b/drivers/vfio/container.c @@ -360,7 +360,7 @@ static int vfio_fops_open(struct inode *inode, struct file *filep) { struct vfio_container *container; - container = kzalloc(sizeof(*container), GFP_KERNEL); + container = kzalloc(sizeof(*container), GFP_KERNEL_ACCOUNT); if (!container) return -ENOMEM; @@ -376,11 +376,6 @@ static int vfio_fops_open(struct inode *inode, struct file *filep) static int vfio_fops_release(struct inode *inode, struct file *filep) { struct vfio_container *container = filep->private_data; - struct vfio_iommu_driver *driver = container->iommu_driver; - - if (driver && driver->ops->notify) - driver->ops->notify(container->iommu_data, - VFIO_IOMMU_CONTAINER_CLOSE); filep->private_data = NULL; diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index defeb8510ace..c89a047a4cd8 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -28,7 +28,7 @@ static int vfio_fsl_mc_open_device(struct vfio_device *core_vdev) int i; vdev->regions = kcalloc(count, sizeof(struct vfio_fsl_mc_region), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!vdev->regions) return -ENOMEM; diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c index 64d01f3fb13d..c51229fccbd6 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c @@ -29,7 +29,7 @@ static int vfio_fsl_mc_irqs_allocate(struct vfio_fsl_mc_device *vdev) irq_count = mc_dev->obj_desc.irq_count; - mc_irq = kcalloc(irq_count, sizeof(*mc_irq), GFP_KERNEL); + mc_irq = kcalloc(irq_count, sizeof(*mc_irq), GFP_KERNEL_ACCOUNT); if (!mc_irq) return -ENOMEM; @@ -77,7 +77,7 @@ static int vfio_set_trigger(struct vfio_fsl_mc_device *vdev, if (fd < 0) /* Disable only */ return 0; - irq->name = kasprintf(GFP_KERNEL, "vfio-irq[%d](%s)", + irq->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-irq[%d](%s)", hwirq, dev_name(&vdev->mc_dev->dev)); if (!irq->name) return -ENOMEM; diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c index e166ad7ce6e7..27d5ba7cf9dc 100644 --- a/drivers/vfio/group.c +++ b/drivers/vfio/group.c @@ -140,7 +140,7 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group, ret = iommufd_vfio_compat_ioas_create(iommufd); if (ret) { - iommufd_ctx_put(group->iommufd); + iommufd_ctx_put(iommufd); goto out_unlock; } @@ -157,6 +157,18 @@ out_unlock: return ret; } +static void vfio_device_group_get_kvm_safe(struct vfio_device *device) +{ + spin_lock(&device->group->kvm_ref_lock); + if (!device->group->kvm) + goto unlock; + + _vfio_device_get_kvm_safe(device, device->group->kvm); + +unlock: + spin_unlock(&device->group->kvm_ref_lock); +} + static int vfio_device_group_open(struct vfio_device *device) { int ret; @@ -167,13 +179,23 @@ static int vfio_device_group_open(struct vfio_device *device) goto out_unlock; } + mutex_lock(&device->dev_set->lock); + /* - * Here we pass the KVM pointer with the group under the lock. If the - * device driver will use it, it must obtain a reference and release it - * during close_device. + * Before the first device open, get the KVM pointer currently + * associated with the group (if there is one) and obtain a reference + * now that will be held until the open_count reaches 0 again. Save + * the pointer in the device for use by drivers. */ - ret = vfio_device_open(device, device->group->iommufd, - device->group->kvm); + if (device->open_count == 0) + vfio_device_group_get_kvm_safe(device); + + ret = vfio_device_open(device, device->group->iommufd); + + if (device->open_count == 0) + vfio_device_put_kvm(device); + + mutex_unlock(&device->dev_set->lock); out_unlock: mutex_unlock(&device->group->group_lock); @@ -183,7 +205,14 @@ out_unlock: void vfio_device_group_close(struct vfio_device *device) { mutex_lock(&device->group->group_lock); + mutex_lock(&device->dev_set->lock); + vfio_device_close(device, device->group->iommufd); + + if (device->open_count == 0) + vfio_device_put_kvm(device); + + mutex_unlock(&device->dev_set->lock); mutex_unlock(&device->group->group_lock); } @@ -453,6 +482,7 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, refcount_set(&group->drivers, 1); mutex_init(&group->group_lock); + spin_lock_init(&group->kvm_ref_lock); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); group->iommu_group = iommu_group; @@ -806,9 +836,9 @@ void vfio_file_set_kvm(struct file *file, struct kvm *kvm) if (!vfio_file_is_group(file)) return; - mutex_lock(&group->group_lock); + spin_lock(&group->kvm_ref_lock); group->kvm = kvm; - mutex_unlock(&group->group_lock); + spin_unlock(&group->kvm_ref_lock); } EXPORT_SYMBOL_GPL(vfio_file_set_kvm); diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig index 646dbed44eb2..e5fb84e07965 100644 --- a/drivers/vfio/mdev/Kconfig +++ b/drivers/vfio/mdev/Kconfig @@ -1,10 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only config VFIO_MDEV - tristate "Mediated device driver framework" - default n - help - Provides a framework to virtualize devices. - See Documentation/driver-api/vfio-mediated-device.rst for more details. - - If you don't know what do here, say N. + tristate diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index abe3359dd477..e4490639d383 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -96,7 +96,7 @@ static MDEV_TYPE_ATTR_RO(device_api); static ssize_t name_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - return sprintf(buf, "%s\n", + return sysfs_emit(buf, "%s\n", mtype->pretty_name ? mtype->pretty_name : mtype->sysfs_name); } diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 0bba3b05c6c7..a117eaf21c14 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -744,7 +744,7 @@ hisi_acc_vf_pci_resume(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct hisi_acc_vf_migration_file *migf; - migf = kzalloc(sizeof(*migf), GFP_KERNEL); + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); if (!migf) return ERR_PTR(-ENOMEM); @@ -863,7 +863,7 @@ hisi_acc_open_saving_migf(struct hisi_acc_vf_core_device *hisi_acc_vdev) struct hisi_acc_vf_migration_file *migf; int ret; - migf = kzalloc(sizeof(*migf), GFP_KERNEL); + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); if (!migf) return ERR_PTR(-ENOMEM); diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 64e68d13cb98..deed156e6165 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -7,6 +7,29 @@ enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 }; +static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id) +{ + int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); + void *query_cap = NULL, *cap; + int ret; + + query_cap = kzalloc(query_sz, GFP_KERNEL); + if (!query_cap) + return -ENOMEM; + + ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap, + MLX5_CAP_GENERAL_2); + if (ret) + goto out; + + cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability); + if (!MLX5_GET(cmd_hca_cap_2, cap, migratable)) + ret = -EOPNOTSUPP; +out: + kfree(query_cap); + return ret; +} + static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, u16 *vhca_id); static void @@ -195,6 +218,10 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, if (mvdev->vf_id < 0) goto end; + ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1); + if (ret) + goto end; + if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1, &mvdev->vhca_id)) goto end; @@ -373,7 +400,7 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, struct mlx5_vhca_data_buffer *buf; int ret; - buf = kzalloc(sizeof(*buf), GFP_KERNEL); + buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); if (!buf) return ERR_PTR(-ENOMEM); @@ -473,7 +500,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) } static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, - size_t image_size) + size_t image_size, bool initial_pre_copy) { struct mlx5_vf_migration_file *migf = header_buf->migf; struct mlx5_vf_migration_header header = {}; @@ -481,7 +508,9 @@ static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, struct page *page; u8 *to_buff; - header.image_size = cpu_to_le64(image_size); + header.record_size = cpu_to_le64(image_size); + header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY); + header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA); page = mlx5vf_get_migration_page(header_buf, 0); if (!page) return -EINVAL; @@ -489,12 +518,13 @@ static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, memcpy(to_buff, &header, sizeof(header)); kunmap_local(to_buff); header_buf->length = sizeof(header); - header_buf->header_image_size = image_size; header_buf->start_pos = header_buf->migf->max_pos; migf->max_pos += header_buf->length; spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&header_buf->buf_elm, &migf->buf_list); spin_unlock_irqrestore(&migf->list_lock, flags); + if (initial_pre_copy) + migf->pre_copy_initial_bytes += sizeof(header); return 0; } @@ -508,11 +538,14 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) if (!status) { size_t image_size; unsigned long flags; + bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY && + !async_data->last_chunk; image_size = MLX5_GET(save_vhca_state_out, async_data->out, actual_image_size); if (async_data->header_buf) { - status = add_buf_header(async_data->header_buf, image_size); + status = add_buf_header(async_data->header_buf, image_size, + initial_pre_copy); if (status) goto err; } @@ -522,6 +555,8 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); spin_unlock_irqrestore(&migf->list_lock, flags); + if (initial_pre_copy) + migf->pre_copy_initial_bytes += image_size; migf->state = async_data->last_chunk ? MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY; wake_up_interruptible(&migf->poll_wait); @@ -583,11 +618,16 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, } if (MLX5VF_PRE_COPY_SUPP(mvdev)) { - header_buf = mlx5vf_get_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); - if (IS_ERR(header_buf)) { - err = PTR_ERR(header_buf); - goto err_free; + if (async_data->last_chunk && migf->buf_header) { + header_buf = migf->buf_header; + migf->buf_header = NULL; + } else { + header_buf = mlx5vf_get_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(header_buf)) { + err = PTR_ERR(header_buf); + goto err_free; + } } } @@ -790,7 +830,7 @@ static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, node = interval_tree_iter_first(ranges, 0, ULONG_MAX); for (i = 0; i < num_ranges; i++) { void *addr_range_i_base = range_list_ptr + record_size * i; - unsigned long length = node->last - node->start; + unsigned long length = node->last - node->start + 1; MLX5_SET64(page_track_range, addr_range_i_base, start_address, node->start); @@ -800,7 +840,7 @@ static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, } WARN_ON(node); - log_addr_space_size = ilog2(total_ranges_len); + log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len)); if (log_addr_space_size < (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) || log_addr_space_size > @@ -1032,18 +1072,18 @@ mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev, void *in; int err; - qp = kzalloc(sizeof(*qp), GFP_KERNEL); + qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT); if (!qp) return ERR_PTR(-ENOMEM); - qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr); - log_rq_stride = ilog2(MLX5_SEND_WQE_DS); - log_rq_sz = ilog2(qp->rq.wqe_cnt); err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node); if (err) goto err_free; if (max_recv_wr) { + qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr); + log_rq_stride = ilog2(MLX5_SEND_WQE_DS); + log_rq_sz = ilog2(qp->rq.wqe_cnt); err = mlx5_frag_buf_alloc_node(mdev, wq_get_byte_sz(log_rq_sz, log_rq_stride), &qp->buf, mdev->priv.numa_node); @@ -1213,12 +1253,13 @@ static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, int i; recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!recv_buf->page_list) return -ENOMEM; for (;;) { - filled = alloc_pages_bulk_array(GFP_KERNEL, npages - done, + filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, + npages - done, recv_buf->page_list + done); if (!filled) goto err; @@ -1248,7 +1289,7 @@ static int register_dma_recv_pages(struct mlx5_core_dev *mdev, recv_buf->dma_addrs = kvcalloc(recv_buf->npages, sizeof(*recv_buf->dma_addrs), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!recv_buf->dma_addrs) return -ENOMEM; diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 5483171d57ad..aec4c69dd6c1 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/vfio_pci_core.h> #include <linux/mlx5/driver.h> +#include <linux/mlx5/vport.h> #include <linux/mlx5/cq.h> #include <linux/mlx5/qp.h> @@ -26,15 +27,33 @@ enum mlx5_vf_migf_state { enum mlx5_vf_load_state { MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER, MLX5_VF_LOAD_STATE_READ_HEADER, + MLX5_VF_LOAD_STATE_PREP_HEADER_DATA, + MLX5_VF_LOAD_STATE_READ_HEADER_DATA, MLX5_VF_LOAD_STATE_PREP_IMAGE, MLX5_VF_LOAD_STATE_READ_IMAGE, MLX5_VF_LOAD_STATE_LOAD_IMAGE, }; +struct mlx5_vf_migration_tag_stop_copy_data { + __le64 stop_copy_size; +}; + +enum mlx5_vf_migf_header_flags { + MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY = 0, + MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL = 1 << 0, +}; + +enum mlx5_vf_migf_header_tag { + MLX5_MIGF_HEADER_TAG_FW_DATA = 0, + MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE = 1 << 0, +}; + struct mlx5_vf_migration_header { - __le64 image_size; + __le64 record_size; /* For future use in case we may need to change the kernel protocol */ - __le64 flags; + __le32 flags; /* Use mlx5_vf_migf_header_flags */ + __le32 tag; /* Use mlx5_vf_migf_header_tag */ + __u8 data[]; /* Its size is given in the record_size */ }; struct mlx5_vhca_data_buffer { @@ -42,7 +61,6 @@ struct mlx5_vhca_data_buffer { loff_t start_pos; u64 length; u64 allocated_length; - u64 header_image_size; u32 mkey; enum dma_data_direction dma_dir; u8 dmaed:1; @@ -72,6 +90,10 @@ struct mlx5_vf_migration_file { enum mlx5_vf_load_state load_state; u32 pdn; loff_t max_pos; + u64 record_size; + u32 record_tag; + u64 stop_copy_prep_size; + u64 pre_copy_initial_bytes; struct mlx5_vhca_data_buffer *buf; struct mlx5_vhca_data_buffer *buf_header; spinlock_t list_lock; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 9feb89c6d939..e897537a9e8a 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -21,8 +21,8 @@ #include "cmd.h" -/* Arbitrary to prevent userspace from consuming endless memory */ -#define MAX_MIGRATION_SIZE (512*1024*1024) +/* Device specification max LOAD size */ +#define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1) static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) { @@ -73,12 +73,13 @@ int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, int ret; to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); - page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL); + page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); if (!page_list) return -ENOMEM; do { - filled = alloc_pages_bulk_array(GFP_KERNEL, to_fill, page_list); + filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, + page_list); if (!filled) { ret = -ENOMEM; goto err; @@ -87,7 +88,7 @@ int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, ret = sg_alloc_append_table_from_pages( &buf->table, page_list, filled, 0, filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (ret) goto err; @@ -303,6 +304,87 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) wake_up_interruptible(&migf->poll_wait); } +static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf) +{ + size_t size = sizeof(struct mlx5_vf_migration_header) + + sizeof(struct mlx5_vf_migration_tag_stop_copy_data); + struct mlx5_vf_migration_tag_stop_copy_data data = {}; + struct mlx5_vhca_data_buffer *header_buf = NULL; + struct mlx5_vf_migration_header header = {}; + unsigned long flags; + struct page *page; + u8 *to_buff; + int ret; + + header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE); + if (IS_ERR(header_buf)) + return PTR_ERR(header_buf); + + header.record_size = cpu_to_le64(sizeof(data)); + header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL); + header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE); + page = mlx5vf_get_migration_page(header_buf, 0); + if (!page) { + ret = -EINVAL; + goto err; + } + to_buff = kmap_local_page(page); + memcpy(to_buff, &header, sizeof(header)); + header_buf->length = sizeof(header); + data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length); + memcpy(to_buff + sizeof(header), &data, sizeof(data)); + header_buf->length += sizeof(data); + kunmap_local(to_buff); + header_buf->start_pos = header_buf->migf->max_pos; + migf->max_pos += header_buf->length; + spin_lock_irqsave(&migf->list_lock, flags); + list_add_tail(&header_buf->buf_elm, &migf->buf_list); + spin_unlock_irqrestore(&migf->list_lock, flags); + migf->pre_copy_initial_bytes = size; + return 0; +err: + mlx5vf_put_data_buffer(header_buf); + return ret; +} + +static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf, + size_t state_size) +{ + struct mlx5_vhca_data_buffer *buf; + size_t inc_state_size; + int ret; + + /* let's be ready for stop_copy size that might grow by 10 percents */ + if (check_add_overflow(state_size, state_size / 10, &inc_state_size)) + inc_state_size = state_size; + + buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + migf->buf = buf; + buf = mlx5vf_get_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto err; + } + + migf->buf_header = buf; + ret = mlx5vf_add_stop_copy_header(migf); + if (ret) + goto err_header; + return 0; + +err_header: + mlx5vf_put_data_buffer(migf->buf_header); + migf->buf_header = NULL; +err: + mlx5vf_put_data_buffer(migf->buf); + migf->buf = NULL; + return ret; +} + static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -313,7 +395,7 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, loff_t *pos = &filp->f_pos; unsigned long minsz; size_t inc_length = 0; - bool end_of_data; + bool end_of_data = false; int ret; if (cmd != VFIO_MIG_GET_PRECOPY_INFO) @@ -357,25 +439,19 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, goto err_migf_unlock; } - buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data); - if (buf) { - if (buf->start_pos == 0) { - info.initial_bytes = buf->header_image_size - *pos; - } else if (buf->start_pos == - sizeof(struct mlx5_vf_migration_header)) { - /* First data buffer following the header */ - info.initial_bytes = buf->start_pos + - buf->length - *pos; - } else { - info.dirty_bytes = buf->start_pos + buf->length - *pos; - } + if (migf->pre_copy_initial_bytes > *pos) { + info.initial_bytes = migf->pre_copy_initial_bytes - *pos; } else { - if (!end_of_data) { - ret = -EINVAL; - goto err_migf_unlock; + buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data); + if (buf) { + info.dirty_bytes = buf->start_pos + buf->length - *pos; + } else { + if (!end_of_data) { + ret = -EINVAL; + goto err_migf_unlock; + } + info.dirty_bytes = inc_length; } - - info.dirty_bytes = inc_length; } if (!end_of_data || !inc_length) { @@ -440,10 +516,16 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) if (ret) goto err; - buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); - if (IS_ERR(buf)) { - ret = PTR_ERR(buf); - goto err; + /* Checking whether we have a matching pre-allocated buffer that can fit */ + if (migf->buf && migf->buf->allocated_length >= length) { + buf = migf->buf; + migf->buf = NULL; + } else { + buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto err; + } } ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); @@ -467,7 +549,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) size_t length; int ret; - migf = kzalloc(sizeof(*migf), GFP_KERNEL); + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); if (!migf) return ERR_PTR(-ENOMEM); @@ -502,6 +584,12 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) if (ret) goto out_pd; + if (track) { + ret = mlx5vf_prep_stop_copy(migf, length); + if (ret) + goto out_pd; + } + buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); @@ -515,7 +603,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) out_save: mlx5vf_free_data_buffer(buf); out_pd: - mlx5vf_cmd_dealloc_pd(migf); + mlx5fv_cmd_clean_migf_resources(migf); out_free: fput(migf->filp); end: @@ -564,7 +652,7 @@ mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, { int ret; - if (requested_length > MAX_MIGRATION_SIZE) + if (requested_length > MAX_LOAD_SIZE) return -ENOMEM; if (vhca_buf->allocated_length < requested_length) { @@ -616,6 +704,56 @@ mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, } static int +mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *vhca_buf, + const char __user **buf, size_t *len, + loff_t *pos, ssize_t *done) +{ + size_t copy_len, to_copy; + size_t required_data; + u8 *to_buff; + int ret; + + required_data = migf->record_size - vhca_buf->length; + to_copy = min_t(size_t, *len, required_data); + copy_len = to_copy; + while (to_copy) { + ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, + done); + if (ret) + return ret; + } + + *len -= copy_len; + if (vhca_buf->length == migf->record_size) { + switch (migf->record_tag) { + case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: + { + struct page *page; + + page = mlx5vf_get_migration_page(vhca_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + migf->stop_copy_prep_size = min_t(u64, + le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE); + kunmap_local(to_buff); + break; + } + default: + /* Optional tag */ + break; + } + + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; + migf->max_pos += migf->record_size; + vhca_buf->length = 0; + } + + return 0; +} + +static int mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, struct mlx5_vhca_data_buffer *vhca_buf, const char __user **buf, @@ -645,23 +783,38 @@ mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, *len |
