diff options
33 files changed, 756 insertions, 422 deletions
diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst index fdf7d69378ec..bbd548b66b42 100644 --- a/Documentation/driver-api/vfio-mediated-device.rst +++ b/Documentation/driver-api/vfio-mediated-device.rst @@ -60,7 +60,7 @@ devices as examples, as these devices are the first devices to use this module:: | mdev.ko | | +-----------+ | mdev_register_parent() +--------------+ | | | +<------------------------+ | - | | | | | nvidia.ko |<-> physical + | | | | | ccw_device.ko|<-> physical | | | +------------------------>+ | device | | | | callbacks +--------------+ | | Physical | | @@ -69,12 +69,6 @@ devices as examples, as these devices are the first devices to use this module:: | | | | | i915.ko |<-> physical | | | +------------------------>+ | device | | | | callbacks +--------------+ - | | | | - | | | | mdev_register_parent() +--------------+ - | | | +<------------------------+ | - | | | | | ccw_device.ko|<-> physical - | | | +------------------------>+ | device - | | | | callbacks +--------------+ | +-----------+ | +---------------+ @@ -270,106 +264,6 @@ these callbacks are supported in the TYPE1 IOMMU module. To enable them for other IOMMU backend modules, such as PPC64 sPAPR module, they need to provide these two callback functions. -Using the Sample Code -===================== - -mtty.c in samples/vfio-mdev/ directory is a sample driver program to -demonstrate how to use the mediated device framework. - -The sample driver creates an mdev device that simulates a serial port over a PCI -card. - -1. Build and load the mtty.ko module. - - This step creates a dummy device, /sys/devices/virtual/mtty/mtty/ - - Files in this device directory in sysfs are similar to the following:: - - # tree /sys/devices/virtual/mtty/mtty/ - /sys/devices/virtual/mtty/mtty/ - |-- mdev_supported_types - | |-- mtty-1 - | | |-- available_instances - | | |-- create - | | |-- device_api - | | |-- devices - | | `-- name - | `-- mtty-2 - | |-- available_instances - | |-- create - | |-- device_api - | |-- devices - | `-- name - |-- mtty_dev - | `-- sample_mtty_dev - |-- power - | |-- autosuspend_delay_ms - | |-- control - | |-- runtime_active_time - | |-- runtime_status - | `-- runtime_suspended_time - |-- subsystem -> ../../../../class/mtty - `-- uevent - -2. Create a mediated device by using the dummy device that you created in the - previous step:: - - # echo "83b8f4f2-509f-382f-3c1e-e6bfe0fa1001" > \ - /sys/devices/virtual/mtty/mtty/mdev_supported_types/mtty-2/create - -3. Add parameters to qemu-kvm:: - - -device vfio-pci,\ - sysfsdev=/sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001 - -4. Boot the VM. - - In the Linux guest VM, with no hardware on the host, the device appears - as follows:: - - # lspci -s 00:05.0 -xxvv - 00:05.0 Serial controller: Device 4348:3253 (rev 10) (prog-if 02 [16550]) - Subsystem: Device 4348:3253 - Physical Slot: 5 - Control: I/O+ Mem- BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr- - Stepping- SERR- FastB2B- DisINTx- - Status: Cap- 66MHz- UDF- FastB2B- ParErr- DEVSEL=medium >TAbort- - <TAbort- <MAbort- >SERR- <PERR- INTx- - Interrupt: pin A routed to IRQ 10 - Region 0: I/O ports at c150 [size=8] - Region 1: I/O ports at c158 [size=8] - Kernel driver in use: serial - 00: 48 43 53 32 01 00 00 02 10 02 00 07 00 00 00 00 - 10: 51 c1 00 00 59 c1 00 00 00 00 00 00 00 00 00 00 - 20: 00 00 00 00 00 00 00 00 00 00 00 00 48 43 53 32 - 30: 00 00 00 00 00 00 00 00 00 00 00 00 0a 01 00 00 - - In the Linux guest VM, dmesg output for the device is as follows: - - serial 0000:00:05.0: PCI INT A -> Link[LNKA] -> GSI 10 (level, high) -> IRQ 10 - 0000:00:05.0: ttyS1 at I/O 0xc150 (irq = 10) is a 16550A - 0000:00:05.0: ttyS2 at I/O 0xc158 (irq = 10) is a 16550A - - -5. In the Linux guest VM, check the serial ports:: - - # setserial -g /dev/ttyS* - /dev/ttyS0, UART: 16550A, Port: 0x03f8, IRQ: 4 - /dev/ttyS1, UART: 16550A, Port: 0xc150, IRQ: 10 - /dev/ttyS2, UART: 16550A, Port: 0xc158, IRQ: 10 - -6. Using minicom or any terminal emulation program, open port /dev/ttyS1 or - /dev/ttyS2 with hardware flow control disabled. - -7. Type data on the minicom terminal or send data to the terminal emulation - program and read the data. - - Data is loop backed from hosts mtty driver. - -8. Destroy the mediated device that you created:: - - # echo 1 > /sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001/remove - References ========== diff --git a/Documentation/driver-api/vfio.rst b/Documentation/driver-api/vfio.rst index c663b6f97825..50b690f7f663 100644 --- a/Documentation/driver-api/vfio.rst +++ b/Documentation/driver-api/vfio.rst @@ -249,19 +249,21 @@ VFIO bus driver API VFIO bus drivers, such as vfio-pci make use of only a few interfaces into VFIO core. When devices are bound and unbound to the driver, -the driver should call vfio_register_group_dev() and -vfio_unregister_group_dev() respectively:: +Following interfaces are called when devices are bound to and +unbound from the driver:: - void vfio_init_group_dev(struct vfio_device *device, - struct device *dev, - const struct vfio_device_ops *ops); - void vfio_uninit_group_dev(struct vfio_device *device); int vfio_register_group_dev(struct vfio_device *device); + int vfio_register_emulated_iommu_dev(struct vfio_device *device); void vfio_unregister_group_dev(struct vfio_device *device); -The driver should embed the vfio_device in its own structure and call -vfio_init_group_dev() to pre-configure it before going to registration -and call vfio_uninit_group_dev() after completing the un-registration. +The driver should embed the vfio_device in its own structure and use +vfio_alloc_device() to allocate the structure, and can register +@init/@release callbacks to manage any private state wrapping the +vfio_device:: + + vfio_alloc_device(dev_struct, member, dev, ops); + void vfio_put_device(struct vfio_device *device); + vfio_register_group_dev() indicates to the core to begin tracking the iommu_group of the specified dev and register the dev as owned by a VFIO bus driver. Once vfio_register_group_dev() returns it is possible for userspace to @@ -270,28 +272,64 @@ ready before calling it. The driver provides an ops structure for callbacks similar to a file operations structure:: struct vfio_device_ops { - int (*open)(struct vfio_device *vdev); + char *name; + int (*init)(struct vfio_device *vdev); void (*release)(struct vfio_device *vdev); + int (*bind_iommufd)(struct vfio_device *vdev, + struct iommufd_ctx *ictx, u32 *out_device_id); + void (*unbind_iommufd)(struct vfio_device *vdev); + int (*attach_ioas)(struct vfio_device *vdev, u32 *pt_id); + int (*open_device)(struct vfio_device *vdev); + void (*close_device)(struct vfio_device *vdev); ssize_t (*read)(struct vfio_device *vdev, char __user *buf, size_t count, loff_t *ppos); - ssize_t (*write)(struct vfio_device *vdev, - const char __user *buf, - size_t size, loff_t *ppos); + ssize_t (*write)(struct vfio_device *vdev, const char __user *buf, + size_t count, loff_t *size); long (*ioctl)(struct vfio_device *vdev, unsigned int cmd, unsigned long arg); - int (*mmap)(struct vfio_device *vdev, - struct vm_area_struct *vma); + int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma); + void (*request)(struct vfio_device *vdev, unsigned int count); + int (*match)(struct vfio_device *vdev, char *buf); + void (*dma_unmap)(struct vfio_device *vdev, u64 iova, u64 length); + int (*device_feature)(struct vfio_device *device, u32 flags, + void __user *arg, size_t argsz); }; Each function is passed the vdev that was originally registered -in the vfio_register_group_dev() call above. This allows the bus driver -to obtain its private data using container_of(). The open/release -callbacks are issued when a new file descriptor is created for a -device (via VFIO_GROUP_GET_DEVICE_FD). The ioctl interface provides -a direct pass through for VFIO_DEVICE_* ioctls. The read/write/mmap -interfaces implement the device region access defined by the device's -own VFIO_DEVICE_GET_REGION_INFO ioctl. +in the vfio_register_group_dev() or vfio_register_emulated_iommu_dev() +call above. This allows the bus driver to obtain its private data using +container_of(). + +:: + + - The init/release callbacks are issued when vfio_device is initialized + and released. + + - The open/close device callbacks are issued when the first + instance of a file descriptor for the device is created (eg. + via VFIO_GROUP_GET_DEVICE_FD) for a user session. + + - The ioctl callback provides a direct pass through for some VFIO_DEVICE_* + ioctls. + + - The [un]bind_iommufd callbacks are issued when the device is bound to + and unbound from iommufd. + + - The attach_ioas callback is issued when the device is attached to an + IOAS managed by the bound iommufd. The attached IOAS is automatically + detached when the device is unbound from iommufd. + + - The read/write/mmap callbacks implement the device region access defined + by the device's own VFIO_DEVICE_GET_REGION_INFO ioctl. + + - The request callback is issued when device is going to be unregistered, + such as when trying to unbind the device from the vfio bus driver. + - The dma_unmap callback is issued when a range of iovas are unmapped + in the container or IOAS attached by the device. Drivers which make + use of the vfio page pinning interface must implement this callback in + order to unpin pages within the dma_unmap range. Drivers must tolerate + this callback even before calls to open_device(). PPC64 sPAPR implementation note ------------------------------- diff --git a/Documentation/s390/vfio-ap.rst b/Documentation/s390/vfio-ap.rst index 00f4a04f6d4c..d46e98c7c1ec 100644 --- a/Documentation/s390/vfio-ap.rst +++ b/Documentation/s390/vfio-ap.rst @@ -553,7 +553,6 @@ These are the steps: * ZCRYPT * S390_AP_IOMMU * VFIO - * VFIO_MDEV * KVM If using make menuconfig select the following to build the vfio_ap module:: diff --git a/MAINTAINERS b/MAINTAINERS index a3d7c8945762..84bd678b8af9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21882,7 +21882,6 @@ F: tools/testing/selftests/filesystems/fat/ VFIO DRIVER M: Alex Williamson <alex.williamson@redhat.com> -R: Cornelia Huck <cohuck@redhat.com> L: kvm@vger.kernel.org S: Maintained T: git https://github.com/awilliam/linux-vfio.git diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 933771b0b07a..078cd1a773a3 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -714,7 +714,9 @@ config EADM_SCH config VFIO_CCW def_tristate n prompt "Support for VFIO-CCW subchannels" - depends on S390_CCW_IOMMU && VFIO_MDEV + depends on S390_CCW_IOMMU + depends on VFIO + select VFIO_MDEV help This driver allows usage of I/O subchannels via VFIO-CCW. @@ -724,8 +726,10 @@ config VFIO_CCW config VFIO_AP def_tristate n prompt "VFIO support for AP devices" - depends on S390_AP_IOMMU && VFIO_MDEV && KVM + depends on S390_AP_IOMMU && KVM + depends on VFIO depends on ZCRYPT + select VFIO_MDEV help This driver grants access to Adjunct Processor (AP) devices via the VFIO mediated device interface. diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 74b35ec2ad28..3c68fe49042c 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -594,7 +594,6 @@ CONFIG_SYNC_FILE=y CONFIG_VFIO=m CONFIG_VFIO_PCI=m CONFIG_MLX5_VFIO_PCI=m -CONFIG_VFIO_MDEV=m CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index cec71268e3bc..9ab91632f74c 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -583,7 +583,6 @@ CONFIG_SYNC_FILE=y CONFIG_VFIO=m CONFIG_VFIO_PCI=m CONFIG_MLX5_VFIO_PCI=m -CONFIG_VFIO_MDEV=m CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig index 9c0990c0ec87..3d1cd04ac5fa 100644 --- a/drivers/gpu/drm/i915/Kconfig +++ b/drivers/gpu/drm/i915/Kconfig @@ -127,9 +127,10 @@ config DRM_I915_GVT_KVMGT depends on X86 depends on 64BIT depends on KVM - depends on VFIO_MDEV + depends on VFIO select DRM_I915_GVT select KVM_EXTERNAL_WRITE_TRACKING + select VFIO_MDEV help Choose this option if you want to enable Intel GVT-g graphics diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c index 89f10becf962..d53d08f16973 100644 --- a/drivers/vfio/container.c +++ b/drivers/vfio/container.c @@ -360,7 +360,7 @@ static int vfio_fops_open(struct inode *inode, struct file *filep) { struct vfio_container *container; - container = kzalloc(sizeof(*container), GFP_KERNEL); + container = kzalloc(sizeof(*container), GFP_KERNEL_ACCOUNT); if (!container) return -ENOMEM; @@ -376,11 +376,6 @@ static int vfio_fops_open(struct inode *inode, struct file *filep) static int vfio_fops_release(struct inode *inode, struct file *filep) { struct vfio_container *container = filep->private_data; - struct vfio_iommu_driver *driver = container->iommu_driver; - - if (driver && driver->ops->notify) - driver->ops->notify(container->iommu_data, - VFIO_IOMMU_CONTAINER_CLOSE); filep->private_data = NULL; diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index defeb8510ace..c89a047a4cd8 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -28,7 +28,7 @@ static int vfio_fsl_mc_open_device(struct vfio_device *core_vdev) int i; vdev->regions = kcalloc(count, sizeof(struct vfio_fsl_mc_region), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!vdev->regions) return -ENOMEM; diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c index 64d01f3fb13d..c51229fccbd6 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c @@ -29,7 +29,7 @@ static int vfio_fsl_mc_irqs_allocate(struct vfio_fsl_mc_device *vdev) irq_count = mc_dev->obj_desc.irq_count; - mc_irq = kcalloc(irq_count, sizeof(*mc_irq), GFP_KERNEL); + mc_irq = kcalloc(irq_count, sizeof(*mc_irq), GFP_KERNEL_ACCOUNT); if (!mc_irq) return -ENOMEM; @@ -77,7 +77,7 @@ static int vfio_set_trigger(struct vfio_fsl_mc_device *vdev, if (fd < 0) /* Disable only */ return 0; - irq->name = kasprintf(GFP_KERNEL, "vfio-irq[%d](%s)", + irq->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-irq[%d](%s)", hwirq, dev_name(&vdev->mc_dev->dev)); if (!irq->name) return -ENOMEM; diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c index e166ad7ce6e7..27d5ba7cf9dc 100644 --- a/drivers/vfio/group.c +++ b/drivers/vfio/group.c @@ -140,7 +140,7 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group, ret = iommufd_vfio_compat_ioas_create(iommufd); if (ret) { - iommufd_ctx_put(group->iommufd); + iommufd_ctx_put(iommufd); goto out_unlock; } @@ -157,6 +157,18 @@ out_unlock: return ret; } +static void vfio_device_group_get_kvm_safe(struct vfio_device *device) +{ + spin_lock(&device->group->kvm_ref_lock); + if (!device->group->kvm) + goto unlock; + + _vfio_device_get_kvm_safe(device, device->group->kvm); + +unlock: + spin_unlock(&device->group->kvm_ref_lock); +} + static int vfio_device_group_open(struct vfio_device *device) { int ret; @@ -167,13 +179,23 @@ static int vfio_device_group_open(struct vfio_device *device) goto out_unlock; } + mutex_lock(&device->dev_set->lock); + /* - * Here we pass the KVM pointer with the group under the lock. If the - * device driver will use it, it must obtain a reference and release it - * during close_device. + * Before the first device open, get the KVM pointer currently + * associated with the group (if there is one) and obtain a reference + * now that will be held until the open_count reaches 0 again. Save + * the pointer in the device for use by drivers. */ - ret = vfio_device_open(device, device->group->iommufd, - device->group->kvm); + if (device->open_count == 0) + vfio_device_group_get_kvm_safe(device); + + ret = vfio_device_open(device, device->group->iommufd); + + if (device->open_count == 0) + vfio_device_put_kvm(device); + + mutex_unlock(&device->dev_set->lock); out_unlock: mutex_unlock(&device->group->group_lock); @@ -183,7 +205,14 @@ out_unlock: void vfio_device_group_close(struct vfio_device *device) { mutex_lock(&device->group->group_lock); + mutex_lock(&device->dev_set->lock); + vfio_device_close(device, device->group->iommufd); + + if (device->open_count == 0) + vfio_device_put_kvm(device); + + mutex_unlock(&device->dev_set->lock); mutex_unlock(&device->group->group_lock); } @@ -453,6 +482,7 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, refcount_set(&group->drivers, 1); mutex_init(&group->group_lock); + spin_lock_init(&group->kvm_ref_lock); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); group->iommu_group = iommu_group; @@ -806,9 +836,9 @@ void vfio_file_set_kvm(struct file *file, struct kvm *kvm) if (!vfio_file_is_group(file)) return; - mutex_lock(&group->group_lock); + spin_lock(&group->kvm_ref_lock); group->kvm = kvm; - mutex_unlock(&group->group_lock); + spin_unlock(&group->kvm_ref_lock); } EXPORT_SYMBOL_GPL(vfio_file_set_kvm); diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig index 646dbed44eb2..e5fb84e07965 100644 --- a/drivers/vfio/mdev/Kconfig +++ b/drivers/vfio/mdev/Kconfig @@ -1,10 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only config VFIO_MDEV - tristate "Mediated device driver framework" - default n - help - Provides a framework to virtualize devices. - See Documentation/driver-api/vfio-mediated-device.rst for more details. - - If you don't know what do here, say N. + tristate diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index abe3359dd477..e4490639d383 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -96,7 +96,7 @@ static MDEV_TYPE_ATTR_RO(device_api); static ssize_t name_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - return sprintf(buf, "%s\n", + return sysfs_emit(buf, "%s\n", mtype->pretty_name ? mtype->pretty_name : mtype->sysfs_name); } diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 0bba3b05c6c7..a117eaf21c14 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -744,7 +744,7 @@ hisi_acc_vf_pci_resume(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct hisi_acc_vf_migration_file *migf; - migf = kzalloc(sizeof(*migf), GFP_KERNEL); + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); if (!migf) return ERR_PTR(-ENOMEM); @@ -863,7 +863,7 @@ hisi_acc_open_saving_migf(struct hisi_acc_vf_core_device *hisi_acc_vdev) struct hisi_acc_vf_migration_file *migf; int ret; - migf = kzalloc(sizeof(*migf), GFP_KERNEL); + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); if (!migf) return ERR_PTR(-ENOMEM); diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 64e68d13cb98..deed156e6165 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -7,6 +7,29 @@ enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 }; +static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id) +{ + int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); + void *query_cap = NULL, *cap; + int ret; + + query_cap = kzalloc(query_sz, GFP_KERNEL); + if (!query_cap) + return -ENOMEM; + + ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap, + MLX5_CAP_GENERAL_2); + if (ret) + goto out; + + cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability); + if (!MLX5_GET(cmd_hca_cap_2, cap, migratable)) + ret = -EOPNOTSUPP; +out: + kfree(query_cap); + return ret; +} + static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, u16 *vhca_id); static void @@ -195,6 +218,10 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, if (mvdev->vf_id < 0) goto end; + ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1); + if (ret) + goto end; + if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1, &mvdev->vhca_id)) goto end; @@ -373,7 +400,7 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, struct mlx5_vhca_data_buffer *buf; int ret; - buf = kzalloc(sizeof(*buf), GFP_KERNEL); + buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); if (!buf) return ERR_PTR(-ENOMEM); @@ -473,7 +500,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) } static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, - size_t image_size) + size_t image_size, bool initial_pre_copy) { struct mlx5_vf_migration_file *migf = header_buf->migf; struct mlx5_vf_migration_header header = {}; @@ -481,7 +508,9 @@ static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, struct page *page; u8 *to_buff; - header.image_size = cpu_to_le64(image_size); + header.record_size = cpu_to_le64(image_size); + header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY); + header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA); page = mlx5vf_get_migration_page(header_buf, 0); if (!page) return -EINVAL; @@ -489,12 +518,13 @@ static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, memcpy(to_buff, &header, sizeof(header)); kunmap_local(to_buff); header_buf->length = sizeof(header); - header_buf->header_image_size = image_size; header_buf->start_pos = header_buf->migf->max_pos; migf->max_pos += header_buf->length; spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&header_buf->buf_elm, &migf->buf_list); spin_unlock_irqrestore(&migf->list_lock, flags); + if (initial_pre_copy) + migf->pre_copy_initial_bytes += sizeof(header); return 0; } @@ -508,11 +538,14 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) if (!status) { size_t image_size; unsigned long flags; + bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY && + !async_data->last_chunk; image_size = MLX5_GET(save_vhca_state_out, async_data->out, actual_image_size); if (async_data->header_buf) { - status = add_buf_header(async_data->header_buf, image_size); + status = add_buf_header(async_data->header_buf, image_size, + initial_pre_copy); if (status) goto err; } @@ -522,6 +555,8 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); spin_unlock_irqrestore(&migf->list_lock, flags); + if (initial_pre_copy) + migf->pre_copy_initial_bytes += image_size; migf->state = async_data->last_chunk ? MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY; wake_up_interruptible(&migf->poll_wait); @@ -583,11 +618,16 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, } if (MLX5VF_PRE_COPY_SUPP(mvdev)) { - header_buf = mlx5vf_get_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); - if (IS_ERR(header_buf)) { - err = PTR_ERR(header_buf); - goto err_free; + if (async_data->last_chunk && migf->buf_header) { + header_buf = migf->buf_header; + migf->buf_header = NULL; + } else { + header_buf = mlx5vf_get_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(header_buf)) { + err = PTR_ERR(header_buf); + goto err_free; + } } } @@ -790,7 +830,7 @@ static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, node = interval_tree_iter_first(ranges, 0, ULONG_MAX); for (i = 0; i < num_ranges; i++) { void *addr_range_i_base = range_list_ptr + record_size * i; - unsigned long length = node->last - node->start; + unsigned long length = node->last - node->start + 1; MLX5_SET64(page_track_range, addr_range_i_base, start_address, node->start); @@ -800,7 +840,7 @@ static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, } WARN_ON(node); - log_addr_space_size = ilog2(total_ranges_len); + log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len)); if (log_addr_space_size < (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) || log_addr_space_size > @@ -1032,18 +1072,18 @@ mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev, void *in; int err; - qp = kzalloc(sizeof(*qp), GFP_KERNEL); + qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT); if (!qp) return ERR_PTR(-ENOMEM); - qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr); - log_rq_stride = ilog2(MLX5_SEND_WQE_DS); - log_rq_sz = ilog2(qp->rq.wqe_cnt); err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node); if (err) goto err_free; if (max_recv_wr) { + qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr); + log_rq_stride = ilog2(MLX5_SEND_WQE_DS); + log_rq_sz = ilog2(qp->rq.wqe_cnt); err = mlx5_frag_buf_alloc_node(mdev, wq_get_byte_sz(log_rq_sz, log_rq_stride), &qp->buf, mdev->priv.numa_node); @@ -1213,12 +1253,13 @@ static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, int i; recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!recv_buf->page_list) return -ENOMEM; for (;;) { - filled = alloc_pages_bulk_array(GFP_KERNEL, npages - done, + filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, + npages - done, recv_buf->page_list + done); if (!filled) goto err; @@ -1248,7 +1289,7 |
