From cd48ebc5c4f2e94830b238f035ebf04f1c3a4433 Mon Sep 17 00:00:00 2001 From: Shang XiaoJing Date: Thu, 22 Sep 2022 20:35:07 +0800 Subject: vfio/mlx5: Switch to use module_pci_driver() macro Since pci provides the helper macro module_pci_driver(), we may replace the module_init/exit with it. Signed-off-by: Shang XiaoJing Reviewed-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220922123507.11222-1-shangxiaojing@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/main.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'drivers/vfio/pci/mlx5') diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index fd6ccb8454a2..457138b92f13 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -676,18 +676,7 @@ static struct pci_driver mlx5vf_pci_driver = { .driver_managed_dma = true, }; -static void __exit mlx5vf_pci_cleanup(void) -{ - pci_unregister_driver(&mlx5vf_pci_driver); -} - -static int __init mlx5vf_pci_init(void) -{ - return pci_register_driver(&mlx5vf_pci_driver); -} - -module_init(mlx5vf_pci_init); -module_exit(mlx5vf_pci_cleanup); +module_pci_driver(mlx5vf_pci_driver); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Max Gurtovoy "); -- cgit v1.2.3 From 4e016f969529f2aec0545e90119e7eb3cb124c46 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 6 Nov 2022 19:46:18 +0200 Subject: vfio: Add an option to get migration data size Add an option to get migration data size by introducing a new migration feature named VFIO_DEVICE_FEATURE_MIG_DATA_SIZE. Upon VFIO_DEVICE_FEATURE_GET the estimated data length that will be required to complete STOP_COPY is returned. This option may better enable user space to consider before moving to STOP_COPY whether it can meet the downtime SLA based on the returned data. The patch also includes the implementation for mlx5 and hisi for this new option to make it feature complete for the existing drivers in this area. Signed-off-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Reviewed-by: Longfang Liu Link: https://lore.kernel.org/r/20221106174630.25909-2-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/main.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'drivers/vfio/pci/mlx5') diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 457138b92f13..6e9cf2aacc52 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -512,6 +512,23 @@ mlx5vf_pci_set_device_state(struct vfio_device *vdev, return res; } +static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + struct mlx5vf_pci_core_device *mvdev = container_of( + vdev, struct mlx5vf_pci_core_device, core_device.vdev); + size_t state_size; + int ret; + + mutex_lock(&mvdev->state_mutex); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, + &state_size); + if (!ret) + *stop_copy_length = state_size; + mlx5vf_state_mutex_unlock(mvdev); + return ret; +} + static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, enum vfio_device_mig_state *curr_state) { @@ -577,6 +594,7 @@ static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { .migration_set_state = mlx5vf_pci_set_device_state, .migration_get_state = mlx5vf_pci_get_device_state, + .migration_get_data_size = mlx5vf_pci_get_data_size, }; static const struct vfio_log_ops mlx5vf_pci_log_ops = { -- cgit v1.2.3 From 2f5d8cef45c30edcf3972d345f606df563d3a48e Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 6 Nov 2022 19:46:19 +0200 Subject: vfio/mlx5: Fix a typo in mlx5vf_cmd_load_vhca_state() Fix a typo in mlx5vf_cmd_load_vhca_state() to use the 'load' memory layout. As in/out sizes are equal for save and load commands there wasn't any functional issue. Fixes: f1d98f346ee3 ("vfio/mlx5: Expose migration commands over mlx5 device") Signed-off-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20221106174630.25909-3-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/vfio/pci/mlx5') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index c604b70437a5..0848bc905d3e 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -378,8 +378,8 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf) { struct mlx5_core_dev *mdev; - u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {}; - u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; + u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; u32 pdn, mkey; int err; -- cgit v1.2.3 From 0e7caa65d707b93fbb4322c6313f739fa9103dfa Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:27 +0200 Subject: vfio/mlx5: Enforce a single SAVE command at a time Enforce a single SAVE command at a time. As the SAVE command is an asynchronous one, we must enforce running only a single command at a time. This will preserve ordering between multiple calls and protect from races on the migration file data structure. This is a must for the next patches from the series where as part of PRE_COPY we may have multiple images to be saved and multiple SAVE commands may be issued from different flows. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-4-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 6 ++++++ drivers/vfio/pci/mlx5/cmd.h | 1 + drivers/vfio/pci/mlx5/main.c | 7 +++++++ 3 files changed, 14 insertions(+) (limited to 'drivers/vfio/pci/mlx5') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 0848bc905d3e..55ee8036f59c 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -281,6 +281,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); mlx5_core_dealloc_pd(mdev, async_data->pdn); kvfree(async_data->out); + complete(&migf->save_comp); fput(migf->filp); } @@ -321,6 +322,10 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, return -ENOTCONN; mdev = mvdev->mdev; + err = wait_for_completion_interruptible(&migf->save_comp); + if (err) + return err; + err = mlx5_core_alloc_pd(mdev, &pdn); if (err) return err; @@ -371,6 +376,7 @@ err_create_mkey: dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); err_dma_map: mlx5_core_dealloc_pd(mdev, pdn); + complete(&migf->save_comp); return err; } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 921d5720a1e5..8ffa7699872c 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -37,6 +37,7 @@ struct mlx5_vf_migration_file { unsigned long last_offset; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; + struct completion save_comp; struct mlx5_async_ctx async_ctx; struct mlx5vf_async_data async_data; }; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 6e9cf2aacc52..0d71ebb2a972 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -245,6 +245,13 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); init_waitqueue_head(&migf->poll_wait); + init_completion(&migf->save_comp); + /* + * save_comp is being used as a binary semaphore built from + * a completion. A normal mutex cannot be used because the lock is + * passed between kernel threads and lockdep can't model this. + */ + complete(&migf->save_comp); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, -- cgit v1.2.3 From 9945a67ea4b30657dd998c7fbbea1b3950747168 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:28 +0200 Subject: vfio/mlx5: Refactor PD usage This patch refactors PD usage such as its life cycle will be as of the migration file instead of allocating/destroying it upon each SAVE/LOAD command. This is a preparation step towards the PRE_COPY series where multiple images will be SAVED/LOADED and a single PD can be simply reused. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-5-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 53 +++++++++++++++++++++++++++++--------------- drivers/vfio/pci/mlx5/cmd.h | 5 ++++- drivers/vfio/pci/mlx5/main.c | 44 ++++++++++++++++++++++++++---------- 3 files changed, 71 insertions(+), 31 deletions(-) (limited to 'drivers/vfio/pci/mlx5') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 55ee8036f59c..a97eac49e3d6 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -279,7 +279,6 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mlx5_core_destroy_mkey(mdev, async_data->mkey); dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); - mlx5_core_dealloc_pd(mdev, async_data->pdn); kvfree(async_data->out); complete(&migf->save_comp); fput(migf->filp); @@ -314,7 +313,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; struct mlx5vf_async_data *async_data; struct mlx5_core_dev *mdev; - u32 pdn, mkey; + u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); @@ -326,16 +325,12 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (err) return err; - err = mlx5_core_alloc_pd(mdev, &pdn); - if (err) - return err; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); if (err) goto err_dma_map; - err = _create_mkey(mdev, pdn, migf, NULL, &mkey); + err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); if (err) goto err_create_mkey; @@ -357,7 +352,6 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, migf->total_length = 0; get_file(migf->filp); async_data->mkey = mkey; - async_data->pdn = pdn; err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), async_data->out, out_size, mlx5vf_save_callback, @@ -375,7 +369,6 @@ err_out: err_create_mkey: dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); err_dma_map: - mlx5_core_dealloc_pd(mdev, pdn); complete(&migf->save_comp); return err; } @@ -386,7 +379,7 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_core_dev *mdev; u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; - u32 pdn, mkey; + u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); @@ -400,15 +393,11 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, } mdev = mvdev->mdev; - err = mlx5_core_alloc_pd(mdev, &pdn); - if (err) - goto end; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); if (err) - goto err_reg; + goto end; - err = _create_mkey(mdev, pdn, migf, NULL, &mkey); + err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); if (err) goto err_mkey; @@ -424,13 +413,41 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, mlx5_core_destroy_mkey(mdev, mkey); err_mkey: dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); -err_reg: - mlx5_core_dealloc_pd(mdev, pdn); end: mutex_unlock(&migf->lock); return err; } +int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) +{ + int err; + + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return -ENOTCONN; + + err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn); + return err; +} + +void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) +{ + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return; + + mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn); +} + +void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) +{ + lockdep_assert_held(&migf->mvdev->state_mutex); + + WARN_ON(migf->mvdev->mdev_detach); + + mlx5vf_cmd_dealloc_pd(migf); +} + static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes, u32 req_nodes) { diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 8ffa7699872c..ba760f956d53 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -16,7 +16,6 @@ struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; int status; - u32 pdn; u32 mkey; void *out; }; @@ -27,6 +26,7 @@ struct mlx5_vf_migration_file { u8 disabled:1; u8 is_err:1; + u32 pdn; struct sg_append_table table; size_t total_length; size_t allocated_length; @@ -127,6 +127,9 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf); +int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf); +void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf); +void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 0d71ebb2a972..1916f7c1468c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -236,12 +236,15 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, O_RDONLY); if (IS_ERR(migf->filp)) { - int err = PTR_ERR(migf->filp); - - kfree(migf); - return ERR_PTR(err); + ret = PTR_ERR(migf->filp); + goto end; } + migf->mvdev = mvdev; + ret = mlx5vf_cmd_alloc_pd(migf); + if (ret) + goto out_free; + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); init_waitqueue_head(&migf->poll_wait); @@ -257,20 +260,25 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &migf->total_length); if (ret) - goto out_free; + goto out_pd; ret = mlx5vf_add_migration_pages( migf, DIV_ROUND_UP_ULL(migf->total_length, PAGE_SIZE)); if (ret) - goto out_free; + goto out_pd; - migf->mvdev = mvdev; ret = mlx5vf_cmd_save_vhca_state(mvdev, migf); if (ret) - goto out_free; + goto out_save; return migf; +out_save: + mlx5vf_disable_fd(migf); +out_pd: + mlx5vf_cmd_dealloc_pd(migf); out_free: fput(migf->filp); +end: + kfree(migf); return ERR_PTR(ret); } @@ -352,6 +360,7 @@ static struct mlx5_vf_migration_file * mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) { struct mlx5_vf_migration_file *migf; + int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); if (!migf) @@ -360,20 +369,30 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, O_WRONLY); if (IS_ERR(migf->filp)) { - int err = PTR_ERR(migf->filp); - - kfree(migf); - return ERR_PTR(err); + ret = PTR_ERR(migf->filp); + goto end; } + + migf->mvdev = mvdev; + ret = mlx5vf_cmd_alloc_pd(migf); + if (ret) + goto out_free; + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); return migf; +out_free: + fput(migf->filp); +end: + kfree(migf); + return ERR_PTR(ret); } void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) { if (mvdev->resuming_migf) { mlx5vf_disable_fd(mvdev->resuming_migf); + mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); fput(mvdev->resuming_migf->filp); mvdev->resuming_migf = NULL; } @@ -381,6 +400,7 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); cancel_work_sync(&mvdev->saving_migf->async_data.work); mlx5vf_disable_fd(mvdev->saving_migf); + mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); fput(mvdev->saving_migf->filp); mvdev->saving_migf = NULL; } -- cgit v1.2.3 From 91454f8b9bf4ce6be1d9a0b4de401bc3c6313a95 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:29 +0200 Subject: vfio/mlx5: Refactor MKEY usage This patch refactors MKEY usage such as its life cycle will be as of the migration file instead of allocating/destroying it upon each SAVE/LOAD command. This is a preparation step towards the PRE_COPY series where multiple images will be SAVED/LOADED. We achieve it by having a new struct named mlx5_vhca_data_buffer which holds the mkey and its related stuff as of sg_append_table, allocated_length, etc. The above fields were taken out from the migration file main struct, into mlx5_vhca_data_buffer dedicated struct with the proper helpers in place. For now we have a single mlx5_vhca_data_buffer per migration file. However, in coming patches we'll have multiple of them to support multiple images. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-6-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 162 +++++++++++++++++++++++++++---------------- drivers/vfio/pci/mlx5/cmd.h | 37 +++++++--- drivers/vfio/pci/mlx5/main.c | 92 +++++++++++++----------- 3 files changed, 178 insertions(+), 113 deletions(-) (limited to 'drivers/vfio/pci/mlx5') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index a97eac49e3d6..ed4c472d2eae 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -210,11 +210,11 @@ err_exec: } static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, - struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf, struct mlx5_vhca_recv_buf *recv_buf, u32 *mkey) { - size_t npages = migf ? DIV_ROUND_UP(migf->total_length, PAGE_SIZE) : + size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : recv_buf->npages; int err = 0, inlen; __be64 *mtt; @@ -232,10 +232,10 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, DIV_ROUND_UP(npages, 2)); mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - if (migf) { + if (buf) { struct sg_dma_page_iter dma_iter; - for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0) + for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); } else { int i; @@ -255,20 +255,99 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); - MLX5_SET64(mkc, mkc, len, - migf ? migf->total_length : (npages * PAGE_SIZE)); + MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); err = mlx5_core_create_mkey(mdev, mkey, in, inlen); kvfree(in); return err; } +static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; + struct mlx5_core_dev *mdev = mvdev->mdev; + int ret; + + lockdep_assert_held(&mvdev->state_mutex); + if (mvdev->mdev_detach) + return -ENOTCONN; + + if (buf->dmaed || !buf->allocated_length) + return -EINVAL; + + ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); + if (ret) + return ret; + + ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); + if (ret) + goto err; + + buf->dmaed = true; + + return 0; +err: + dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); + return ret; +} + +void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + struct mlx5_vf_migration_file *migf = buf->migf; + struct sg_page_iter sg_iter; + + lockdep_assert_held(&migf->mvdev->state_mutex); + WARN_ON(migf->mvdev->mdev_detach); + + if (buf->dmaed) { + mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); + dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, + buf->dma_dir, 0); + } + + /* Undo alloc_pages_bulk_array() */ + for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) + __free_page(sg_page_iter_page(&sg_iter)); + sg_free_append_table(&buf->table); + kfree(buf); +} + +struct mlx5_vhca_data_buffer * +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, + enum dma_data_direction dma_dir) +{ + struct mlx5_vhca_data_buffer *buf; + int ret; + + buf = kzalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) + return ERR_PTR(-ENOMEM); + + buf->dma_dir = dma_dir; + buf->migf = migf; + if (length) { + ret = mlx5vf_add_migration_pages(buf, + DIV_ROUND_UP_ULL(length, PAGE_SIZE)); + if (ret) + goto end; + + ret = mlx5vf_dma_data_buffer(buf); + if (ret) + goto end; + } + + return buf; +end: + mlx5vf_free_data_buffer(buf); + return ERR_PTR(ret); +} + void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) { struct mlx5vf_async_data *async_data = container_of(_work, struct mlx5vf_async_data, work); struct mlx5_vf_migration_file *migf = container_of(async_data, struct mlx5_vf_migration_file, async_data); - struct mlx5_core_dev *mdev = migf->mvdev->mdev; mutex_lock(&migf->lock); if (async_data->status) { @@ -276,9 +355,6 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); - - mlx5_core_destroy_mkey(mdev, async_data->mkey); - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); kvfree(async_data->out); complete(&migf->save_comp); fput(migf->filp); @@ -292,7 +368,7 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { - WRITE_ONCE(migf->total_length, + WRITE_ONCE(migf->buf->length, MLX5_GET(save_vhca_state_out, async_data->out, actual_image_size)); wake_up_interruptible(&migf->poll_wait); @@ -307,39 +383,28 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) } int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf) + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf) { u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; struct mlx5vf_async_data *async_data; - struct mlx5_core_dev *mdev; - u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; - mdev = mvdev->mdev; err = wait_for_completion_interruptible(&migf->save_comp); if (err) return err; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, - 0); - if (err) - goto err_dma_map; - - err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); - if (err) - goto err_create_mkey; - MLX5_SET(save_vhca_state_in, in, opcode, MLX5_CMD_OP_SAVE_VHCA_STATE); MLX5_SET(save_vhca_state_in, in, op_mod, 0); MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); - MLX5_SET(save_vhca_state_in, in, mkey, mkey); - MLX5_SET(save_vhca_state_in, in, size, migf->total_length); + MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); + MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); async_data = &migf->async_data; async_data->out = kvzalloc(out_size, GFP_KERNEL); @@ -348,10 +413,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, goto err_out; } - /* no data exists till the callback comes back */ - migf->total_length = 0; get_file(migf->filp); - async_data->mkey = mkey; err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), async_data->out, out_size, mlx5vf_save_callback, @@ -365,57 +427,33 @@ err_exec: fput(migf->filp); kvfree(async_data->out); err_out: - mlx5_core_destroy_mkey(mdev, mkey); -err_create_mkey: - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); -err_dma_map: complete(&migf->save_comp); return err; } int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf) + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf) { - struct mlx5_core_dev *mdev; u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; - u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; - mutex_lock(&migf->lock); - if (!migf->total_length) { - err = -EINVAL; - goto end; - } - - mdev = mvdev->mdev; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); + err = mlx5vf_dma_data_buffer(buf); if (err) - goto end; - - err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); - if (err) - goto err_mkey; + return err; MLX5_SET(load_vhca_state_in, in, opcode, MLX5_CMD_OP_LOAD_VHCA_STATE); MLX5_SET(load_vhca_state_in, in, op_mod, 0); MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id); - MLX5_SET(load_vhca_state_in, in, mkey, mkey); - MLX5_SET(load_vhca_state_in, in, size, migf->total_length); - - err = mlx5_cmd_exec_inout(mdev, load_vhca_state, in, out); - - mlx5_core_destroy_mkey(mdev, mkey); -err_mkey: - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); -end: - mutex_unlock(&migf->lock); - return err; + MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey); + MLX5_SET(load_vhca_state_in, in, size, buf->length); + return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out); } int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) @@ -445,6 +483,10 @@ void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) WARN_ON(migf->mvdev->mdev_detach); + if (migf->buf) { + mlx5vf_free_data_buffer(migf->buf); + migf->buf = NULL; + } mlx5vf_cmd_dealloc_pd(migf); } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index ba760f956d53..b0f08dfc8120 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -12,11 +12,25 @@ #include #include +struct mlx5_vhca_data_buffer { + struct sg_append_table table; + loff_t start_pos; + u64 length; + u64 allocated_length; + u32 mkey; + enum dma_data_direction dma_dir; + u8 dmaed:1; + struct mlx5_vf_migration_file *migf; + /* Optimize mlx5vf_get_migration_page() for sequential access */ + struct scatterlist *last_offset_sg; + unsigned int sg_last_entry; + unsigned long last_offset; +}; + struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; int status; - u32 mkey; void *out; }; @@ -27,14 +41,7 @@ struct mlx5_vf_migration_file { u8 is_err:1; u32 pdn; - struct sg_append_table table; - size_t total_length; - size_t allocated_length; - - /* Optimize mlx5vf_get_migration_page() for sequential access */ - struct scatterlist *last_offset_sg; - unsigned int sg_last_entry; - unsigned long last_offset; + struct mlx5_vhca_data_buffer *buf; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; struct completion save_comp; @@ -124,12 +131,20 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf); + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf); + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf); int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf); void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf); void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf); +struct mlx5_vhca_data_buffer * +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir); +void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf); +int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, + unsigned int npages); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 1916f7c1468c..5f694fce854c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -33,7 +33,7 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) } static struct page * -mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, +mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, unsigned long offset) { unsigned long cur_offset = 0; @@ -41,20 +41,20 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, unsigned int i; /* All accesses are sequential */ - if (offset < migf->last_offset || !migf->last_offset_sg) { - migf->last_offset = 0; - migf->last_offset_sg = migf->table.sgt.sgl; - migf->sg_last_entry = 0; + if (offset < buf->last_offset || !buf->last_offset_sg) { + buf->last_offset = 0; + buf->last_offset_sg = buf->table.sgt.sgl; + buf->sg_last_entry = 0; } - cur_offset = migf->last_offset; + cur_offset = buf->last_offset; - for_each_sg(migf->last_offset_sg, sg, - migf->table.sgt.orig_nents - migf->sg_last_entry, i) { + for_each_sg(buf->last_offset_sg, sg, + buf->table.sgt.orig_nents - buf->sg_last_entry, i) { if (offset < sg->length + cur_offset) { - migf->last_offset_sg = sg; - migf->sg_last_entry += i; - migf->last_offset = cur_offset; + buf->last_offset_sg = sg; + buf->sg_last_entry += i; + buf->last_offset = cur_offset; return nth_page(sg_page(sg), (offset - cur_offset) / PAGE_SIZE); } @@ -63,8 +63,8 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, return NULL; } -static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, - unsigned int npages) +int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, + unsigned int npages) { unsigned int to_alloc = npages; struct page **page_list; @@ -85,13 +85,13 @@ static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, } to_alloc -= filled; ret = sg_alloc_append_table_from_pages( - &migf->table, page_list, filled, 0, + &buf->table, page_list, filled, 0, filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, GFP_KERNEL); if (ret) goto err; - migf->allocated_length += filled * PAGE_SIZE; + buf->allocated_length += filled * PAGE_SIZE; /* clean input for another bulk allocation */ memset(page_list, 0, filled * sizeof(*page_list)); to_fill = min_t(unsigned int, to_alloc, @@ -108,16 +108,8 @@ err: static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) { - struct sg_page_iter sg_iter; - mutex_lock(&migf->lock); - /* Undo alloc_pages_bulk_array() */ - for_each_sgtable_page(&migf->table.sgt, &sg_iter, 0) - __free_page(sg_page_iter_page(&sg_iter)); - sg_free_append_table(&migf->table); migf->disabled = true; - migf->total_length = 0; - migf->allocated_length = 0; migf->filp->f_pos = 0; mutex_unlock(&migf->lock); } @@ -136,6 +128,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; ssize_t done = 0; if (pos) @@ -144,16 +137,16 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (!(filp->f_flags & O_NONBLOCK)) { if (wait_event_interruptible(migf->poll_wait, - READ_ONCE(migf->total_length) || migf->is_err)) + READ_ONCE(vhca_buf->length) || migf->is_err)) return -ERESTARTSYS; } mutex_lock(&migf->lock); - if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(migf->total_length)) { + if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(vhca_buf->length)) { done = -EAGAIN; goto out_unlock; } - if (*pos > migf->total_length) { + if (*pos > vhca_buf->length) { done = -EINVAL; goto out_unlock; } @@ -162,7 +155,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, goto out_unlock; } - len = min_t(size_t, migf->total_length - *pos, len); + len = min_t(size_t, vhca_buf->length - *pos, len); while (len) { size_t page_offset; struct page *page; @@ -171,7 +164,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, int ret; page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(migf, *pos - page_offset); + page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); if (!page) { if (done == 0) done = -EINVAL; @@ -208,7 +201,7 @@ static __poll_t mlx5vf_save_poll(struct file *filp, mutex_lock(&migf->lock); if (migf->disabled || migf->is_err) pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - else if (READ_ONCE(migf->total_length)) + else if (READ_ONCE(migf->buf->length)) pollflags = EPOLLIN | EPOLLRDNORM; mutex_unlock(&migf->lock); @@ -227,6 +220,8 @@ static struct mlx5_vf_migration_file * mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) { struct mlx5_vf_migration_file *migf; + struct mlx5_vhca_data_buffer *buf; + size_t length; int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); @@ -257,22 +252,23 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) complete(&migf->save_comp); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, - &migf->total_length); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length); if (ret) goto out_pd; - ret = mlx5vf_add_migration_pages( - migf, DIV_ROUND_UP_ULL(migf->total_length, PAGE_SIZE)); - if (ret) + buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); goto out_pd; + } - ret = mlx5vf_cmd_save_vhca_state(mvdev, migf); + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf); if (ret) goto out_save; + migf->buf = buf; return migf; out_save: - mlx5vf_disable_fd(migf); + mlx5vf_free_data_buffer(buf); out_pd: mlx5vf_cmd_dealloc_pd(migf); out_free: @@ -286,6 +282,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; loff_t requested_length; ssize_t done = 0; @@ -306,10 +303,10 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, goto out_unlock; } - if (migf->allocated_length < requested_length) { + if (vhca_buf->allocated_length < requested_length) { done = mlx5vf_add_migration_pages( - migf, - DIV_ROUND_UP(requested_length - migf->allocated_length, + vhca_buf, + DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, PAGE_SIZE)); if (done) goto out_unlock; @@ -323,7 +320,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, int ret; page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(migf, *pos - page_offset); + page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); if (!page) { if (done == 0) done = -EINVAL; @@ -342,7 +339,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, len -= page_len; done += page_len; buf += page_len; - migf->total_length += page_len; + vhca_buf->length += page_len; } out_unlock: mutex_unlock(&migf->lock); @@ -360,6 +357,7 @@ static struct mlx5_vf_migration_file * mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) { struct mlx5_vf_migration_file *migf; + struct mlx5_vhca_data_buffer *buf; int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); @@ -378,9 +376,18 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) if (ret) goto out_free; + buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_pd; + } + + migf->buf = buf; stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); return migf; +out_pd: + mlx5vf_cmd_dealloc_pd(migf); out_free: fput(migf->filp); end: @@ -474,7 +481,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { ret = mlx5vf_cmd_load_vhca_state(mvdev, - mvdev->resuming_migf); + mvdev->resuming_migf, + mvdev->resuming_migf->buf); if (ret) return ERR_PTR(ret); mlx5vf_disable_fds(mvdev); -- cgit v1.2.3 From 8b599d143419669e57da3881d8293f17809688d7 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:30 +0200 Subject: vfio/mlx5: Refactor migration file state Refactor migration file state to be an emum which is mutual exclusive. As of that dropped the 'disabled' state as 'error' is the same from functional point of view. Next patches from the series will extend this enum for other relevant states. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-7-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 2 +- drivers/vfio/pci/mlx5/cmd.h | 7 +++++-- drivers/vfio/pci/mlx5/main.c | 11 ++++++----- 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'drivers/vfio/pci/mlx5') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index ed4c472d2eae..fcba12326185 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -351,7 +351,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { - migf->is_err = true; + migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index b0f08dfc8120..14403e654e4e 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -12,6 +12,10 @@ #include #include +enum mlx5_vf_migf_state { + MLX5_MIGF_STATE_ERROR = 1, +}; + struct mlx5_vhca_data_buffer { struct sg_append_table table; loff_t start_pos; @@ -37,8 +41,7 @@ struct mlx5vf_async_data { struct mlx5_vf_migration_file { struct file *filp; struct mutex lock; - u8 disabled:1; - u8 is_err:1; + enum mlx5_vf_migf_state state; u32 pdn; struct mlx5_vhca_data_buffer *buf; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 5f694fce854c..d95646c2f010 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -109,7 +109,7 @@ err: static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) { mutex_lock(&migf->lock); - migf->disabled = true; + migf->state = MLX5_MIGF_STATE_ERROR; migf->filp->f_pos = 0; mutex_unlock(&migf->lock); } @@ -137,7 +137,8 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (!(filp->f_flags & O_NONBLOCK)) { if (wait_event_interruptible(migf->poll_wait, - READ_ONCE(vhca_buf->length) || migf->is_err)) + READ_ONCE(vhca_buf->length) || + migf->state == MLX5_MIGF_STATE_ERROR)) return -ERESTARTSYS; } @@ -150,7 +151,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, done = -EINVAL; goto out_unlock; } - if (migf->disabled || migf->is_err) { + if (migf->state == MLX5_MIGF_STATE_ERROR) { done = -ENODEV; goto out_unlock; } @@ -199,7 +200,7 @@ static __poll_t mlx5vf_save_poll(struct file *filp, poll_wait(filp, &migf->poll_wait, wait); mutex_lock(&migf->lock); - if (migf->disabled || migf->is_err) + if (migf->state == MLX5_MIGF_STATE_ERROR) pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; else if (READ_ONCE(migf->buf->length)) pollflags = EPOLLIN | EPOLLRDNORM; @@ -298,7 +299,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, return -ENOMEM; mutex_lock(&migf->lock); - if (migf->disabled) { + if (migf->state == MLX5_MIGF_STATE_ERROR) { done = -ENODEV; goto out_unlock; } -- cgit v1.2.3 From c668878381b5702f867ec7f43ee3b74259c6ea03 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:31 +0200 Subject: vfio/mlx5: Refactor to use queue based data chunks Refactor to use queue based data chunks on the migration file. The SAVE command adds a chunk to the tail of the queue while the read() API finds the required chunk and returns its data. In case the queue is empty but the state of the migration file is MLX5_MIGF_STATE_COMPLETE, read() may not be blocked but will return 0 to indicate end of file. This is a step towards maintaining multiple images and their meta data (i.e. headers) on the migration file as part of next patches from the series. Note: At that point, we still use a single chunk on the migration file but becomes ready to support multiple. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-8-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 24 +++++-- drivers/vfio/pci/mlx5/cmd.h | 5 ++ drivers/vfio/pci/mlx5/main.c | 145 +++++++++++++++++++++++++++++++++---------- 3 files changed, 136 insertions(+), 38 deletions(-) (limited to 'drivers/vfio/pci/mlx5') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index fcba12326185..0e36b4c8c816 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -351,6 +351,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { + migf->buf = async_data->buf; migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } @@ -368,9 +369,15 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { - WRITE_ONCE(migf->buf->length, - MLX5_GET(save_vhca_state_out, async_data->out, - actual_image_size)); + unsigned long flags; + + async_data->buf->length = + MLX5_GET(save_vhca_state_out, async_data->out, + actual_image_size); + spin_lock_irqsave(&migf->list_lock, flags); + list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); + spin_unlock_irqrestore(&migf->list_lock, flags); + migf->state = MLX5_MIGF_STATE_COMPLETE; wake_up_interruptible(&migf->poll_wait); } @@ -407,6 +414,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); async_data = &migf->async_data; + async_data->buf = buf; async_data->out = kvzalloc(out_size, GFP_KERNEL); if (!async_data->out) { err = -ENOMEM; @@ -479,14 +487,22 @@ void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) { - lockdep_assert_held(&migf->mvdev->state_mutex); + struct mlx5_vhca_data_buffer *entry; + lockdep_assert_held(&migf->mvdev->state_mutex); WARN_ON(migf->mvdev->mdev_detach); if (migf->buf) { mlx5vf_free_data_buffer(migf->buf); migf->buf = NULL; } + + while ((entry = list_first_entry_or_null(&migf->buf_list, + struct mlx5_vhca_data_buffer, buf_elm))) { + list_del(&entry->buf_elm); + mlx5vf_free_data_buffer(entry); + } + mlx5vf_cmd_dealloc_pd(migf); } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 14403e654e4e..6e594689566e 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -14,6 +14,7 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, + MLX5_MIGF_STATE_COMPLETE, }; struct mlx5_vhca_data_buffer { @@ -24,6 +25,7 @@ struct mlx5_vhca_data_buffer { u32 mkey; enum dma_data_direction dma_dir; u8 dmaed:1; + struct list_head buf_elm; struct mlx5_vf_migration_file *migf; /* Optimize mlx5vf_get_migration_page() for sequential access */ struct scatterlist *last_offset_sg; @@ -34,6 +36,7 @@ struct mlx5_vhca_data_buffer { struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; + struct mlx5_vhca_data_buffer *buf; int status; void *out; }; @@ -45,6 +48,8 @@ struct mlx5_vf_migration_file { u32 pdn; struct mlx5_vhca_data_buffer *buf; + spinlock_t list_lock; + struct list_head buf_list; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; struct completion save_comp; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index d95646c2f010..ca16425811c4 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -124,11 +124,90 @@ static int mlx5vf_release_file(struct inode *inode, struct file *filp) return 0; } +static struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, + bool *end_of_data) +{ + struct mlx5_vhca_data_buffer *buf; + bool found = false; + + *end_of_data = false; + spin_lock_irq(&migf->list_lock); + if (list_empty(&migf->buf_list)) { + *end_of_data = true; + goto end; + } + + buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, + buf_elm); + if (pos >= buf->start_pos && + pos < buf->start_pos + buf->length) { + found = true; + goto end; + } + + /* + * As we use a stream based FD we may expect having the data always + * on first chunk + */ + migf->state = MLX5_MIGF_STATE_ERROR; + +end: + spin_unlock_irq(&migf->list_lock); + return found ? buf : NULL; +} + +static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, + char __user **buf, size_t *len, loff_t *pos) +{ + unsigned long offset; + ssize_t done = 0; + size_t copy_len; + + copy_len = min_t(size_t, + vhca_buf->start_pos + vhca_buf->length - *pos, *len); + while (copy_len) { + size_t page_offset; + struct page *page; + size_t page_len; + u8 *from_buff; + int ret; + + offset = *pos - vhca_buf->start_pos; + page_offset = offset % PAGE_SIZE; + offset -= page_offset; + page = mlx5vf_get_migration_page(vhca_buf, offset); + if (!page) + return -EINVAL; + page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); + from_buff = kmap_local_page(page); + ret = copy_to_user(*buf, from_buff + page_offset, page_len); + kunmap_local(from_buff); + if (ret) + return -EFAULT; + *pos += page_len; + *len -= page_len; + *buf += page_len; + done += page_len; + copy_len -= page_len; + } + + if (*pos >= vhca_buf->start_pos + vhca_buf->length) { + spin_lock_irq(&vhca_buf->migf->list_lock); + list_del_init(&vhca_buf->buf_elm); + spin_unlock_irq(&vhca_buf->migf->list_lock); + } + + return done; +} + static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; - struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; + struct mlx5_vhca_data_buffer *vhca_buf; + bool first_loop_call = true; + bool end_of_data; ssize_t done = 0; if (pos) @@ -137,53 +216,47 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (!(filp->f_flags & O_NONBLOCK)) { if (wait_event_interruptible(migf->poll_wait, - READ_ONCE(vhca_buf->length) || - migf->state == MLX5_MIGF_STATE_ERROR)) + !list_empty(&migf->buf_list) || + migf->state == MLX5_MIGF_STATE_ERROR || + migf->state == MLX5_MIGF_STATE_COMPLETE)) return -ERESTARTSYS; } mutex_lock(&migf->lock); - if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(vhca_buf->length)) { - done = -EAGAIN; - goto out_unlock; - } - if (*pos > vhca_buf->length) { - done = -EINVAL; - goto out_unlock; - } if (migf->state == MLX5_MIGF_STATE_ERROR) { done = -ENODEV; goto out_unlock; } - len = min_t(size_t, vhca_buf->length - *pos, len); while (len) { - size_t page_offset; - struct page *page; - size_t page_len; - u8 *from_buff; - int ret; + ssize_t count; + + vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, + &end_of_data); + if (first_loop_call) { + first_loop_call = false; + if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { + if (filp->f_flags & O_NONBLOCK) { + done = -EAGAIN; + goto out_unlock; + } + } + } - page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); - if (!page) { - if (done == 0) - done = -EINVAL; + if (end_of_data) + goto out_unlock; + + if (!vhca_buf) { + done = -EINVAL; goto out_unlock; } - page_len = min_t(size_t, len, PAGE_SIZE - page_offset); - from_buff = kmap_local_page(page); - ret = copy_to_user(buf, from_buff + page_offset, page_len); - kunmap_local(from_buff); - if (ret) { - done = -EFAULT; + count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); + if (count < 0) { + done = count; goto out_unlock; } - *pos += page_len; - len -= page_len; - done += page_len; - buf += page_len; + done += count; } out_unlock: @@ -202,7 +275,8 @@ static __poll_t mlx5vf_save_poll(struct file *filp, mutex_lock(&migf->lock); if (migf->state == MLX5_MIGF_STATE_ERROR) pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - else if (READ_ONCE(migf->buf->length)) + else if (!list_empty(&migf->buf_list) || + migf->state == MLX5_MIGF_STATE_COMPLETE) pollflags = EPOLLIN | EPOLLRDNORM; mutex_unlock(&migf->lock); @@ -253,6 +327,8 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) complete(&migf->save_comp); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); + INIT_LIST_HEAD(&migf->buf_list); + spin_lock_init(&migf->list_lock); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length); if (ret) goto out_pd; @@ -266,7 +342,6 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf); if (ret) goto out_save; - migf->buf = buf; return migf; out_save: mlx5vf_free_data_buffer(buf); @@ -386,6 +461,8 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) migf->buf = buf; stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); + INIT_LIST_HEAD(&migf->buf_list); + spin_lock_init(&migf->list_lock); return migf; out_pd: mlx5vf_cmd_dealloc_pd(migf); -- cgit v1.2.3 From 3319d287f4c04b9deece8ea00e27a70bbe32941b Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:32 +0200 Subject: vfio/mlx5: Introduce device transitions of PRE_COPY In order to support PRE_COPY, mlx5 driver is transferring multiple states (images) of the device. e.g.: the source VF can save and transfer multiple states, and the target VF will load them by that order. The device is saving three kinds of states: 1) Initial state - when the device moves to PRE_COPY state. 2) Middle state - during PRE_COPY phase via VFIO_MIG_GET_PRECOPY_INFO. There can be multiple states of this type. 3) Final state - when the device moves to STOP_COPY state. After moving to PRE_COPY state, user is holding the saving migf FD and can use it. For example: user can start transferring data via read() callback. Also, user can switch from PRE_COPY to STOP_COPY whenever he sees it fits. This will invoke saving of final state. This means that mlx5 VFIO device can be switched to STOP_COPY without transferring any data in PRE_COPY state. Therefore, when the device moves to STOP_COPY, mlx5 will store the final state on a dedicated queue entry on the list. Co-developed-by: Shay Drory Signed-off-by: Shay Drory Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-9-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 96 ++++++++++++++++++++++++++++++++++++++++---- drivers/vfio/pci/mlx5/cmd.h | 16 +++++++- drivers/vfio/pci/mlx5/main.c | 90 +++++++++++++++++++++++++++++++++++++---- 3 files changed, 184 insertions(+), 18 deletions(-) (limited to 'drivers/vfio/pci/mlx5') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 0e36b4c8c816..5fcece201d4c 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -14,18 +14,36 @@ _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) { + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {}; u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; + int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; + /* + * In case PRE_COPY is used, saving_migf is exposed while the device is + * running. Make sure to run only once there is no active save command. + * Running both in parallel, might end-up with a failure in the save + * command once it will try to turn on 'tracking' on a suspended device. + */ + if (migf) { + err = wait_for_completion_interruptible(&migf->save_comp); + if (err) + return err; + } + MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA); MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); - return mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); + err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); + if (migf) + complete(&migf->save_comp); + + return err; } int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) @@ -45,7 +63,7 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) } int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size) + size_t *state_size, u8 query_flags) { u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; @@ -59,6 +77,8 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); + MLX5_SET(query_vhca_migration_state_in, in, incremental, + query_flags & MLX5VF_QUERY_INC); ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, out); @@ -342,6 +362,56 @@ end: return ERR_PTR(ret); } +void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + spin_lock_irq(&buf->migf->list_lock); + list_add_tail(&buf->buf_elm, &buf->migf->avail_list); + spin_unlock_irq(&buf->migf->list_lock); +} + +struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir) +{ + struct mlx5_vhca_data_buffer *buf, *temp_buf; + struct list_head free_list; + + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return ERR_PTR(-ENOTCONN); + + INIT_LIST_HEAD(&free_list); + + spin_lock_irq(&migf->list_lock); + list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { + if (buf->dma_dir == dma_dir) { + list_del_init(&buf->buf_elm); + if (buf->allocated_length >= length) { + spin_unlock_irq(&migf->list_lock); + goto found; + } + /* + * Prevent holding redundant buffers. Put in a free + * list and call at the end not under the spin lock + * (&migf->list_lock) to mlx5vf_free_data_buffer which + * might sleep. + */ + list_add(&buf->buf_elm, &free_list); + } + } + spin_unlock_irq(&migf->list_lock); + buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); + +found: + while ((temp_buf = list_first_entry_or_null(&free_list, + struct mlx5_vhca_data_buffer, buf_elm))) { + list_del(&temp_buf->buf_elm); + mlx5vf_free_data_buffer(temp_buf); + } + + return buf; +} + void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) { struct mlx5vf_async_data *async_data = container_of(_work, @@ -351,7 +421,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { - migf->buf = async_data->buf; + mlx5vf_put_data_buffer(async_data->buf); migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } @@ -369,15 +439,19 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { + size_t image_size; unsigned long flags; - async_data->buf->length = - MLX5_GET(save_vhca_state_out, async_data->out, - actual_image_size); + image_size = MLX5_GET(save_vhca_state_out, async_data->out, + actual_image_size); + async_data->buf->length = image_size; + async_data->buf->start_pos = migf->max_pos; + migf->max_pos += async_data->buf->length; spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); spin_unlock_irqrestore(&migf->list_lock, flags); - migf->state = MLX5_MIGF_STATE_COMPLETE; + if (async_data->last_chunk) + migf->state = MLX5_MIGF_STATE_COMPLETE; wake_up_interruptible(&migf->poll_wait); } @@ -391,7 +465,8 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf, - struct mlx5_vhca_data_buffer *buf) + struct mlx5_vhca_data_buffer *buf, bool inc, + bool track) { u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; @@ -412,9 +487,12 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); + MLX5_SET(save_vhca_state_in, in, incremental, inc); + MLX5_SET(save_vhca_state_in, in, set_track, track); async_data = &migf->async_data; async_data->buf = buf; + async_data->last_chunk = !track; async_data->out = kvzalloc(out_size, GFP_KERNEL); if (!async_data->out) { err = -ENOMEM; @@ -497,6 +575,8 @@ void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) migf->buf = NULL; } + list_splice(&migf->avail_list, &migf->buf_list); + while ((entry = list_first_entry_or_null(&migf->buf_list, struct mlx5_vhca_data_buffer, buf_elm))) { list_del(&entry->buf_elm); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 6e594689566e..34e61c7aa23d 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -38,6 +38,7 @@ struct mlx5vf_async_data { struct work_struct work; struct mlx5_vhca_data_buffer *buf; int status; + u8 last_chunk:1; void *out; }; @@ -47,9 +48,11 @@ struct mlx5_vf_migration_file { enum mlx5_vf_migf_state state; u32 pdn; + loff_t max_pos; struct mlx5_vhca_data_buffer *buf; spinlock_t list_lock; struct list_head buf_list; + struct list_head avail_list; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; struct completion save_comp; @@ -129,10 +132,14 @@ struct mlx5vf_pci_core_device { struct mlx5_core_dev *mdev; }; +enum { + MLX5VF_QUERY_INC = (1UL << 0), +}; + int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size); + size_t *state_size, u8 query_flags); void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, const struct vfio_migration_ops *mig_ops, const struct vfio_log_ops *log_ops); @@ -140,7 +147,8 @@ void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf, - struct mlx5_vhca_data_buffer *buf); + struct mlx5_vhca_data_buffer *buf, bool inc, + bool track); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf, struct mlx5_vhca_data_buffer *buf); @@ -151,6 +159,10 @@ struct mlx5_vhca_data_buffer * mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, size_t length, enum dma_data_direction dma_dir); void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf); +struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir); +void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, unsigned int npages); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index ca16425811c4..9cabba456044 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -195,6 +195,7 @@ static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, if (*pos >= vhca_buf->start_pos + vhca_buf->length) { spin_lock_irq(&vhca_buf->migf->list_lock); list_del_init(&vhca_buf->buf_elm); + list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); spin_unlock_irq(&vhca_buf->migf->list_lock); } @@ -283,6 +284,16 @@ static __poll_t mlx5vf_save_poll(struct file *filp, return pollflags; } +/* + * FD is exposed and user can use it after receiving an error. + * Mark migf in error, and wake the user. + */ +static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) +{ + migf->state = MLX5_MIGF_STATE_ERROR; + wake_up_interruptible(&migf->poll_wait); +} + static const struct file_operations mlx5vf_save_fops = { .owner = THIS_MODULE, .read = mlx5vf_save_read, @@ -291,8 +302,42 @@ static const struct file_operations mlx5vf_save_fops = { .llseek = no_llseek, }; +static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) +{ + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; + struct mlx5_vhca_data_buffer *buf; + size_t length; + int ret; + + if (migf->state == MLX5_MIGF_STATE_ERROR) + return -ENODEV; + + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, + MLX5VF_QUERY_INC); + if (ret) + goto err; + + buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto err; + } + + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); + if (ret) + goto err_save; + + return 0; + +err_save: + mlx5vf_put_data_buffer(buf); +err: + mlx5vf_mark_err(migf); + return ret; +} + static struct mlx5_vf_migration_file * -mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) +mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) { struct mlx5_vf_migration_file *migf; struct