// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
*/
#include <linux/device.h>
#include <linux/eventfd.h>
#include <linux/file.h>
#include <linux/interrupt.h>
#include <linux/iommu.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/notifier.h>
#include <linux/pci.h>
#include <linux/pm_runtime.h>
#include <linux/types.h>
#include <linux/uaccess.h>
#include <linux/vfio.h>
#include <linux/sched/mm.h>
#include <linux/anon_inodes.h>
#include "cmd.h"
/* Device specification max LOAD size */
#define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
#define MAX_CHUNK_SIZE SZ_8M
static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
{
struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
return container_of(core_device, struct mlx5vf_pci_core_device,
core_device);
}
struct page *
mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
unsigned long offset)
{
unsigned long cur_offset = 0;
struct scatterlist *sg;
unsigned int i;
/* All accesses are sequential */
if (offset < buf->last_offset || !buf->last_offset_sg) {
buf->last_offset = 0;
buf->last_offset_sg = buf->table.sgt.sgl;
buf->sg_last_entry = 0;
}
cur_offset = buf->last_offset;
for_each_sg(buf->last_offset_sg, sg,
buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
if (offset < sg->length + cur_offset) {
buf->last_offset_sg = sg;
buf->sg_last_entry += i;
buf->last_offset = cur_offset;
return nth_page(sg_page(sg),
(offset - cur_offset) / PAGE_SIZE);
}
cur_offset += sg->length;
}
return NULL;
}
int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
unsigned int npages)
{
unsigned int to_alloc = npages;
struct page **page_list;
unsigned long filled;
unsigned int to_fill;
int ret;
to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
if (!page_list)
return -ENOMEM;
do {
filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
page_list);
if (!filled) {
ret = -ENOMEM;
goto err;
}
to_alloc -= filled;
ret = sg_alloc_append_table_from_pages(
&buf->table, page_list, filled, 0,
filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
GFP_KERNEL_ACCOUNT);
if (ret)
goto err;
buf->allocated_length += filled * PAGE_SIZE;
/* clean input for another bulk allocation */
memset(page_list, 0, filled * sizeof(*page_list));
to_fill = min_t(unsigned int, to_alloc,
PAGE_SIZE / sizeof(*page_list));
} while (to_alloc > 0);
kvfree(page_list);
return 0;
err:
kvfree(page_list);
return ret;
}
static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
{
mutex_lock(&migf->lock);
migf->state = MLX5_MIGF_STATE_ERROR;
migf->filp->f_pos = 0;
mutex_unlock(&migf->lock);
}
static int mlx5vf_release_file(struct inode *inode, struct file *filp)
{
struct mlx5_vf_migration_file *migf = filp->private_data;
mlx5vf_disable_fd(migf);
mutex_destroy(&migf->lock);
kfree(migf);
return 0;
}
static struct mlx5_vhca_data_buffer *
mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
bool *end_of_data)
{
struct mlx5_vhca_data_buffer *buf;
bool found = false;
*end_of_data = false;
spin_lock_irq(&migf->list_lock);
if (list_empty(&migf->buf_list)) {
*end_of_data = true;
goto end;
}
buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
buf_elm);
if (pos >= buf->start_pos &&
pos < buf->start_pos + buf->length) {
found = true;
goto end;
}
/*
* As we use a stream based FD we may expect having the data always
* on first chunk
*/
migf->state = MLX5_MIGF_STATE_ERROR;
end:
spin_unlock_irq(&migf->list_lock);
return found ? buf : NULL;
}
static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
{
struct mlx5_vf_migration_file *migf = vhca_buf->migf;
if (vhca_buf->stop_copy_chunk_num) {
bool is_header = vhca_buf->dma_dir == DMA_NONE;
u8 chunk_num = vhca_buf->stop_copy_chunk_num;
size_t next_required_umem_size = 0;
if (is_header)
migf->buf_header[chunk_num - 1] = vhca_buf;
else
migf->buf[chunk_num - 1] = vhca_buf;
spin_lock_irq(&migf->list_lock);
list_del_init(&vhca_buf->buf_elm);
if (!is_header) {
next_required_umem_size =
migf->next_required_umem_size;
migf->next_required_umem_size = 0;
migf->num_ready_chunks--;
}
spin_unlock_irq(&migf->list_lock);
if (next_required_umem_size)
mlx5vf_mig_file_set_save_work(migf, chunk_num,
next_required_umem_size);
return;
}
spin_lock_irq(&migf->list_lock);
list_del_init(&vhca_buf->buf_elm);
list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
spin_unlock_irq(&migf->list_lock);
}
static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
char __user **buf, size_t *len, loff_t *pos)
{
unsigned long offset;
ssize_t done = 0;
size_t copy_len;
copy_len = min_t(size_t,
vhca_buf->start_pos + vhca_buf->length - *pos, *len);
while (copy_len) {
size_t page_offset;
struct page *page;
size_t page_len;
u8 *from_buff;
int ret;
offset = *pos - vhca_buf->start_pos;
page_offset = offset % PAGE_SIZE;
offset -= page_offset;
page = mlx5vf_get_migration_page(vhca_buf, offset);
if (!page)
return -EINVAL;
page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
from_buff = kmap_local_page(page);
ret = copy_to_user(*buf, from_buff + page_offset, page_len);
kunmap_local(from_buff);
if (ret)
return -EFAULT;
*pos += page_len;
*len -= page_len;
*buf += page_len;
done += page_len;
copy_len -= page_len;
}
if (*pos >= vhca_buf->start_pos + vhca_buf->length)
mlx5vf_buf_read_done(vhca_buf);
return done;
}
static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
loff_t *pos)
{
struct mlx5_vf_migration_file *migf = filp->private_data;
struct mlx5_vhca_data_buffer *vhca_buf;
bool first_loop_call = true;
bool end_of_data;
ssize_t done = 0;
if (pos)
return -ESPIPE;
pos = &filp->f_pos;
if (!(filp->f_flags & O_NONBLOCK)) {
if (wait_event_interruptible(migf->poll_wait,
!list_empty(&migf->buf_list) ||
migf->state == MLX5_MIGF_STATE_ERROR ||
migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
migf->state == MLX5_MIGF_STATE_PRE_COPY ||
migf->state == MLX5_MIGF_STATE_COMPLETE))
return -ERESTARTSYS;
}
mutex_lock(&migf->lock);
if (migf->state == MLX5_MIGF_STATE_ERROR) {
done = -ENODEV;
goto out_unlock;
}
while (len) {
ssize_t count;
vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
&end_of_data);
if (first_loop_call) {
first_loop_call = false;
/* Temporary end of file as part of PRE_COPY */
if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
done = -ENOMSG;
goto out_unlock;
}
if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
i
|