summaryrefslogtreecommitdiff
path: root/drivers/vhost
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-08-06 09:20:13 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-08-06 09:20:13 -0400
commit0803e04011c2e107b9611660301edde94d7010cc (patch)
tree75699c1999c71a93dc8194a9cac338412e36d78d /drivers/vhost
parent80fac0f577a35c437219a2786c1804ab8ca1e998 (diff)
parentb226acab2f6aaa45c2af27279b63f622b23a44bd (diff)
downloadlinux-0803e04011c2e107b9611660301edde94d7010cc.tar.gz
linux-0803e04011c2e107b9611660301edde94d7010cc.tar.bz2
linux-0803e04011c2e107b9611660301edde94d7010cc.zip
Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
Pull virtio/vhost updates from Michael Tsirkin: - new vsock device support in host and guest - platform IOMMU support in host and guest, including compatibility quirks for legacy systems. - misc fixes and cleanups. * tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost: VSOCK: Use kvfree() vhost: split out vringh Kconfig vhost: detect 32 bit integer wrap around vhost: new device IOTLB API vhost: drop vringh dependency vhost: convert pre sorted vhost memory array to interval tree vhost: introduce vhost memory accessors VSOCK: Add Makefile and Kconfig VSOCK: Introduce vhost_vsock.ko VSOCK: Introduce virtio_transport.ko VSOCK: Introduce virtio_vsock_common.ko VSOCK: defer sock removal to transports VSOCK: transport-specific vsock_transport functions vhost: drop vringh dependency vop: pull in vhost Kconfig virtio: new feature to detect IOMMU device quirk balloon: check the number of available pages in leak balloon vhost: lockless enqueuing vhost: simplify work flushing
Diffstat (limited to 'drivers/vhost')
-rw-r--r--drivers/vhost/Kconfig18
-rw-r--r--drivers/vhost/Kconfig.vringh5
-rw-r--r--drivers/vhost/Makefile4
-rw-r--r--drivers/vhost/net.c67
-rw-r--r--drivers/vhost/vhost.c927
-rw-r--r--drivers/vhost/vhost.h64
-rw-r--r--drivers/vhost/vsock.c719
7 files changed, 1612 insertions, 192 deletions
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 533eaf04f12f..40764ecad9ce 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -2,7 +2,6 @@ config VHOST_NET
tristate "Host kernel accelerator for virtio net"
depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP)
select VHOST
- select VHOST_RING
---help---
This kernel module can be loaded in host kernel to accelerate
guest networking with virtio_net. Not to be confused with virtio_net
@@ -15,17 +14,24 @@ config VHOST_SCSI
tristate "VHOST_SCSI TCM fabric driver"
depends on TARGET_CORE && EVENTFD && m
select VHOST
- select VHOST_RING
default n
---help---
Say M here to enable the vhost_scsi TCM fabric module
for use with virtio-scsi guests
-config VHOST_RING
- tristate
+config VHOST_VSOCK
+ tristate "vhost virtio-vsock driver"
+ depends on VSOCKETS && EVENTFD
+ select VIRTIO_VSOCKETS_COMMON
+ select VHOST
+ default n
---help---
- This option is selected by any driver which needs to access
- the host side of a virtio ring.
+ This kernel module can be loaded in the host kernel to provide AF_VSOCK
+ sockets for communicating with guests. The guests must have the
+ virtio_transport.ko driver loaded to use the virtio-vsock device.
+
+ To compile this driver as a module, choose M here: the module will be called
+ vhost_vsock.
config VHOST
tristate
diff --git a/drivers/vhost/Kconfig.vringh b/drivers/vhost/Kconfig.vringh
new file mode 100644
index 000000000000..6a4490c09d7f
--- /dev/null
+++ b/drivers/vhost/Kconfig.vringh
@@ -0,0 +1,5 @@
+config VHOST_RING
+ tristate
+ ---help---
+ This option is selected by any driver which needs to access
+ the host side of a virtio ring.
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index e0441c34db1c..6b012b986b57 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -4,5 +4,9 @@ vhost_net-y := net.o
obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
vhost_scsi-y := scsi.o
+obj-$(CONFIG_VHOST_VSOCK) += vhost_vsock.o
+vhost_vsock-y := vsock.o
+
obj-$(CONFIG_VHOST_RING) += vringh.o
+
obj-$(CONFIG_VHOST) += vhost.o
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e032ca397371..5dc128a8da83 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -61,7 +61,8 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
enum {
VHOST_NET_FEATURES = VHOST_FEATURES |
(1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
- (1ULL << VIRTIO_NET_F_MRG_RXBUF)
+ (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+ (1ULL << VIRTIO_F_IOMMU_PLATFORM)
};
enum {
@@ -334,7 +335,7 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
{
unsigned long uninitialized_var(endtime);
int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
- out_num, in_num, NULL, NULL);
+ out_num, in_num, NULL, NULL);
if (r == vq->num && vq->busyloop_timeout) {
preempt_disable();
@@ -344,7 +345,7 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
cpu_relax_lowlatency();
preempt_enable();
r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
- out_num, in_num, NULL, NULL);
+ out_num, in_num, NULL, NULL);
}
return r;
@@ -377,6 +378,9 @@ static void handle_tx(struct vhost_net *net)
if (!sock)
goto out;
+ if (!vq_iotlb_prefetch(vq))
+ goto out;
+
vhost_disable_notify(&net->dev, vq);
hdr_size = nvq->vhost_hlen;
@@ -652,6 +656,10 @@ static void handle_rx(struct vhost_net *net)
sock = vq->private_data;
if (!sock)
goto out;
+
+ if (!vq_iotlb_prefetch(vq))
+ goto out;
+
vhost_disable_notify(&net->dev, vq);
vhost_net_disable_vq(net, vq);
@@ -1052,20 +1060,20 @@ static long vhost_net_reset_owner(struct vhost_net *n)
struct socket *tx_sock = NULL;
struct socket *rx_sock = NULL;
long err;
- struct vhost_memory *memory;
+ struct vhost_umem *umem;
mutex_lock(&n->dev.mutex);
err = vhost_dev_check_owner(&n->dev);
if (err)
goto done;
- memory = vhost_dev_reset_owner_prepare();
- if (!memory) {
+ umem = vhost_dev_reset_owner_prepare();
+ if (!umem) {
err = -ENOMEM;
goto done;
}
vhost_net_stop(n, &tx_sock, &rx_sock);
vhost_net_flush(n);
- vhost_dev_reset_owner(&n->dev, memory);
+ vhost_dev_reset_owner(&n->dev, umem);
vhost_net_vq_reset(n);
done:
mutex_unlock(&n->dev.mutex);
@@ -1096,10 +1104,14 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
}
mutex_lock(&n->dev.mutex);
if ((features & (1 << VHOST_F_LOG_ALL)) &&
- !vhost_log_access_ok(&n->dev)) {
- mutex_unlock(&n->dev.mutex);
- return -EFAULT;
+ !vhost_log_access_ok(&n->dev))
+ goto out_unlock;
+
+ if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
+ if (vhost_init_device_iotlb(&n->dev, true))
+ goto out_unlock;
}
+
for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
mutex_lock(&n->vqs[i].vq.mutex);
n->vqs[i].vq.acked_features = features;
@@ -1109,6 +1121,10 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
}
mutex_unlock(&n->dev.mutex);
return 0;
+
+out_unlock:
+ mutex_unlock(&n->dev.mutex);
+ return -EFAULT;
}
static long vhost_net_set_owner(struct vhost_net *n)
@@ -1182,9 +1198,40 @@ static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
}
#endif
+static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *file = iocb->ki_filp;
+ struct vhost_net *n = file->private_data;
+ struct vhost_dev *dev = &n->dev;
+ int noblock = file->f_flags & O_NONBLOCK;
+
+ return vhost_chr_read_iter(dev, to, noblock);
+}
+
+static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct vhost_net *n = file->private_data;
+ struct vhost_dev *dev = &n->dev;
+
+ return vhost_chr_write_iter(dev, from);
+}
+
+static unsigned int vhost_net_chr_poll(struct file *file, poll_table *wait)
+{
+ struct vhost_net *n = file->private_data;
+ struct vhost_dev *dev = &n->dev;
+
+ return vhost_chr_poll(file, dev, wait);
+}
+
static const struct file_operations vhost_net_fops = {
.owner = THIS_MODULE,
.release = vhost_net_release,
+ .read_iter = vhost_net_chr_read_iter,
+ .write_iter = vhost_net_chr_write_iter,
+ .poll = vhost_net_chr_poll,
.unlocked_ioctl = vhost_net_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = vhost_net_compat_ioctl,
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 669fef1e2bb6..c6f2d89c0e97 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -27,6 +27,7 @@
#include <linux/cgroup.h>
#include <linux/module.h>
#include <linux/sort.h>
+#include <linux/interval_tree_generic.h>
#include "vhost.h"
@@ -34,6 +35,10 @@ static ushort max_mem_regions = 64;
module_param(max_mem_regions, ushort, 0444);
MODULE_PARM_DESC(max_mem_regions,
"Maximum number of memory regions in memory map. (default: 64)");
+static int max_iotlb_entries = 2048;
+module_param(max_iotlb_entries, int, 0444);
+MODULE_PARM_DESC(max_iotlb_entries,
+ "Maximum number of iotlb entries. (default: 2048)");
enum {
VHOST_MEMORY_F_LOG = 0x1,
@@ -42,6 +47,10 @@ enum {
#define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
#define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
+INTERVAL_TREE_DEFINE(struct vhost_umem_node,
+ rb, __u64, __subtree_last,
+ START, LAST, , vhost_umem_interval_tree);
+
#ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
{
@@ -131,6 +140,19 @@ static void vhost_reset_is_le(struct vhost_virtqueue *vq)
vq->is_le = virtio_legacy_is_little_endian();
}
+struct vhost_flush_struct {
+ struct vhost_work work;
+ struct completion wait_event;
+};
+
+static void vhost_flush_work(struct vhost_work *work)
+{
+ struct vhost_flush_struct *s;
+
+ s = container_of(work, struct vhost_flush_struct, work);
+ complete(&s->wait_event);
+}
+
static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
@@ -155,11 +177,9 @@ static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
{
- INIT_LIST_HEAD(&work->node);
+ clear_bit(VHOST_WORK_QUEUED, &work->flags);
work->fn = fn;
init_waitqueue_head(&work->done);
- work->flushing = 0;
- work->queue_seq = work->done_seq = 0;
}
EXPORT_SYMBOL_GPL(vhost_work_init);
@@ -211,31 +231,17 @@ void vhost_poll_stop(struct vhost_poll *poll)
}
EXPORT_SYMBOL_GPL(vhost_poll_stop);
-static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work,
- unsigned seq)
-{
- int left;
-
- spin_lock_irq(&dev->work_lock);
- left = seq - work->done_seq;
- spin_unlock_irq(&dev->work_lock);
- return left <= 0;
-}
-
void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
{
- unsigned seq;
- int flushing;
+ struct vhost_flush_struct flush;
- spin_lock_irq(&dev->work_lock);
- seq = work->queue_seq;
- work->flushing++;
- spin_unlock_irq(&dev->work_lock);
- wait_event(work->done, vhost_work_seq_done(dev, work, seq));
- spin_lock_irq(&dev->work_lock);
- flushing = --work->flushing;
- spin_unlock_irq(&dev->work_lock);
- BUG_ON(flushing < 0);
+ if (dev->worker) {
+ init_completion(&flush.wait_event);
+ vhost_work_init(&flush.work, vhost_flush_work);
+
+ vhost_work_queue(dev, &flush.work);
+ wait_for_completion(&flush.wait_event);
+ }
}
EXPORT_SYMBOL_GPL(vhost_work_flush);
@@ -249,16 +255,16 @@ EXPORT_SYMBOL_GPL(vhost_poll_flush);
void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
{
- unsigned long flags;
+ if (!dev->worker)
+ return;
- spin_lock_irqsave(&dev->work_lock, flags);
- if (list_empty(&work->node)) {
- list_add_tail(&work->node, &dev->work_list);
- work->queue_seq++;
- spin_unlock_irqrestore(&dev->work_lock, flags);
+ if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
+ /* We can only add the work to the list after we're
+ * sure it was not in the list.
+ */
+ smp_mb();
+ llist_add(&work->node, &dev->work_list);
wake_up_process(dev->worker);
- } else {
- spin_unlock_irqrestore(&dev->work_lock, flags);
}
}
EXPORT_SYMBOL_GPL(vhost_work_queue);
@@ -266,7 +272,7 @@ EXPORT_SYMBOL_GPL(vhost_work_queue);
/* A lockless hint for busy polling code to exit the loop */
bool vhost_has_work(struct vhost_dev *dev)
{
- return !list_empty(&dev->work_list);
+ return !llist_empty(&dev->work_list);
}
EXPORT_SYMBOL_GPL(vhost_has_work);
@@ -300,17 +306,18 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->call_ctx = NULL;
vq->call = NULL;
vq->log_ctx = NULL;
- vq->memory = NULL;
vhost_reset_is_le(vq);
vhost_disable_cross_endian(vq);
vq->busyloop_timeout = 0;
+ vq->umem = NULL;
+ vq->iotlb = NULL;
}
static int vhost_worker(void *data)
{
struct vhost_dev *dev = data;
- struct vhost_work *work = NULL;
- unsigned uninitialized_var(seq);
+ struct vhost_work *work, *work_next;
+ struct llist_node *node;
mm_segment_t oldfs = get_fs();
set_fs(USER_DS);
@@ -320,35 +327,25 @@ static int vhost_worker(void *data)
/* mb paired w/ kthread_stop */
set_current_state(TASK_INTERRUPTIBLE);
- spin_lock_irq(&dev->work_lock);
- if (work) {
- work->done_seq = seq;
- if (work->flushing)
- wake_up_all(&work->done);
- }
-
if (kthread_should_stop()) {
- spin_unlock_irq(&dev->work_lock);
__set_current_state(TASK_RUNNING);
break;
}
- if (!list_empty(&dev->work_list)) {
- work = list_first_entry(&dev->work_list,
- struct vhost_work, node);
- list_del_init(&work->node);
- seq = work->queue_seq;
- } else
- work = NULL;
- spin_unlock_irq(&dev->work_lock);
- if (work) {
+ node = llist_del_all(&dev->work_list);
+ if (!node)
+ schedule();
+
+ node = llist_reverse_order(node);
+ /* make sure flag is seen after deletion */
+ smp_wmb();
+ llist_for_each_entry_safe(work, work_next, node, node) {
+ clear_bit(VHOST_WORK_QUEUED, &work->flags);
__set_current_state(TASK_RUNNING);
work->fn(work);
if (need_resched())
schedule();
- } else
- schedule();
-
+ }
}
unuse_mm(dev->mm);
set_fs(oldfs);
@@ -407,11 +404,16 @@ void vhost_dev_init(struct vhost_dev *dev,
mutex_init(&dev->mutex);
dev->log_ctx = NULL;
dev->log_file = NULL;
- dev->memory = NULL;
+ dev->umem = NULL;
+ dev->iotlb = NULL;
dev->mm = NULL;
- spin_lock_init(&dev->work_lock);
- INIT_LIST_HEAD(&dev->work_list);
dev->worker = NULL;
+ init_llist_head(&dev->work_list);
+ init_waitqueue_head(&dev->wait);
+ INIT_LIST_HEAD(&dev->read_list);
+ INIT_LIST_HEAD(&dev->pending_list);
+ spin_lock_init(&dev->iotlb_lock);
+
for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
@@ -512,27 +514,36 @@ err_mm:
}
EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
-struct vhost_memory *vhost_dev_reset_owner_prepare(void)
+static void *vhost_kvzalloc(unsigned long size)
{
- return kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL);
+ void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+
+ if (!n)
+ n = vzalloc(size);
+ return n;
+}
+
+struct vhost_umem *vhost_dev_reset_owner_prepare(void)
+{
+ return vhost_kvzalloc(sizeof(struct vhost_umem));
}
EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
/* Caller should have device mutex */
-void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_memory *memory)
+void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_umem *umem)
{
int i;
vhost_dev_cleanup(dev, true);
/* Restore memory to default empty mapping. */
- memory->nregions = 0;
- dev->memory = memory;
+ INIT_LIST_HEAD(&umem->umem_list);
+ dev->umem = umem;
/* We don't need VQ locks below since vhost_dev_cleanup makes sure
* VQs aren't running.
*/
for (i = 0; i < dev->nvqs; ++i)
- dev->vqs[i]->memory = memory;
+ dev->vqs[i]->umem = umem;
}
EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);
@@ -549,6 +560,47 @@ void vhost_dev_stop(struct vhost_dev *dev)
}
EXPORT_SYMBOL_GPL(vhost_dev_stop);
+static void vhost_umem_free(struct vhost_umem *umem,
+ struct vhost_umem_node *node)
+{
+ vhost_umem_interval_tree_remove(node, &umem->umem_tree);
+ list_del(&node->link);
+ kfree(node);
+ umem->numem--;
+}
+
+static void vhost_umem_clean(struct vhost_umem *umem)
+{
+ struct vhost_umem_node *node, *tmp;
+
+ if (!umem)
+ return;
+
+ list_for_each_entry_safe(node, tmp, &umem->umem_list, link)
+ vhost_umem_free(umem, node);
+
+ kvfree(umem);
+}
+
+static void vhost_clear_msg(struct vhost_dev *dev)
+{
+ struct vhost_msg_node *node, *n;
+
+ spin_lock(&dev->iotlb_lock);
+
+ list_for_each_entry_safe(node, n, &dev->read_list, node) {
+ list_del(&node->node);
+ kfree(node);
+ }
+
+ list_for_each_entry_safe(node, n, &dev->pending_list, node) {
+ list_del(&node->node);
+ kfree(node);
+ }
+
+ spin_unlock(&dev->iotlb_lock);
+}
+
/* Caller should have device mutex if and only if locked is set */
void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
{
@@ -575,9 +627,13 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
fput(dev->log_file);
dev->log_file = NULL;
/* No one will access memory at this point */
- kvfree(dev->memory);
- dev->memory = NULL;
- WARN_ON(!list_empty(&dev->work_list));
+ vhost_umem_clean(dev->umem);
+ dev->umem = NULL;
+ vhost_umem_clean(dev->iotlb);
+ dev->iotlb = NULL;
+ vhost_clear_msg(dev);
+ wake_up_interruptible_poll(&dev->wait, POLLIN | POLLRDNORM);
+ WARN_ON(!llist_empty(&dev->work_list));
if (dev->worker) {
kthread_stop(dev->worker);
dev->worker = NULL;
@@ -601,26 +657,34 @@ static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
(sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8);
}
+static bool vhost_overflow(u64 uaddr, u64 size)
+{
+ /* Make sure 64 bit math will not overflow. */
+ return uaddr > ULONG_MAX || size > ULONG_MAX || uaddr > ULONG_MAX - size;
+}
+
/* Caller should have vq mutex and device mutex. */
-static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem,
+static int vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem,
int log_all)
{
- int i;
+ struct vhost_umem_node *node;
- if (!mem)
+ if (!umem)
return 0;
- for (i = 0; i < mem->nregions; ++i) {
- struct vhost_memory_region *m = mem->regions + i;
- unsigned long a = m->userspace_addr;
- if (m->memory_size > ULONG_MAX)
+ list_for_each_entry(node, &umem->umem_list, link) {
+ unsigned long a = node->userspace_addr;
+
+ if (vhost_overflow(node->userspace_addr, node->size))
return 0;
- else if (!access_ok(VERIFY_WRITE, (void __user *)a,
- m->memory_size))
+
+
+ if (!access_ok(VERIFY_WRITE, (void __user *)a,
+ node->size))
return 0;
else if (log_all && !log_access_ok(log_base,
- m->guest_phys_addr,
- m->memory_size))
+ node->start,
+ node->size))
return 0;
}
return 1;
@@ -628,7 +692,7 @@ static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem,
/* Can we switch to this memory table? */
/* Caller should have device mutex but not vq mutex */
-static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
+static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
int log_all)
{
int i;
@@ -641,7 +705,8 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL);
/* If ring is inactive, will check when it's enabled. */
if (d->vqs[i]->private_data)
- ok = vq_memory_access_ok(d->vqs[i]->log_base, mem, log);
+ ok = vq_memory_access_ok(d->vqs[i]->log_base,
+ umem, log);
else
ok = 1;
mutex_unlock(&d->vqs[i]->mutex);
@@ -651,12 +716,385 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
return 1;
}
+static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
+ struct iovec iov[], int iov_size, int access);
+
+static int vhost_copy_to_user(struct vhost_virtqueue *vq, void *to,
+ const void *from, unsigned size)
+{
+ int ret;
+
+ if (!vq->iotlb)
+ return __copy_to_user(to, from, size);
+ else {
+ /* This function should be called after iotlb
+ * prefetch, which means we're sure that all vq
+ * could be access through iotlb. So -EAGAIN should
+ * not happen in this case.
+ */
+ /* TODO: more fast path */
+ struct iov_iter t;
+ ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov,
+ ARRAY_SIZE(vq->iotlb_iov),
+ VHOST_ACCESS_WO);
+ if (ret < 0)
+ goto out;
+ iov_iter_init(&t, WRITE, vq->iotlb_iov, ret, size);
+ ret = copy_to_iter(from, size, &t);
+ if (ret == size)
+ ret = 0;
+ }
+out:
+ return ret;
+}
+
+static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
+ void *from, unsigned size)
+{
+ int ret;
+
+ if (!vq->iotlb)
+ return __copy_from_user(to, from, size);
+ else {
+ /* This function should be called after iotlb
+ * prefetch, which means we're sure that vq
+ * could be access through iotlb. So -EAGAIN should
+ * not happen in this case.
+ */
+ /* TODO: more fast path */
+ struct iov_iter f;
+ ret = translate_desc(vq, (u64)(uintptr_t)from, size, vq->iotlb_iov,
+ ARRAY_SIZE(vq->iotlb_iov),
+ VHOST_ACCESS_RO);
+ if (ret < 0) {
+ vq_err(vq, "IOTLB translation failure: uaddr "
+ "%p size 0x%llx\n", from,
+ (unsigned long long) size);
+ goto out;
+ }
+ iov_iter_init(&f, READ, vq->iotlb_iov, ret, size);
+ ret = copy_from_iter(to, size, &f);
+ if (ret == size)
+ ret = 0;
+ }
+
+out:
+ return ret;
+}
+
+static void __user *__vhost_get_user(struct vhost_virtqueue *vq,
+ void *addr, unsigned size)
+{
+ int ret;
+
+ /* This function should be called after iotlb
+ * prefetch, which means we're sure that vq
+ * could be access through iotlb. So -EAGAIN should
+ * not happen in this case.
+ */
+ /* TODO: more fast path */
+ ret = translate_desc(vq, (u64)(uintptr_t)addr, size, vq->iotlb_iov,
+ ARRAY_SIZE(vq->iotlb_iov),
+ VHOST_ACCESS_RO);
+ if (ret < 0) {
+ vq_err(vq, "IOTLB translation failure: uaddr "
+ "%p size 0x%llx\n", addr,
+ (unsigned long long) size);
+ return NULL;
+ }
+
+ if (ret != 1 || vq->iotlb_iov[0].iov_len != size) {
+ vq_err(vq, "Non atomic userspace memory access: uaddr "
+ "%p size 0x%llx\n", addr,
+ (unsigned long long) size);
+ return NULL;
+ }
+
+ return vq->iotlb_iov[0].iov_base;
+}
+
+#define vhost_put_user(vq, x, ptr) \
+({ \
+ int ret = -EFAULT; \
+ if (!vq->iotlb) { \
+ ret = __put_user(x, ptr); \
+ } else { \
+ __typeof__(ptr) to = \
+ (__typeof__(ptr)) __vhost_get_user(vq, ptr, sizeof(*ptr)); \
+ if (to != NULL) \
+ ret = __put_user(x, to); \
+ else \
+ ret = -EFAULT; \
+ } \
+ ret; \
+})
+
+#define vhost_get_user(vq, x, ptr) \
+({ \
+ int ret; \
+ if (!vq->iotlb) { \
+ ret = __get_user(x, ptr); \
+ } else { \
+ __typeof__(ptr) from = \
+ (__typeof__(ptr)) __vhost_get_user(vq, ptr, sizeof(*ptr)); \
+ if (from != NULL) \
+ ret = __get_user(x, from); \
+ else \
+ ret = -EFAULT; \
+ } \
+ ret; \
+})
+
+static void vhost_dev_lock_vqs(struct vhost_dev *d)
+{
+ int i = 0;
+ for (i = 0; i < d->nvqs; ++i)
+ mutex_lock(&d->vqs[i]->mutex);
+}
+
+static void vhost_dev_unlock_vqs(struct vhost_dev *d)
+{
+ int i = 0;
+ for (i = 0; i < d->nvqs; ++i)
+ mutex_unlock(&d->vqs[i]->mutex);
+}
+
+static int vhost_new_umem_range(struct vhost_umem *umem,
+ u64 start, u64 size, u64 end,
+ u64 userspace_addr, int perm)
+{
+ struct vhost_umem_node *tmp, *node = kmalloc(sizeof(*node), GFP_ATOMIC);
+
+ if (!node)
+ return -ENOMEM;
+
+ if (umem->numem == max_iotlb_entries) {
+ tmp = list_first_entry(&umem->umem_list, typeof(*tmp), link);
+ vhost_umem_free(umem, tmp);
+ }
+
+ node->start = start;
+ node->size = size;
+ node->last = end;
+ node->userspace_addr = userspace_addr;
+ node->perm = perm;
+ INIT_LIST_HEAD(&node->link);
+ list_add_tail(&node->link, &umem->umem_list);
+ vhost_umem_interval_tree_insert(node, &umem->umem_tree);
+ umem->numem++;
+
+ return 0;
+}
+
+static void vhost_del_umem_range(struct vhost_umem *umem,
+ u64 start, u64 end)
+{
+ struct vhost_umem_node *node;
+
+ while ((node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
+ start, end)))
+ vhost_umem_free(umem, node);
+}
+
+static void vhost_iotlb_notify_vq(struct vhost_dev *d,
+ struct vhost_iotlb_msg *msg)
+{
+ struct vhost_msg_node *node, *n;
+
+ spin_lock(&d->iotlb_lock);
+
+ list_for_each_entry_safe(node, n, &d->pending_list, node) {
+ struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb;
+ if (msg->iova <= vq_msg->iova &&
+ msg->iova + msg->size - 1 > vq_msg->iova &&
+ vq_msg->type == VHOST_IOTLB_MISS) {
+ vhost_poll_queue(&node->vq->poll);
+ list_del(&node->node);
+ kfree(node);
+ }
+ }
+
+ spin_unlock(&d->iotlb_lock);
+}
+
+static int umem_access_ok(u64 uaddr, u64 size, int access)
+{
+ unsigned long a = uaddr;
+
+ /* Make sure 64 bit math will not overflow. */
+ if (vhost_overflow(uaddr, size))
+ return -EFAULT;
+
+ if ((access & VHOST_ACCESS_RO) &&
+ !access_ok(VERIFY_READ, (void __user *)a, size))
+ return -EFAULT;
+ if ((access & VHOST_ACCESS_WO) &&
+ !access_ok(VERIFY_WRITE, (void __user *)a, size))
+ return -EFAULT;
+ return 0;
+}
+
+int vhost_process_iotlb_msg(struct vhost_dev *dev,
+ struct vhost_iotlb_msg *msg)
+{
+ int ret = 0;
+
+ vhost_dev_lock_vqs(dev);
+ switch (msg->type) {
+ case VHOST_IOTLB_UPDATE:
+ if (!dev->iotlb) {
+ ret = -EFAULT;
+ break;
+ }
+ if (umem_access_ok(msg->uaddr, msg->size, msg->perm)) {
+ ret = -EFAULT;
+ break;
+ }
+ if (vhost_new_umem_range(dev->iotlb, msg->iova, msg->size,
+ msg->iova + msg->size - 1,
+ msg->uaddr, msg->perm)) {
+ ret = -ENOMEM;
+ break;
+ }
+ vhost_iotlb_notify_vq(dev, msg);
+ break;
+ case VHOST_IOTLB_INVALIDATE:
+ vhost_del_umem_range(dev->iotlb, msg->iova,
+ msg->iova + msg->size - 1);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ vhost_dev_unlock_vqs(dev);
+ return ret;
+}
+ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
+ struct iov_iter *from)
+{
+ struct vhost_msg_node node;
+ unsigned size = sizeof(struct vhost_msg);
+ size_t ret;
+ int err;
+
+ if (iov_iter_count(from) < size)
+ return 0;
+ ret = copy_from_iter(&node.msg, size, from);
+ if (ret != size)
+ goto done;
+
+ switch (node.msg.type) {
+ case VHOST_IOTLB_MSG:
+ err = vhost_process_iotlb_msg(dev, &node.msg.iotlb);
+ if (err)
+ ret = err;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+done:
+ return ret;
+}
+EXPORT_SYMBOL(vhost_chr_write_iter);
+
+unsigned int vhost_chr_poll(struct file *file, struct vhost_dev *dev,
+ poll_table *wait)
+{
+ unsigned int mask = 0;
+
+ poll_wait(file, &dev->wait, wait);
+
+ if (!list_empty(&dev->read_list))
+ mask |= POLLIN | POLLRDNORM;
+
+ return mask;
+}
+EXPORT_SYMBOL(vhost_chr_poll);
+
+ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
+ int noblock)
+{
+ DEFINE_WAIT(wait);
+ struct vhost_msg_node *node;
+ ssize_t ret = 0;
+ unsigned size = sizeof(struct vhost_msg);
+
+ if (iov_iter_count(to) < size)
+ return 0;
+
+ while (1) {
+ if (!noblock)
+ prepare_to_wait(&dev->wait, &wait,
+ TASK_INTERRUPTIBLE);
+
+ node = vhost_dequeue_msg(dev, &dev->read_list);
+ if (node)
+ break;
+ if (noblock) {
+ ret = -EAGAIN;
+ break;
+ }
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ if (!dev->iotlb) {
+ ret = -EBADFD;
+ break;
+ }
+
+ schedule();
+ }
+
+ if (!noblock)
+ finish_wait(&dev->wait, &wait);
+
+ if (node) {
+ ret = copy_to_iter(&node->msg, size, to);
+
+ if (ret != size || node->msg.type != VHOST_IOTLB_MISS) {
+ kfree(node);
+ return ret;
+ }
+
+ vhost_enqueue_msg(dev, &dev->pending_list, node);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(vhost_chr_read_iter);
+
+static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
+{
+ struct vhost_dev *dev = vq->dev;
+ struct vhost_msg_node *node;
+ struct vhost_iotlb_msg *msg;
+
+ node = vhost_new_msg(vq, VHOST_IOTLB_MISS);
+ if (!node)
+ return -ENOMEM;
+
+ msg = &node->msg.iotlb;
+ msg->type = VHOST_IOTLB_MISS;
+ msg->iova = iova;
+ msg->perm = access;
+
+ vhost_enqueue_msg(dev, &dev->read_list, node);
+
+ return 0;
+}
+
static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
struct vring_desc __user *desc,
struct vring_avail __user *avail,
struct vring_used __user *used)
+
{
size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+
return access_ok(VERIFY_READ, desc, num * sizeof *desc) &&
access_ok(VERIFY_READ, avail,
sizeof *avail + num * sizeof *avail->ring + s) &&
@@ -664,11 +1102,59 @@ static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
sizeof *used + num * sizeof *used->ring + s);
}
+static int iotlb_access_ok(struct vhost_virtqueue *vq,
+ int access, u64 addr, u64 len)
+{
+ const struct vhost_umem_node *node;
+ struct vhost_umem *umem = vq->iotlb;
+ u64 s = 0, size;
+
+ while (len > s) {
+ node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
+ addr,
+ addr + len - 1);
+ if (node == NULL || node->start > addr) {
+ vhost_iotlb_miss(vq, addr, access);
+ return false;
+ } else if (!(node->perm & access)) {
+ /* Report the possible access violation by
+ * request another translation from userspace.
+ */
+ return false;
+ }
+
+ size = node->size - addr + node->start;
+ s += size;
+ addr += size;
+ }
+
+ return true;
+}
+
+int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+{
+ size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+ unsigned int num = vq->num;
+
+ if (!vq->iotlb)
+ return 1;
+
+ return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
+ num * sizeof *vq->desc) &&
+ iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
+ sizeof *vq->avail +
+ num * sizeof *vq->avail->ring + s) &&
+ iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used,
+ sizeof *vq->used +
+ num * sizeof *vq->used->ring + s);
+}
+EXPORT_SYMBOL_GPL(vq_iotlb_prefetch);