aboutsummaryrefslogtreecommitdiff
path: root/drivers/vhost
diff options
context:
space:
mode:
authorLinus Torvalds2019-07-17 11:26:09 -0700
committerLinus Torvalds2019-07-17 11:26:09 -0700
commit3a1d5384b7decbff6519daa9c65a35665e227323 (patch)
tree7442f1b74d452d82d6702f8cd25173cc81c0c634 /drivers/vhost
parent37d4607ebbbf5d8b74cbcb9434a5ce6897a51864 (diff)
parent5e663f0410fa2f355042209154029842ba1abd43 (diff)
Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
Pull virtio, vhost updates from Michael Tsirkin: "Fixes, features, performance: - new iommu device - vhost guest memory access using vmap (just meta-data for now) - minor fixes" * tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost: virtio-mmio: add error check for platform_get_irq scsi: virtio_scsi: Use struct_size() helper iommu/virtio: Add event queue iommu/virtio: Add probe request iommu: Add virtio-iommu driver PCI: OF: Initialize dev->fwnode appropriately of: Allow the iommu-map property to omit untranslated devices dt-bindings: virtio: Add virtio-pci-iommu node dt-bindings: virtio-mmio: Add IOMMU description vhost: fix clang build warning vhost: access vq metadata through kernel virtual address vhost: factor out setting vring addr and num vhost: introduce helpers to get the size of metadata area vhost: rename vq_iotlb_prefetch() to vq_meta_prefetch() vhost: fine grain userspace memory accessors vhost: generalize adding used elem
Diffstat (limited to 'drivers/vhost')
-rw-r--r--drivers/vhost/net.c4
-rw-r--r--drivers/vhost/vhost.c850
-rw-r--r--drivers/vhost/vhost.h43
3 files changed, 771 insertions, 126 deletions
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 247e5585af5d..1a2dd53caade 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -956,7 +956,7 @@ static void handle_tx(struct vhost_net *net)
if (!sock)
goto out;
- if (!vq_iotlb_prefetch(vq))
+ if (!vq_meta_prefetch(vq))
goto out;
vhost_disable_notify(&net->dev, vq);
@@ -1125,7 +1125,7 @@ static void handle_rx(struct vhost_net *net)
if (!sock)
goto out;
- if (!vq_iotlb_prefetch(vq))
+ if (!vq_meta_prefetch(vq))
goto out;
vhost_disable_notify(&net->dev, vq);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index ff8892c38666..0536f8526359 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -298,6 +298,160 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
__vhost_vq_meta_reset(d->vqs[i]);
}
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+static void vhost_map_unprefetch(struct vhost_map *map)
+{
+ kfree(map->pages);
+ map->pages = NULL;
+ map->npages = 0;
+ map->addr = NULL;
+}
+
+static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
+{
+ struct vhost_map *map[VHOST_NUM_ADDRS];
+ int i;
+
+ spin_lock(&vq->mmu_lock);
+ for (i = 0; i < VHOST_NUM_ADDRS; i++) {
+ map[i] = rcu_dereference_protected(vq->maps[i],
+ lockdep_is_held(&vq->mmu_lock));
+ if (map[i])
+ rcu_assign_pointer(vq->maps[i], NULL);
+ }
+ spin_unlock(&vq->mmu_lock);
+
+ synchronize_rcu();
+
+ for (i = 0; i < VHOST_NUM_ADDRS; i++)
+ if (map[i])
+ vhost_map_unprefetch(map[i]);
+
+}
+
+static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
+{
+ int i;
+
+ vhost_uninit_vq_maps(vq);
+ for (i = 0; i < VHOST_NUM_ADDRS; i++)
+ vq->uaddrs[i].size = 0;
+}
+
+static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
+ unsigned long start,
+ unsigned long end)
+{
+ if (unlikely(!uaddr->size))
+ return false;
+
+ return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
+}
+
+static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
+ int index,
+ unsigned long start,
+ unsigned long end)
+{
+ struct vhost_uaddr *uaddr = &vq->uaddrs[index];
+ struct vhost_map *map;
+ int i;
+
+ if (!vhost_map_range_overlap(uaddr, start, end))
+ return;
+
+ spin_lock(&vq->mmu_lock);
+ ++vq->invalidate_count;
+
+ map = rcu_dereference_protected(vq->maps[index],
+ lockdep_is_held(&vq->mmu_lock));
+ if (map) {
+ if (uaddr->write) {
+ for (i = 0; i < map->npages; i++)
+ set_page_dirty(map->pages[i]);
+ }
+ rcu_assign_pointer(vq->maps[index], NULL);
+ }
+ spin_unlock(&vq->mmu_lock);
+
+ if (map) {
+ synchronize_rcu();
+ vhost_map_unprefetch(map);
+ }
+}
+
+static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
+ int index,
+ unsigned long start,
+ unsigned long end)
+{
+ if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
+ return;
+
+ spin_lock(&vq->mmu_lock);
+ --vq->invalidate_count;
+ spin_unlock(&vq->mmu_lock);
+}
+
+static int vhost_invalidate_range_start(struct mmu_notifier *mn,
+ const struct mmu_notifier_range *range)
+{
+ struct vhost_dev *dev = container_of(mn, struct vhost_dev,
+ mmu_notifier);
+ int i, j;
+
+ if (!mmu_notifier_range_blockable(range))
+ return -EAGAIN;
+
+ for (i = 0; i < dev->nvqs; i++) {
+ struct vhost_virtqueue *vq = dev->vqs[i];
+
+ for (j = 0; j < VHOST_NUM_ADDRS; j++)
+ vhost_invalidate_vq_start(vq, j,
+ range->start,
+ range->end);
+ }
+
+ return 0;
+}
+
+static void vhost_invalidate_range_end(struct mmu_notifier *mn,
+ const struct mmu_notifier_range *range)
+{
+ struct vhost_dev *dev = container_of(mn, struct vhost_dev,
+ mmu_notifier);
+ int i, j;
+
+ for (i = 0; i < dev->nvqs; i++) {
+ struct vhost_virtqueue *vq = dev->vqs[i];
+
+ for (j = 0; j < VHOST_NUM_ADDRS; j++)
+ vhost_invalidate_vq_end(vq, j,
+ range->start,
+ range->end);
+ }
+}
+
+static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
+ .invalidate_range_start = vhost_invalidate_range_start,
+ .invalidate_range_end = vhost_invalidate_range_end,
+};
+
+static void vhost_init_maps(struct vhost_dev *dev)
+{
+ struct vhost_virtqueue *vq;
+ int i, j;
+
+ dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
+
+ for (i = 0; i < dev->nvqs; ++i) {
+ vq = dev->vqs[i];
+ for (j = 0; j < VHOST_NUM_ADDRS; j++)
+ RCU_INIT_POINTER(vq->maps[j], NULL);
+ }
+}
+#endif
+
static void vhost_vq_reset(struct vhost_dev *dev,
struct vhost_virtqueue *vq)
{
@@ -326,7 +480,11 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->busyloop_timeout = 0;
vq->umem = NULL;
vq->iotlb = NULL;
+ vq->invalidate_count = 0;
__vhost_vq_meta_reset(vq);
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ vhost_reset_vq_maps(vq);
+#endif
}
static int vhost_worker(void *data)
@@ -427,6 +585,32 @@ bool vhost_exceeds_weight(struct vhost_virtqueue *vq,
}
EXPORT_SYMBOL_GPL(vhost_exceeds_weight);
+static size_t vhost_get_avail_size(struct vhost_virtqueue *vq,
+ unsigned int num)
+{
+ size_t event __maybe_unused =
+ vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+
+ return sizeof(*vq->avail) +
+ sizeof(*vq->avail->ring) * num + event;
+}
+
+static size_t vhost_get_used_size(struct vhost_virtqueue *vq,
+ unsigned int num)
+{
+ size_t event __maybe_unused =
+ vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+
+ return sizeof(*vq->used) +
+ sizeof(*vq->used->ring) * num + event;
+}
+
+static size_t vhost_get_desc_size(struct vhost_virtqueue *vq,
+ unsigned int num)
+{
+ return sizeof(*vq->desc) * num;
+}
+
void vhost_dev_init(struct vhost_dev *dev,
struct vhost_virtqueue **vqs, int nvqs,
int iov_limit, int weight, int byte_weight)
@@ -450,7 +634,9 @@ void vhost_dev_init(struct vhost_dev *dev,
INIT_LIST_HEAD(&dev->read_list);
INIT_LIST_HEAD(&dev->pending_list);
spin_lock_init(&dev->iotlb_lock);
-
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ vhost_init_maps(dev);
+#endif
for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
@@ -459,6 +645,7 @@ void vhost_dev_init(struct vhost_dev *dev,
vq->heads = NULL;
vq->dev = dev;
mutex_init(&vq->mutex);
+ spin_lock_init(&vq->mmu_lock);
vhost_vq_reset(dev, vq);
if (vq->handle_kick)
vhost_poll_init(&vq->poll, vq->handle_kick,
@@ -538,7 +725,18 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
if (err)
goto err_cgroup;
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
+ if (err)
+ goto err_mmu_notifier;
+#endif
+
return 0;
+
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+err_mmu_notifier:
+ vhost_dev_free_iovecs(dev);
+#endif
err_cgroup:
kthread_stop(worker);
dev->worker = NULL;
@@ -629,6 +827,107 @@ static void vhost_clear_msg(struct vhost_dev *dev)
spin_unlock(&dev->iotlb_lock);
}
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
+ int index, unsigned long uaddr,
+ size_t size, bool write)
+{
+ struct vhost_uaddr *addr = &vq->uaddrs[index];
+
+ addr->uaddr = uaddr;
+ addr->size = size;
+ addr->write = write;
+}
+
+static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
+{
+ vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
+ (unsigned long)vq->desc,
+ vhost_get_desc_size(vq, vq->num),
+ false);
+ vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
+ (unsigned long)vq->avail,
+ vhost_get_avail_size(vq, vq->num),
+ false);
+ vhost_setup_uaddr(vq, VHOST_ADDR_USED,
+ (unsigned long)vq->used,
+ vhost_get_used_size(vq, vq->num),
+ true);
+}
+
+static int vhost_map_prefetch(struct vhost_virtqueue *vq,
+ int index)
+{
+ struct vhost_map *map;
+ struct vhost_uaddr *uaddr = &vq->uaddrs[index];
+ struct page **pages;
+ int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
+ int npinned;
+ void *vaddr, *v;
+ int err;
+ int i;
+
+ spin_lock(&vq->mmu_lock);
+
+ err = -EFAULT;
+ if (vq->invalidate_count)
+ goto err;
+
+ err = -ENOMEM;
+ map = kmalloc(sizeof(*map), GFP_ATOMIC);
+ if (!map)
+ goto err;
+
+ pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
+ if (!pages)
+ goto err_pages;
+
+ err = EFAULT;
+ npinned = __get_user_pages_fast(uaddr->uaddr, npages,
+ uaddr->write, pages);
+ if (npinned > 0)
+ release_pages(pages, npinned);
+ if (npinned != npages)
+ goto err_gup;
+
+ for (i = 0; i < npinned; i++)
+ if (PageHighMem(pages[i]))
+ goto err_gup;
+
+ vaddr = v = page_address(pages[0]);
+
+ /* For simplicity, fallback to userspace address if VA is not
+ * contigious.
+ */
+ for (i = 1; i < npinned; i++) {
+ v += PAGE_SIZE;
+ if (v != page_address(pages[i]))
+ goto err_gup;
+ }
+
+ map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
+ map->npages = npages;
+ map->pages = pages;
+
+ rcu_assign_pointer(vq->maps[index], map);
+ /* No need for a synchronize_rcu(). This function should be
+ * called by dev->worker so we are serialized with all
+ * readers.
+ */
+ spin_unlock(&vq->mmu_lock);
+
+ return 0;
+
+err_gup:
+ kfree(pages);
+err_pages:
+ kfree(map);
+err:
+ spin_unlock(&vq->mmu_lock);
+ return err;
+}
+#endif
+
void vhost_dev_cleanup(struct vhost_dev *dev)
{
int i;
@@ -658,8 +957,16 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
kthread_stop(dev->worker);
dev->worker = NULL;
}
- if (dev->mm)
+ if (dev->mm) {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ mmu_notifier_unregister(&dev->mmu_notifier, dev->mm);
+#endif
mmput(dev->mm);
+ }
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ for (i = 0; i < dev->nvqs; i++)
+ vhost_uninit_vq_maps(dev->vqs[i]);
+#endif
dev->mm = NULL;
}
EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
@@ -886,6 +1193,113 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
ret; \
})
+static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ struct vhost_map *map;
+ struct vring_used *used;
+
+ if (!vq->iotlb) {
+ rcu_read_lock();
+
+ map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+ if (likely(map)) {
+ used = map->addr;
+ *((__virtio16 *)&used->ring[vq->num]) =
+ cpu_to_vhost16(vq, vq->avail_idx);
+ rcu_read_unlock();
+ return 0;
+ }
+
+ rcu_read_unlock();
+ }
+#endif
+
+ return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
+ vhost_avail_event(vq));
+}
+
+static inline int vhost_put_used(struct vhost_virtqueue *vq,
+ struct vring_used_elem *head, int idx,
+ int count)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ struct vhost_map *map;
+ struct vring_used *used;
+ size_t size;
+
+ if (!vq->iotlb) {
+ rcu_read_lock();
+
+ map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+ if (likely(map)) {
+ used = map->addr;
+ size = count * sizeof(*head);
+ memcpy(used->ring + idx, head, size);
+ rcu_read_unlock();
+ return 0;
+ }
+
+ rcu_read_unlock();
+ }
+#endif
+
+ return vhost_copy_to_user(vq, vq->used->ring + idx, head,
+ count * sizeof(*head));
+}
+
+static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
+
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ struct vhost_map *map;
+ struct vring_used *used;
+
+ if (!vq->iotlb) {
+ rcu_read_lock();
+
+ map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+ if (likely(map)) {
+ used = map->addr;
+ used->flags = cpu_to_vhost16(vq, vq->used_flags);
+ rcu_read_unlock();
+ return 0;
+ }
+
+ rcu_read_unlock();
+ }
+#endif
+
+ return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
+ &vq->used->flags);
+}
+
+static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
+
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ struct vhost_map *map;
+ struct vring_used *used;
+
+ if (!vq->iotlb) {
+ rcu_read_lock();
+
+ map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+ if (likely(map)) {
+ used = map->addr;
+ used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
+ rcu_read_unlock();
+ return 0;
+ }
+
+ rcu_read_unlock();
+ }
+#endif
+
+ return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
+ &vq->used->idx);
+}
+
#define vhost_get_user(vq, x, ptr, type) \
({ \
int ret; \
@@ -924,6 +1338,155 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
mutex_unlock(&d->vqs[i]->mutex);
}
+static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
+ __virtio16 *idx)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ struct vhost_map *map;
+ struct vring_avail *avail;
+
+ if (!vq->iotlb) {
+ rcu_read_lock();
+
+ map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+ if (likely(map)) {
+ avail = map->addr;
+ *idx = avail->idx;
+ rcu_read_unlock();
+ return 0;
+ }
+
+ rcu_read_unlock();
+ }
+#endif
+
+ return vhost_get_avail(vq, *idx, &vq->avail->idx);
+}
+
+static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
+ __virtio16 *head, int idx)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ struct vhost_map *map;
+ struct vring_avail *avail;
+
+ if (!vq->iotlb) {
+ rcu_read_lock();
+
+ map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+ if (likely(map)) {
+ avail = map->addr;
+ *head = avail->ring[idx & (vq->num - 1)];
+ rcu_read_unlock();
+ return 0;
+ }
+
+ rcu_read_unlock();
+ }
+#endif
+
+ return vhost_get_avail(vq, *head,
+ &vq->avail->ring[idx & (vq->num - 1)]);
+}
+
+static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
+ __virtio16 *flags)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ struct vhost_map *map;
+ struct vring_avail *avail;
+
+ if (!vq->iotlb) {
+ rcu_read_lock();
+
+ map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+ if (likely(map)) {
+ avail = map->addr;
+ *flags = avail->flags;
+ rcu_read_unlock();
+ return 0;
+ }
+
+ rcu_read_unlock();
+ }
+#endif
+
+ return vhost_get_avail(vq, *flags, &vq->avail->flags);
+}
+
+static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
+ __virtio16 *event)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ struct vhost_map *map;
+ struct vring_avail *avail;
+
+ if (!vq->iotlb) {
+ rcu_read_lock();
+ map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+ if (likely(map)) {
+ avail = map->addr;
+ *event = (__virtio16)avail->ring[vq->num];
+ rcu_read_unlock();
+ return 0;
+ }
+ rcu_read_unlock();
+ }
+#endif
+
+ return vhost_get_avail(vq, *event, vhost_used_event(vq));
+}
+
+static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
+ __virtio16 *idx)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ struct vhost_map *map;
+ struct vring_used *used;
+
+ if (!vq->iotlb) {
+ rcu_read_lock();
+
+ map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+ if (likely(map)) {
+ used = map->addr;
+ *idx = used->idx;
+ rcu_read_unlock();
+ return 0;
+ }
+
+ rcu_read_unlock();
+ }
+#endif
+
+ return vhost_get_used(vq, *idx, &vq->used->idx);
+}
+
+static inline int vhost_get_desc(struct vhost_virtqueue *vq,
+ struct vring_desc *desc, int idx)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ struct vhost_map *map;
+ struct vring_desc *d;
+
+ if (!vq->iotlb) {
+ rcu_read_lock();
+
+ map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]);
+ if (likely(map)) {
+ d = map->addr;
+ *desc = *(d + idx);
+ rcu_read_unlock();
+ return 0;
+ }
+
+ rcu_read_unlock();
+ }
+#endif
+
+ return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
+}
+
static int vhost_new_umem_range(struct vhost_umem *umem,
u64 start, u64 size, u64 end,
u64 userspace_addr, int perm)
@@ -1209,13 +1772,9 @@ static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
struct vring_used __user *used)
{
- size_t s __maybe_unused = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
-
- return access_ok(desc, num * sizeof *desc) &&
- access_ok(avail,
- sizeof *avail + num * sizeof *avail->ring + s) &&
- access_ok(used,
- sizeof *used + num * sizeof *used->ring + s);
+ return access_ok(desc, vhost_get_desc_size(vq, num)) &&
+ access_ok(avail, vhost_get_avail_size(vq, num)) &&
+ access_ok(used, vhost_get_used_size(vq, num));
}
static void vhost_vq_meta_update(struct vhost_virtqueue *vq,
@@ -1265,26 +1824,42 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
return true;
}
-int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
+{
+ struct vhost_map __rcu *map;
+ int i;
+
+ for (i = 0; i < VHOST_NUM_ADDRS; i++) {
+ rcu_read_lock();
+ map = rcu_dereference(vq->maps[i]);
+ rcu_read_unlock();
+ if (unlikely(!map))
+ vhost_map_prefetch(vq, i);
+ }
+}
+#endif
+
+int vq_meta_prefetch(struct vhost_virtqueue *vq)
{
- size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
unsigned int num = vq->num;
- if (!vq->iotlb)
+ if (!vq->iotlb) {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ vhost_vq_map_prefetch(vq);
+#endif
return 1;
+ }
return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
- num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
+ vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
- sizeof *vq->avail +
- num * sizeof(*vq->avail->ring) + s,
+ vhost_get_avail_size(vq, num),
VHOST_ADDR_AVAIL) &&
iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used,
- sizeof *vq->used +
- num * sizeof(*vq->used->ring) + s,
- VHOST_ADDR_USED);
+ vhost_get_used_size(vq, num), VHOST_ADDR_USED);
}
-EXPORT_SYMBOL_GPL(vq_iotlb_prefetch);
+EXPORT_SYMBOL_GPL(vq_meta_prefetch);
/* Can we log writes? */
/* Caller should have device mutex but not vq mutex */
@@ -1299,13 +1874,10 @@ EXPORT_SYMBOL_GPL(vhost_log_access_ok);
static bool vq_log_access_ok(struct vhost_virtqueue *vq,
void __user *log_base)
{
- size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
-
return vq_memory_access_ok(log_base, vq->umem,
vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
(!vq->log_used || log_access_ok(log_base, vq->log_addr,
- sizeof *vq->used +
- vq->num * sizeof *vq->used->ring + s));
+ vhost_get_used_size(vq, vq->num)));
}
/* Can we start vq? */
@@ -1405,6 +1977,121 @@ err:
return -EFAULT;
}
+static long vhost_vring_set_num(struct vhost_dev *d,
+ struct vhost_virtqueue *vq,
+ void __user *argp)
+{
+ struct vhost_vring_state s;
+
+ /* Resizing ring with an active backend?
+ * You don't want to do that. */
+ if (vq->private_data)
+ return -EBUSY;
+
+ if (copy_from_user(&s, argp, sizeof s))
+ return -EFAULT;
+
+ if (!s.num || s.num > 0xffff || (s.num & (s.num - 1)))
+ return -EINVAL;
+ vq->num = s.num;
+
+ return 0;
+}
+
+static long vhost_vring_set_addr(struct vhost_dev *d,
+ struct vhost_virtqueue *vq,
+ void __user *argp)
+{
+ struct vhost_vring_addr a;
+
+ if (copy_from_user(&a, argp, sizeof a))
+ return -EFAULT;
+ if (a.flags & ~(0x1 << VHOST_VRING_F_LOG))
+ return -EOPNOTSUPP;
+
+ /* For 32bit, verify that the top 32bits of the user
+ data are set to zero. */
+ if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr ||
+ (u64)(unsigned long)a.used_user_addr != a.used_user_addr ||
+ (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr)
+ return -EFAULT;
+
+ /* Make sure it's safe to cast pointers to vring types. */
+ BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE);
+ BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE);
+ if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) ||
+ (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) ||
+ (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1)))
+ return -EINVAL;
+
+ /* We only verify access here if backend is configured.
+ * If it is not, we don't as size might not have been setup.
+ * We will verify when backend is configured. */
+ if (vq->private_data) {
+ if (!vq_access_ok(vq, vq->num,
+ (void __user *)(unsigned long)a.desc_user_addr,
+ (void __user *)(unsigned long)a.avail_user_addr,
+ (void __user *)(unsigned long)a.used_user_addr))
+ return -EINVAL;
+
+ /* Also validate log access for used ring if enabled. */
+ if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) &&
+ !log_access_ok(vq->log_base, a.log_guest_addr,
+ sizeof *vq->used +
+ vq->num * sizeof *vq->used->ring))
+ return -EINVAL;
+ }
+
+ vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG));
+ vq->desc = (void __user *)(unsigned long)a.desc_user_addr;
+ vq->avail = (void __user *)(unsigned long)a.avail_user_addr;
+ vq->log_addr = a.log_guest_addr;
+ vq->used = (void __user *)(unsigned long)a.used_user_addr;
+
+ return 0;
+}
+
+static long vhost_vring_set_num_addr(struct vhost_dev *d,
+ struct vhost_virtqueue *vq,
+ unsigned int ioctl,
+ void __user *argp)
+{
+ long r;
+
+ mutex_lock(&vq->mutex);
+
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ /* Unregister MMU notifer to allow invalidation callback
+ * can access vq->uaddrs[] without holding a lock.
+ */
+ if (d->mm)
+ mmu_notifier_unregister(&d->mmu_notifier, d->mm);
+
+ vhost_uninit_vq_maps(vq);
+#endif
+
+ switch (ioctl) {
+ case VHOST_SET_VRING_NUM:
+ r = vhost_vring_set_num(d, vq, argp);
+ break;
+ case VHOST_SET_VRING_ADDR:
+ r = vhost_vring_set_addr(d, vq, argp);
+ break;
+ default:
+ BUG();
+ }
+
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ vhost_setup_vq_uaddr(vq);
+
+ if (d->mm)
+ mmu_notifier_register(&d->mmu_notifier, d->mm);
+#endif
+
+ mutex_unlock(&vq->mutex);
+
+ return r;
+}
long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
{
struct file *eventfp, *filep = NULL;
@@ -1414,7 +2101,6 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
struct vhost_virtqueue *vq;
struct vhost_vring_state s;
struct vhost_vring_file f;
- struct vhost_vring_addr a;
u32 idx;
long r;
@@ -1427,26 +2113,14 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
idx = array_index_nospec(idx, d->nvqs);
vq = d->vqs[idx];
+ if (ioctl == VHOST_SET_VRING_NUM ||
+ ioctl == VHOST_SET_VRING_ADDR) {
+ return vhost_vring_set_num_addr(d, vq, ioctl, argp);
+ }
+
mutex_lock(&vq->mutex);
switch (ioctl) {
- case VHOST_SET_VRING_NUM:
- /* Resizing ring with an active backend?
- * You don't want to do that. */
- if (vq->private_data) {
- r = -EBUSY;
- break;
- }
- if (copy_from_user(&s, argp, sizeof s)) {
- r = -EFAULT;
- break;
- }
- if (!s.num || s.num > 0xffff || (s.num & (s.num - 1))) {
- r = -EINVAL;
- break;
- }
- vq->num = s.num;
- break;
case VHOST_SET_VRING_BASE:
/* Moving base with an active backend?
* You don't want to do that. */
@@ -1472,62 +2146,6 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
if (copy_to_user(argp, &s, sizeof s))
r = -EFAULT;
break;
- case VHOST_SET_VRING_ADDR:
- if (copy_from_user(&a, argp, sizeof a)) {
- r = -EFAULT;
- break;
- }
- if (a.flags & ~(0x1 << VHOST_VRING_F_LOG)) {
- r = -EOPNOTSUPP;
- break;
- }
- /* For 32bit, verify that the top 32bits of the user
- data are set to zero. */
- if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr ||
- (u64)(unsigned long)a.used_user_addr != a.used_user_addr ||
- (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr) {
- r = -EFAULT;
- break;
- }
-
- /* Make sure it's safe to cast pointers to vring types. */
- BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE);
- BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE);
- if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) ||
- (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) ||
- (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1))) {
- r = -EINVAL;
- break;
- }
-
- /* We only verify access here if backend is configured.
- * If it is not, we don't as size might not have been setup.
- * We will verify when backend is configured. */
- if (vq->private_data) {
- if (!vq_access_ok(vq, vq->num,
- (void __user *)(unsigned long)a.desc_user_addr,
- (void __user *)(unsigned long)a.avail_user_addr,
- (void __user *)(unsigned long)a.used_user_addr)) {
- r = -EINVAL;
- break;
- }
-
- /* Also validate log access for used ring if enabled. */
- if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) &&
- !log_access_ok(vq->log_base, a.log_guest_addr,
- sizeof *vq->used +
- vq->num * sizeof *vq->used->ring)) {
- r = -EINVAL;
- break;
- }
- }
-
- vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG));
- vq->desc = (void __user *)(unsigned long)a.desc_user_addr;
- vq->avail = (void __user *)(unsigned long)a.avail_user_addr;
- vq->log_addr = a.log_guest_addr;
- vq->used = (void __user *)(unsigned long)a.used_user_addr;
- break;
case VHOST_SET_VRING_KICK:
if (copy_from_user(&f, argp, sizeof f)) {
r = -EFAULT;
@@ -1861,8 +2479,7 @@ EXPORT_SYMBOL_GPL(vhost_log_write);
static int vhost_update_used_flags(struct vhost_virtqueue *vq)
{
void __user *used;
- if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
- &vq->used->flags) < 0)
+ if (vhost_put_used_flags(vq))
return -EFAULT;
if (unlikely(vq->log_used)) {
/* Make sure the flag is seen before log. */
@@ -1879,8 +2496,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
{
- if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
- vhost_avail_event(vq)))
+ if (vhost_put_avail_event(vq))
return -EFAULT;
if (unlikely(vq->log_used)) {
void __user *used;
@@ -1916,7 +2532,7 @@ int vhost_vq_init_access(struct vhost_virtqueue *vq)
r = -EFAULT;
goto err;
}
- r = vhost_get_used(vq, last_used_idx, &vq->used->idx);
+ r = vhost_get_used_idx(vq, &last_used_idx);
if (r) {
vq_err(vq, "Can't access used idx at %p\n",
&vq->used->idx);
@@ -2115,7 +2731,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
last_avail_idx = vq->last_avail_idx;
if (vq->avail_idx == vq->last_avail_idx) {
- if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
+ if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
vq_err(vq, "Failed to access avail idx at %p\n",
&vq->avail->idx);
return -EFAULT;
@@ -2142,8 +2758,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
/* Grab the next descriptor number they're advertising, and increment
* the index we've seen. */
- if (unlikely(vhost_get_avail(vq, ring_head,
- &vq->avail->ring[last_avail_idx & (vq->num - 1)]))) {
+ if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
vq_err(vq, "Failed to read head: idx %d address %p\n",
last_avail_idx,
&vq->avail->ring[last_avail_idx % vq->num]);
@@ -2178,8 +2793,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
i, vq->num, head);
return -EINVAL;
}
- ret = vhost_copy_from_user(vq, &desc, vq->desc + i,
- sizeof desc);
+ ret = vhost_get_desc(vq, &desc, i);
if (unlikely(ret)) {
vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
i, vq->desc + i);
@@ -2272,16 +2886,7 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
start = vq->last_used_idx & (vq->num - 1);
used = vq->used->ring + start;
- if (count == 1) {
- if (vhost_put_user(vq, heads[0].id, &used->id)) {
- vq_err(vq, "Failed to write used id");
- return -EFAULT;
- }
- if (vhost_put_user(vq, heads[0].len, &used->len)) {
- vq_err(vq, "Failed to write used len");
- return -EFAULT;
- }
- } else if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) {
+ if (vhost_put_used(vq, heads, start, count)) {
vq_err(vq, "Failed to write used");
return -EFAULT;
}
@@ -2323,8 +2928,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
/* Make sure buffer is written before we update index. */
smp_wmb();
- if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
- &vq->used->idx)) {
+ if (vhost_put_used_idx(vq)) {
vq_err(vq, "Failed to increment used idx");
return -EFAULT;
}
@@ -2357,7 +2961,7 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
__virtio16 flags;
- if (vhost_get_avail(vq, flags, &vq->avail->flags)) {
+ if (vhost_get_avail_flags(vq, &flags)) {
vq_err(vq, "Failed to get flags");
return true;
}
@@ -2371,7 +2975,7 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
if (unlikely(!v))
return true;
- if (vhost_get_avail(vq, event, vhost_used_event(vq))) {
+ if (vhost_get_used_event(vq, &event)) {
vq_err(vq, "Failed to get used event idx");
return true;
}
@@ -2416,7 +3020,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
if (vq->avail_idx != vq->last_avail_idx)
return false;
- r = vhost_get_avail(vq, avail_idx, &vq->avail->idx);
+ r = vhost_get_avail_idx(vq, &avail_idx);
if (unlikely(r))
return false;
vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
@@ -2452,7 +3056,7 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
/* They could have slipped one in as we were doing that: make
* sure it's written, then check again. */
smp_mb();
- r = vhost_get_avail(vq, avail_idx, &vq->avail->idx);
+ r = vhost_get_avail_idx(vq, &avail_idx);
if (r) {
vq_err(vq, "Failed to check avail idx at %p: %d\n",
&vq->avail->idx, r);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 27a78a9b8cc7..819296332913 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -12,6 +12,9 @@
#include <linux/virtio_config.h>
#include <linux/virtio_ring.h>
#include <linux/atomic.h>
+#include <linux/pagemap.h>
+#include <linux/mmu_notifier.h>
+#include <asm/cacheflush.h>
struct vhost_work;
typedef void (*vhost_work_fn_t)(struct vhost_work *work);
@@ -80,6 +83,24 @@ enum vhost_uaddr_type {
VHOST_NUM_ADDRS = 3,
};
+struct vhost_map {
+ int npages;
+ void *addr;
+ struct page **pages;
+};
+
+struct vhost_uaddr {
+ unsigned long uaddr;
+ size_t size;
+ bool write;
+};
+
+#if defined(CONFIG_MMU_NOTIFIER) && ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 0
+#define VHOST_ARCH_CAN_ACCEL_UACCESS 1
+#else
+#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
+#endif
+
/* The virtqueue structure describes a queue attached to a device. */
struct vhost_virtqueue {
struct vhost_dev *dev;
@@ -90,7 +111,22 @@ struct vhost_virtqueue {
struct vring_desc __user *desc;
struct vring_avail __user *avail;
struct vring_used __user *used;
+
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ /* Read by memory accessors, modified by meta data
+ * prefetching, MMU notifier and vring ioctl().
+ * Synchonrized through mmu_lock (writers) and RCU (writers
+ * and readers).
+ */
+ struct vhost_map __rcu *maps[VHOST_NUM_ADDRS];
+ /* Read by MMU notifier, modified by vring ioctl(),
+ * synchronized through MMU notifier
+ * registering/unregistering.
+ */
+ struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
+#endif
const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
+
struct file *kick;
struct eventfd_ctx *call_ctx;
struct eventfd_ctx *error_ctx;
@@ -145,6 +181,8 @@ struct vhost_virtqueue {
bool user_be;
#endif
u32 busyloop_timeout;
+ spinlock_t mmu_lock;
+ int invalidate_count;
};
struct vhost_msg_node {
@@ -158,6 +196,9 @@ struct vhost_msg_node {
struct vhost_dev {
struct mm_struct *mm;
+#ifdef CONFIG_MMU_NOTIFIER
+ struct mmu_notifier mmu_notifier;
+#endif
struct mutex mutex;
struct vhost_virtqueue **vqs;
int nvqs;
@@ -212,7 +253,7 @@ bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
unsigned int log_num, u64 len,
struct iovec *iov, int count);
-int vq_iotlb_prefetch(struct vhost_virtqueue *vq);
+int vq_meta_prefetch(struct vhost_virtqueue *vq);
struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type);
void vhost_enqueue_msg(struct vhost_dev *dev,