diff options
author | Luca Boccassi <luca.boccassi@gmail.com> | 2018-04-23 14:39:05 +0100 |
---|---|---|
committer | Luca Boccassi <luca.boccassi@gmail.com> | 2018-04-23 14:39:39 +0100 |
commit | d038355bf358f713efbb182f174e2a8a09042e2b (patch) | |
tree | 25e6274e1f59814b2c6e93f13e846dab972d8a17 | |
parent | c3f15def2ebe9cc255cf0e5cf32aa171f5b4326d (diff) |
New upstream version 17.11.2upstream/17.11.2
Change-Id: Iccf80e68222e7fc45af045bfed265a962b5ccca6
Signed-off-by: Luca Boccassi <luca.boccassi@gmail.com>
-rw-r--r-- | doc/guides/rel_notes/release_17_11.rst | 15 | ||||
-rw-r--r-- | examples/vhost/virtio_net.c | 94 | ||||
-rw-r--r-- | examples/vhost_scsi/vhost_scsi.c | 56 | ||||
-rw-r--r-- | lib/librte_eal/common/include/rte_version.h | 2 | ||||
-rw-r--r-- | lib/librte_vhost/rte_vhost.h | 46 | ||||
-rw-r--r-- | lib/librte_vhost/rte_vhost_version.map | 6 | ||||
-rw-r--r-- | lib/librte_vhost/vhost.c | 39 | ||||
-rw-r--r-- | lib/librte_vhost/vhost.h | 8 | ||||
-rw-r--r-- | lib/librte_vhost/vhost_user.c | 58 | ||||
-rw-r--r-- | lib/librte_vhost/virtio_net.c | 411 | ||||
-rw-r--r-- | pkg/dpdk.spec | 2 |
11 files changed, 619 insertions, 118 deletions
diff --git a/doc/guides/rel_notes/release_17_11.rst b/doc/guides/rel_notes/release_17_11.rst index 77f2ea06..fad7a7eb 100644 --- a/doc/guides/rel_notes/release_17_11.rst +++ b/doc/guides/rel_notes/release_17_11.rst @@ -1075,3 +1075,18 @@ Fixes in 17.11 LTS Release * vhost: fix mbuf free * vhost: protect active rings from async ring changes * vhost: remove pending IOTLB entry if miss request failed + +17.11.2 +~~~~~~~ + +* examples/vhost: move to safe GPA translation API +* examples/vhost_scsi: move to safe GPA translation API +* vhost: add support for non-contiguous indirect descs tables (fixes CVE-2018-1059) +* vhost: check all range is mapped when translating GPAs (fixes CVE-2018-1059) +* vhost: deprecate unsafe GPA translation API (fixes CVE-2018-1059) +* vhost: ensure all range is mapped when translating QVAs (fixes CVE-2018-1059) +* vhost: fix indirect descriptors table translation size (fixes CVE-2018-1059) +* vhost: handle virtually non-contiguous buffers in Rx (fixes CVE-2018-1059) +* vhost: handle virtually non-contiguous buffers in Rx-mrg (fixes CVE-2018-1059) +* vhost: handle virtually non-contiguous buffers in Tx (fixes CVE-2018-1059) +* vhost: introduce safe API for GPA translation (fixes CVE-2018-1059) diff --git a/examples/vhost/virtio_net.c b/examples/vhost/virtio_net.c index 1ab57f52..31c3dd06 100644 --- a/examples/vhost/virtio_net.c +++ b/examples/vhost/virtio_net.c @@ -85,16 +85,20 @@ enqueue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr, struct rte_mbuf *m, uint16_t desc_idx) { uint32_t desc_avail, desc_offset; + uint64_t desc_chunck_len; uint32_t mbuf_avail, mbuf_offset; uint32_t cpy_len; struct vring_desc *desc; - uint64_t desc_addr; + uint64_t desc_addr, desc_gaddr; struct virtio_net_hdr virtio_hdr = {0, 0, 0, 0, 0, 0}; /* A counter to avoid desc dead loop chain */ uint16_t nr_desc = 1; desc = &vr->desc[desc_idx]; - desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); + desc_chunck_len = desc->len; + desc_gaddr = desc->addr; + desc_addr = rte_vhost_va_from_guest_pa( + dev->mem, desc_gaddr, &desc_chunck_len); /* * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid * performance issue with some versions of gcc (4.8.4 and 5.3.0) which @@ -106,9 +110,42 @@ enqueue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr, rte_prefetch0((void *)(uintptr_t)desc_addr); /* write virtio-net header */ - *(struct virtio_net_hdr *)(uintptr_t)desc_addr = virtio_hdr; + if (likely(desc_chunck_len >= dev->hdr_len)) { + *(struct virtio_net_hdr *)(uintptr_t)desc_addr = virtio_hdr; + desc_offset = dev->hdr_len; + } else { + uint64_t len; + uint64_t remain = dev->hdr_len; + uint64_t src = (uint64_t)(uintptr_t)&virtio_hdr, dst; + uint64_t guest_addr = desc_gaddr; + + while (remain) { + len = remain; + dst = rte_vhost_va_from_guest_pa(dev->mem, + guest_addr, &len); + if (unlikely(!dst || !len)) + return -1; + + rte_memcpy((void *)(uintptr_t)dst, + (void *)(uintptr_t)src, + len); + + remain -= len; + guest_addr += len; + dst += len; + } + + desc_chunck_len = desc->len - dev->hdr_len; + desc_gaddr += dev->hdr_len; + desc_addr = rte_vhost_va_from_guest_pa( + dev->mem, desc_gaddr, + &desc_chunck_len); + if (unlikely(!desc_addr)) + return -1; + + desc_offset = 0; + } - desc_offset = dev->hdr_len; desc_avail = desc->len - dev->hdr_len; mbuf_avail = rte_pktmbuf_data_len(m); @@ -133,15 +170,28 @@ enqueue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr, return -1; desc = &vr->desc[desc->next]; - desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); + desc_chunck_len = desc->len; + desc_gaddr = desc->addr; + desc_addr = rte_vhost_va_from_guest_pa( + dev->mem, desc_gaddr, &desc_chunck_len); if (unlikely(!desc_addr)) return -1; desc_offset = 0; desc_avail = desc->len; + } else if (unlikely(desc_chunck_len == 0)) { + desc_chunck_len = desc_avail; + desc_gaddr += desc_offset; + desc_addr = rte_vhost_va_from_guest_pa(dev->mem, + desc_gaddr, + &desc_chunck_len); + if (unlikely(!desc_addr)) + return -1; + + desc_offset = 0; } - cpy_len = RTE_MIN(desc_avail, mbuf_avail); + cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail); rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), cpy_len); @@ -150,6 +200,7 @@ enqueue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr, mbuf_offset += cpy_len; desc_avail -= cpy_len; desc_offset += cpy_len; + desc_chunck_len -= cpy_len; } return 0; @@ -223,8 +274,9 @@ dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr, struct rte_mempool *mbuf_pool) { struct vring_desc *desc; - uint64_t desc_addr; + uint64_t desc_addr, desc_gaddr; uint32_t desc_avail, desc_offset; + uint64_t desc_chunck_len; uint32_t mbuf_avail, mbuf_offset; uint32_t cpy_len; struct rte_mbuf *cur = m, *prev = m; @@ -236,7 +288,10 @@ dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr, (desc->flags & VRING_DESC_F_INDIRECT)) return -1; - desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); + desc_chunck_len = desc->len; + desc_gaddr = desc->addr; + desc_addr = rte_vhost_va_from_guest_pa( + dev->mem, desc_gaddr, &desc_chunck_len); if (unlikely(!desc_addr)) return -1; @@ -250,7 +305,10 @@ dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr, * header. */ desc = &vr->desc[desc->next]; - desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); + desc_chunck_len = desc->len; + desc_gaddr = desc->addr; + desc_addr = rte_vhost_va_from_guest_pa( + dev->mem, desc_gaddr, &desc_chunck_len); if (unlikely(!desc_addr)) return -1; rte_prefetch0((void *)(uintptr_t)desc_addr); @@ -262,7 +320,7 @@ dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr, mbuf_offset = 0; mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM; while (1) { - cpy_len = RTE_MIN(desc_avail, mbuf_avail); + cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail); rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset), (void *)((uintptr_t)(desc_addr + desc_offset)), @@ -272,6 +330,7 @@ dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr, mbuf_offset += cpy_len; desc_avail -= cpy_len; desc_offset += cpy_len; + desc_chunck_len -= cpy_len; /* This desc reaches to its end, get the next one */ if (desc_avail == 0) { @@ -283,13 +342,26 @@ dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr, return -1; desc = &vr->desc[desc->next]; - desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); + desc_chunck_len = desc->len; + desc_gaddr = desc->addr; + desc_addr = rte_vhost_va_from_guest_pa( + dev->mem, desc_gaddr, &desc_chunck_len); if (unlikely(!desc_addr)) return -1; rte_prefetch0((void *)(uintptr_t)desc_addr); desc_offset = 0; desc_avail = desc->len; + } else if (unlikely(desc_chunck_len == 0)) { + desc_chunck_len = desc_avail; + desc_gaddr += desc_offset; + desc_addr = rte_vhost_va_from_guest_pa(dev->mem, + desc_gaddr, + &desc_chunck_len); + if (unlikely(!desc_addr)) + return -1; + + desc_offset = 0; } /* diff --git a/examples/vhost_scsi/vhost_scsi.c b/examples/vhost_scsi/vhost_scsi.c index b4f1f8d2..b40f9936 100644 --- a/examples/vhost_scsi/vhost_scsi.c +++ b/examples/vhost_scsi/vhost_scsi.c @@ -68,7 +68,7 @@ vhost_scsi_ctrlr_find(__rte_unused const char *ctrlr_name) return g_vhost_ctrlr; } -static uint64_t gpa_to_vva(int vid, uint64_t gpa) +static uint64_t gpa_to_vva(int vid, uint64_t gpa, uint64_t *len) { char path[PATH_MAX]; struct vhost_scsi_ctrlr *ctrlr; @@ -88,7 +88,7 @@ static uint64_t gpa_to_vva(int vid, uint64_t gpa) assert(ctrlr->mem != NULL); - return rte_vhost_gpa_to_vva(ctrlr->mem, gpa); + return rte_vhost_va_from_guest_pa(ctrlr->mem, gpa, len); } static struct vring_desc * @@ -138,15 +138,29 @@ static void vhost_process_read_payload_chain(struct vhost_scsi_task *task) { void *data; + uint64_t chunck_len; task->iovs_cnt = 0; + chunck_len = task->desc->len; task->resp = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, - task->desc->addr); + task->desc->addr, + &chunck_len); + if (!task->resp || chunck_len != task->desc->len) { + fprintf(stderr, "failed to translate desc address.\n"); + return; + } while (descriptor_has_next(task->desc)) { task->desc = descriptor_get_next(task->vq->desc, task->desc); + chunck_len = task->desc->len; data = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, - task->desc->addr); + task->desc->addr, + &chunck_len); + if (!data || chunck_len != task->desc->len) { + fprintf(stderr, "failed to translate desc address.\n"); + return; + } + task->iovs[task->iovs_cnt].iov_base = data; task->iovs[task->iovs_cnt].iov_len = task->desc->len; task->data_len += task->desc->len; @@ -158,12 +172,20 @@ static void vhost_process_write_payload_chain(struct vhost_scsi_task *task) { void *data; + uint64_t chunck_len; task->iovs_cnt = 0; do { + chunck_len = task->desc->len; data = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, - task->desc->addr); + task->desc->addr, + &chunck_len); + if (!data || chunck_len != task->desc->len) { + fprintf(stderr, "failed to translate desc address.\n"); + return; + } + task->iovs[task->iovs_cnt].iov_base = data; task->iovs[task->iovs_cnt].iov_len = task->desc->len; task->data_len += task->desc->len; @@ -171,8 +193,12 @@ vhost_process_write_payload_chain(struct vhost_scsi_task *task) task->desc = descriptor_get_next(task->vq->desc, task->desc); } while (descriptor_has_next(task->desc)); + chunck_len = task->desc->len; task->resp = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, - task->desc->addr); + task->desc->addr, + &chunck_len); + if (!task->resp || chunck_len != task->desc->len) + fprintf(stderr, "failed to translate desc address.\n"); } static struct vhost_block_dev * @@ -218,6 +244,7 @@ process_requestq(struct vhost_scsi_ctrlr *ctrlr, uint32_t q_idx) int req_idx; uint16_t last_idx; struct vhost_scsi_task *task; + uint64_t chunck_len; last_idx = scsi_vq->last_used_idx & (vq->size - 1); req_idx = vq->avail->ring[last_idx]; @@ -235,16 +262,27 @@ process_requestq(struct vhost_scsi_ctrlr *ctrlr, uint32_t q_idx) assert((task->desc->flags & VRING_DESC_F_INDIRECT) == 0); scsi_vq->last_used_idx++; + chunck_len = task->desc->len; task->req = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid, - task->desc->addr); + task->desc->addr, + &chunck_len); + if (!task->req || chunck_len != task->desc->len) { + fprintf(stderr, "failed to translate desc address.\n"); + return; + } task->desc = descriptor_get_next(task->vq->desc, task->desc); if (!descriptor_has_next(task->desc)) { task->dxfer_dir = SCSI_DIR_NONE; + chunck_len = task->desc->len; task->resp = (void *)(uintptr_t) gpa_to_vva(task->bdev->vid, - task->desc->addr); - + task->desc->addr, + &chunck_len); + if (!task->resp || chunck_len != task->desc->len) { + fprintf(stderr, "failed to translate desc address.\n"); + return; + } } else if (!descriptor_is_wr(task->desc)) { task->dxfer_dir = SCSI_DIR_TO_DEV; vhost_process_write_payload_chain(task); diff --git a/lib/librte_eal/common/include/rte_version.h b/lib/librte_eal/common/include/rte_version.h index a0611f3d..4616a080 100644 --- a/lib/librte_eal/common/include/rte_version.h +++ b/lib/librte_eal/common/include/rte_version.h @@ -66,7 +66,7 @@ extern "C" { /** * Patch level number i.e. the z in yy.mm.z */ -#define RTE_VER_MINOR 1 +#define RTE_VER_MINOR 2 /** * Extra string to be appended to version number diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h index f6536449..3fc6034d 100644 --- a/lib/librte_vhost/rte_vhost.h +++ b/lib/librte_vhost/rte_vhost.h @@ -117,6 +117,11 @@ struct vhost_device_ops { /** * Convert guest physical address to host virtual address * + * This function is deprecated because unsafe. + * New rte_vhost_va_from_guest_pa() should be used instead to ensure + * guest physical ranges are fully and contiguously mapped into + * process virtual address space. + * * @param mem * the guest memory regions * @param gpa @@ -124,6 +129,7 @@ struct vhost_device_ops { * @return * the host virtual address on success, 0 on failure */ +__rte_deprecated static __rte_always_inline uint64_t rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa) { @@ -142,6 +148,46 @@ rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa) return 0; } +/** + * Convert guest physical address to host virtual address safely + * + * This variant of rte_vhost_gpa_to_vva() takes care all the + * requested length is mapped and contiguous in process address + * space. + * + * @param mem + * the guest memory regions + * @param gpa + * the guest physical address for querying + * @param len + * the size of the requested area to map, updated with actual size mapped + * @return + * the host virtual address on success, 0 on failure + */ +static __rte_always_inline uint64_t +rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem, + uint64_t gpa, uint64_t *len) +{ + struct rte_vhost_mem_region *r; + uint32_t i; + + for (i = 0; i < mem->nregions; i++) { + r = &mem->regions[i]; + if (gpa >= r->guest_phys_addr && + gpa < r->guest_phys_addr + r->size) { + + if (unlikely(*len > r->guest_phys_addr + r->size - gpa)) + *len = r->guest_phys_addr + r->size - gpa; + + return gpa - r->guest_phys_addr + + r->host_user_addr; + } + } + *len = 0; + + return 0; +} + #define RTE_VHOST_NEED_LOG(features) ((features) & (1ULL << VHOST_F_LOG_ALL)) /** diff --git a/lib/librte_vhost/rte_vhost_version.map b/lib/librte_vhost/rte_vhost_version.map index 1e704953..9cb1d8ca 100644 --- a/lib/librte_vhost/rte_vhost_version.map +++ b/lib/librte_vhost/rte_vhost_version.map @@ -52,3 +52,9 @@ DPDK_17.08 { rte_vhost_rx_queue_count; } DPDK_17.05; + +DPDK_17.11.2 { + global; + + rte_vhost_va_from_guest_pa; +} DPDK_17.08; diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index 51ea720a..a8ed40b1 100644 --- a/lib/librte_vhost/vhost.c +++ b/lib/librte_vhost/vhost.c @@ -58,17 +58,17 @@ struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; /* Called with iotlb_lock read-locked */ uint64_t __vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint64_t iova, uint64_t size, uint8_t perm) + uint64_t iova, uint64_t *size, uint8_t perm) { uint64_t vva, tmp_size; - if (unlikely(!size)) + if (unlikely(!*size)) return 0; - tmp_size = size; + tmp_size = *size; vva = vhost_user_iotlb_cache_find(vq, iova, &tmp_size, perm); - if (tmp_size == size) + if (tmp_size == *size) return vva; iova += tmp_size; @@ -158,32 +158,39 @@ free_device(struct virtio_net *dev) int vring_translate(struct virtio_net *dev, struct vhost_virtqueue *vq) { - uint64_t size; + uint64_t req_size, size; if (!(dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) goto out; - size = sizeof(struct vring_desc) * vq->size; + req_size = sizeof(struct vring_desc) * vq->size; + size = req_size; vq->desc = (struct vring_desc *)(uintptr_t)vhost_iova_to_vva(dev, vq, vq->ring_addrs.desc_user_addr, - size, VHOST_ACCESS_RW); - if (!vq->desc) + &size, VHOST_ACCESS_RW); + if (!vq->desc || size != req_size) return -1; - size = sizeof(struct vring_avail); - size += sizeof(uint16_t) * vq->size; + req_size = sizeof(struct vring_avail); + req_size += sizeof(uint16_t) * vq->size; + if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) + req_size += sizeof(uint16_t); + size = req_size; vq->avail = (struct vring_avail *)(uintptr_t)vhost_iova_to_vva(dev, vq, vq->ring_addrs.avail_user_addr, - size, VHOST_ACCESS_RW); - if (!vq->avail) + &size, VHOST_ACCESS_RW); + if (!vq->avail || size != req_size) return -1; - size = sizeof(struct vring_used); - size += sizeof(struct vring_used_elem) * vq->size; + req_size = sizeof(struct vring_used); + req_size += sizeof(struct vring_used_elem) * vq->size; + if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) + req_size += sizeof(uint16_t); + size = req_size; vq->used = (struct vring_used *)(uintptr_t)vhost_iova_to_vva(dev, vq, vq->ring_addrs.used_user_addr, - size, VHOST_ACCESS_RW); - if (!vq->used) + &size, VHOST_ACCESS_RW); + if (!vq->used || size != req_size) return -1; out: diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index c8f2a817..16d6b891 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -381,18 +381,18 @@ struct vhost_device_ops const *vhost_driver_callback_get(const char *path); void vhost_backend_cleanup(struct virtio_net *dev); uint64_t __vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint64_t iova, uint64_t size, uint8_t perm); + uint64_t iova, uint64_t *len, uint8_t perm); int vring_translate(struct virtio_net *dev, struct vhost_virtqueue *vq); void vring_invalidate(struct virtio_net *dev, struct vhost_virtqueue *vq); static __rte_always_inline uint64_t vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint64_t iova, uint64_t size, uint8_t perm) + uint64_t iova, uint64_t *len, uint8_t perm) { if (!(dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) - return rte_vhost_gpa_to_vva(dev->mem, iova); + return rte_vhost_va_from_guest_pa(dev->mem, iova, len); - return __vhost_iova_to_vva(dev, vq, iova, size, perm); + return __vhost_iova_to_vva(dev, vq, iova, len, perm); } #endif /* _VHOST_NET_CDEV_H_ */ diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c index 3acaacf5..50e654db 100644 --- a/lib/librte_vhost/vhost_user.c +++ b/lib/librte_vhost/vhost_user.c @@ -329,21 +329,26 @@ numa_realloc(struct virtio_net *dev, int index __rte_unused) /* Converts QEMU virtual address to Vhost virtual address. */ static uint64_t -qva_to_vva(struct virtio_net *dev, uint64_t qva) +qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len) { - struct rte_vhost_mem_region *reg; + struct rte_vhost_mem_region *r; uint32_t i; /* Find the region where the address lives. */ for (i = 0; i < dev->mem->nregions; i++) { - reg = &dev->mem->regions[i]; + r = &dev->mem->regions[i]; + + if (qva >= r->guest_user_addr && + qva < r->guest_user_addr + r->size) { + + if (unlikely(*len > r->guest_user_addr + r->size - qva)) + *len = r->guest_user_addr + r->size - qva; - if (qva >= reg->guest_user_addr && - qva < reg->guest_user_addr + reg->size) { - return qva - reg->guest_user_addr + - reg->host_user_addr; + return qva - r->guest_user_addr + + r->host_user_addr; } } + *len = 0; return 0; } @@ -356,20 +361,20 @@ qva_to_vva(struct virtio_net *dev, uint64_t qva) */ static uint64_t ring_addr_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint64_t ra, uint64_t size) + uint64_t ra, uint64_t *size) { if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) { uint64_t vva; vva = vhost_user_iotlb_cache_find(vq, ra, - &size, VHOST_ACCESS_RW); + size, VHOST_ACCESS_RW); if (!vva) vhost_user_iotlb_miss(dev, ra, VHOST_ACCESS_RW); return vva; } - return qva_to_vva(dev, ra); + return qva_to_vva(dev, ra, size); } static struct virtio_net * @@ -377,16 +382,18 @@ translate_ring_addresses(struct virtio_net *dev, int vq_index) { struct vhost_virtqueue *vq = dev->virtqueue[vq_index]; struct vhost_vring_addr *addr = &vq->ring_addrs; + uint64_t len; /* The addresses are converted from QEMU virtual to Vhost virtual. */ if (vq->desc && vq->avail && vq->used) return dev; + len = sizeof(struct vring_desc) * vq->size; vq->desc = (struct vring_desc *)(uintptr_t)ring_addr_to_vva(dev, - vq, addr->desc_user_addr, sizeof(struct vring_desc)); - if (vq->desc == 0) { + vq, addr->desc_user_addr, &len); + if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) { RTE_LOG(DEBUG, VHOST_CONFIG, - "(%d) failed to find desc ring address.\n", + "(%d) failed to map desc ring.\n", dev->vid); return dev; } @@ -395,20 +402,26 @@ translate_ring_addresses(struct virtio_net *dev, int vq_index) vq = dev->virtqueue[vq_index]; addr = &vq->ring_addrs; + len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size; vq->avail = (struct vring_avail *)(uintptr_t)ring_addr_to_vva(dev, - vq, addr->avail_user_addr, sizeof(struct vring_avail)); - if (vq->avail == 0) { + vq, addr->avail_user_addr, &len); + if (vq->avail == 0 || + len != sizeof(struct vring_avail) + + sizeof(uint16_t) * vq->size) { RTE_LOG(DEBUG, VHOST_CONFIG, - "(%d) failed to find avail ring address.\n", + "(%d) failed to map avail ring.\n", dev->vid); return dev; } + len = sizeof(struct vring_used) + + sizeof(struct vring_used_elem) * vq->size; vq->used = (struct vring_used *)(uintptr_t)ring_addr_to_vva(dev, - vq, addr->used_user_addr, sizeof(struct vring_used)); - if (vq->used == 0) { + vq, addr->used_user_addr, &len); + if (vq->used == 0 || len != sizeof(struct vring_used) + + sizeof(struct vring_used_elem) * vq->size) { RTE_LOG(DEBUG, VHOST_CONFIG, - "(%d) failed to find used ring address.\n", + "(%d) failed to map used ring.\n", dev->vid); return dev; } @@ -1094,11 +1107,12 @@ vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg) struct virtio_net *dev = *pdev; struct vhost_iotlb_msg *imsg = &msg->payload.iotlb; uint16_t i; - uint64_t vva; + uint64_t vva, len; switch (imsg->type) { case VHOST_IOTLB_UPDATE: - vva = qva_to_vva(dev, imsg->uaddr); + len = imsg->size; + vva = qva_to_vva(dev, imsg->uaddr, &len); if (!vva) return -1; @@ -1106,7 +1120,7 @@ vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg) struct vhost_virtqueue *vq = dev->virtqueue[i]; vhost_user_iotlb_cache_insert(vq, imsg->iova, vva, - imsg->size, imsg->perm); + len, imsg->perm); if (is_vring_iotlb_update(vq, imsg)) *pdev = dev = translate_ring_addresses(dev, i); diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index d3470307..a013c07b 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -45,6 +45,7 @@ #include <rte_sctp.h> #include <rte_arp.h> #include <rte_spinlock.h> +#include <rte_malloc.h> #include "iotlb.h" #include "vhost.h" @@ -59,6 +60,46 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring; } +static __rte_always_inline struct vring_desc * +alloc_copy_ind_table(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct vring_desc *desc) +{ + struct vring_desc *idesc; + uint64_t src, dst; + uint64_t len, remain = desc->len; + uint64_t desc_addr = desc->addr; + + idesc = rte_malloc(__func__, desc->len, 0); + if (unlikely(!idesc)) + return 0; + + dst = (uint64_t)(uintptr_t)idesc; + + while (remain) { + len = remain; + src = vhost_iova_to_vva(dev, vq, desc_addr, &len, + VHOST_ACCESS_RO); + if (unlikely(!src || !len)) { + rte_free(idesc); + return 0; + } + + rte_memcpy((void *)(uintptr_t)dst, (void *)(uintptr_t)src, len); + + remain -= len; + dst += len; + desc_addr += len; + } + + return idesc; +} + +static __rte_always_inline void +free_ind_table(struct vring_desc *idesc) +{ + rte_free(idesc); +} + static __rte_always_inline void do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq, uint16_t to, uint16_t from, uint16_t size) @@ -204,8 +245,9 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, uint32_t desc_avail, desc_offset; uint32_t mbuf_avail, mbuf_offset; uint32_t cpy_len; + uint64_t desc_chunck_len; struct vring_desc *desc; - uint64_t desc_addr; + uint64_t desc_addr, desc_gaddr; /* A counter to avoid desc dead loop chain */ uint16_t nr_desc = 1; struct batch_copy_elem *batch_copy = vq->batch_copy_elems; @@ -213,8 +255,10 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, int error = 0; desc = &descs[desc_idx]; - desc_addr = vhost_iova_to_vva(dev, vq, desc->addr, - desc->len, VHOST_ACCESS_RW); + desc_chunck_len = desc->len; + desc_gaddr = desc->addr; + desc_addr = vhost_iova_to_vva(dev, vq, desc_gaddr, + &desc_chunck_len, VHOST_ACCESS_RW); /* * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid * performance issue with some versions of gcc (4.8.4 and 5.3.0) which @@ -227,12 +271,58 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, rte_prefetch0((void *)(uintptr_t)desc_addr); - virtio_enqueue_offload(m, (struct virtio_net_hdr *)(uintptr_t)desc_addr); - vhost_log_write(dev, desc->addr, dev->vhost_hlen); - PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0); + if (likely(desc_chunck_len >= dev->vhost_hlen)) { + virtio_enqueue_offload(m, + (struct virtio_net_hdr *)(uintptr_t)desc_addr); + PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0); + vhost_log_write(dev, desc_gaddr, dev->vhost_hlen); + } else { + struct virtio_net_hdr vnet_hdr; + uint64_t remain = dev->vhost_hlen; + uint64_t len; + uint64_t src = (uint64_t)(uintptr_t)&vnet_hdr, dst; + uint64_t guest_addr = desc_gaddr; + + virtio_enqueue_offload(m, &vnet_hdr); + + while (remain) { + len = remain; + dst = vhost_iova_to_vva(dev, vq, guest_addr, + &len, VHOST_ACCESS_RW); + if (unlikely(!dst || !len)) { + error = -1; + goto out; + } + + rte_memcpy((void *)(uintptr_t)dst, + (void *)(uintptr_t)src, len); + + PRINT_PACKET(dev, (uintptr_t)dst, len, 0); + vhost_log_write(dev, guest_addr, len); + remain -= len; + guest_addr += len; + dst += len; + } + } - desc_offset = dev->vhost_hlen; desc_avail = desc->len - dev->vhost_hlen; + if (unlikely(desc_chunck_len < dev->vhost_hlen)) { + desc_chunck_len = desc_avail; + desc_gaddr = desc->addr + dev->vhost_hlen; + desc_addr = vhost_iova_to_vva(dev, + vq, desc_gaddr, + &desc_chunck_len, + VHOST_ACCESS_RW); + if (unlikely(!desc_addr)) { + error = -1; + goto out; + } + + desc_offset = 0; + } else { + desc_offset = dev->vhost_hlen; + desc_chunck_len -= dev->vhost_hlen; + } mbuf_avail = rte_pktmbuf_data_len(m); mbuf_offset = 0; @@ -258,8 +348,10 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, } desc = &descs[desc->next]; - desc_addr = vhost_iova_to_vva(dev, vq, desc->addr, - desc->len, + desc_chunck_len = desc->len; + desc_gaddr = desc->addr; + desc_addr = vhost_iova_to_vva(dev, vq, desc_gaddr, + &desc_chunck_len, VHOST_ACCESS_RW); if (unlikely(!desc_addr)) { error = -1; @@ -268,15 +360,26 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, desc_offset = 0; desc_avail = desc->len; + } else if (unlikely(desc_chunck_len == 0)) { + desc_chunck_len = desc_avail; + desc_gaddr += desc_offset; + desc_addr = vhost_iova_to_vva(dev, + vq, desc_gaddr, + &desc_chunck_len, VHOST_ACCESS_RW); + if (unlikely(!desc_addr)) { + error = -1; + goto out; + } + desc_offset = 0; } - cpy_len = RTE_MIN(desc_avail, mbuf_avail); + cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail); if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) { rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), cpy_len); - vhost_log_write(dev, desc->addr + desc_offset, cpy_len); + vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len); PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), cpy_len, 0); } else { @@ -284,7 +387,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, (void *)((uintptr_t)(desc_addr + desc_offset)); batch_copy[copy_nb].src = rte_pktmbuf_mtod_offset(m, void *, mbuf_offset); - batch_copy[copy_nb].log_addr = desc->addr + desc_offset; + batch_copy[copy_nb].log_addr = desc_gaddr + desc_offset; batch_copy[copy_nb].len = cpy_len; copy_nb++; } @@ -293,6 +396,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, mbuf_offset += cpy_len; desc_avail -= cpy_len; desc_offset += cpy_len; + desc_chunck_len -= cpy_len; } out: @@ -371,20 +475,34 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, rte_prefetch0(&vq->desc[desc_indexes[0]]); for (i = 0; i < count; i++) { + struct vring_desc *idesc = NULL; uint16_t desc_idx = desc_indexes[i]; int err; if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) { + uint64_t dlen = vq->desc[desc_idx].len; descs = (struct vring_desc *)(uintptr_t) vhost_iova_to_vva(dev, vq, vq->desc[desc_idx].addr, - vq->desc[desc_idx].len, - VHOST_ACCESS_RO); + &dlen, VHOST_ACCESS_RO); if (unlikely(!descs)) { count = i; break; } + if (unlikely(dlen < vq->desc[desc_idx].len)) { + /* + * The indirect desc table is not contiguous + * in process VA space, we have to copy it. + */ + idesc = alloc_copy_ind_table(dev, vq, + &vq->desc[desc_idx]); + if (unlikely(!idesc)) + break; + + descs = idesc; + } + desc_idx = 0; sz = vq->desc[desc_idx].len / sizeof(*descs); } else { @@ -395,11 +513,15 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, err = copy_mbuf_to_desc(dev, vq, descs, pkts[i], desc_idx, sz); if (unlikely(err)) { count = i; + free_ind_table(idesc); break; } if (i + 1 < count) rte_prefetch0(&vq->desc[desc_indexes[i+1]]); + + if (unlikely(!!idesc)) + free_ind_table(idesc); } do_data_copy_enqueue(dev, vq); @@ -438,24 +560,41 @@ fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq, uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)]; uint32_t vec_id = *vec_idx; uint32_t len = 0; + uint64_t dlen; struct vring_desc *descs = vq->desc; + struct vring_desc *idesc = NULL; *desc_chain_head = idx; if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) { + dlen = vq->desc[idx].len; descs = (struct vring_desc *)(uintptr_t) vhost_iova_to_vva(dev, vq, vq->desc[idx].addr, - vq->desc[idx].len, + &dlen, VHOST_ACCESS_RO); if (unlikely(!descs)) return -1; + if (unlikely(dlen < vq->desc[idx].len)) { + /* + * The indirect desc table is not contiguous + * in process VA space, we have to copy it. + */ + idesc = alloc_copy_ind_table(dev, vq, &vq->desc[idx]); + if (unlikely(!idesc)) + return -1; + + descs = idesc; + } + idx = 0; } while (1) { - if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size)) + if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size)) { + free_ind_table(idesc); return -1; + } len += descs[idx].len; buf_vec[vec_id].buf_addr = descs[idx].addr; @@ -472,6 +611,9 @@ fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq, *desc_chain_len = len; *vec_idx = vec_id; + if (unlikely(!!idesc)) + free_ind_table(idesc); + return 0; } @@ -526,13 +668,15 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, uint16_t num_buffers) { uint32_t vec_idx = 0; - uint64_t desc_addr; + uint64_t desc_addr, desc_gaddr; uint32_t mbuf_offset, mbuf_avail; uint32_t desc_offset, desc_avail; uint32_t cpy_len; + uint64_t desc_chunck_len; uint64_t hdr_addr, hdr_phys_addr; struct rte_mbuf *hdr_mbuf; struct batch_copy_elem *batch_copy = vq->batch_copy_elems; + struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL; uint16_t copy_nb = vq->batch_copy_nb_elems; int error = 0; @@ -541,9 +685,12 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, goto out; } - desc_addr = vhost_iova_to_vva(dev, vq, buf_vec[vec_idx].buf_addr, - buf_vec[vec_idx].buf_len, - VHOST_ACCESS_RW); + desc_chunck_len = buf_vec[vec_idx].buf_len; + desc_gaddr = buf_vec[vec_idx].buf_addr; + desc_addr = vhost_iova_to_vva(dev, vq, + desc_gaddr, + &desc_chunck_len, + VHOST_ACCESS_RW); if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr) { error = -1; goto out; @@ -551,14 +698,35 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, hdr_mbuf = m; hdr_addr = desc_addr; - hdr_phys_addr = buf_vec[vec_idx].buf_addr; + if (unlikely(desc_chunck_len < dev->vhost_hlen)) + hdr = &tmp_hdr; + else + hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr; + hdr_phys_addr = desc_gaddr; rte_prefetch0((void *)(uintptr_t)hdr_addr); LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n", dev->vid, num_buffers); desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen; - desc_offset = dev->vhost_hlen; + if (unlikely(desc_chunck_len < dev->vhost_hlen)) { + desc_chunck_len = desc_avail; + desc_gaddr += dev->vhost_hlen; + desc_addr = vhost_iova_to_vva(dev, vq, + desc_gaddr, + &desc_chunck_len, + VHOST_ACCESS_RW); + if (unlikely(!desc_addr)) { + error = -1; + goto out; + } + + desc_offset = 0; + } else { + desc_offset = dev->vhost_hlen; + desc_chunck_len -= dev->vhost_hlen; + } + mbuf_avail = rte_pktmbuf_data_len(m); mbuf_offset = 0; @@ -566,10 +734,12 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, /* done with current desc buf, get the next one */ if (desc_avail == 0) { vec_idx++; + desc_chunck_len = buf_vec[vec_idx].buf_len; + desc_gaddr = buf_vec[vec_idx].buf_addr; desc_addr = vhost_iova_to_vva(dev, vq, - buf_vec[vec_idx].buf_addr, - buf_vec[vec_idx].buf_len, + desc_gaddr, + &desc_chunck_len, VHOST_ACCESS_RW); if (unlikely(!desc_addr)) { error = -1; @@ -580,6 +750,17 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, rte_prefetch0((void *)(uintptr_t)desc_addr); desc_offset = 0; desc_avail = buf_vec[vec_idx].buf_len; + } else if (unlikely(desc_chunck_len == 0)) { + desc_chunck_len = desc_avail; + desc_gaddr += desc_offset; + desc_addr = vhost_iova_to_vva(dev, vq, + desc_gaddr, + &desc_chunck_len, VHOST_ACCESS_RW); + if (unlikely(!desc_addr)) { + error = -1; + goto out; + } + desc_offset = 0; } /* done with current mbuf, get the next one */ @@ -591,30 +772,55 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, } if (hdr_addr) { - struct virtio_net_hdr_mrg_rxbuf *hdr; - - hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t) - hdr_addr; virtio_enqueue_offload(hdr_mbuf, &hdr->hdr); ASSIGN_UNLESS_EQUAL(hdr->num_buffers, num_buffers); - vhost_log_write(dev, hdr_phys_addr, dev->vhost_hlen); - PRINT_PACKET(dev, (uintptr_t)hdr_addr, - dev->vhost_hlen, 0); + if (unlikely(hdr == &tmp_hdr)) { + uint64_t len; + uint64_t remain = dev->vhost_hlen; + uint64_t src = (uint64_t)(uintptr_t)hdr, dst; + uint64_t guest_addr = hdr_phys_addr; + + while (remain) { + len = remain; + dst = vhost_iova_to_vva(dev, vq, + guest_addr, &len, + VHOST_ACCESS_RW); + if (unlikely(!dst || !len)) { + error = -1; + goto out; + } + + rte_memcpy((void *)(uintptr_t)dst, + (void *)(uintptr_t)src, + len); + + PRINT_PACKET(dev, (uintptr_t)dst, + len, 0); + vhost_log_write(dev, guest_addr, len); + + remain -= len; + guest_addr += len; + dst += len; + } + } else { + PRINT_PACKET(dev, (uintptr_t)hdr_addr, + dev->vhost_hlen, 0); + vhost_log_write(dev, hdr_phys_addr, + dev->vhost_hlen); + } hdr_addr = 0; } - cpy_len = RTE_MIN(desc_avail, mbuf_avail); + cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail); if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) { rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), cpy_len); - vhost_log_write(dev, - buf_vec[vec_idx].buf_addr + desc_offset, - cpy_len); + vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len); PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), cpy_len, 0); } else { @@ -622,8 +828,7 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, (void *)((uintptr_t)(desc_addr + desc_offset)); batch_copy[copy_nb].src = rte_pktmbuf_mtod_offset(m, void *, mbuf_offset); - batch_copy[copy_nb].log_addr = - buf_vec[vec_idx].buf_addr + desc_offset; + batch_copy[copy_nb].log_addr = desc_gaddr + desc_offset; batch_copy[copy_nb].len = cpy_len; copy_nb++; } @@ -632,6 +837,7 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, mbuf_offset += cpy_len; desc_avail -= cpy_len; desc_offset += cpy_len; + desc_chunck_len -= cpy_len; } out: @@ -907,11 +1113,13 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool) { struct vring_desc *desc; - uint64_t desc_addr; + uint64_t desc_addr, desc_gaddr; uint32_t desc_avail, desc_offset; uint32_t mbuf_avail, mbuf_offset; uint32_t cpy_len; + uint64_t desc_chunck_len; struct rte_mbuf *cur = m, *prev = m; + struct virtio_net_hdr tmp_hdr; struct virtio_net_hdr *hdr = NULL; /* A counter to avoid desc dead loop chain */ uint32_t nr_desc = 1; @@ -926,9 +1134,11 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, goto out; } + desc_chunck_len = desc->len; + desc_gaddr = desc->addr; desc_addr = vhost_iova_to_vva(dev, - vq, desc->addr, - desc->len, + vq, desc_gaddr, + &desc_chunck_len, VHOST_ACCESS_RO); if (unlikely(!desc_addr)) { error = -1; @@ -936,8 +1146,40 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, } if (virtio_net_with_host_offload(dev)) { - hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr); - rte_prefetch0(hdr); + if (unlikely(desc_chunck_len < sizeof(struct virtio_net_hdr))) { + uint64_t len = desc_chunck_len; + uint64_t remain = sizeof(struct virtio_net_hdr); + uint64_t src = desc_addr; + uint64_t dst = (uint64_t)(uintptr_t)&tmp_hdr; + uint64_t guest_addr = desc_gaddr; + + /* + * No luck, the virtio-net header doesn't fit + * in a contiguous virtual area. + */ + while (remain) { + len = remain; + src = vhost_iova_to_vva(dev, vq, + guest_addr, &len, + VHOST_ACCESS_RO); + if (unlikely(!src || !len)) { + error = -1; + goto out; + } + + rte_memcpy((void *)(uintptr_t)dst, + (void *)(uintptr_t)src, len); + + guest_addr += len; + remain -= len; + dst += len; + } + + hdr = &tmp_hdr; + } else { + hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr); + rte_prefetch0(hdr); + } } /* @@ -953,9 +1195,11 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, goto out; } + desc_chunck_len = desc->len; + desc_gaddr = desc->addr; desc_addr = vhost_iova_to_vva(dev, - vq, desc->addr, - desc->len, + vq, desc_gaddr, + &desc_chunck_len, VHOST_ACCESS_RO); if (unlikely(!desc_addr)) { error = -1; @@ -967,19 +1211,37 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, nr_desc += 1; } else { desc_avail = desc->len - dev->vhost_hlen; - desc_offset = dev->vhost_hlen; + + if (unlikely(desc_chunck_len < dev->vhost_hlen)) { + desc_chunck_len = desc_avail; + desc_gaddr += dev->vhost_hlen; + desc_addr = vhost_iova_to_vva(dev, + vq, desc_gaddr, + &desc_chunck_len, + VHOST_ACCESS_RO); + if (unlikely(!desc_addr)) { + error = -1; + goto out; + } + + desc_offset = 0; + } else { + desc_offset = dev->vhost_hlen; + desc_chunck_len -= dev->vhost_hlen; + } } rte_prefetch0((void *)(uintptr_t)(desc_addr + desc_offset)); - PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), desc_avail, 0); + PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), + desc_chunck_len, 0); mbuf_offset = 0; mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM; while (1) { uint64_t hpa; - cpy_len = RTE_MIN(desc_avail, mbuf_avail); + cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail); /* * A desc buf might across two host physical pages that are @@ -987,7 +1249,7 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, * will be copied even though zero copy is enabled. */ if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev, - desc->addr + desc_offset, cpy_len)))) { + desc_gaddr + desc_offset, cpy_len)))) { cur->data_len = cpy_len; cur->data_off = 0; cur->buf_addr = (void *)(uintptr_t)(desc_addr @@ -1002,7 +1264,8 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, } else { if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size || - (hdr && cur == m))) { + (hdr && cur == m) || + desc->len != desc_chunck_len)) { rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset), (void *)((uintptr_t)(desc_addr + @@ -1023,6 +1286,7 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, mbuf_avail -= cpy_len; mbuf_offset += cpy_len; desc_avail -= cpy_len; + desc_chunck_len -= cpy_len; desc_offset += cpy_len; /* This desc reaches to its end, get the next one */ @@ -1041,9 +1305,11 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, goto out; } + desc_chunck_len = desc->len; + desc_gaddr = desc->addr; desc_addr = vhost_iova_to_vva(dev, - vq, desc->addr, - desc->len, + vq, desc_gaddr, + &desc_chunck_len, VHOST_ACCESS_RO); if (unlikely(!desc_addr)) { error = -1; @@ -1055,7 +1321,23 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, desc_offset = 0; desc_avail = desc->len; - PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0); + PRINT_PACKET(dev, (uintptr_t)desc_addr, + desc_chunck_len, 0); + } else if (unlikely(desc_chunck_len == 0)) { + desc_chunck_len = desc_avail; + desc_gaddr += desc_offset; + desc_addr = vhost_iova_to_vva(dev, vq, + desc_gaddr, + &desc_chunck_len, + VHOST_ACCESS_RO); + if (unlikely(!desc_addr)) { + error = -1; + goto out; + } + desc_offset = 0; + + PRINT_PACKET(dev, (uintptr_t)desc_addr, + desc_chunck_len, 0); } /* @@ -1317,22 +1599,37 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, /* Prefetch descriptor index. */ rte_prefetch0(&vq->desc[desc_indexes[0]]); for (i = 0; i < count; i++) { - struct vring_desc *desc; + struct vring_desc *desc, *idesc = NULL; uint16_t sz, idx; + uint64_t dlen; int err; if (likely(i + 1 < count)) rte_prefetch0(&vq->desc[desc_indexes[i + 1]]); if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) { + dlen = vq->desc[desc_indexes[i]].len; desc = (struct vring_desc *)(uintptr_t) vhost_iova_to_vva(dev, vq, vq->desc[desc_indexes[i]].addr, - sizeof(*desc), + &dlen, VHOST_ACCESS_RO); if (unlikely(!desc)) break; + if (unlikely(dlen < vq->desc[desc_indexes[i]].len)) { + /* + * The indirect desc table is not contiguous + * in process VA space, we have to copy it. + */ + idesc = alloc_copy_ind_table(dev, vq, + &vq->desc[desc_indexes[i]]); + if (unlikely(!idesc)) + break; + + desc = idesc; + } + rte_prefetch0(desc); sz = vq->desc[desc_indexes[i]].len / sizeof(*desc); idx = 0; @@ -1346,6 +1643,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, if (unlikely(pkts[i] == NULL)) { RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for mbuf.\n"); + free_ind_table(idesc); break; } @@ -1353,6 +1651,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, mbuf_pool); if (unlikely(err)) { rte_pktmbuf_free(pkts[i]); + free_ind_table(idesc); break; } @@ -1362,6 +1661,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, zmbuf = get_zmbuf(vq); if (!zmbuf) { rte_pktmbuf_free(pkts[i]); + free_ind_table(idesc); break; } zmbuf->mbuf = pkts[i]; @@ -1378,6 +1678,9 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, vq->nr_zmbuf += 1; TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); } + + if (unlikely(!!idesc)) + free_ind_table(idesc); } vq->last_avail_idx += i; diff --git a/pkg/dpdk.spec b/pkg/dpdk.spec index bbb16f1c..8681e4c8 100644 --- a/pkg/dpdk.spec +++ b/pkg/dpdk.spec @@ -30,7 +30,7 @@ # OF THE POSSIBILITY OF SUCH DAMAGE. Name: dpdk -Version: 17.11.1 +Version: 17.11.2 Release: 1 Packager: packaging@6wind.com URL: http://dpdk.org |