aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorLuca Boccassi <luca.boccassi@gmail.com>2018-04-23 14:16:57 +0100
committerLuca Boccassi <luca.boccassi@gmail.com>2018-04-23 14:17:34 +0100
commit39157ec04095ab012d11db23c462844634bfbb8f (patch)
tree643f83dc46445aa7834fe271ce2c21a5cb278cee /lib
parent47d9763a1dd3103d732da9eec350cfc1cd784717 (diff)
New upstream version 16.11.5upstream/16.11.5
Change-Id: I47171042629a57c6958d50251351e668ca5f3d8b Signed-off-by: Luca Boccassi <luca.boccassi@gmail.com>
Diffstat (limited to 'lib')
-rw-r--r--lib/librte_eal/bsdapp/contigmem/contigmem.c1
-rw-r--r--lib/librte_eal/bsdapp/eal/eal_memory.c2
-rw-r--r--lib/librte_eal/common/eal_common_memzone.c3
-rw-r--r--lib/librte_eal/common/eal_common_pci_uio.c1
-rw-r--r--lib/librte_eal/common/include/arch/ppc_64/rte_atomic.h8
-rw-r--r--lib/librte_eal/common/include/arch/x86/rte_atomic.h44
-rw-r--r--lib/librte_eal/common/include/rte_debug.h2
-rw-r--r--lib/librte_eal/common/include/rte_version.h2
-rw-r--r--lib/librte_eal/common/malloc_elem.c1
-rw-r--r--lib/librte_eal/common/malloc_heap.c6
-rw-r--r--lib/librte_eal/common/malloc_heap.h2
-rw-r--r--lib/librte_eal/common/rte_keepalive.c28
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_pci.c1
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_vfio.c92
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_vfio.h25
-rw-r--r--lib/librte_eal/linuxapp/igb_uio/compat.h4
-rw-r--r--lib/librte_eal/linuxapp/igb_uio/igb_uio.c22
-rw-r--r--lib/librte_eal/linuxapp/kni/ethtool/igb/igb_main.c41
-rw-r--r--lib/librte_eal/linuxapp/kni/ethtool/igb/kcompat.h4
-rw-r--r--lib/librte_ether/rte_ethdev.c1
-rw-r--r--lib/librte_ether/rte_ethdev.h18
-rw-r--r--lib/librte_lpm/rte_lpm.c8
-rw-r--r--lib/librte_mbuf/rte_mbuf.h13
-rw-r--r--lib/librte_pdump/rte_pdump.c4
-rw-r--r--lib/librte_vhost/socket.c4
-rw-r--r--lib/librte_vhost/vhost.c2
-rw-r--r--lib/librte_vhost/vhost.h2
-rw-r--r--lib/librte_vhost/vhost_user.c104
-rw-r--r--lib/librte_vhost/virtio_net.c56
29 files changed, 443 insertions, 58 deletions
diff --git a/lib/librte_eal/bsdapp/contigmem/contigmem.c b/lib/librte_eal/bsdapp/contigmem/contigmem.c
index e8fb9087..9676d8b3 100644
--- a/lib/librte_eal/bsdapp/contigmem/contigmem.c
+++ b/lib/librte_eal/bsdapp/contigmem/contigmem.c
@@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
#include <sys/rwlock.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
+#include <sys/vmmeter.h>
#include <machine/bus.h>
diff --git a/lib/librte_eal/bsdapp/eal/eal_memory.c b/lib/librte_eal/bsdapp/eal/eal_memory.c
index 3614da8d..248312d9 100644
--- a/lib/librte_eal/bsdapp/eal/eal_memory.c
+++ b/lib/librte_eal/bsdapp/eal/eal_memory.c
@@ -150,7 +150,7 @@ rte_eal_hugepage_attach(void)
/* Map the shared hugepage_info into the process address spaces */
hpi = mmap(NULL, sizeof(struct hugepage_info), PROT_READ, MAP_PRIVATE,
fd_hugepage_info, 0);
- if (hpi == NULL) {
+ if (hpi == MAP_FAILED) {
RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path());
goto error;
}
diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 64f4e0ad..b58d85b7 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -236,7 +236,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
return NULL;
}
- const struct malloc_elem *elem = malloc_elem_from_data(mz_addr);
+ struct malloc_elem *elem = malloc_elem_from_data(mz_addr);
/* fill the zone in config */
mz = get_next_free_memzone();
@@ -244,6 +244,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
if (mz == NULL) {
RTE_LOG(ERR, EAL, "%s(): Cannot find free memzone but there is room "
"in config!\n", __func__);
+ malloc_elem_free(elem);
rte_errno = ENOSPC;
return NULL;
}
diff --git a/lib/librte_eal/common/eal_common_pci_uio.c b/lib/librte_eal/common/eal_common_pci_uio.c
index 367a6816..6f91ff9c 100644
--- a/lib/librte_eal/common/eal_common_pci_uio.c
+++ b/lib/librte_eal/common/eal_common_pci_uio.c
@@ -117,7 +117,6 @@ pci_uio_map_resource(struct rte_pci_device *dev)
dev->intr_handle.fd = -1;
dev->intr_handle.uio_cfg_fd = -1;
- dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
/* secondary processes - use already recorded details */
if (rte_eal_process_type() != RTE_PROC_PRIMARY)
diff --git a/lib/librte_eal/common/include/arch/ppc_64/rte_atomic.h b/lib/librte_eal/common/include/arch/ppc_64/rte_atomic.h
index fb4fccb4..37f5eff2 100644
--- a/lib/librte_eal/common/include/arch/ppc_64/rte_atomic.h
+++ b/lib/librte_eal/common/include/arch/ppc_64/rte_atomic.h
@@ -64,9 +64,9 @@ extern "C" {
* occur before the STORE operations generated after.
*/
#ifdef RTE_ARCH_64
-#define rte_wmb() {asm volatile("lwsync" : : : "memory"); }
+#define rte_wmb() asm volatile("lwsync" : : : "memory")
#else
-#define rte_wmb() {asm volatile("sync" : : : "memory"); }
+#define rte_wmb() asm volatile("sync" : : : "memory")
#endif
/**
@@ -76,9 +76,9 @@ extern "C" {
* occur before the LOAD operations generated after.
*/
#ifdef RTE_ARCH_64
-#define rte_rmb() {asm volatile("lwsync" : : : "memory"); }
+#define rte_rmb() asm volatile("lwsync" : : : "memory")
#else
-#define rte_rmb() {asm volatile("sync" : : : "memory"); }
+#define rte_rmb() asm volatile("sync" : : : "memory")
#endif
#define rte_smp_mb() rte_mb()
diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic.h b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
index 00b1cdf5..d12b679a 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic.h
@@ -55,12 +55,52 @@ extern "C" {
#define rte_rmb() _mm_lfence()
-#define rte_smp_mb() rte_mb()
-
#define rte_smp_wmb() rte_compiler_barrier()
#define rte_smp_rmb() rte_compiler_barrier()
+/*
+ * From Intel Software Development Manual; Vol 3;
+ * 8.2.2 Memory Ordering in P6 and More Recent Processor Families:
+ * ...
+ * . Reads are not reordered with other reads.
+ * . Writes are not reordered with older reads.
+ * . Writes to memory are not reordered with other writes,
+ * with the following exceptions:
+ * . streaming stores (writes) executed with the non-temporal move
+ * instructions (MOVNTI, MOVNTQ, MOVNTDQ, MOVNTPS, and MOVNTPD); and
+ * . string operations (see Section 8.2.4.1).
+ * ...
+ * . Reads may be reordered with older writes to different locations but not
+ * with older writes to the same location.
+ * . Reads or writes cannot be reordered with I/O instructions,
+ * locked instructions, or serializing instructions.
+ * . Reads cannot pass earlier LFENCE and MFENCE instructions.
+ * . Writes ... cannot pass earlier LFENCE, SFENCE, and MFENCE instructions.
+ * . LFENCE instructions cannot pass earlier reads.
+ * . SFENCE instructions cannot pass earlier writes ...
+ * . MFENCE instructions cannot pass earlier reads, writes ...
+ *
+ * As pointed by Java guys, that makes possible to use lock-prefixed
+ * instructions to get the same effect as mfence and on most modern HW
+ * that gives a better perfomance then using mfence:
+ * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
+ * Basic idea is to use lock prefixed add with some dummy memory location
+ * as the destination. From their experiments 128B(2 cache lines) below
+ * current stack pointer looks like a good candidate.
+ * So below we use that techinque for rte_smp_mb() implementation.
+ */
+
+static inline void __attribute__((always_inline))
+rte_smp_mb(void)
+{
+#ifdef RTE_ARCH_I686
+ asm volatile("lock addl $0, -128(%%esp); " ::: "memory");
+#else
+ asm volatile("lock addl $0, -128(%%rsp); " ::: "memory");
+#endif
+}
+
/*------------------------- 16 bit atomic operations -------------------------*/
#ifndef RTE_FORCE_INTRINSICS
diff --git a/lib/librte_eal/common/include/rte_debug.h b/lib/librte_eal/common/include/rte_debug.h
index cab6fb4c..ec1dce03 100644
--- a/lib/librte_eal/common/include/rte_debug.h
+++ b/lib/librte_eal/common/include/rte_debug.h
@@ -86,7 +86,7 @@ void rte_dump_registers(void);
#endif
#define RTE_VERIFY(exp) do { \
if (unlikely(!(exp))) \
- rte_panic("line %d\tassert \"" #exp "\" failed\n", __LINE__); \
+ rte_panic("line %d\tassert \"%s\" failed\n", __LINE__, #exp); \
} while (0)
/*
diff --git a/lib/librte_eal/common/include/rte_version.h b/lib/librte_eal/common/include/rte_version.h
index e92737d2..4a9f4821 100644
--- a/lib/librte_eal/common/include/rte_version.h
+++ b/lib/librte_eal/common/include/rte_version.h
@@ -66,7 +66,7 @@ extern "C" {
/**
* Patch level number i.e. the z in yy.mm.z
*/
-#define RTE_VER_MINOR 4
+#define RTE_VER_MINOR 5
/**
* Extra string to be appended to version number
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 77a86151..e2bd3ac2 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -98,6 +98,7 @@ elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align,
if ((new_data_start & bmask) != ((end_pt - 1) & bmask)) {
end_pt = RTE_ALIGN_FLOOR(end_pt, bound);
new_data_start = RTE_ALIGN_FLOOR((end_pt - size), align);
+ end_pt = new_data_start + size;
if (((end_pt - 1) & bmask) != (new_data_start & bmask))
return NULL;
}
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 267a4c6c..c731f1cd 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -178,12 +178,14 @@ malloc_heap_alloc(struct malloc_heap *heap,
* Function to retrieve data for heap on given socket
*/
int
-malloc_heap_get_stats(const struct malloc_heap *heap,
+malloc_heap_get_stats(struct malloc_heap *heap,
struct rte_malloc_socket_stats *socket_stats)
{
size_t idx;
struct malloc_elem *elem;
+ rte_spinlock_lock(&heap->lock);
+
/* Initialise variables for heap */
socket_stats->free_count = 0;
socket_stats->heap_freesz_bytes = 0;
@@ -205,6 +207,8 @@ malloc_heap_get_stats(const struct malloc_heap *heap,
socket_stats->heap_allocsz_bytes = (socket_stats->heap_totalsz_bytes -
socket_stats->heap_freesz_bytes);
socket_stats->alloc_count = heap->alloc_count;
+
+ rte_spinlock_unlock(&heap->lock);
return 0;
}
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index 3ccbef0f..3b1166f0 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -57,7 +57,7 @@ malloc_heap_alloc(struct malloc_heap *heap, const char *type, size_t size,
unsigned flags, size_t align, size_t bound);
int
-malloc_heap_get_stats(const struct malloc_heap *heap,
+malloc_heap_get_stats(struct malloc_heap *heap,
struct rte_malloc_socket_stats *socket_stats);
int
diff --git a/lib/librte_eal/common/rte_keepalive.c b/lib/librte_eal/common/rte_keepalive.c
index 9765d1bd..4625fab0 100644
--- a/lib/librte_eal/common/rte_keepalive.c
+++ b/lib/librte_eal/common/rte_keepalive.c
@@ -42,8 +42,12 @@
struct rte_keepalive {
/** Core Liveness. */
- enum rte_keepalive_state __rte_cache_aligned state_flags[
- RTE_KEEPALIVE_MAXCORES];
+ struct {
+ /*
+ * Each element must be cache aligned to prevent false sharing.
+ */
+ enum rte_keepalive_state core_state __rte_cache_aligned;
+ } live_data[RTE_KEEPALIVE_MAXCORES];
/** Last-seen-alive timestamps */
uint64_t last_alive[RTE_KEEPALIVE_MAXCORES];
@@ -96,19 +100,22 @@ rte_keepalive_dispatch_pings(__rte_unused void *ptr_timer,
if (keepcfg->active_cores[idx_core] == 0)
continue;
- switch (keepcfg->state_flags[idx_core]) {
+ switch (keepcfg->live_data[idx_core].core_state) {
case RTE_KA_STATE_UNUSED:
break;
case RTE_KA_STATE_ALIVE: /* Alive */
- keepcfg->state_flags[idx_core] = RTE_KA_STATE_MISSING;
+ keepcfg->live_data[idx_core].core_state =
+ RTE_KA_STATE_MISSING;
keepcfg->last_alive[idx_core] = rte_rdtsc();
break;
case RTE_KA_STATE_MISSING: /* MIA */
print_trace("Core MIA. ", keepcfg, idx_core);
- keepcfg->state_flags[idx_core] = RTE_KA_STATE_DEAD;
+ keepcfg->live_data[idx_core].core_state =
+ RTE_KA_STATE_DEAD;
break;
case RTE_KA_STATE_DEAD: /* Dead */
- keepcfg->state_flags[idx_core] = RTE_KA_STATE_GONE;
+ keepcfg->live_data[idx_core].core_state =
+ RTE_KA_STATE_GONE;
print_trace("Core died. ", keepcfg, idx_core);
if (keepcfg->callback)
keepcfg->callback(
@@ -119,7 +126,8 @@ rte_keepalive_dispatch_pings(__rte_unused void *ptr_timer,
case RTE_KA_STATE_GONE: /* Buried */
break;
case RTE_KA_STATE_DOZING: /* Core going idle */
- keepcfg->state_flags[idx_core] = RTE_KA_STATE_SLEEP;
+ keepcfg->live_data[idx_core].core_state =
+ RTE_KA_STATE_SLEEP;
keepcfg->last_alive[idx_core] = rte_rdtsc();
break;
case RTE_KA_STATE_SLEEP: /* Idled core */
@@ -129,7 +137,7 @@ rte_keepalive_dispatch_pings(__rte_unused void *ptr_timer,
keepcfg->relay_callback(
keepcfg->relay_callback_data,
idx_core,
- keepcfg->state_flags[idx_core],
+ keepcfg->live_data[idx_core].core_state,
keepcfg->last_alive[idx_core]
);
}
@@ -173,11 +181,11 @@ rte_keepalive_register_core(struct rte_keepalive *keepcfg, const int id_core)
void
rte_keepalive_mark_alive(struct rte_keepalive *keepcfg)
{
- keepcfg->state_flags[rte_lcore_id()] = RTE_KA_STATE_ALIVE;
+ keepcfg->live_data[rte_lcore_id()].core_state = RTE_KA_STATE_ALIVE;
}
void
rte_keepalive_mark_sleep(struct rte_keepalive *keepcfg)
{
- keepcfg->state_flags[rte_lcore_id()] = RTE_KA_STATE_DOZING;
+ keepcfg->live_data[rte_lcore_id()].core_state = RTE_KA_STATE_DOZING;
}
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c
index 876ba381..b0d0c3c6 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
@@ -623,7 +623,6 @@ pci_ioport_map(struct rte_pci_device *dev, int bar __rte_unused,
if (!found)
return -1;
- dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
p->base = start;
RTE_LOG(DEBUG, EAL, "PCI Port IO found start=0x%x\n", start);
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 702f7a2e..dd451071 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -50,12 +50,15 @@
static struct vfio_config vfio_cfg;
static int vfio_type1_dma_map(int);
+static int vfio_spapr_dma_map(int);
static int vfio_noiommu_dma_map(int);
/* IOMMU types we support */
static const struct vfio_iommu_type iommu_types[] = {
/* x86 IOMMU, otherwise known as type 1 */
{ RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map},
+ /* ppc64 IOMMU, otherwise known as spapr */
+ { RTE_VFIO_SPAPR, "sPAPR", &vfio_spapr_dma_map},
/* IOMMU-less mode */
{ RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map},
};
@@ -339,7 +342,7 @@ vfio_enable(const char *modname)
int
vfio_is_enabled(const char *modname)
{
- const int mod_available = rte_eal_check_module(modname);
+ const int mod_available = rte_eal_check_module(modname) > 0;
return vfio_cfg.vfio_enabled && mod_available;
}
@@ -540,6 +543,93 @@ vfio_type1_dma_map(int vfio_container_fd)
}
static int
+vfio_spapr_dma_map(int vfio_container_fd)
+{
+ const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+ int i, ret;
+
+ struct vfio_iommu_spapr_register_memory reg = {
+ .argsz = sizeof(reg),
+ .flags = 0
+ };
+ struct vfio_iommu_spapr_tce_info info = {
+ .argsz = sizeof(info),
+ };
+ struct vfio_iommu_spapr_tce_create create = {
+ .argsz = sizeof(create),
+ };
+ struct vfio_iommu_spapr_tce_remove remove = {
+ .argsz = sizeof(remove),
+ };
+
+ /* query spapr iommu info */
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot get iommu info, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ /* remove default DMA of 32 bit window */
+ remove.start_addr = info.dma32_window_start;
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot remove default DMA window, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ /* calculate window size based on number of hugepages configured */
+ create.window_size = rte_eal_get_physmem_size();
+ create.page_shift = __builtin_ctzll(ms->hugepage_sz);
+ create.levels = 2;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot create new DMA window, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+ for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+ struct vfio_iommu_type1_dma_map dma_map;
+
+ if (ms[i].addr == NULL)
+ break;
+
+ reg.vaddr = (uintptr_t) ms[i].addr;
+ reg.size = ms[i].len;
+ ret = ioctl(vfio_container_fd,
+ VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = ms[i].addr_64;
+ dma_map.size = ms[i].len;
+ dma_map.iova = ms[i].phys_addr;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+ VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ }
+
+ return 0;
+}
+
+static int
vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
{
/* No-IOMMU mode does not need DMA mapping */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 29f7f3ec..ac31a4fc 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -54,6 +54,31 @@
#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU
+#ifndef VFIO_SPAPR_TCE_v2_IOMMU
+#define RTE_VFIO_SPAPR 7
+#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17)
+#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19)
+#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20)
+struct vfio_iommu_spapr_register_memory {
+ uint32_t argsz;
+ uint32_t flags;
+ uint64_t vaddr;
+ uint64_t size;
+};
+struct vfio_iommu_spapr_tce_create {
+ uint32_t argsz;
+ uint32_t page_shift;
+ uint64_t window_size;
+ uint32_t levels;
+};
+struct vfio_iommu_spapr_tce_remove {
+ uint32_t argsz;
+ uint64_t start_addr;
+};
+#else
+#define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU
+#endif
+
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)
#define RTE_VFIO_NOIOMMU 8
#else
diff --git a/lib/librte_eal/linuxapp/igb_uio/compat.h b/lib/librte_eal/linuxapp/igb_uio/compat.h
index 0d781e48..38259330 100644
--- a/lib/librte_eal/linuxapp/igb_uio/compat.h
+++ b/lib/librte_eal/linuxapp/igb_uio/compat.h
@@ -123,3 +123,7 @@ static bool pci_check_and_mask_intx(struct pci_dev *pdev)
}
#endif /* < 3.3.0 */
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0)
+#define HAVE_ALLOC_IRQ_VECTORS 1
+#endif
diff --git a/lib/librte_eal/linuxapp/igb_uio/igb_uio.c b/lib/librte_eal/linuxapp/igb_uio/igb_uio.c
index df41e457..9f00f07a 100644
--- a/lib/librte_eal/linuxapp/igb_uio/igb_uio.c
+++ b/lib/librte_eal/linuxapp/igb_uio/igb_uio.c
@@ -325,7 +325,9 @@ static int
igbuio_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
{
struct rte_uio_pci_dev *udev;
+#ifndef HAVE_ALLOC_IRQ_VECTORS
struct msix_entry msix_entry;
+#endif
int err;
udev = kzalloc(sizeof(struct rte_uio_pci_dev), GFP_KERNEL);
@@ -379,6 +381,7 @@ igbuio_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
switch (igbuio_intr_mode_preferred) {
case RTE_INTR_MODE_MSIX:
/* Only 1 msi-x vector needed */
+#ifndef HAVE_ALLOC_IRQ_VECTORS
msix_entry.entry = 0;
if (pci_enable_msix(dev, &msix_entry, 1) == 0) {
dev_dbg(&dev->dev, "using MSI-X");
@@ -386,6 +389,15 @@ igbuio_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
udev->mode = RTE_INTR_MODE_MSIX;
break;
}
+#else
+ if (pci_alloc_irq_vectors(dev, 1, 1, PCI_IRQ_MSIX) == 1) {
+ dev_dbg(&dev->dev, "using MSI-X");
+ udev->info.irq_flags = IRQF_NO_THREAD;
+ udev->info.irq = pci_irq_vector(dev, 0);
+ udev->mode = RTE_INTR_MODE_MSIX;
+ break;
+ }
+#endif
/* fall back to INTX */
case RTE_INTR_MODE_LEGACY:
if (pci_intx_mask_supported(dev)) {
@@ -429,8 +441,13 @@ fail_remove_group:
sysfs_remove_group(&dev->dev.kobj, &dev_attr_grp);
fail_release_iomem:
igbuio_pci_release_iomem(&udev->info);
+#ifndef HAVE_ALLOC_IRQ_VECTORS
if (udev->mode == RTE_INTR_MODE_MSIX)
pci_disable_msix(udev->pdev);
+#else
+ if (udev->mode == RTE_INTR_MODE_MSIX)
+ pci_free_irq_vectors(udev->pdev);
+#endif
pci_disable_device(dev);
fail_free:
kfree(udev);
@@ -446,8 +463,13 @@ igbuio_pci_remove(struct pci_dev *dev)
sysfs_remove_group(&dev->dev.kobj, &dev_attr_grp);
uio_unregister_device(&udev->info);
igbuio_pci_release_iomem(&udev->info);
+#ifndef HAVE_ALLOC_IRQ_VECTORS
if (udev->mode == RTE_INTR_MODE_MSIX)
pci_disable_msix(dev);
+#else
+ if (udev->mode == RTE_INTR_MODE_MSIX)
+ pci_free_irq_vectors(dev);
+#endif
pci_disable_device(dev);
pci_set_drvdata(dev, NULL);
kfree(udev);
diff --git a/lib/librte_eal/linuxapp/kni/ethtool/igb/igb_main.c b/lib/librte_eal/linuxapp/kni/ethtool/igb/igb_main.c
index acb1a69b..3c683e17 100644
--- a/lib/librte_eal/linuxapp/kni/ethtool/igb/igb_main.c
+++ b/lib/librte_eal/linuxapp/kni/ethtool/igb/igb_main.c
@@ -137,11 +137,20 @@ static void igb_clean_all_tx_rings(struct igb_adapter *);
static void igb_clean_all_rx_rings(struct igb_adapter *);
static void igb_clean_tx_ring(struct igb_ring *);
static void igb_set_rx_mode(struct net_device *);
+#ifdef HAVE_TIMER_SETUP
+static void igb_update_phy_info(struct timer_list *);
+static void igb_watchdog(struct timer_list *);
+#else
static void igb_update_phy_info(unsigned long);
static void igb_watchdog(unsigned long);
+#endif
static void igb_watchdog_task(struct work_struct *);
static void igb_dma_err_task(struct work_struct *);
+#ifdef HAVE_TIMER_SETUP
+static void igb_dma_err_timer(struct timer_list *);
+#else
static void igb_dma_err_timer(unsigned long data);
+#endif
static netdev_tx_t igb_xmit_frame(struct sk_buff *skb, struct net_device *);
static struct net_device_stats *igb_get_stats(struct net_device *);
static int igb_change_mtu(struct net_device *, int);
@@ -2806,6 +2815,12 @@ static int __devinit igb_probe(struct pci_dev *pdev,
/* Check if Media Autosense is enabled */
if (hw->mac.type == e1000_82580)
igb_init_mas(adapter);
+#ifdef HAVE_TIMER_SETUP
+ timer_setup(&adapter->watchdog_timer, &igb_watchdog, 0);
+ if (adapter->flags & IGB_FLAG_DETECT_BAD_DMA)
+ timer_setup(&adapter->dma_err_timer, &igb_dma_err_timer, 0);
+ timer_setup(&adapter->phy_info_timer, &igb_update_phy_info, 0);
+#else
setup_timer(&adapter->watchdog_timer, &igb_watchdog,
(unsigned long) adapter);
if (adapter->flags & IGB_FLAG_DETECT_BAD_DMA)
@@ -2813,6 +2828,7 @@ static int __devinit igb_probe(struct pci_dev *pdev,
(unsigned long) adapter);
setup_timer(&adapter->phy_info_timer, &igb_update_phy_info,
(unsigned long) adapter);
+#endif
INIT_WORK(&adapter->reset_task, igb_reset_task);
INIT_WORK(&adapter->watchdog_task, igb_watchdog_task);
@@ -4543,9 +4559,15 @@ static void igb_spoof_check(struct igb_adapter *adapter)
/* Need to wait a few seconds after link up to get diagnostic information from
* the phy */
+#ifdef HAVE_TIMER_SETUP
+static void igb_update_phy_info(struct timer_list *t)
+{
+ struct igb_adapter *adapter = from_timer(adapter, t, phy_info_timer);
+#else
static void igb_update_phy_info(unsigned long data)
{
struct igb_adapter *adapter = (struct igb_adapter *) data;
+#endif
e1000_get_phy_info(&adapter->hw);
}
@@ -4594,9 +4616,15 @@ bool igb_has_link(struct igb_adapter *adapter)
* igb_watchdog - Timer Call-back
* @data: pointer to adapter cast into an unsigned long
**/
+#ifdef HAVE_TIMER_SETUP
+static void igb_watchdog(struct timer_list *t)
+{
+ struct igb_adapter *adapter = from_timer(adapter, t, watchdog_timer);
+#else
static void igb_watchdog(unsigned long data)
{
struct igb_adapter *adapter = (struct igb_adapter *)data;
+#endif
/* Do the rest outside of interrupt context */
schedule_work(&adapter->watchdog_task);
}
@@ -4854,9 +4882,15 @@ dma_timer_reset:
* igb_dma_err_timer - Timer Call-back
* @data: pointer to adapter cast into an unsigned long
**/
+#ifdef HAVE_TIMER_SETUP
+static void igb_dma_err_timer(struct timer_list *t)
+{
+ struct igb_adapter *adapter = from_timer(adapter, t, dma_err_timer);
+#else
static void igb_dma_err_timer(unsigned long data)
{
struct igb_adapter *adapter = (struct igb_adapter *)data;
+#endif
/* Do the rest outside of interrupt context */
schedule_work(&adapter->dma_err_task);
}
@@ -10051,6 +10085,12 @@ int igb_kni_probe(struct pci_dev *pdev,
igb_init_mas(adapter);
#ifdef NO_KNI
+#ifdef HAVE_TIMER_SETUP
+ timer_setup(&adapter->watchdog_timer, &igb_watchdog, 0);
+ if (adapter->flags & IGB_FLAG_DETECT_BAD_DMA)
+ timer_setup(&adapter->dma_err_timer, &igb_dma_err_timer, 0);
+ timer_setup(&adapter->phy_info_timer, &igb_update_phy_info, 0);
+#else
setup_timer(&adapter->watchdog_timer, &igb_watchdog,
(unsigned long) adapter);
if (adapter->flags & IGB_FLAG_DETECT_BAD_DMA)
@@ -10058,6 +10098,7 @@ int igb_kni_probe(struct pci_dev *pdev,
(unsigned long) adapter);
setup_timer(&adapter->phy_info_timer, &igb_update_phy_info,
(unsigned long) adapter);
+#endif
INIT_WORK(&adapter->reset_task, igb_reset_task);
INIT_WORK(&adapter->watchdog_task, igb_watchdog_task);
diff --git a/lib/librte_eal/linuxapp/kni/ethtool/igb/kcompat.h b/lib/librte_eal/linuxapp/kni/ethtool/igb/kcompat.h
index aea253b1..88bd18ec 100644
--- a/lib/librte_eal/linuxapp/kni/ethtool/igb/kcompat.h
+++ b/lib/librte_eal/linuxapp/kni/ethtool/igb/kcompat.h
@@ -3937,4 +3937,8 @@ skb_set_hash(struct sk_buff *skb, __u32 hash, __always_unused int type)
#define HAVE_PCI_ENABLE_MSIX
#endif
+#if defined(timer_setup) && defined(from_timer)
+#define HAVE_TIMER_SETUP
+#endif
+
#endif /* _KCOMPAT_H_ */
diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index ea545250..4679dc69 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -94,6 +94,7 @@ static const struct rte_eth_xstats_name_off rte_stats_strings[] = {
{"tx_good_packets", offsetof(struct rte_eth_stats, opackets)},
{"rx_good_bytes", offsetof(struct rte_eth_stats, ibytes)},
{"tx_good_bytes", offsetof(struct rte_eth_stats, obytes)},
+ {"rx_missed_errors", offsetof(struct rte_eth_stats, imissed)},
{"rx_errors", offsetof(struct rte_eth_stats, ierrors)},
{"tx_errors", offsetof(struct rte_eth_stats, oerrors)},
{"rx_mbuf_allocation_errors", offsetof(struct rte_eth_stats,
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 11ec1fa8..dc6d0cc9 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -262,17 +262,17 @@ __extension__
struct rte_eth_link {
uint32_t link_speed; /**< ETH_SPEED_NUM_ */
uint16_t link_duplex : 1; /**< ETH_LINK_[HALF/FULL]_DUPLEX */
- uint16_t link_autoneg : 1; /**< ETH_LINK_SPEED_[AUTONEG/FIXED] */
+ uint16_t link_autoneg : 1; /**< ETH_LINK_[AUTONEG/FIXED] */
uint16_t link_status : 1; /**< ETH_LINK_[DOWN/UP] */
} __attribute__((aligned(8))); /**< aligned for atomic64 read/write */
/* Utility constants */
-#define ETH_LINK_HALF_DUPLEX 0 /**< Half-duplex connection. */
-#define ETH_LINK_FULL_DUPLEX 1 /**< Full-duplex connection. */
-#define ETH_LINK_DOWN 0 /**< Link is down. */
-#define ETH_LINK_UP 1 /**< Link is up. */
-#define ETH_LINK_FIXED 0 /**< No autonegotiation. */
-#define ETH_LINK_AUTONEG 1 /**< Autonegotiated. */
+#define ETH_LINK_HALF_DUPLEX 0 /**< Half-duplex connection (see link_duplex). */
+#define ETH_LINK_FULL_DUPLEX 1 /**< Full-duplex connection (see link_duplex). */
+#define ETH_LINK_DOWN 0 /**< Link is down (see link_status). */
+#define ETH_LINK_UP 1 /**< Link is up (see link_status). */
+#define ETH_LINK_FIXED 0 /**< No autonegotiation (see link_autoneg). */
+#define ETH_LINK_AUTONEG 1 /**< Autonegotiated (see link_autoneg). */
/**
* A structure used to configure the ring threshold registers of an RX/TX
@@ -1694,7 +1694,7 @@ struct rte_eth_dev_data {
enum rte_kernel_driver kdrv; /**< Kernel driver passthrough */
int numa_node; /**< NUMA node connection */
const char *drv_name; /**< Driver name */
-};
+} __rte_cache_aligned;
/** Device supports hotplug detach */
#define RTE_ETH_DEV_DETACHABLE 0x0001
@@ -1965,7 +1965,7 @@ int rte_eth_rx_queue_setup(uint8_t port_id, uint16_t rx_queue_id,
* the DMA memory allocated for the transmit descriptors of the ring.
* @param tx_conf
* The pointer to the configuration data to be used for the transmit queue.
- * NULL value is allowed, in which case default RX configuration
+ * NULL value is allowed, in which case default TX configuration
* will be used.
* The *tx_conf* structure contains the following data:
* - The *tx_thresh* structure with the values of the Prefetch, Host, and
diff --git a/lib/librte_lpm/rte_lpm.c b/lib/librte_lpm/rte_lpm.c
index 978ac601..56fc8b0b 100644
--- a/lib/librte_lpm/rte_lpm.c
+++ b/lib/librte_lpm/rte_lpm.c
@@ -908,7 +908,7 @@ add_depth_big_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked, uint8_t depth,
*/
struct rte_lpm_tbl_entry_v20 new_tbl24_entry = {
- { .group_idx = (uint8_t)tbl8_group_index, },
+ .group_idx = (uint8_t)tbl8_group_index,
.valid = VALID,
.valid_group = 1,
.depth = 0,
@@ -954,7 +954,7 @@ add_depth_big_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked, uint8_t depth,
*/
struct rte_lpm_tbl_entry_v20 new_tbl24_entry = {
- { .group_idx = (uint8_t)tbl8_group_index, },
+ .group_idx = (uint8_t)tbl8_group_index,
.valid = VALID,
.valid_group = 1,
.depth = 0,
@@ -1361,7 +1361,7 @@ delete_depth_small_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked,
*/
struct rte_lpm_tbl_entry_v20 new_tbl24_entry = {
- {.next_hop = lpm->rules_tbl[sub_rule_index].next_hop,},
+ .next_hop = lpm->rules_tbl[sub_rule_index].next_hop,
.valid = VALID,
.valid_group = 0,
.depth = sub_rule_depth,
@@ -1664,7 +1664,7 @@ delete_depth_big_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked,
} else if (tbl8_recycle_index > -1) {
/* Update tbl24 entry. */
struct rte_lpm_tbl_entry_v20 new_tbl24_entry = {
- { .next_hop = lpm->tbl8[tbl8_recycle_index].next_hop, },
+ .next_hop = lpm->tbl8[tbl8_recycle_index].next_hop,
.valid = VALID,
.valid_group = 0,
.depth = lpm->tbl8[tbl8_recycle_index].depth,
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 8a814df2..bc015d03 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -1233,13 +1233,14 @@ rte_pktmbuf_free_seg(struct rte_mbuf *m)
* segment is added back into its original mempool.
*
* @param m
- * The packet mbuf to be freed.
+ * The packet mbuf to be freed. If NULL, the function does nothing.
*/
static inline void rte_pktmbuf_free(struct rte_mbuf *m)
{
struct rte_mbuf *m_next;
- __rte_mbuf_sanity_check(m, 1);
+ if (m != NULL)
+ __rte_mbuf_sanity_check(m, 1);
while (m != NULL) {
m_next = m->next;
@@ -1361,12 +1362,10 @@ static inline uint16_t rte_pktmbuf_tailroom(const struct rte_mbuf *m)
*/
static inline struct rte_mbuf *rte_pktmbuf_lastseg(struct rte_mbuf *m)
{
- struct rte_mbuf *m2 = (struct rte_mbuf *)m;
-
__rte_mbuf_sanity_check(m, 1);
- while (m2->next != NULL)
- m2 = m2->next;
- return m2;
+ while (m->next != NULL)
+ m = m->next;
+ return m;
}
/**
diff --git a/lib/librte_pdump/rte_pdump.c b/lib/librte_pdump/rte_pdump.c
index e64bf598..1a5147aa 100644
--- a/lib/librte_pdump/rte_pdump.c
+++ b/lib/librte_pdump/rte_pdump.c
@@ -582,7 +582,7 @@ rte_pdump_init(const char *path)
if (ret != 0) {
RTE_LOG(ERR, PDUMP,
"Failed to create the pdump thread:%s, %s:%d\n",
- strerror(errno), __func__, __LINE__);
+ strerror(ret), __func__, __LINE__);
return -1;
}
/* Set thread_name for aid in debugging. */
@@ -605,7 +605,7 @@ rte_pdump_uninit(void)
if (ret != 0) {
RTE_LOG(ERR, PDUMP,
"Failed to cancel the pdump thread:%s, %s:%d\n",
- strerror(errno), __func__, __LINE__);
+ strerror(ret), __func__, __LINE__);
return -1;
}
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index 84e05951..a95fbfbb 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -438,7 +438,7 @@ vhost_user_reconnect_init(void)
ret = pthread_create(&reconn_tid, NULL,
vhost_user_client_reconnect, NULL);
- if (ret < 0)
+ if (ret != 0)
RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread");
return ret;
@@ -525,7 +525,7 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
if (vsocket->reconnect && reconn_tid == 0) {
- if (vhost_user_reconnect_init() < 0) {
+ if (vhost_user_reconnect_init() != 0) {
free(vsocket->path);
free(vsocket);
goto out;
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 3c3f6a42..36fdfb55 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -202,6 +202,8 @@ alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
dev->virtqueue[virt_tx_q_idx] = virtqueue + VIRTIO_TXQ;
init_vring_queue_pair(dev, qp_idx);
+ rte_spinlock_init(&dev->virtqueue[virt_rx_q_idx]->access_lock);
+ rte_spinlock_init(&dev->virtqueue[virt_tx_q_idx]->access_lock);
dev->virt_qp_nb += 1;
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index d97df1d8..9f60ff81 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -91,6 +91,8 @@ struct vhost_virtqueue {
/* Backend value to determine if device should started/stopped */
int backend;
+ rte_spinlock_t access_lock;
+
/* Used to notify the guest (trigger interrupt) */
int callfd;
/* Currently unused as polling mode is enabled */
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index d25e1c02..80348dbf 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -39,6 +39,7 @@
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
+#include <stdbool.h>
#include <assert.h>
#ifdef RTE_LIBRTE_VHOST_NUMA
#include <numaif.h>
@@ -488,6 +489,30 @@ dump_guest_pages(struct virtio_net *dev)
#define dump_guest_pages(dev)
#endif
+static bool
+vhost_memory_changed(struct VhostUserMemory *new,
+ struct virtio_memory *old)
+{
+ uint32_t i;
+
+ if (new->nregions != old->nregions)
+ return true;
+
+ for (i = 0; i < new->nregions; ++i) {
+ VhostUserMemoryRegion *new_r = &new->regions[i];
+ struct virtio_memory_region *old_r = &old->regions[i];
+
+ if (new_r->guest_phys_addr != old_r->guest_phys_addr)
+ return true;
+ if (new_r->memory_size != old_r->size)
+ return true;
+ if (new_r->userspace_addr != old_r->guest_user_addr)
+ return true;
+ }
+
+ return false;
+}
+
static int
vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
{
@@ -500,6 +525,16 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
uint32_t i;
int fd;
+ if (dev->mem && !vhost_memory_changed(&memory, dev->mem)) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "(%d) memory regions not changed\n", dev->vid);
+
+ for (i = 0; i < memory.nregions; i++)
+ close(pmsg->fds[i]);
+
+ return 0;
+ }
+
/* Remove from the data plane. */
if (dev->flags & VIRTIO_DEV_RUNNING) {
dev->flags &= ~VIRTIO_DEV_RUNNING;
@@ -917,12 +952,47 @@ send_vhost_message(int sockfd, struct VhostUserMsg *msg)
return ret;
}
+static void
+vhost_user_lock_all_queue_pairs(struct virtio_net *dev)
+{
+ unsigned int i = 0;
+ unsigned int vq_num = 0;
+
+ while (vq_num < dev->virt_qp_nb * 2) {
+ struct vhost_virtqueue *vq = dev->virtqueue[i];
+
+ if (vq) {
+ rte_spinlock_lock(&vq->access_lock);
+ vq_num++;
+ }
+ i++;
+ }
+}
+
+static void
+vhost_user_unlock_all_queue_pairs(struct virtio_net *dev)
+{
+ unsigned int i = 0;
+ unsigned int vq_num = 0;
+
+ while (vq_num < dev->virt_qp_nb * 2) {
+ struct vhost_virtqueue *vq = dev->virtqueue[i];
+
+ if (vq) {
+ rte_spinlock_unlock(&vq->access_lock);
+ vq_num++;
+ }
+ i++;
+ }
+}
+
int
vhost_user_msg_handler(int vid, int fd)
{
struct virtio_net *dev;
struct VhostUserMsg msg;
int ret;
+ int unlock_required = 0;
dev = get_device(vid);
if (dev == NULL)
@@ -945,6 +1015,37 @@ vhost_user_msg_handler(int vid, int fd)
RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
vhost_message_str[msg.request]);
+
+ /*
+ * Note: we don't lock all queues on VHOST_USER_GET_VRING_BASE
+ * and VHOST_USER_RESET_OWNER, since it is sent when virtio stops
+ * and device is destroyed. destroy_device waits for queues to be
+ * inactive, so it is safe. Otherwise taking the access_lock
+ * would cause a dead lock.
+ */
+ switch (msg.request) {
+ case VHOST_USER_SET_FEATURES:
+ case VHOST_USER_SET_PROTOCOL_FEATURES:
+ case VHOST_USER_SET_OWNER:
+ case VHOST_USER_SET_MEM_TABLE:
+ case VHOST_USER_SET_LOG_BASE:
+ case VHOST_USER_SET_LOG_FD:
+ case VHOST_USER_SET_VRING_NUM:
+ case VHOST_USER_SET_VRING_ADDR:
+ case VHOST_USER_SET_VRING_BASE:
+ case VHOST_USER_SET_VRING_KICK:
+ case VHOST_USER_SET_VRING_CALL:
+ case VHOST_USER_SET_VRING_ERR:
+ case VHOST_USER_SET_VRING_ENABLE:
+ case VHOST_USER_SEND_RARP:
+ vhost_user_lock_all_queue_pairs(dev);
+ unlock_required = 1;
+ break;
+ default:
+ break;
+
+ }
+
switch (msg.request) {
case VHOST_USER_GET_FEATURES:
msg.payload.u64 = vhost_user_get_features();
@@ -1034,5 +1135,8 @@ vhost_user_msg_handler(int vid, int fd)
}
+ if (unlock_required)
+ vhost_user_unlock_all_queue_pairs(dev);
+
return 0;
}
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 0027f393..0024f729 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -44,6 +44,7 @@
#include <rte_udp.h>
#include <rte_sctp.h>
#include <rte_arp.h>
+#include <rte_spinlock.h>
#include "vhost.h"
@@ -313,8 +314,11 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
}
vq = dev->virtqueue[queue_id];
+
+ rte_spinlock_lock(&vq->access_lock);
+
if (unlikely(vq->enabled == 0))
- return 0;
+ goto out_access_unlock;
avail_idx = *((volatile uint16_t *)&vq->avail->idx);
start_idx = vq->last_used_idx;
@@ -322,7 +326,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
count = RTE_MIN(count, free_entries);
count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
if (count == 0)
- return 0;
+ goto out_access_unlock;
LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
dev->vid, start_idx, start_idx + count);
@@ -388,6 +392,10 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
&& (vq->callfd >= 0))
eventfd_write(vq->callfd, (eventfd_t)1);
+
+out_access_unlock:
+ rte_spinlock_unlock(&vq->access_lock);
+
return count;
}
@@ -582,12 +590,15 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
}
vq = dev->virtqueue[queue_id];
+
+ rte_spinlock_lock(&vq->access_lock);
+
if (unlikely(vq->enabled == 0))
- return 0;
+ goto out_access_unlock;
count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
if (count == 0)
- return 0;
+ goto out_access_unlock;
rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
@@ -631,6 +642,9 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
eventfd_write(vq->callfd, (eventfd_t)1);
}
+out_access_unlock:
+ rte_spinlock_unlock(&vq->access_lock);
+
return pkt_idx;
}
@@ -875,7 +889,8 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
desc->addr + desc_offset, cpy_len)))) {
cur->data_len = cpy_len;
cur->data_off = 0;
- cur->buf_addr = (void *)(uintptr_t)desc_addr;
+ cur->buf_addr = (void *)(uintptr_t)(desc_addr
+ + desc_offset);
cur->buf_physaddr = hpa;
/*
@@ -1027,6 +1042,22 @@ mbuf_is_consumed(struct rte_mbuf *m)
return true;
}
+static inline void __attribute__((always_inline))
+restore_mbuf(struct rte_mbuf *m)
+{
+ uint32_t mbuf_size, priv_size;
+
+ while (m) {
+ priv_size = rte_pktmbuf_priv_size(m->pool);
+ mbuf_size = sizeof(struct rte_mbuf) + priv_size;
+ /* start of buffer is after mbuf structure and priv data */
+
+ m->buf_addr = (char *)m + mbuf_size;
+ m->buf_physaddr = rte_mempool_virt2phy(NULL, m) + mbuf_size;
+ m = m->next;
+ }
+}
+
uint16_t
rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
@@ -1051,9 +1082,13 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
}
vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
+
+ if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
return 0;
+ if (unlikely(vq->enabled == 0))
+ goto out_access_unlock;
+
if (unlikely(dev->dequeue_zero_copy)) {
struct zcopy_mbuf *zmbuf, *next;
int nr_updated = 0;
@@ -1069,6 +1104,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
nr_updated += 1;
TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
+ restore_mbuf(zmbuf->mbuf);
rte_pktmbuf_free(zmbuf->mbuf);
put_zmbuf(zmbuf);
vq->nr_zmbuf -= 1;
@@ -1102,7 +1138,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
if (rarp_mbuf == NULL) {
RTE_LOG(ERR, VHOST_DATA,
"Failed to allocate memory for mbuf.\n");
- return 0;
+ goto out_access_unlock;
}
if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
@@ -1116,7 +1152,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
free_entries = *((volatile uint16_t *)&vq->avail->idx) -
vq->last_avail_idx;
if (free_entries == 0)
- goto out;
+ goto out_access_unlock;
LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
@@ -1209,7 +1245,9 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
update_used_idx(dev, vq, i);
}
-out:
+out_access_unlock:
+ rte_spinlock_unlock(&vq->access_lock);
+
if (unlikely(rarp_mbuf != NULL)) {
/*
* Inject it to the head of "pkts" array, so that switch's mac