aboutsummaryrefslogtreecommitdiffstats
path: root/lib/librte_eal/linuxapp/eal
diff options
context:
space:
mode:
Diffstat (limited to 'lib/librte_eal/linuxapp/eal')
-rw-r--r--lib/librte_eal/linuxapp/eal/Makefile6
-rw-r--r--lib/librte_eal/linuxapp/eal/eal.c128
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_alarm.c5
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_debug.c4
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_hugepage_info.c9
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_interrupts.c117
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_memory.c103
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_pci.c149
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_pci_init.h3
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_pci_uio.c3
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_pci_vfio.c88
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_vfio.c386
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_vfio.h67
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c23
-rw-r--r--lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h14
-rw-r--r--lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h5
-rw-r--r--lib/librte_eal/linuxapp/eal/rte_eal_version.map74
17 files changed, 867 insertions, 317 deletions
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index 4e206f09..640afd08 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -37,7 +37,7 @@ ARCH_DIR ?= $(RTE_ARCH)
EXPORT_MAP := rte_eal_version.map
VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR)
-LIBABIVER := 3
+LIBABIVER := 4
VPATH += $(RTE_SDK)/lib/librte_eal/common
@@ -87,6 +87,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_cpuflags.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_string_fns.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_hexdump.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_devargs.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_bus.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_dev.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_options.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_thread.c
@@ -130,7 +131,4 @@ INC := rte_interrupts.h rte_kni_common.h rte_dom0_common.h
SYMLINK-$(CONFIG_RTE_EXEC_ENV_LINUXAPP)-include/exec-env := \
$(addprefix include/exec-env/,$(INC))
-DEPDIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += lib/librte_eal/common
-DEPDIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += lib/librte_eal/common/arch/$(ARCH_DIR)
-
include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 2075282e..7c78f2dc 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -61,6 +61,7 @@
#include <rte_launch.h>
#include <rte_eal.h>
#include <rte_eal_memconfig.h>
+#include <rte_errno.h>
#include <rte_per_lcore.h>
#include <rte_lcore.h>
#include <rte_log.h>
@@ -69,6 +70,7 @@
#include <rte_string_fns.h>
#include <rte_cpuflags.h>
#include <rte_interrupts.h>
+#include <rte_bus.h>
#include <rte_pci.h>
#include <rte_dev.h>
#include <rte_devargs.h>
@@ -210,7 +212,7 @@ rte_eal_config_create(void)
rte_panic("Cannot mmap memory for rte_config\n");
}
memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
- rte_config.mem_config = (struct rte_mem_config *) rte_mem_cfg_addr;
+ rte_config.mem_config = rte_mem_cfg_addr;
/* store address of the config in the config itself so that secondary
* processes could later map the config into this exact location */
@@ -490,8 +492,6 @@ eal_log_level_parse(int argc, char **argv)
argvopt = argv;
optind = 1;
- eal_reset_internal_config(&internal_config);
-
while ((opt = getopt_long(argc, argvopt, eal_short_options,
eal_long_options, &option_index)) != EOF) {
@@ -739,6 +739,12 @@ static int rte_eal_vfio_setup(void)
}
#endif
+static void rte_eal_init_alert(const char *msg)
+{
+ fprintf(stderr, "EAL: FATAL: %s\n", msg);
+ RTE_LOG(ERR, EAL, "%s\n", msg);
+}
+
/* Launch threads, called at application init(). */
int
rte_eal_init(int argc, char **argv)
@@ -751,33 +757,51 @@ rte_eal_init(int argc, char **argv)
char thread_name[RTE_MAX_THREAD_NAME_LEN];
/* checks if the machine is adequate */
- rte_cpu_check_supported();
+ if (!rte_cpu_is_supported()) {
+ rte_eal_init_alert("unsupported cpu type.");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
- if (!rte_atomic32_test_and_set(&run_once))
+ if (!rte_atomic32_test_and_set(&run_once)) {
+ rte_eal_init_alert("already called initialization.");
+ rte_errno = EALREADY;
return -1;
+ }
logid = strrchr(argv[0], '/');
logid = strdup(logid ? logid + 1: argv[0]);
thread_id = pthread_self();
- eal_log_level_parse(argc, argv);
+ eal_reset_internal_config(&internal_config);
/* set log level as early as possible */
- rte_set_log_level(internal_config.log_level);
+ eal_log_level_parse(argc, argv);
- if (rte_eal_cpu_init() < 0)
- rte_panic("Cannot detect lcores\n");
+ if (rte_eal_cpu_init() < 0) {
+ rte_eal_init_alert("Cannot detect lcores.");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
fctret = eal_parse_args(argc, argv);
- if (fctret < 0)
- exit(1);
+ if (fctret < 0) {
+ rte_eal_init_alert("Invalid 'command line' arguments.");
+ rte_errno = EINVAL;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
if (internal_config.no_hugetlbfs == 0 &&
internal_config.process_type != RTE_PROC_SECONDARY &&
internal_config.xen_dom0_support == 0 &&
- eal_hugepage_info_init() < 0)
- rte_panic("Cannot get hugepage information\n");
+ eal_hugepage_info_init() < 0) {
+ rte_eal_init_alert("Cannot get hugepage information.");
+ rte_errno = EACCES;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
if (internal_config.no_hugetlbfs)
@@ -799,39 +823,59 @@ rte_eal_init(int argc, char **argv)
rte_config_init();
- if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0)
- rte_panic("Cannot init logs\n");
-
- if (rte_eal_pci_init() < 0)
- rte_panic("Cannot init PCI\n");
+ if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) {
+ rte_eal_init_alert("Cannot init logging.");
+ rte_errno = ENOMEM;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
#ifdef VFIO_PRESENT
- if (rte_eal_vfio_setup() < 0)
- rte_panic("Cannot init VFIO\n");
+ if (rte_eal_vfio_setup() < 0) {
+ rte_eal_init_alert("Cannot init VFIO\n");
+ rte_errno = EAGAIN;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
#endif
- if (rte_eal_memory_init() < 0)
- rte_panic("Cannot init memory\n");
+ if (rte_eal_memory_init() < 0) {
+ rte_eal_init_alert("Cannot init memory\n");
+ rte_errno = ENOMEM;
+ return -1;
+ }
/* the directories are locked during eal_hugepage_info_init */
eal_hugedirs_unlock();
- if (rte_eal_memzone_init() < 0)
- rte_panic("Cannot init memzone\n");
+ if (rte_eal_memzone_init() < 0) {
+ rte_eal_init_alert("Cannot init memzone\n");
+ rte_errno = ENODEV;
+ return -1;
+ }
- if (rte_eal_tailqs_init() < 0)
- rte_panic("Cannot init tail queues for objects\n");
+ if (rte_eal_tailqs_init() < 0) {
+ rte_eal_init_alert("Cannot init tail queues for objects\n");
+ rte_errno = EFAULT;
+ return -1;
+ }
- if (rte_eal_alarm_init() < 0)
- rte_panic("Cannot init interrupt-handling thread\n");
+ if (rte_eal_alarm_init() < 0) {
+ rte_eal_init_alert("Cannot init interrupt-handling thread\n");
+ /* rte_eal_alarm_init sets rte_errno on failure. */
+ return -1;
+ }
- if (rte_eal_timer_init() < 0)
- rte_panic("Cannot init HPET or TSC timers\n");
+ if (rte_eal_timer_init() < 0) {
+ rte_eal_init_alert("Cannot init HPET or TSC timers\n");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
eal_check_mem_on_local_socket();
if (eal_plugins_init() < 0)
- rte_panic("Cannot init plugins\n");
+ rte_eal_init_alert("Cannot init plugins\n");
eal_thread_init_master(rte_config.master_lcore);
@@ -841,11 +885,16 @@ rte_eal_init(int argc, char **argv)
rte_config.master_lcore, (int)thread_id, cpuset,
ret == 0 ? "" : "...");
- if (rte_eal_dev_init() < 0)
- rte_panic("Cannot init pmd devices\n");
+ if (rte_eal_intr_init() < 0) {
+ rte_eal_init_alert("Cannot init interrupt-handling thread\n");
+ return -1;
+ }
- if (rte_eal_intr_init() < 0)
- rte_panic("Cannot init interrupt-handling thread\n");
+ if (rte_bus_scan()) {
+ rte_eal_init_alert("Cannot scan the buses for devices\n");
+ rte_errno = ENODEV;
+ return -1;
+ }
RTE_LCORE_FOREACH_SLAVE(i) {
@@ -883,9 +932,12 @@ rte_eal_init(int argc, char **argv)
rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);
rte_eal_mp_wait_lcore();
- /* Probe & Initialize PCI devices */
- if (rte_eal_pci_probe())
- rte_panic("Cannot probe PCI\n");
+ /* Probe all the buses and devices/drivers on them */
+ if (rte_bus_probe()) {
+ rte_eal_init_alert("Cannot probe devices\n");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
rte_eal_mcfg_complete();
diff --git a/lib/librte_eal/linuxapp/eal/eal_alarm.c b/lib/librte_eal/linuxapp/eal/eal_alarm.c
index 8b042abc..fbae4613 100644
--- a/lib/librte_eal/linuxapp/eal/eal_alarm.c
+++ b/lib/librte_eal/linuxapp/eal/eal_alarm.c
@@ -83,7 +83,7 @@ static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER;
static struct rte_intr_handle intr_handle = {.fd = -1 };
static int handler_registered = 0;
-static void eal_alarm_callback(struct rte_intr_handle *hdl, void *arg);
+static void eal_alarm_callback(void *arg);
int
rte_eal_alarm_init(void)
@@ -102,8 +102,7 @@ error:
}
static void
-eal_alarm_callback(struct rte_intr_handle *hdl __rte_unused,
- void *arg __rte_unused)
+eal_alarm_callback(void *arg __rte_unused)
{
struct timespec now;
struct alarm_entry *ap;
diff --git a/lib/librte_eal/linuxapp/eal/eal_debug.c b/lib/librte_eal/linuxapp/eal/eal_debug.c
index 5fbc17c5..e1c75548 100644
--- a/lib/librte_eal/linuxapp/eal/eal_debug.c
+++ b/lib/librte_eal/linuxapp/eal/eal_debug.c
@@ -31,7 +31,9 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#ifdef RTE_BACKTRACE
#include <execinfo.h>
+#endif
#include <stdarg.h>
#include <signal.h>
#include <stdlib.h>
@@ -47,6 +49,7 @@
/* dump the stack of the calling core */
void rte_dump_stack(void)
{
+#ifdef RTE_BACKTRACE
void *func[BACKTRACE_SIZE];
char **symb = NULL;
int size;
@@ -64,6 +67,7 @@ void rte_dump_stack(void)
}
free(symb);
+#endif /* RTE_BACKTRACE */
}
/* not implemented in this environment */
diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 18858e2d..7a21e8f6 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -283,9 +283,12 @@ eal_hugepage_info_init(void)
struct dirent *dirent;
dir = opendir(sys_dir_path);
- if (dir == NULL)
- rte_panic("Cannot open directory %s to read system hugepage "
- "info\n", sys_dir_path);
+ if (dir == NULL) {
+ RTE_LOG(ERR, EAL,
+ "Cannot open directory %s to read system hugepage info\n",
+ sys_dir_path);
+ return -1;
+ }
for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {
struct hugepage_info *hpi;
diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 47a3b20a..2e3bd12a 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -46,6 +46,7 @@
#include <sys/ioctl.h>
#include <sys/eventfd.h>
#include <assert.h>
+#include <stdbool.h>
#include <rte_common.h>
#include <rte_interrupts.h>
@@ -136,7 +137,7 @@ static pthread_t intr_thread;
/* enable legacy (INTx) interrupts */
static int
-vfio_enable_intx(struct rte_intr_handle *intr_handle) {
+vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
struct vfio_irq_set *irq_set;
char irq_set_buf[IRQ_SET_BUF_LEN];
int len, ret;
@@ -183,7 +184,7 @@ vfio_enable_intx(struct rte_intr_handle *intr_handle) {
/* disable legacy (INTx) interrupts */
static int
-vfio_disable_intx(struct rte_intr_handle *intr_handle) {
+vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
struct vfio_irq_set *irq_set;
char irq_set_buf[IRQ_SET_BUF_LEN];
int len, ret;
@@ -194,14 +195,14 @@ vfio_disable_intx(struct rte_intr_handle *intr_handle) {
irq_set = (struct vfio_irq_set *) irq_set_buf;
irq_set->argsz = len;
irq_set->count = 1;
- irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
+ irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
irq_set->start = 0;
ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
if (ret) {
- RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
+ RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
intr_handle->fd);
return -1;
}
@@ -226,7 +227,7 @@ vfio_disable_intx(struct rte_intr_handle *intr_handle) {
/* enable MSI interrupts */
static int
-vfio_enable_msi(struct rte_intr_handle *intr_handle) {
+vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
int len, ret;
char irq_set_buf[IRQ_SET_BUF_LEN];
struct vfio_irq_set *irq_set;
@@ -255,7 +256,7 @@ vfio_enable_msi(struct rte_intr_handle *intr_handle) {
/* disable MSI interrupts */
static int
-vfio_disable_msi(struct rte_intr_handle *intr_handle) {
+vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
struct vfio_irq_set *irq_set;
char irq_set_buf[IRQ_SET_BUF_LEN];
int len, ret;
@@ -280,7 +281,7 @@ vfio_disable_msi(struct rte_intr_handle *intr_handle) {
/* enable MSI-X interrupts */
static int
-vfio_enable_msix(struct rte_intr_handle *intr_handle) {
+vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
int len, ret;
char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
struct vfio_irq_set *irq_set;
@@ -290,12 +291,10 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
irq_set = (struct vfio_irq_set *) irq_set_buf;
irq_set->argsz = len;
- if (!intr_handle->max_intr)
- intr_handle->max_intr = 1;
- else if (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID)
- intr_handle->max_intr = RTE_MAX_RXTX_INTR_VEC_ID + 1;
-
- irq_set->count = intr_handle->max_intr;
+ /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
+ irq_set->count = intr_handle->max_intr ?
+ (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ?
+ RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1;
irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
irq_set->start = 0;
@@ -318,7 +317,7 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
/* disable MSI-X interrupts */
static int
-vfio_disable_msix(struct rte_intr_handle *intr_handle) {
+vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
struct vfio_irq_set *irq_set;
char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
int len, ret;
@@ -343,7 +342,7 @@ vfio_disable_msix(struct rte_intr_handle *intr_handle) {
#endif
static int
-uio_intx_intr_disable(struct rte_intr_handle *intr_handle)
+uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
{
unsigned char command_high;
@@ -367,7 +366,7 @@ uio_intx_intr_disable(struct rte_intr_handle *intr_handle)
}
static int
-uio_intx_intr_enable(struct rte_intr_handle *intr_handle)
+uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
{
unsigned char command_high;
@@ -391,7 +390,7 @@ uio_intx_intr_enable(struct rte_intr_handle *intr_handle)
}
static int
-uio_intr_disable(struct rte_intr_handle *intr_handle)
+uio_intr_disable(const struct rte_intr_handle *intr_handle)
{
const int value = 0;
@@ -405,7 +404,7 @@ uio_intr_disable(struct rte_intr_handle *intr_handle)
}
static int
-uio_intr_enable(struct rte_intr_handle *intr_handle)
+uio_intr_enable(const struct rte_intr_handle *intr_handle)
{
const int value = 1;
@@ -419,7 +418,7 @@ uio_intr_enable(struct rte_intr_handle *intr_handle)
}
int
-rte_intr_callback_register(struct rte_intr_handle *intr_handle,
+rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
rte_intr_callback_fn cb, void *cb_arg)
{
int ret, wake_thread;
@@ -491,7 +490,7 @@ rte_intr_callback_register(struct rte_intr_handle *intr_handle,
}
int
-rte_intr_callback_unregister(struct rte_intr_handle *intr_handle,
+rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
rte_intr_callback_fn cb_fn, void *cb_arg)
{
int ret;
@@ -555,8 +554,11 @@ rte_intr_callback_unregister(struct rte_intr_handle *intr_handle,
}
int
-rte_intr_enable(struct rte_intr_handle *intr_handle)
+rte_intr_enable(const struct rte_intr_handle *intr_handle)
{
+ if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
+ return 0;
+
if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
return -1;
@@ -599,8 +601,11 @@ rte_intr_enable(struct rte_intr_handle *intr_handle)
}
int
-rte_intr_disable(struct rte_intr_handle *intr_handle)
+rte_intr_disable(const struct rte_intr_handle *intr_handle)
{
+ if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
+ return 0;
+
if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
return -1;
@@ -645,6 +650,7 @@ rte_intr_disable(struct rte_intr_handle *intr_handle)
static int
eal_intr_process_interrupts(struct epoll_event *events, int nfds)
{
+ bool call = false;
int n, bytes_read;
struct rte_intr_source *src;
struct rte_intr_callback *cb;
@@ -693,13 +699,18 @@ eal_intr_process_interrupts(struct epoll_event *events, int nfds)
bytes_read = sizeof(buf.vfio_intr_count);
break;
#endif
+ case RTE_INTR_HANDLE_VDEV:
case RTE_INTR_HANDLE_EXT:
+ bytes_read = 0;
+ call = true;
+ break;
+
default:
bytes_read = 1;
break;
}
- if (src->intr_handle.type != RTE_INTR_HANDLE_EXT) {
+ if (bytes_read > 0) {
/**
* read out to clear the ready-to-be-read flag
* for epoll_wait.
@@ -716,12 +727,14 @@ eal_intr_process_interrupts(struct epoll_event *events, int nfds)
} else if (bytes_read == 0)
RTE_LOG(ERR, EAL, "Read nothing from file "
"descriptor %d\n", events[n].data.fd);
+ else
+ call = true;
}
/* grab a lock, again to call callbacks and update status. */
rte_spinlock_lock(&intr_lock);
- if (bytes_read > 0) {
+ if (call) {
/* Finally, call all callbacks. */
TAILQ_FOREACH(cb, &src->callbacks, next) {
@@ -731,8 +744,7 @@ eal_intr_process_interrupts(struct epoll_event *events, int nfds)
rte_spinlock_unlock(&intr_lock);
/* call the actual callback */
- active_cb.cb_fn(&src->intr_handle,
- active_cb.cb_arg);
+ active_cb.cb_fn(active_cb.cb_arg);
/*get the lock back. */
rte_spinlock_lock(&intr_lock);
@@ -832,7 +844,7 @@ eal_intr_thread_main(__rte_unused void *arg)
TAILQ_FOREACH(src, &intr_sources, next) {
if (src->callbacks.tqh_first == NULL)
continue; /* skip those with no callbacks */
- ev.events = EPOLLIN | EPOLLPRI;
+ ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
ev.data.fd = src->intr_handle.fd;
/**
@@ -872,13 +884,16 @@ rte_eal_intr_init(void)
* create a pipe which will be waited by epoll and notified to
* rebuild the wait list of epoll.
*/
- if (pipe(intr_pipe.pipefd) < 0)
+ if (pipe(intr_pipe.pipefd) < 0) {
+ rte_errno = errno;
return -1;
+ }
/* create the host thread to wait/handle the interrupt */
ret = pthread_create(&intr_thread, NULL,
eal_intr_thread_main, NULL);
if (ret != 0) {
+ rte_errno = ret;
RTE_LOG(ERR, EAL,
"Failed to create thread for interrupt handling\n");
} else {
@@ -913,6 +928,14 @@ eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
bytes_read = sizeof(buf.vfio_intr_count);
break;
#endif
+ case RTE_INTR_HANDLE_VDEV:
+ /* for vdev, fd points to:
+ * a. eventfd which does not need to read out;
+ * b. datapath fd which needs PMD to read out.
+ */
+ return;
+ case RTE_INTR_HANDLE_EXT:
+ return;
default:
bytes_read = 1;
RTE_LOG(INFO, EAL, "unexpected intr type\n");
@@ -1141,6 +1164,24 @@ rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
return rc;
}
+void
+rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
+{
+ uint32_t i;
+ struct rte_epoll_event *rev;
+
+ for (i = 0; i < intr_handle->nb_efd; i++) {
+ rev = &intr_handle->elist[i];
+ if (rev->status == RTE_EPOLL_INVALID)
+ continue;
+ if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
+ /* force free if the entry valid */
+ eal_epoll_data_safe_free(rev);
+ rev->status = RTE_EPOLL_INVALID;
+ }
+ }
+}
+
int
rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
{
@@ -1157,12 +1198,14 @@ rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
RTE_LOG(ERR, EAL,
"can't setup eventfd, error %i (%s)\n",
errno, strerror(errno));
- return -1;
+ return -errno;
}
intr_handle->efds[i] = fd;
}
intr_handle->nb_efd = n;
intr_handle->max_intr = NB_OTHER_INTR + n;
+ } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
+ /* do nothing, and let vdev driver to initialize this struct */
} else {
intr_handle->efds[0] = intr_handle->fd;
intr_handle->nb_efd = RTE_MIN(nb_efd, 1U);
@@ -1176,19 +1219,8 @@ void
rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
{
uint32_t i;
- struct rte_epoll_event *rev;
-
- for (i = 0; i < intr_handle->nb_efd; i++) {
- rev = &intr_handle->elist[i];
- if (rev->status == RTE_EPOLL_INVALID)
- continue;
- if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
- /* force free if the entry valid */
- eal_epoll_data_safe_free(rev);
- rev->status = RTE_EPOLL_INVALID;
- }
- }
+ rte_intr_free_epoll_fd(intr_handle);
if (intr_handle->max_intr > intr_handle->nb_efd) {
for (i = 0; i < intr_handle->nb_efd; i++)
close(intr_handle->efds[i]);
@@ -1218,5 +1250,8 @@ rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX)
return 1;
+ if (intr_handle->type == RTE_INTR_HANDLE_VDEV)
+ return 1;
+
return 0;
}
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index a956bb22..ebe06833 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -64,6 +64,7 @@
#define _FILE_OFFSET_BITS 64
#include <errno.h>
#include <stdarg.h>
+#include <stdbool.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
@@ -122,26 +123,28 @@ int rte_xen_dom0_supported(void)
static uint64_t baseaddr_offset;
-static unsigned proc_pagemap_readable;
+static bool phys_addrs_available = true;
#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
static void
-test_proc_pagemap_readable(void)
+test_phys_addrs_available(void)
{
- int fd = open("/proc/self/pagemap", O_RDONLY);
+ uint64_t tmp;
+ phys_addr_t physaddr;
- if (fd < 0) {
+ /* For dom0, phys addresses can always be available */
+ if (rte_xen_dom0_supported())
+ return;
+
+ physaddr = rte_mem_virt2phy(&tmp);
+ if (physaddr == RTE_BAD_PHYS_ADDR) {
RTE_LOG(ERR, EAL,
- "Cannot open /proc/self/pagemap: %s. "
- "virt2phys address translation will not work\n",
+ "Cannot obtain physical addresses: %s. "
+ "Only vfio will function.\n",
strerror(errno));
- return;
+ phys_addrs_available = false;
}
-
- /* Is readable */
- close(fd);
- proc_pagemap_readable = 1;
}
/* Lock page in physical memory and prevent from swapping. */
@@ -190,7 +193,7 @@ rte_mem_virt2phy(const void *virtaddr)
}
/* Cannot parse /proc/self/pagemap, no need to log errors everywhere */
- if (!proc_pagemap_readable)
+ if (!phys_addrs_available)
return RTE_BAD_PHYS_ADDR;
/* standard page size */
@@ -229,6 +232,9 @@ rte_mem_virt2phy(const void *virtaddr)
* the pfn (page frame number) are bits 0-54 (see
* pagemap.txt in linux Documentation)
*/
+ if ((page & 0x7fffffffffffffULL) == 0)
+ return RTE_BAD_PHYS_ADDR;
+
physaddr = ((page & 0x7fffffffffffffULL) * page_size)
+ ((unsigned long)virtaddr % page_size);
@@ -242,7 +248,7 @@ rte_mem_virt2phy(const void *virtaddr)
static int
find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
{
- unsigned i;
+ unsigned int i;
phys_addr_t addr;
for (i = 0; i < hpi->num_pages[0]; i++) {
@@ -255,6 +261,22 @@ find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
}
/*
+ * For each hugepage in hugepg_tbl, fill the physaddr value sequentially.
+ */
+static int
+set_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
+{
+ unsigned int i;
+ static phys_addr_t addr;
+
+ for (i = 0; i < hpi->num_pages[0]; i++) {
+ hugepg_tbl[i].physaddr = addr;
+ addr += hugepg_tbl[i].size;
+ }
+ return 0;
+}
+
+/*
* Check whether address-space layout randomization is enabled in
* the kernel. This is important for multi-process as it can prevent
* two processes mapping data to the same virtual address
@@ -313,7 +335,13 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
}
do {
addr = mmap(addr,
- (*size) + hugepage_sz, PROT_READ, MAP_PRIVATE, fd, 0);
+ (*size) + hugepage_sz, PROT_READ,
+#ifdef RTE_ARCH_PPC_64
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+#else
+ MAP_PRIVATE,
+#endif
+ fd, 0);
if (addr == MAP_FAILED)
*size -= hugepage_sz;
} while (addr == MAP_FAILED && *size > 0);
@@ -592,12 +620,12 @@ static int
cmp_physaddr(const void *a, const void *b)
{
#ifndef RTE_ARCH_PPC_64
- const struct hugepage_file *p1 = (const struct hugepage_file *)a;
- const struct hugepage_file *p2 = (const struct hugepage_file *)b;
+ const struct hugepage_file *p1 = a;
+ const struct hugepage_file *p2 = b;
#else
/* PowerPC needs memory sorted in reverse order from x86 */
- const struct hugepage_file *p1 = (const struct hugepage_file *)b;
- const struct hugepage_file *p2 = (const struct hugepage_file *)a;
+ const struct hugepage_file *p1 = b;
+ const struct hugepage_file *p2 = a;
#endif
if (p1->physaddr < p2->physaddr)
return -1;
@@ -951,7 +979,7 @@ rte_eal_hugepage_init(void)
int nr_hugefiles, nr_hugepages = 0;
void *addr;
- test_proc_pagemap_readable();
+ test_phys_addrs_available();
memset(used_hp, 0, sizeof(used_hp));
@@ -1043,11 +1071,22 @@ rte_eal_hugepage_init(void)
continue;
}
- /* find physical addresses and sockets for each hugepage */
- if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0){
- RTE_LOG(DEBUG, EAL, "Failed to find phys addr for %u MB pages\n",
- (unsigned)(hpi->hugepage_sz / 0x100000));
- goto fail;
+ if (phys_addrs_available) {
+ /* find physical addresses for each hugepage */
+ if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
+ RTE_LOG(DEBUG, EAL, "Failed to find phys addr "
+ "for %u MB pages\n",
+ (unsigned int)(hpi->hugepage_sz / 0x100000));
+ goto fail;
+ }
+ } else {
+ /* set physical addresses for each hugepage */
+ if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
+ RTE_LOG(DEBUG, EAL, "Failed to set phys addr "
+ "for %u MB pages\n",
+ (unsigned int)(hpi->hugepage_sz / 0x100000));
+ goto fail;
+ }
}
if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
@@ -1289,7 +1328,7 @@ rte_eal_hugepage_attach(void)
"into secondary processes\n");
}
- test_proc_pagemap_readable();
+ test_phys_addrs_available();
if (internal_config.xen_dom0_support) {
#ifdef RTE_LIBRTE_XEN_DOM0
@@ -1330,7 +1369,13 @@ rte_eal_hugepage_attach(void)
* use mmap to get identical addresses as the primary process.
*/
base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len,
- PROT_READ, MAP_PRIVATE, fd_zero, 0);
+ PROT_READ,
+#ifdef RTE_ARCH_PPC_64
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+#else
+ MAP_PRIVATE,
+#endif
+ fd_zero, 0);
if (base_addr == MAP_FAILED ||
base_addr != mcfg->memseg[s].addr) {
max_seg = s;
@@ -1426,3 +1471,9 @@ error:
close(fd_hugepage);
return -1;
}
+
+bool
+rte_eal_using_phys_addrs(void)
+{
+ return phys_addrs_available;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c
index 876ba381..595622b2 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
@@ -35,6 +35,7 @@
#include <dirent.h>
#include <rte_log.h>
+#include <rte_bus.h>
#include <rte_pci.h>
#include <rte_eal_memconfig.h>
#include <rte_malloc.h>
@@ -54,44 +55,7 @@
* IGB_UIO driver (or doesn't initialize, if the device wasn't bound to it).
*/
-/* unbind kernel driver for this device */
-int
-pci_unbind_kernel_driver(struct rte_pci_device *dev)
-{
- int n;
- FILE *f;
- char filename[PATH_MAX];
- char buf[BUFSIZ];
- struct rte_pci_addr *loc = &dev->addr;
-
- /* open /sys/bus/pci/devices/AAAA:BB:CC.D/driver */
- snprintf(filename, sizeof(filename),
- "%s/" PCI_PRI_FMT "/driver/unbind", pci_get_sysfs_path(),
- loc->domain, loc->bus, loc->devid, loc->function);
-
- f = fopen(filename, "w");
- if (f == NULL) /* device was not bound */
- return 0;
-
- n = snprintf(buf, sizeof(buf), PCI_PRI_FMT "\n",
- loc->domain, loc->bus, loc->devid, loc->function);
- if ((n < 0) || (n >= (int)sizeof(buf))) {
- RTE_LOG(ERR, EAL, "%s(): snprintf failed\n", __func__);
- goto error;
- }
- if (fwrite(buf, n, 1, f) == 0) {
- RTE_LOG(ERR, EAL, "%s(): could not write to %s\n", __func__,
- filename);
- goto error;
- }
-
- fclose(f);
- return 0;
-
-error:
- fclose(f);
- return -1;
-}
+extern struct rte_pci_bus rte_pci_bus;
static int
pci_get_kernel_driver_by_path(const char *filename, char *dri_name)
@@ -124,7 +88,7 @@ pci_get_kernel_driver_by_path(const char *filename, char *dri_name)
/* Map pci device */
int
-rte_eal_pci_map_device(struct rte_pci_device *dev)
+rte_pci_map_device(struct rte_pci_device *dev)
{
int ret = -1;
@@ -138,8 +102,10 @@ rte_eal_pci_map_device(struct rte_pci_device *dev)
break;
case RTE_KDRV_IGB_UIO:
case RTE_KDRV_UIO_GENERIC:
- /* map resources for devices that use uio */
- ret = pci_uio_map_resource(dev);
+ if (rte_eal_using_phys_addrs()) {
+ /* map resources for devices that use uio */
+ ret = pci_uio_map_resource(dev);
+ }
break;
default:
RTE_LOG(DEBUG, EAL,
@@ -153,12 +119,15 @@ rte_eal_pci_map_device(struct rte_pci_device *dev)
/* Unmap pci device */
void
-rte_eal_pci_unmap_device(struct rte_pci_device *dev)
+rte_pci_unmap_device(struct rte_pci_device *dev)
{
/* try unmapping the NIC resources using VFIO if it exists */
switch (dev->kdrv) {
case RTE_KDRV_VFIO:
- RTE_LOG(ERR, EAL, "Hotplug doesn't support vfio yet\n");
+#ifdef VFIO_PRESENT
+ if (pci_vfio_is_enabled())
+ pci_vfio_unmap_resource(dev);
+#endif
break;
case RTE_KDRV_IGB_UIO:
case RTE_KDRV_UIO_GENERIC:
@@ -267,8 +236,7 @@ error:
/* Scan one pci sysfs entry, and fill the devices list from it. */
static int
-pci_scan_one(const char *dirname, uint16_t domain, uint8_t bus,
- uint8_t devid, uint8_t function)
+pci_scan_one(const char *dirname, const struct rte_pci_addr *addr)
{
char filename[PATH_MAX];
unsigned long tmp;
@@ -281,10 +249,7 @@ pci_scan_one(const char *dirname, uint16_t domain, uint8_t bus,
return -1;
memset(dev, 0, sizeof(*dev));
- dev->addr.domain = domain;
- dev->addr.bus = bus;
- dev->addr.devid = devid;
- dev->addr.function = function;
+ dev->addr = *addr;
/* get vendor id */
snprintf(filename, sizeof(filename), "%s/vendor", dirname);
@@ -359,6 +324,9 @@ pci_scan_one(const char *dirname, uint16_t domain, uint8_t bus,
dev->device.numa_node = tmp;
}
+ rte_pci_device_name(addr, dev->name, sizeof(dev->name));
+ dev->device.name = dev->name;
+
/* parse resources */
snprintf(filename, sizeof(filename), "%s/resource", dirname);
if (pci_parse_sysfs_resource(filename, dev) < 0) {
@@ -389,21 +357,19 @@ pci_scan_one(const char *dirname, uint16_t domain, uint8_t bus,
dev->kdrv = RTE_KDRV_NONE;
/* device is valid, add in list (sorted) */
- if (TAILQ_EMPTY(&pci_device_list)) {
- rte_eal_device_insert(&dev->device);
- TAILQ_INSERT_TAIL(&pci_device_list, dev, next);
+ if (TAILQ_EMPTY(&rte_pci_bus.device_list)) {
+ rte_pci_add_device(dev);
} else {
struct rte_pci_device *dev2;
int ret;
- TAILQ_FOREACH(dev2, &pci_device_list, next) {
+ TAILQ_FOREACH(dev2, &rte_pci_bus.device_list, next) {
ret = rte_eal_compare_pci_addr(&dev->addr, &dev2->addr);
if (ret > 0)
continue;
if (ret < 0) {
- TAILQ_INSERT_BEFORE(dev2, dev, next);
- rte_eal_device_insert(&dev->device);
+ rte_pci_insert_device(dev2, dev);
} else { /* already registered */
dev2->kdrv = dev->kdrv;
dev2->max_vfs = dev->max_vfs;
@@ -413,8 +379,8 @@ pci_scan_one(const char *dirname, uint16_t domain, uint8_t bus,
}
return 0;
}
- rte_eal_device_insert(&dev->device);
- TAILQ_INSERT_TAIL(&pci_device_list, dev, next);
+
+ rte_pci_add_device(dev);
}
return 0;
@@ -429,16 +395,14 @@ pci_update_device(const struct rte_pci_addr *addr)
pci_get_sysfs_path(), addr->domain, addr->bus, addr->devid,
addr->function);
- return pci_scan_one(filename, addr->domain, addr->bus, addr->devid,
- addr->function);
+ return pci_scan_one(filename, addr);
}
/*
* split up a pci address into its constituent parts.
*/
static int
-parse_pci_addr_format(const char *buf, int bufsize, uint16_t *domain,
- uint8_t *bus, uint8_t *devid, uint8_t *function)
+parse_pci_addr_format(const char *buf, int bufsize, struct rte_pci_addr *addr)
{
/* first split on ':' */
union splitaddr {
@@ -466,10 +430,10 @@ parse_pci_addr_format(const char *buf, int bufsize, uint16_t *domain,
/* now convert to int values */
errno = 0;
- *domain = (uint16_t)strtoul(splitaddr.domain, NULL, 16);
- *bus = (uint8_t)strtoul(splitaddr.bus, NULL, 16);
- *devid = (uint8_t)strtoul(splitaddr.devid, NULL, 16);
- *function = (uint8_t)strtoul(splitaddr.function, NULL, 10);
+ addr->domain = (uint16_t)strtoul(splitaddr.domain, NULL, 16);
+ addr->bus = (uint8_t)strtoul(splitaddr.bus, NULL, 16);
+ addr->devid = (uint8_t)strtoul(splitaddr.devid, NULL, 16);
+ addr->function = (uint8_t)strtoul(splitaddr.function, NULL, 10);
if (errno != 0)
goto error;
@@ -485,13 +449,16 @@ error:
* list
*/
int
-rte_eal_pci_scan(void)
+rte_pci_scan(void)
{
struct dirent *e;
DIR *dir;
char dirname[PATH_MAX];
- uint16_t domain;
- uint8_t bus, devid, function;
+ struct rte_pci_addr addr;
+
+ /* for debug purposes, PCI can be disabled */
+ if (internal_config.no_pci)
+ return 0;
dir = opendir(pci_get_sysfs_path());
if (dir == NULL) {
@@ -504,13 +471,13 @@ rte_eal_pci_scan(void)
if (e->d_name[0] == '.')
continue;
- if (parse_pci_addr_format(e->d_name, sizeof(e->d_name), &domain,
- &bus, &devid, &function) != 0)
+ if (parse_pci_addr_format(e->d_name, sizeof(e->d_name), &addr) != 0)
continue;
snprintf(dirname, sizeof(dirname), "%s/%s",
pci_get_sysfs_path(), e->d_name);
- if (pci_scan_one(dirname, domain, bus, devid, function) < 0)
+
+ if (pci_scan_one(dirname, &addr) < 0)
goto error;
}
closedir(dir);
@@ -522,8 +489,8 @@ error:
}
/* Read PCI config space. */
-int rte_eal_pci_read_config(const struct rte_pci_device *device,
- void *buf, size_t len, off_t offset)
+int rte_pci_read_config(const struct rte_pci_device *device,
+ void *buf, size_t len, off_t offset)
{
const struct rte_intr_handle *intr_handle = &device->intr_handle;
@@ -547,8 +514,8 @@ int rte_eal_pci_read_config(const struct rte_pci_device *device,
}
/* Write PCI config space. */
-int rte_eal_pci_write_config(const struct rte_pci_device *device,
- const void *buf, size_t len, off_t offset)
+int rte_pci_write_config(const struct rte_pci_device *device,
+ const void *buf, size_t len, off_t offset)
{
const struct rte_intr_handle *intr_handle = &device->intr_handle;
@@ -574,7 +541,7 @@ int rte_eal_pci_write_config(const struct rte_pci_device *device,
#if defined(RTE_ARCH_X86)
static int
pci_ioport_map(struct rte_pci_device *dev, int bar __rte_unused,
- struct rte_pci_ioport *p)
+ struct rte_pci_ioport *p)
{
uint16_t start, end;
FILE *fp;
@@ -632,8 +599,8 @@ pci_ioport_map(struct rte_pci_device *dev, int bar __rte_unused,
#endif
int
-rte_eal_pci_ioport_map(struct rte_pci_device *dev, int bar,
- struct rte_pci_ioport *p)
+rte_pci_ioport_map(struct rte_pci_device *dev, int bar,
+ struct rte_pci_ioport *p)
{
int ret = -1;
@@ -670,8 +637,8 @@ rte_eal_pci_ioport_map(struct rte_pci_device *dev, int bar,
}
void
-rte_eal_pci_ioport_read(struct rte_pci_ioport *p,
- void *data, size_t len, off_t offset)
+rte_pci_ioport_read(struct rte_pci_ioport *p,
+ void *data, size_t len, off_t offset)
{
switch (p->dev->kdrv) {
#ifdef VFIO_PRESENT
@@ -696,8 +663,8 @@ rte_eal_pci_ioport_read(struct rte_pci_ioport *p,
}
void
-rte_eal_pci_ioport_write(struct rte_pci_ioport *p,
- const void *data, size_t len, off_t offset)
+rte_pci_ioport_write(struct rte_pci_ioport *p,
+ const void *data, size_t len, off_t offset)
{
switch (p->dev->kdrv) {
#ifdef VFIO_PRESENT
@@ -722,7 +689,7 @@ rte_eal_pci_ioport_write(struct rte_pci_ioport *p,
}
int
-rte_eal_pci_ioport_unmap(struct rte_pci_ioport *p)
+rte_pci_ioport_unmap(struct rte_pci_ioport *p)
{
int ret = -1;
@@ -754,19 +721,3 @@ rte_eal_pci_ioport_unmap(struct rte_pci_ioport *p)
return ret;
}
-
-/* Init the PCI EAL subsystem */
-int
-rte_eal_pci_init(void)
-{
- /* for debug purposes, PCI can be disabled */
- if (internal_config.no_pci)
- return 0;
-
- if (rte_eal_pci_scan() < 0) {
- RTE_LOG(ERR, EAL, "%s(): Cannot scan PCI bus\n", __func__);
- return -1;
- }
-
- return 0;
-}
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_init.h b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
index 6a960d1b..ae2980d6 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_init.h
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
@@ -88,8 +88,9 @@ void pci_vfio_ioport_write(struct rte_pci_ioport *p,
const void *data, size_t len, off_t offset);
int pci_vfio_ioport_unmap(struct rte_pci_ioport *p);
-/* map VFIO resource prototype */
+/* map/unmap VFIO resource prototype */
int pci_vfio_map_resource(struct rte_pci_device *dev);
+int pci_vfio_unmap_resource(struct rte_pci_device *dev);
#endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
index 3e4ffb57..fa10329f 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
@@ -38,6 +38,7 @@
#include <inttypes.h>
#include <sys/stat.h>
#include <sys/mman.h>
+#include <sys/sysmacros.h>
#include <linux/pci_regs.h>
#if defined(RTE_ARCH_X86)
@@ -230,7 +231,7 @@ pci_uio_free_resource(struct rte_pci_device *dev,
close(dev->intr_handle.uio_cfg_fd);
dev->intr_handle.uio_cfg_fd = -1;
}
- if (dev->intr_handle.fd) {
+ if (dev->intr_handle.fd >= 0) {
close(dev->intr_handle.fd);
dev->intr_handle.fd = -1;
dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
index 5f478c59..2be13195 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -38,6 +38,7 @@
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
+#include <stdbool.h>
#include <rte_log.h>
#include <rte_pci.h>
@@ -172,7 +173,7 @@ pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset,
/* set PCI bus mastering */
static int
-pci_vfio_set_bus_master(int dev_fd)
+pci_vfio_set_bus_master(int dev_fd, bool op)
{
uint16_t reg;
int ret;
@@ -185,8 +186,11 @@ pci_vfio_set_bus_master(int dev_fd)
return -1;
}
- /* set the master bit */
- reg |= PCI_COMMAND_MASTER;
+ if (op)
+ /* set the master bit */
+ reg |= PCI_COMMAND_MASTER;
+ else
+ reg &= ~(PCI_COMMAND_MASTER);
ret = pwrite64(dev_fd, &reg, sizeof(reg),
VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
@@ -355,7 +359,8 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
} else {
/* if we're in a secondary process, just find our tailq entry */
TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
- if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev->addr)))
+ if (rte_eal_compare_pci_addr(&vfio_res->pci_addr,
+ &dev->addr))
continue;
break;
}
@@ -517,7 +522,7 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
}
/* set bus mastering for the device */
- if (pci_vfio_set_bus_master(vfio_dev_fd)) {
+ if (pci_vfio_set_bus_master(vfio_dev_fd, true)) {
RTE_LOG(ERR, EAL, " %s cannot set up bus mastering!\n", pci_addr);
close(vfio_dev_fd);
rte_free(vfio_res);
@@ -535,6 +540,79 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
}
int
+pci_vfio_unmap_resource(struct rte_pci_device *dev)
+{
+ char pci_addr[PATH_MAX] = {0};
+ struct rte_pci_addr *loc = &dev->addr;
+ int i, ret;
+ struct mapped_pci_resource *vfio_res = NULL;
+ struct mapped_pci_res_list *vfio_res_list;
+
+ struct pci_map *maps;
+
+ /* store PCI address string */
+ snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+ loc->domain, loc->bus, loc->devid, loc->function);
+
+
+ if (close(dev->intr_handle.fd) < 0) {
+ RTE_LOG(INFO, EAL, "Error when closing eventfd file descriptor for %s\n",
+ pci_addr);
+ return -1;
+ }
+
+ if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, false)) {
+ RTE_LOG(ERR, EAL, " %s cannot unset bus mastering for PCI device!\n",
+ pci_addr);
+ return -1;
+ }
+
+ ret = vfio_release_device(pci_get_sysfs_path(), pci_addr,
+ dev->intr_handle.vfio_dev_fd);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL,
+ "%s(): cannot release device\n", __func__);
+ return ret;
+ }
+
+ vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
+ /* Get vfio_res */
+ TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
+ if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev->addr)))
+ continue;
+ break;
+ }
+ /* if we haven't found our tailq entry, something's wrong */
+ if (vfio_res == NULL) {
+ RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n",
+ pci_addr);
+ return -1;
+ }
+
+ /* unmap BARs */
+ maps = vfio_res->maps;
+
+ RTE_LOG(INFO, EAL, "Releasing pci mapped resource for %s\n",
+ pci_addr);
+ for (i = 0; i < (int) vfio_res->nb_maps; i++) {
+
+ /*
+ * We do not need to be aware of MSI-X table BAR mappings as
+ * when mapping. Just using current maps array is enough
+ */
+ if (maps[i].addr) {
+ RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n",
+ pci_addr, maps[i].addr);
+ pci_unmap_resource(maps[i].addr, maps[i].size);
+ }
+ }
+
+ TAILQ_REMOVE(vfio_res_list, vfio_res, next);
+
+ return 0;
+}
+
+int
pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
struct rte_pci_ioport *p)
{
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 702f7a2e..53ac725d 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -50,12 +50,15 @@
static struct vfio_config vfio_cfg;
static int vfio_type1_dma_map(int);
+static int vfio_spapr_dma_map(int);
static int vfio_noiommu_dma_map(int);
/* IOMMU types we support */
static const struct vfio_iommu_type iommu_types[] = {
/* x86 IOMMU, otherwise known as type 1 */
{ RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map},
+ /* ppc64 IOMMU, otherwise known as spapr */
+ { RTE_VFIO_SPAPR, "sPAPR", &vfio_spapr_dma_map},
/* IOMMU-less mode */
{ RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map},
};
@@ -65,13 +68,32 @@ vfio_get_group_fd(int iommu_group_no)
{
int i;
int vfio_group_fd;
+ int group_idx = -1;
char filename[PATH_MAX];
/* check if we already have the group descriptor open */
- for (i = 0; i < vfio_cfg.vfio_group_idx; i++)
+ for (i = 0; i < VFIO_MAX_GROUPS; i++)
if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no)
return vfio_cfg.vfio_groups[i].fd;
+ /* Lets see first if there is room for a new group */
+ if (vfio_cfg.vfio_active_groups == VFIO_MAX_GROUPS) {
+ RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+ return -1;
+ }
+
+ /* Now lets get an index for the new group */
+ for (i = 0; i < VFIO_MAX_GROUPS; i++)
+ if (vfio_cfg.vfio_groups[i].group_no == -1) {
+ group_idx = i;
+ break;
+ }
+
+ /* This should not happen */
+ if (group_idx == -1) {
+ RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
+ return -1;
+ }
/* if primary, try to open the group */
if (internal_config.process_type == RTE_PROC_PRIMARY) {
/* try regular group format */
@@ -101,14 +123,9 @@ vfio_get_group_fd(int iommu_group_no)
/* noiommu group found */
}
- /* if the fd is valid, create a new group for it */
- if (vfio_cfg.vfio_group_idx == VFIO_MAX_GROUPS) {
- RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
- close(vfio_group_fd);
- return -1;
- }
- vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no;
- vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd;
+ vfio_cfg.vfio_groups[group_idx].group_no = iommu_group_no;
+ vfio_cfg.vfio_groups[group_idx].fd = vfio_group_fd;
+ vfio_cfg.vfio_active_groups++;
return vfio_group_fd;
}
/* if we're in a secondary process, request group fd from the primary
@@ -155,14 +172,115 @@ vfio_get_group_fd(int iommu_group_no)
return -1;
}
+
+static int
+get_vfio_group_idx(int vfio_group_fd)
+{
+ int i;
+ for (i = 0; i < VFIO_MAX_GROUPS; i++)
+ if (vfio_cfg.vfio_groups[i].fd == vfio_group_fd)
+ return i;
+ return -1;
+}
+
+static void
+vfio_group_device_get(int vfio_group_fd)
+{
+ int i;
+
+ i = get_vfio_group_idx(vfio_group_fd);
+ if (i < 0 || i > VFIO_MAX_GROUPS)
+ RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i);
+ else
+ vfio_cfg.vfio_groups[i].devices++;
+}
+
static void
-clear_current_group(void)
+vfio_group_device_put(int vfio_group_fd)
{
- vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = 0;
- vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = -1;
+ int i;
+
+ i = get_vfio_group_idx(vfio_group_fd);
+ if (i < 0 || i > VFIO_MAX_GROUPS)
+ RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i);
+ else
+ vfio_cfg.vfio_groups[i].devices--;
+}
+
+static int
+vfio_group_device_count(int vfio_group_fd)
+{
+ int i;
+
+ i = get_vfio_group_idx(vfio_group_fd);
+ if (i < 0 || i > VFIO_MAX_GROUPS) {
+ RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i);
+ return -1;
+ }
+
+ return vfio_cfg.vfio_groups[i].devices;
+}
+
+int
+clear_group(int vfio_group_fd)
+{
+ int i;
+ int socket_fd, ret;
+
+ if (internal_config.process_type == RTE_PROC_PRIMARY) {
+
+ i = get_vfio_group_idx(vfio_group_fd);
+ if (i < 0)
+ return -1;
+ vfio_cfg.vfio_groups[i].group_no = -1;
+ vfio_cfg.vfio_groups[i].fd = -1;
+ vfio_cfg.vfio_groups[i].devices = 0;
+ vfio_cfg.vfio_active_groups--;
+ return 0;
+ }
+
+ /* This is just for SECONDARY processes */
+ socket_fd = vfio_mp_sync_connect_to_primary();
+
+ if (socket_fd < 0) {
+ RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
+ return -1;
+ }
+
+ if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
+ RTE_LOG(ERR, EAL, " cannot request container fd!\n");
+ close(socket_fd);
+ return -1;
+ }
+
+ if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
+ RTE_LOG(ERR, EAL, " cannot send group fd!\n");
+ close(socket_fd);
+ return -1;
+ }
+
+ ret = vfio_mp_sync_receive_request(socket_fd);
+ switch (ret) {
+ case SOCKET_NO_FD:
+ RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
+ close(socket_fd);
+ break;
+ case SOCKET_OK:
+ close(socket_fd);
+ return 0;
+ case SOCKET_ERR:
+ RTE_LOG(ERR, EAL, " Socket error\n");
+ close(socket_fd);
+ break;
+ default:
+ RTE_LOG(ERR, EAL, " UNKNOWN reply, %d\n", ret);
+ close(socket_fd);
+ }
+ return -1;
}
-int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
+int
+vfio_setup_device(const char *sysfs_base, const char *dev_addr,
int *vfio_dev_fd, struct vfio_device_info *device_info)
{
struct vfio_group_status group_status = {
@@ -189,18 +307,10 @@ int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
if (vfio_group_fd < 0)
return -1;
- /* store group fd */
- vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no;
- vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd;
-
/* if group_fd == 0, that means the device isn't managed by VFIO */
if (vfio_group_fd == 0) {
- RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n",
+ RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n",
dev_addr);
- /* we store 0 as group fd to distinguish between existing but
- * unbound VFIO groups, and groups that don't exist at all.
- */
- vfio_cfg.vfio_group_idx++;
return 1;
}
@@ -215,12 +325,12 @@ int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
RTE_LOG(ERR, EAL, " %s cannot get group status, "
"error %i (%s)\n", dev_addr, errno, strerror(errno));
close(vfio_group_fd);
- clear_current_group();
+ clear_group(vfio_group_fd);
return -1;
} else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
RTE_LOG(ERR, EAL, " %s VFIO group is not viable!\n", dev_addr);
close(vfio_group_fd);
- clear_current_group();
+ clear_group(vfio_group_fd);
return -1;
}
@@ -234,60 +344,131 @@ int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, "
"error %i (%s)\n", dev_addr, errno, strerror(errno));
close(vfio_group_fd);
- clear_current_group();
+ clear_group(vfio_group_fd);
return -1;
}
+
/*
- * at this point we know that this group has been successfully
- * initialized, so we increment vfio_group_idx to indicate that we can
- * add new groups.
+ * pick an IOMMU type and set up DMA mappings for container
+ *
+ * needs to be done only once, only when first group is
+ * assigned to a container and only in primary process.
+ * Note this can happen several times with the hotplug
+ * functionality.
*/
- vfio_cfg.vfio_group_idx++;
- }
-
- /*
- * pick an IOMMU type and set up DMA mappings for container
- *
- * needs to be done only once, only when at least one group is assigned to
- * a container and only in primary process
- */
- if (internal_config.process_type == RTE_PROC_PRIMARY &&
- vfio_cfg.vfio_container_has_dma == 0) {
- /* select an IOMMU type which we will be using */
- const struct vfio_iommu_type *t =
+ if (internal_config.process_type == RTE_PROC_PRIMARY &&
+ vfio_cfg.vfio_active_groups == 1) {
+ /* select an IOMMU type which we will be using */
+ const struct vfio_iommu_type *t =
vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
- if (!t) {
- RTE_LOG(ERR, EAL, " %s failed to select IOMMU type\n", dev_addr);
- return -1;
- }
- ret = t->dma_map_func(vfio_cfg.vfio_container_fd);
- if (ret) {
- RTE_LOG(ERR, EAL, " %s DMA remapping failed, "
- "error %i (%s)\n", dev_addr, errno, strerror(errno));
- return -1;
+ if (!t) {
+ RTE_LOG(ERR, EAL,
+ " %s failed to select IOMMU type\n",
+ dev_addr);
+ close(vfio_group_fd);
+ clear_group(vfio_group_fd);
+ return -1;
+ }
+ ret = t->dma_map_func(vfio_cfg.vfio_container_fd);
+ if (ret) {
+ RTE_LOG(ERR, EAL,
+ " %s DMA remapping failed, error %i (%s)\n",
+ dev_addr, errno, strerror(errno));
+ close(vfio_group_fd);
+ clear_group(vfio_group_fd);
+ return -1;
+ }
}
- vfio_cfg.vfio_container_has_dma = 1;
}
/* get a file descriptor for the device */
*vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
if (*vfio_dev_fd < 0) {
- /* if we cannot get a device fd, this simply means that this
- * particular port is not bound to VFIO
- */
- RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n",
+ /* if we cannot get a device fd, this implies a problem with
+ * the VFIO group or the container not having IOMMU configured.
+ */
+
+ RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n",
dev_addr);
- return 1;
+ close(vfio_group_fd);
+ clear_group(vfio_group_fd);
+ return -1;
}
/* test and setup the device */
ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
if (ret) {
RTE_LOG(ERR, EAL, " %s cannot get device info, "
- "error %i (%s)\n", dev_addr, errno, strerror(errno));
+ "error %i (%s)\n", dev_addr, errno,
+ strerror(errno));
close(*vfio_dev_fd);
+ close(vfio_group_fd);
+ clear_group(vfio_group_fd);
return -1;
}
+ vfio_group_device_get(vfio_group_fd);
+
+ return 0;
+}
+
+int
+vfio_release_device(const char *sysfs_base, const char *dev_addr,
+ int vfio_dev_fd)
+{
+ struct vfio_group_status group_status = {
+ .argsz = sizeof(group_status)
+ };
+ int vfio_group_fd;
+ int iommu_group_no;
+ int ret;
+
+ /* get group number */
+ ret = vfio_get_group_no(sysfs_base, dev_addr, &iommu_group_no);
+ if (ret <= 0) {
+ RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver\n",
+ dev_addr);
+ /* This is an error at this point. */
+ return -1;
+ }
+
+ /* get the actual group fd */
+ vfio_group_fd = vfio_get_group_fd(iommu_group_no);
+ if (vfio_group_fd <= 0) {
+ RTE_LOG(INFO, EAL, "vfio_get_group_fd failed for %s\n",
+ dev_addr);
+ return -1;
+ }
+
+ /* At this point we got an active group. Closing it will make the
+ * container detachment. If this is the last active group, VFIO kernel
+ * code will unset the container and the IOMMU mappings.
+ */
+
+ /* Closing a device */
+ if (close(vfio_dev_fd) < 0) {
+ RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n",
+ dev_addr);
+ return -1;
+ }
+
+ /* An VFIO group can have several devices attached. Just when there is
+ * no devices remaining should the group be closed.
+ */
+ vfio_group_device_put(vfio_group_fd);
+ if (!vfio_group_device_count(vfio_group_fd)) {
+
+ if (close(vfio_group_fd) < 0) {
+ RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n",
+ dev_addr);
+ return -1;
+ }
+
+ if (clear_group(vfio_group_fd) < 0) {
+ RTE_LOG(INFO, EAL, "Error when clearing group for %s\n",
+ dev_addr);
+ return -1;
+ }
+ }
return 0;
}
@@ -302,6 +483,7 @@ vfio_enable(const char *modname)
for (i = 0; i < VFIO_MAX_GROUPS; i++) {
vfio_cfg.vfio_groups[i].fd = -1;
vfio_cfg.vfio_groups[i].group_no = -1;
+ vfio_cfg.vfio_groups[i].devices = 0;
}
/* inform the user that we are probing for VFIO */
@@ -531,7 +713,8 @@ vfio_type1_dma_map(int vfio_container_fd)
if (ret) {
RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
- "error %i (%s)\n", errno, strerror(errno));
+ "error %i (%s)\n", errno,
+ strerror(errno));
return -1;
}
}
@@ -540,6 +723,93 @@ vfio_type1_dma_map(int vfio_container_fd)
}
static int
+vfio_spapr_dma_map(int vfio_container_fd)
+{
+ const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+ int i, ret;
+
+ struct vfio_iommu_spapr_register_memory reg = {
+ .argsz = sizeof(reg),
+ .flags = 0
+ };
+ struct vfio_iommu_spapr_tce_info info = {
+ .argsz = sizeof(info),
+ };
+ struct vfio_iommu_spapr_tce_create create = {
+ .argsz = sizeof(create),
+ };
+ struct vfio_iommu_spapr_tce_remove remove = {
+ .argsz = sizeof(remove),
+ };
+
+ /* query spapr iommu info */
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot get iommu info, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ /* remove default DMA of 32 bit window */
+ remove.start_addr = info.dma32_window_start;
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot remove default DMA window, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ /* calculate window size based on number of hugepages configured */
+ create.window_size = rte_eal_get_physmem_size();
+ create.page_shift = __builtin_ctzll(ms->hugepage_sz);
+ create.levels = 2;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot create new DMA window, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+ for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+ struct vfio_iommu_type1_dma_map dma_map;
+
+ if (ms[i].addr == NULL)
+ break;
+
+ reg.vaddr = (uintptr_t) ms[i].addr;
+ reg.size = ms[i].len;
+ ret = ioctl(vfio_container_fd,
+ VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = ms[i].addr_64;
+ dma_map.size = ms[i].len;
+ dma_map.iova = ms[i].phys_addr;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+ VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ }
+
+ return 0;
+}
+
+static int
vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
{
/* No-IOMMU mode does not need DMA mapping */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 29f7f3ec..5ff63e5d 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -54,6 +54,62 @@
#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU
+#ifndef VFIO_SPAPR_TCE_v2_IOMMU
+#define RTE_VFIO_SPAPR 7
+#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17)
+#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19)
+#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20)
+
+struct vfio_iommu_spapr_register_memory {
+ uint32_t argsz;
+ uint32_t flags;
+ uint64_t vaddr;
+ uint64_t size;
+};
+
+struct vfio_iommu_spapr_tce_create {
+ uint32_t argsz;
+ uint32_t flags;
+ /* in */
+ uint32_t page_shift;
+ uint32_t __resv1;
+ uint64_t window_size;
+ uint32_t levels;
+ uint32_t __resv2;
+ /* out */
+ uint64_t start_addr;
+};
+
+struct vfio_iommu_spapr_tce_remove {
+ uint32_t argsz;
+ uint32_t flags;
+ /* in */
+ uint64_t start_addr;
+};
+
+struct vfio_iommu_spapr_tce_ddw_info {
+ uint64_t pgsizes;
+ uint32_t max_dynamic_windows_supported;
+ uint32_t levels;
+};
+
+/* SPAPR_v2 is not present, but SPAPR might be */
+#ifndef VFIO_SPAPR_TCE_IOMMU
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+
+struct vfio_iommu_spapr_tce_info {
+ uint32_t argsz;
+ uint32_t flags;
+ uint32_t dma32_window_start;
+ uint32_t dma32_window_size;
+ struct vfio_iommu_spapr_tce_ddw_info ddw;
+};
+#endif /* VFIO_SPAPR_TCE_IOMMU */
+
+#else /* VFIO_SPAPR_TCE_v2_IOMMU */
+#define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU
+#endif
+
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)
#define RTE_VFIO_NOIOMMU 8
#else
@@ -78,13 +134,13 @@ int vfio_mp_sync_connect_to_primary(void);
struct vfio_group {
int group_no;
int fd;
+ int devices;
};
struct vfio_config {
int vfio_enabled;
int vfio_container_fd;
- int vfio_container_has_dma;
- int vfio_group_idx;
+ int vfio_active_groups;
struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
};
@@ -130,6 +186,10 @@ vfio_get_group_no(const char *sysfs_base,
int
vfio_get_group_fd(int iommu_group_no);
+/* remove group fd from internal VFIO group fd array */
+int
+clear_group(int vfio_group_fd);
+
/**
* Setup vfio_cfg for the device identified by its address. It discovers
* the configured I/O MMU groups or sets a new one for the device. If a new
@@ -140,6 +200,8 @@ vfio_get_group_fd(int iommu_group_no);
int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
int *vfio_dev_fd, struct vfio_device_info *device_info);
+int vfio_release_device(const char *sysfs_base, const char *dev_addr, int fd);
+
int vfio_enable(const char *modname);
int vfio_is_enabled(const char *modname);
@@ -150,6 +212,7 @@ int vfio_mp_sync_setup(void);
#define SOCKET_REQ_CONTAINER 0x100
#define SOCKET_REQ_GROUP 0x200
+#define SOCKET_CLR_GROUP 0x300
#define SOCKET_OK 0x0
#define SOCKET_NO_FD 0x1
#define SOCKET_ERR 0xFF
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index fb4a2f84..7e8095cb 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -267,7 +267,7 @@ vfio_mp_sync_connect_to_primary(void)
static __attribute__((noreturn)) void *
vfio_mp_sync_thread(void __rte_unused * arg)
{
- int ret, fd, vfio_group_no;
+ int ret, fd, vfio_data;
/* wait for requests on the socket */
for (;;) {
@@ -305,13 +305,13 @@ vfio_mp_sync_thread(void __rte_unused * arg)
break;
case SOCKET_REQ_GROUP:
/* wait for group number */
- vfio_group_no = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_group_no < 0) {
+ vfio_data = vfio_mp_sync_receive_request(conn_sock);
+ if (vfio_data < 0) {
close(conn_sock);
continue;
}
- fd = vfio_get_group_fd(vfio_group_no);
+ fd = vfio_get_group_fd(vfio_data);
if (fd < 0)
vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
@@ -324,6 +324,21 @@ vfio_mp_sync_thread(void __rte_unused * arg)
vfio_mp_sync_send_fd(conn_sock, fd);
}
break;
+ case SOCKET_CLR_GROUP:
+ /* wait for group fd */
+ vfio_data = vfio_mp_sync_receive_request(conn_sock);
+ if (vfio_data < 0) {
+ close(conn_sock);
+ continue;
+ }
+
+ ret = clear_group(vfio_data);
+
+ if (ret < 0)
+ vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+ else
+ vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
+ break;
default:
vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
break;
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index d459bf48..6daffebf 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -49,8 +49,9 @@ enum rte_intr_handle_type {
RTE_INTR_HANDLE_VFIO_LEGACY, /**< vfio device handle (legacy) */
RTE_INTR_HANDLE_VFIO_MSI, /**< vfio device handle (MSI) */
RTE_INTR_HANDLE_VFIO_MSIX, /**< vfio device handle (MSIX) */
- RTE_INTR_HANDLE_ALARM, /**< alarm handle */
- RTE_INTR_HANDLE_EXT, /**< external handler */
+ RTE_INTR_HANDLE_ALARM, /**< alarm handle */
+ RTE_INTR_HANDLE_EXT, /**< external handler */
+ RTE_INTR_HANDLE_VDEV, /**< virtual device */
RTE_INTR_HANDLE_MAX
};
@@ -171,6 +172,15 @@ rte_intr_rx_ctl(struct rte_intr_handle *intr_handle,
int epfd, int op, unsigned int vec, void *data);
/**
+ * It deletes registered eventfds.
+ *
+ * @param intr_handle
+ * Pointer to the interrupt handle.
+ */
+void
+rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle);
+
+/**
* It enables the packet I/O interrupt event if it's necessary.
* It creates event fd for each interrupt vector when MSIX is used,
* otherwise it multiplexes a single event fd.
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
index 09713b0c..2ac879fd 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
@@ -116,11 +116,10 @@ struct rte_kni_fifo {
struct rte_kni_mbuf {
void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE)));
uint64_t buf_physaddr;
- char pad0[2];
uint16_t data_off; /**< Start address of data in segment buffer. */
char pad1[2];
- uint8_t nb_segs; /**< Number of segments. */
- char pad4[1];
+ uint16_t nb_segs; /**< Number of segments. */
+ char pad4[2];
uint64_t ol_flags; /**< Offload features. */
char pad2[4];
uint32_t pkt_len; /**< Total pkt len: sum of all segment data_len. */
diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
index 83721ba5..670bab3a 100644
--- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
@@ -6,8 +6,6 @@ DPDK_2.0 {
eal_parse_sysfs_value;
eal_timer_source;
lcore_config;
- pci_device_list;
- pci_driver_list;
per_lcore__lcore_id;
per_lcore__rte_errno;
rte_calloc;
@@ -22,12 +20,9 @@ DPDK_2.0 {
rte_dump_tailq;
rte_eal_alarm_cancel;
rte_eal_alarm_set;
- rte_eal_dev_init;
rte_eal_devargs_add;
rte_eal_devargs_dump;
rte_eal_devargs_type_count;
- rte_eal_driver_register;
- rte_eal_driver_unregister;
rte_eal_get_configuration;
rte_eal_get_lcore_state;
rte_eal_get_physmem_layout;
@@ -40,18 +35,10 @@ DPDK_2.0 {
rte_eal_mp_remote_launch;
rte_eal_mp_wait_lcore;
rte_eal_parse_devargs_str;
- rte_eal_pci_dump;
- rte_eal_pci_probe;
- rte_eal_pci_probe_one;
- rte_eal_pci_register;
- rte_eal_pci_scan;
- rte_eal_pci_unregister;
rte_eal_process_type;
rte_eal_remote_launch;
rte_eal_tailq_lookup;
rte_eal_tailq_register;
- rte_eal_vdev_init;
- rte_eal_vdev_uninit;
rte_eal_wait_lcore;
rte_exit;
rte_free;
@@ -66,11 +53,8 @@ DPDK_2.0 {
rte_intr_disable;
rte_intr_enable;
rte_log;
- rte_log_add_in_history;
rte_log_cur_msg_loglevel;
rte_log_cur_msg_logtype;
- rte_log_dump_history;
- rte_log_set_history;
rte_logs;
rte_malloc;
rte_malloc_dump_stats;
@@ -114,9 +98,6 @@ DPDK_2.0 {
DPDK_2.1 {
global:
- rte_eal_pci_detach;
- rte_eal_pci_read_config;
- rte_eal_pci_write_config;
rte_epoll_ctl;
rte_epoll_wait;
rte_intr_allow_others;
@@ -146,12 +127,6 @@ DPDK_16.04 {
global:
rte_cpu_get_flag_name;
- rte_eal_pci_ioport_map;
- rte_eal_pci_ioport_read;
- rte_eal_pci_ioport_unmap;
- rte_eal_pci_ioport_write;
- rte_eal_pci_map_device;
- rte_eal_pci_unmap_device;
rte_eal_primary_proc_alive;
} DPDK_2.2;
@@ -174,7 +149,52 @@ DPDK_16.11 {
rte_delay_us_callback_register;
rte_eal_dev_attach;
rte_eal_dev_detach;
- rte_eal_vdrv_register;
- rte_eal_vdrv_unregister;
} DPDK_16.07;
+
+DPDK_17.02 {
+ global:
+
+ rte_bus_dump;
+ rte_bus_probe;
+ rte_bus_register;
+ rte_bus_scan;
+ rte_bus_unregister;
+
+} DPDK_16.11;
+
+DPDK_17.05 {
+ global:
+
+ rte_cpu_is_supported;
+ rte_intr_free_epoll_fd;
+ rte_log_dump;
+ rte_log_get_global_level;
+ rte_log_register;
+ rte_log_set_global_level;
+ rte_log_set_level;
+ rte_log_set_level_regexp;
+ rte_pci_detach;
+ rte_pci_dump;
+ rte_pci_ioport_map;
+ rte_pci_ioport_read;
+ rte_pci_ioport_unmap;
+ rte_pci_ioport_write;
+ rte_pci_map_device;
+ rte_pci_probe;
+ rte_pci_probe_one;
+ rte_pci_read_config;
+ rte_pci_register;
+ rte_pci_scan;
+ rte_pci_unmap_device;
+ rte_pci_unregister;
+ rte_pci_write_config;
+ rte_vdev_init;
+ rte_vdev_register;
+ rte_vdev_uninit;
+ rte_vdev_unregister;
+ vfio_get_container_fd;
+ vfio_get_group_fd;
+ vfio_get_group_no;
+
+} DPDK_17.02;