aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/bus/pci/linux
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/bus/pci/linux')
-rw-r--r--drivers/bus/pci/linux/Makefile36
-rw-r--r--drivers/bus/pci/linux/pci.c856
-rw-r--r--drivers/bus/pci/linux/pci_init.h111
-rw-r--r--drivers/bus/pci/linux/pci_uio.c568
-rw-r--r--drivers/bus/pci/linux/pci_vfio.c763
5 files changed, 2334 insertions, 0 deletions
diff --git a/drivers/bus/pci/linux/Makefile b/drivers/bus/pci/linux/Makefile
new file mode 100644
index 00000000..77c5f970
--- /dev/null
+++ b/drivers/bus/pci/linux/Makefile
@@ -0,0 +1,36 @@
+# BSD LICENSE
+#
+# Copyright(c) 2017 6WIND S.A.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of 6WIND nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+SRCS += pci.c
+SRCS += pci_uio.c
+SRCS += pci_vfio.c
+
+CFLAGS += -D_GNU_SOURCE
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
new file mode 100644
index 00000000..5da6728f
--- /dev/null
+++ b/drivers/bus/pci/linux/pci.c
@@ -0,0 +1,856 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <dirent.h>
+
+#include <rte_log.h>
+#include <rte_bus.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_eal_memconfig.h>
+#include <rte_malloc.h>
+#include <rte_devargs.h>
+#include <rte_memcpy.h>
+#include <rte_vfio.h>
+
+#include "eal_private.h"
+#include "eal_filesystem.h"
+
+#include "private.h"
+#include "pci_init.h"
+
+/**
+ * @file
+ * PCI probing under linux
+ *
+ * This code is used to simulate a PCI probe by parsing information in sysfs.
+ * When a registered device matches a driver, it is then initialized with
+ * IGB_UIO driver (or doesn't initialize, if the device wasn't bound to it).
+ */
+
+extern struct rte_pci_bus rte_pci_bus;
+
+static int
+pci_get_kernel_driver_by_path(const char *filename, char *dri_name)
+{
+ int count;
+ char path[PATH_MAX];
+ char *name;
+
+ if (!filename || !dri_name)
+ return -1;
+
+ count = readlink(filename, path, PATH_MAX);
+ if (count >= PATH_MAX)
+ return -1;
+
+ /* For device does not have a driver */
+ if (count < 0)
+ return 1;
+
+ path[count] = '\0';
+
+ name = strrchr(path, '/');
+ if (name) {
+ strncpy(dri_name, name + 1, strlen(name + 1) + 1);
+ return 0;
+ }
+
+ return -1;
+}
+
+/* Map pci device */
+int
+rte_pci_map_device(struct rte_pci_device *dev)
+{
+ int ret = -1;
+
+ /* try mapping the NIC resources using VFIO if it exists */
+ switch (dev->kdrv) {
+ case RTE_KDRV_VFIO:
+#ifdef VFIO_PRESENT
+ if (pci_vfio_is_enabled())
+ ret = pci_vfio_map_resource(dev);
+#endif
+ break;
+ case RTE_KDRV_IGB_UIO:
+ case RTE_KDRV_UIO_GENERIC:
+ if (rte_eal_using_phys_addrs()) {
+ /* map resources for devices that use uio */
+ ret = pci_uio_map_resource(dev);
+ }
+ break;
+ default:
+ RTE_LOG(DEBUG, EAL,
+ " Not managed by a supported kernel driver, skipped\n");
+ ret = 1;
+ break;
+ }
+
+ return ret;
+}
+
+/* Unmap pci device */
+void
+rte_pci_unmap_device(struct rte_pci_device *dev)
+{
+ /* try unmapping the NIC resources using VFIO if it exists */
+ switch (dev->kdrv) {
+ case RTE_KDRV_VFIO:
+#ifdef VFIO_PRESENT
+ if (pci_vfio_is_enabled())
+ pci_vfio_unmap_resource(dev);
+#endif
+ break;
+ case RTE_KDRV_IGB_UIO:
+ case RTE_KDRV_UIO_GENERIC:
+ /* unmap resources for devices that use uio */
+ pci_uio_unmap_resource(dev);
+ break;
+ default:
+ RTE_LOG(DEBUG, EAL,
+ " Not managed by a supported kernel driver, skipped\n");
+ break;
+ }
+}
+
+void *
+pci_find_max_end_va(void)
+{
+ const struct rte_memseg *seg = rte_eal_get_physmem_layout();
+ const struct rte_memseg *last = seg;
+ unsigned i = 0;
+
+ for (i = 0; i < RTE_MAX_MEMSEG; i++, seg++) {
+ if (seg->addr == NULL)
+ break;
+
+ if (seg->addr > last->addr)
+ last = seg;
+
+ }
+ return RTE_PTR_ADD(last->addr, last->len);
+}
+
+/* parse one line of the "resource" sysfs file (note that the 'line'
+ * string is modified)
+ */
+int
+pci_parse_one_sysfs_resource(char *line, size_t len, uint64_t *phys_addr,
+ uint64_t *end_addr, uint64_t *flags)
+{
+ union pci_resource_info {
+ struct {
+ char *phys_addr;
+ char *end_addr;
+ char *flags;
+ };
+ char *ptrs[PCI_RESOURCE_FMT_NVAL];
+ } res_info;
+
+ if (rte_strsplit(line, len, res_info.ptrs, 3, ' ') != 3) {
+ RTE_LOG(ERR, EAL,
+ "%s(): bad resource format\n", __func__);
+ return -1;
+ }
+ errno = 0;
+ *phys_addr = strtoull(res_info.phys_addr, NULL, 16);
+ *end_addr = strtoull(res_info.end_addr, NULL, 16);
+ *flags = strtoull(res_info.flags, NULL, 16);
+ if (errno != 0) {
+ RTE_LOG(ERR, EAL,
+ "%s(): bad resource format\n", __func__);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* parse the "resource" sysfs file */
+static int
+pci_parse_sysfs_resource(const char *filename, struct rte_pci_device *dev)
+{
+ FILE *f;
+ char buf[BUFSIZ];
+ int i;
+ uint64_t phys_addr, end_addr, flags;
+
+ f = fopen(filename, "r");
+ if (f == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot open sysfs resource\n");
+ return -1;
+ }
+
+ for (i = 0; i<PCI_MAX_RESOURCE; i++) {
+
+ if (fgets(buf, sizeof(buf), f) == NULL) {
+ RTE_LOG(ERR, EAL,
+ "%s(): cannot read resource\n", __func__);
+ goto error;
+ }
+ if (pci_parse_one_sysfs_resource(buf, sizeof(buf), &phys_addr,
+ &end_addr, &flags) < 0)
+ goto error;
+
+ if (flags & IORESOURCE_MEM) {
+ dev->mem_resource[i].phys_addr = phys_addr;
+ dev->mem_resource[i].len = end_addr - phys_addr + 1;
+ /* not mapped for now */
+ dev->mem_resource[i].addr = NULL;
+ }
+ }
+ fclose(f);
+ return 0;
+
+error:
+ fclose(f);
+ return -1;
+}
+
+/* Scan one pci sysfs entry, and fill the devices list from it. */
+static int
+pci_scan_one(const char *dirname, const struct rte_pci_addr *addr)
+{
+ char filename[PATH_MAX];
+ unsigned long tmp;
+ struct rte_pci_device *dev;
+ char driver[PATH_MAX];
+ int ret;
+
+ dev = malloc(sizeof(*dev));
+ if (dev == NULL)
+ return -1;
+
+ memset(dev, 0, sizeof(*dev));
+ dev->addr = *addr;
+
+ /* get vendor id */
+ snprintf(filename, sizeof(filename), "%s/vendor", dirname);
+ if (eal_parse_sysfs_value(filename, &tmp) < 0) {
+ free(dev);
+ return -1;
+ }
+ dev->id.vendor_id = (uint16_t)tmp;
+
+ /* get device id */
+ snprintf(filename, sizeof(filename), "%s/device", dirname);
+ if (eal_parse_sysfs_value(filename, &tmp) < 0) {
+ free(dev);
+ return -1;
+ }
+ dev->id.device_id = (uint16_t)tmp;
+
+ /* get subsystem_vendor id */
+ snprintf(filename, sizeof(filename), "%s/subsystem_vendor",
+ dirname);
+ if (eal_parse_sysfs_value(filename, &tmp) < 0) {
+ free(dev);
+ return -1;
+ }
+ dev->id.subsystem_vendor_id = (uint16_t)tmp;
+
+ /* get subsystem_device id */
+ snprintf(filename, sizeof(filename), "%s/subsystem_device",
+ dirname);
+ if (eal_parse_sysfs_value(filename, &tmp) < 0) {
+ free(dev);
+ return -1;
+ }
+ dev->id.subsystem_device_id = (uint16_t)tmp;
+
+ /* get class_id */
+ snprintf(filename, sizeof(filename), "%s/class",
+ dirname);
+ if (eal_parse_sysfs_value(filename, &tmp) < 0) {
+ free(dev);
+ return -1;
+ }
+ /* the least 24 bits are valid: class, subclass, program interface */
+ dev->id.class_id = (uint32_t)tmp & RTE_CLASS_ANY_ID;
+
+ /* get max_vfs */
+ dev->max_vfs = 0;
+ snprintf(filename, sizeof(filename), "%s/max_vfs", dirname);
+ if (!access(filename, F_OK) &&
+ eal_parse_sysfs_value(filename, &tmp) == 0)
+ dev->max_vfs = (uint16_t)tmp;
+ else {
+ /* for non igb_uio driver, need kernel version >= 3.8 */
+ snprintf(filename, sizeof(filename),
+ "%s/sriov_numvfs", dirname);
+ if (!access(filename, F_OK) &&
+ eal_parse_sysfs_value(filename, &tmp) == 0)
+ dev->max_vfs = (uint16_t)tmp;
+ }
+
+ /* get numa node, default to 0 if not present */
+ snprintf(filename, sizeof(filename), "%s/numa_node",
+ dirname);
+
+ if (access(filename, F_OK) != -1) {
+ if (eal_parse_sysfs_value(filename, &tmp) == 0)
+ dev->device.numa_node = tmp;
+ else
+ dev->device.numa_node = -1;
+ } else {
+ dev->device.numa_node = 0;
+ }
+
+ pci_name_set(dev);
+
+ /* parse resources */
+ snprintf(filename, sizeof(filename), "%s/resource", dirname);
+ if (pci_parse_sysfs_resource(filename, dev) < 0) {
+ RTE_LOG(ERR, EAL, "%s(): cannot parse resource\n", __func__);
+ free(dev);
+ return -1;
+ }
+
+ /* parse driver */
+ snprintf(filename, sizeof(filename), "%s/driver", dirname);
+ ret = pci_get_kernel_driver_by_path(filename, driver);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "Fail to get kernel driver\n");
+ free(dev);
+ return -1;
+ }
+
+ if (!ret) {
+ if (!strcmp(driver, "vfio-pci"))
+ dev->kdrv = RTE_KDRV_VFIO;
+ else if (!strcmp(driver, "igb_uio"))
+ dev->kdrv = RTE_KDRV_IGB_UIO;
+ else if (!strcmp(driver, "uio_pci_generic"))
+ dev->kdrv = RTE_KDRV_UIO_GENERIC;
+ else
+ dev->kdrv = RTE_KDRV_UNKNOWN;
+ } else
+ dev->kdrv = RTE_KDRV_NONE;
+
+ /* device is valid, add in list (sorted) */
+ if (TAILQ_EMPTY(&rte_pci_bus.device_list)) {
+ rte_pci_add_device(dev);
+ } else {
+ struct rte_pci_device *dev2;
+ int ret;
+
+ TAILQ_FOREACH(dev2, &rte_pci_bus.device_list, next) {
+ ret = rte_pci_addr_cmp(&dev->addr, &dev2->addr);
+ if (ret > 0)
+ continue;
+
+ if (ret < 0) {
+ rte_pci_insert_device(dev2, dev);
+ } else { /* already registered */
+ dev2->kdrv = dev->kdrv;
+ dev2->max_vfs = dev->max_vfs;
+ pci_name_set(dev2);
+ memmove(dev2->mem_resource, dev->mem_resource,
+ sizeof(dev->mem_resource));
+ free(dev);
+ }
+ return 0;
+ }
+
+ rte_pci_add_device(dev);
+ }
+
+ return 0;
+}
+
+int
+pci_update_device(const struct rte_pci_addr *addr)
+{
+ char filename[PATH_MAX];
+
+ snprintf(filename, sizeof(filename), "%s/" PCI_PRI_FMT,
+ rte_pci_get_sysfs_path(), addr->domain, addr->bus, addr->devid,
+ addr->function);
+
+ return pci_scan_one(filename, addr);
+}
+
+/*
+ * split up a pci address into its constituent parts.
+ */
+static int
+parse_pci_addr_format(const char *buf, int bufsize, struct rte_pci_addr *addr)
+{
+ /* first split on ':' */
+ union splitaddr {
+ struct {
+ char *domain;
+ char *bus;
+ char *devid;
+ char *function;
+ };
+ char *str[PCI_FMT_NVAL]; /* last element-separator is "." not ":" */
+ } splitaddr;
+
+ char *buf_copy = strndup(buf, bufsize);
+ if (buf_copy == NULL)
+ return -1;
+
+ if (rte_strsplit(buf_copy, bufsize, splitaddr.str, PCI_FMT_NVAL, ':')
+ != PCI_FMT_NVAL - 1)
+ goto error;
+ /* final split is on '.' between devid and function */
+ splitaddr.function = strchr(splitaddr.devid,'.');
+ if (splitaddr.function == NULL)
+ goto error;
+ *splitaddr.function++ = '\0';
+
+ /* now convert to int values */
+ errno = 0;
+ addr->domain = strtoul(splitaddr.domain, NULL, 16);
+ addr->bus = strtoul(splitaddr.bus, NULL, 16);
+ addr->devid = strtoul(splitaddr.devid, NULL, 16);
+ addr->function = strtoul(splitaddr.function, NULL, 10);
+ if (errno != 0)
+ goto error;
+
+ free(buf_copy); /* free the copy made with strdup */
+ return 0;
+error:
+ free(buf_copy);
+ return -1;
+}
+
+/*
+ * Scan the content of the PCI bus, and the devices in the devices
+ * list
+ */
+int
+rte_pci_scan(void)
+{
+ struct dirent *e;
+ DIR *dir;
+ char dirname[PATH_MAX];
+ struct rte_pci_addr addr;
+
+ /* for debug purposes, PCI can be disabled */
+ if (!rte_eal_has_pci())
+ return 0;
+
+#ifdef VFIO_PRESENT
+ if (!pci_vfio_is_enabled())
+ RTE_LOG(DEBUG, EAL, "VFIO PCI modules not loaded\n");
+#endif
+
+ dir = opendir(rte_pci_get_sysfs_path());
+ if (dir == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): opendir failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+
+ while ((e = readdir(dir)) != NULL) {
+ if (e->d_name[0] == '.')
+ continue;
+
+ if (parse_pci_addr_format(e->d_name, sizeof(e->d_name), &addr) != 0)
+ continue;
+
+ snprintf(dirname, sizeof(dirname), "%s/%s",
+ rte_pci_get_sysfs_path(), e->d_name);
+
+ if (pci_scan_one(dirname, &addr) < 0)
+ goto error;
+ }
+ closedir(dir);
+ return 0;
+
+error:
+ closedir(dir);
+ return -1;
+}
+
+/*
+ * Is pci device bound to any kdrv
+ */
+static inline int
+pci_one_device_is_bound(void)
+{
+ struct rte_pci_device *dev = NULL;
+ int ret = 0;
+
+ FOREACH_DEVICE_ON_PCIBUS(dev) {
+ if (dev->kdrv == RTE_KDRV_UNKNOWN ||
+ dev->kdrv == RTE_KDRV_NONE) {
+ continue;
+ } else {
+ ret = 1;
+ break;
+ }
+ }
+ return ret;
+}
+
+/*
+ * Any one of the device bound to uio
+ */
+static inline int
+pci_one_device_bound_uio(void)
+{
+ struct rte_pci_device *dev = NULL;
+ struct rte_devargs *devargs;
+ int need_check;
+
+ FOREACH_DEVICE_ON_PCIBUS(dev) {
+ devargs = dev->device.devargs;
+
+ need_check = 0;
+ switch (rte_pci_bus.bus.conf.scan_mode) {
+ case RTE_BUS_SCAN_WHITELIST:
+ if (devargs && devargs->policy == RTE_DEV_WHITELISTED)
+ need_check = 1;
+ break;
+ case RTE_BUS_SCAN_UNDEFINED:
+ case RTE_BUS_SCAN_BLACKLIST:
+ if (devargs == NULL ||
+ devargs->policy != RTE_DEV_BLACKLISTED)
+ need_check = 1;
+ break;
+ }
+
+ if (!need_check)
+ continue;
+
+ if (dev->kdrv == RTE_KDRV_IGB_UIO ||
+ dev->kdrv == RTE_KDRV_UIO_GENERIC) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Any one of the device has iova as va
+ */
+static inline int
+pci_one_device_has_iova_va(void)
+{
+ struct rte_pci_device *dev = NULL;
+ struct rte_pci_driver *drv = NULL;
+
+ FOREACH_DRIVER_ON_PCIBUS(drv) {
+ if (drv && drv->drv_flags & RTE_PCI_DRV_IOVA_AS_VA) {
+ FOREACH_DEVICE_ON_PCIBUS(dev) {
+ if (dev->kdrv == RTE_KDRV_VFIO &&
+ rte_pci_match(drv, dev))
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+/*
+ * Get iommu class of PCI devices on the bus.
+ */
+enum rte_iova_mode
+rte_pci_get_iommu_class(void)
+{
+ bool is_bound;
+ bool is_vfio_noiommu_enabled = true;
+ bool has_iova_va;
+ bool is_bound_uio;
+ bool spapr_iommu =
+#if defined(RTE_ARCH_PPC_64)
+ true;
+#else
+ false;
+#endif
+
+ is_bound = pci_one_device_is_bound();
+ if (!is_bound)
+ return RTE_IOVA_DC;
+
+ has_iova_va = pci_one_device_has_iova_va();
+ is_bound_uio = pci_one_device_bound_uio();
+#ifdef VFIO_PRESENT
+ is_vfio_noiommu_enabled = rte_vfio_noiommu_is_enabled() == true ?
+ true : false;
+#endif
+
+ if (has_iova_va && !is_bound_uio && !is_vfio_noiommu_enabled &&
+ !spapr_iommu)
+ return RTE_IOVA_VA;
+
+ if (has_iova_va) {
+ RTE_LOG(WARNING, EAL, "Some devices want iova as va but pa will be used because.. ");
+ if (is_vfio_noiommu_enabled)
+ RTE_LOG(WARNING, EAL, "vfio-noiommu mode configured\n");
+ if (is_bound_uio)
+ RTE_LOG(WARNING, EAL, "few device bound to UIO\n");
+ if (spapr_iommu)
+ RTE_LOG(WARNING, EAL, "sPAPR IOMMU does not support IOVA as VA\n");
+ }
+
+ return RTE_IOVA_PA;
+}
+
+/* Read PCI config space. */
+int rte_pci_read_config(const struct rte_pci_device *device,
+ void *buf, size_t len, off_t offset)
+{
+ const struct rte_intr_handle *intr_handle = &device->intr_handle;
+
+ switch (intr_handle->type) {
+ case RTE_INTR_HANDLE_UIO:
+ case RTE_INTR_HANDLE_UIO_INTX:
+ return pci_uio_read_config(intr_handle, buf, len, offset);
+
+#ifdef VFIO_PRESENT
+ case RTE_INTR_HANDLE_VFIO_MSIX:
+ case RTE_INTR_HANDLE_VFIO_MSI:
+ case RTE_INTR_HANDLE_VFIO_LEGACY:
+ return pci_vfio_read_config(intr_handle, buf, len, offset);
+#endif
+ default:
+ RTE_LOG(ERR, EAL,
+ "Unknown handle type of fd %d\n",
+ intr_handle->fd);
+ return -1;
+ }
+}
+
+/* Write PCI config space. */
+int rte_pci_write_config(const struct rte_pci_device *device,
+ const void *buf, size_t len, off_t offset)
+{
+ const struct rte_intr_handle *intr_handle = &device->intr_handle;
+
+ switch (intr_handle->type) {
+ case RTE_INTR_HANDLE_UIO:
+ case RTE_INTR_HANDLE_UIO_INTX:
+ return pci_uio_write_config(intr_handle, buf, len, offset);
+
+#ifdef VFIO_PRESENT
+ case RTE_INTR_HANDLE_VFIO_MSIX:
+ case RTE_INTR_HANDLE_VFIO_MSI:
+ case RTE_INTR_HANDLE_VFIO_LEGACY:
+ return pci_vfio_write_config(intr_handle, buf, len, offset);
+#endif
+ default:
+ RTE_LOG(ERR, EAL,
+ "Unknown handle type of fd %d\n",
+ intr_handle->fd);
+ return -1;
+ }
+}
+
+#if defined(RTE_ARCH_X86)
+static int
+pci_ioport_map(struct rte_pci_device *dev, int bar __rte_unused,
+ struct rte_pci_ioport *p)
+{
+ uint16_t start, end;
+ FILE *fp;
+ char *line = NULL;
+ char pci_id[16];
+ int found = 0;
+ size_t linesz;
+
+ snprintf(pci_id, sizeof(pci_id), PCI_PRI_FMT,
+ dev->addr.domain, dev->addr.bus,
+ dev->addr.devid, dev->addr.function);
+
+ fp = fopen("/proc/ioports", "r");
+ if (fp == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): can't open ioports\n", __func__);
+ return -1;
+ }
+
+ while (getdelim(&line, &linesz, '\n', fp) > 0) {
+ char *ptr = line;
+ char *left;
+ int n;
+
+ n = strcspn(ptr, ":");
+ ptr[n] = 0;
+ left = &ptr[n + 1];
+
+ while (*left && isspace(*left))
+ left++;
+
+ if (!strncmp(left, pci_id, strlen(pci_id))) {
+ found = 1;
+
+ while (*ptr && isspace(*ptr))
+ ptr++;
+
+ sscanf(ptr, "%04hx-%04hx", &start, &end);
+
+ break;
+ }
+ }
+
+ free(line);
+ fclose(fp);
+
+ if (!found)
+ return -1;
+
+ dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+ p->base = start;
+ RTE_LOG(DEBUG, EAL, "PCI Port IO found start=0x%x\n", start);
+
+ return 0;
+}
+#endif
+
+int
+rte_pci_ioport_map(struct rte_pci_device *dev, int bar,
+ struct rte_pci_ioport *p)
+{
+ int ret = -1;
+
+ switch (dev->kdrv) {
+#ifdef VFIO_PRESENT
+ case RTE_KDRV_VFIO:
+ if (pci_vfio_is_enabled())
+ ret = pci_vfio_ioport_map(dev, bar, p);
+ break;
+#endif
+ case RTE_KDRV_IGB_UIO:
+ ret = pci_uio_ioport_map(dev, bar, p);
+ break;
+ case RTE_KDRV_UIO_GENERIC:
+#if defined(RTE_ARCH_X86)
+ ret = pci_ioport_map(dev, bar, p);
+#else
+ ret = pci_uio_ioport_map(dev, bar, p);
+#endif
+ break;
+ case RTE_KDRV_NONE:
+#if defined(RTE_ARCH_X86)
+ ret = pci_ioport_map(dev, bar, p);
+#endif
+ break;
+ default:
+ break;
+ }
+
+ if (!ret)
+ p->dev = dev;
+
+ return ret;
+}
+
+void
+rte_pci_ioport_read(struct rte_pci_ioport *p,
+ void *data, size_t len, off_t offset)
+{
+ switch (p->dev->kdrv) {
+#ifdef VFIO_PRESENT
+ case RTE_KDRV_VFIO:
+ pci_vfio_ioport_read(p, data, len, offset);
+ break;
+#endif
+ case RTE_KDRV_IGB_UIO:
+ pci_uio_ioport_read(p, data, len, offset);
+ break;
+ case RTE_KDRV_UIO_GENERIC:
+ pci_uio_ioport_read(p, data, len, offset);
+ break;
+ case RTE_KDRV_NONE:
+#if defined(RTE_ARCH_X86)
+ pci_uio_ioport_read(p, data, len, offset);
+#endif
+ break;
+ default:
+ break;
+ }
+}
+
+void
+rte_pci_ioport_write(struct rte_pci_ioport *p,
+ const void *data, size_t len, off_t offset)
+{
+ switch (p->dev->kdrv) {
+#ifdef VFIO_PRESENT
+ case RTE_KDRV_VFIO:
+ pci_vfio_ioport_write(p, data, len, offset);
+ break;
+#endif
+ case RTE_KDRV_IGB_UIO:
+ pci_uio_ioport_write(p, data, len, offset);
+ break;
+ case RTE_KDRV_UIO_GENERIC:
+ pci_uio_ioport_write(p, data, len, offset);
+ break;
+ case RTE_KDRV_NONE:
+#if defined(RTE_ARCH_X86)
+ pci_uio_ioport_write(p, data, len, offset);
+#endif
+ break;
+ default:
+ break;
+ }
+}
+
+int
+rte_pci_ioport_unmap(struct rte_pci_ioport *p)
+{
+ int ret = -1;
+
+ switch (p->dev->kdrv) {
+#ifdef VFIO_PRESENT
+ case RTE_KDRV_VFIO:
+ if (pci_vfio_is_enabled())
+ ret = pci_vfio_ioport_unmap(p);
+ break;
+#endif
+ case RTE_KDRV_IGB_UIO:
+ ret = pci_uio_ioport_unmap(p);
+ break;
+ case RTE_KDRV_UIO_GENERIC:
+#if defined(RTE_ARCH_X86)
+ ret = 0;
+#else
+ ret = pci_uio_ioport_unmap(p);
+#endif
+ break;
+ case RTE_KDRV_NONE:
+#if defined(RTE_ARCH_X86)
+ ret = 0;
+#endif
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
diff --git a/drivers/bus/pci/linux/pci_init.h b/drivers/bus/pci/linux/pci_init.h
new file mode 100644
index 00000000..f342c47d
--- /dev/null
+++ b/drivers/bus/pci/linux/pci_init.h
@@ -0,0 +1,111 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef EAL_PCI_INIT_H_
+#define EAL_PCI_INIT_H_
+
+#include <linux/version.h>
+
+#include <rte_vfio.h>
+
+/** IO resource type: */
+#define IORESOURCE_IO 0x00000100
+#define IORESOURCE_MEM 0x00000200
+
+/*
+ * Helper function to map PCI resources right after hugepages in virtual memory
+ */
+extern void *pci_map_addr;
+void *pci_find_max_end_va(void);
+
+/* parse one line of the "resource" sysfs file (note that the 'line'
+ * string is modified)
+ */
+int pci_parse_one_sysfs_resource(char *line, size_t len, uint64_t *phys_addr,
+ uint64_t *end_addr, uint64_t *flags);
+
+int pci_uio_alloc_resource(struct rte_pci_device *dev,
+ struct mapped_pci_resource **uio_res);
+void pci_uio_free_resource(struct rte_pci_device *dev,
+ struct mapped_pci_resource *uio_res);
+int pci_uio_map_resource_by_index(struct rte_pci_device *dev, int res_idx,
+ struct mapped_pci_resource *uio_res, int map_idx);
+
+int pci_uio_read_config(const struct rte_intr_handle *intr_handle,
+ void *buf, size_t len, off_t offs);
+int pci_uio_write_config(const struct rte_intr_handle *intr_handle,
+ const void *buf, size_t len, off_t offs);
+
+int pci_uio_ioport_map(struct rte_pci_device *dev, int bar,
+ struct rte_pci_ioport *p);
+void pci_uio_ioport_read(struct rte_pci_ioport *p,
+ void *data, size_t len, off_t offset);
+void pci_uio_ioport_write(struct rte_pci_ioport *p,
+ const void *data, size_t len, off_t offset);
+int pci_uio_ioport_unmap(struct rte_pci_ioport *p);
+
+#ifdef VFIO_PRESENT
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 0)
+#define RTE_PCI_MSIX_TABLE_BIR 0x7
+#define RTE_PCI_MSIX_TABLE_OFFSET 0xfffffff8
+#define RTE_PCI_MSIX_FLAGS_QSIZE 0x07ff
+#else
+#define RTE_PCI_MSIX_TABLE_BIR PCI_MSIX_TABLE_BIR
+#define RTE_PCI_MSIX_TABLE_OFFSET PCI_MSIX_TABLE_OFFSET
+#define RTE_PCI_MSIX_FLAGS_QSIZE PCI_MSIX_FLAGS_QSIZE
+#endif
+
+/* access config space */
+int pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
+ void *buf, size_t len, off_t offs);
+int pci_vfio_write_config(const struct rte_intr_handle *intr_handle,
+ const void *buf, size_t len, off_t offs);
+
+int pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
+ struct rte_pci_ioport *p);
+void pci_vfio_ioport_read(struct rte_pci_ioport *p,
+ void *data, size_t len, off_t offset);
+void pci_vfio_ioport_write(struct rte_pci_ioport *p,
+ const void *data, size_t len, off_t offset);
+int pci_vfio_ioport_unmap(struct rte_pci_ioport *p);
+
+/* map/unmap VFIO resource prototype */
+int pci_vfio_map_resource(struct rte_pci_device *dev);
+int pci_vfio_unmap_resource(struct rte_pci_device *dev);
+
+int pci_vfio_is_enabled(void);
+
+#endif
+
+#endif /* EAL_PCI_INIT_H_ */
diff --git a/drivers/bus/pci/linux/pci_uio.c b/drivers/bus/pci/linux/pci_uio.c
new file mode 100644
index 00000000..92b7f027
--- /dev/null
+++ b/drivers/bus/pci/linux/pci_uio.c
@@ -0,0 +1,568 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <inttypes.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/sysmacros.h>
+#include <linux/pci_regs.h>
+
+#if defined(RTE_ARCH_X86)
+#include <sys/io.h>
+#endif
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_eal_memconfig.h>
+#include <rte_common.h>
+#include <rte_malloc.h>
+
+#include "eal_filesystem.h"
+#include "pci_init.h"
+
+void *pci_map_addr = NULL;
+
+#define OFF_MAX ((uint64_t)(off_t)-1)
+
+int
+pci_uio_read_config(const struct rte_intr_handle *intr_handle,
+ void *buf, size_t len, off_t offset)
+{
+ return pread(intr_handle->uio_cfg_fd, buf, len, offset);
+}
+
+int
+pci_uio_write_config(const struct rte_intr_handle *intr_handle,
+ const void *buf, size_t len, off_t offset)
+{
+ return pwrite(intr_handle->uio_cfg_fd, buf, len, offset);
+}
+
+static int
+pci_uio_set_bus_master(int dev_fd)
+{
+ uint16_t reg;
+ int ret;
+
+ ret = pread(dev_fd, &reg, sizeof(reg), PCI_COMMAND);
+ if (ret != sizeof(reg)) {
+ RTE_LOG(ERR, EAL,
+ "Cannot read command from PCI config space!\n");
+ return -1;
+ }
+
+ /* return if bus mastering is already on */
+ if (reg & PCI_COMMAND_MASTER)
+ return 0;
+
+ reg |= PCI_COMMAND_MASTER;
+
+ ret = pwrite(dev_fd, &reg, sizeof(reg), PCI_COMMAND);
+ if (ret != sizeof(reg)) {
+ RTE_LOG(ERR, EAL,
+ "Cannot write command to PCI config space!\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+pci_mknod_uio_dev(const char *sysfs_uio_path, unsigned uio_num)
+{
+ FILE *f;
+ char filename[PATH_MAX];
+ int ret;
+ unsigned major, minor;
+ dev_t dev;
+
+ /* get the name of the sysfs file that contains the major and minor
+ * of the uio device and read its content */
+ snprintf(filename, sizeof(filename), "%s/dev", sysfs_uio_path);
+
+ f = fopen(filename, "r");
+ if (f == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): cannot open sysfs to get major:minor\n",
+ __func__);
+ return -1;
+ }
+
+ ret = fscanf(f, "%u:%u", &major, &minor);
+ if (ret != 2) {
+ RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs to get major:minor\n",
+ __func__);
+ fclose(f);
+ return -1;
+ }
+ fclose(f);
+
+ /* create the char device "mknod /dev/uioX c major minor" */
+ snprintf(filename, sizeof(filename), "/dev/uio%u", uio_num);
+ dev = makedev(major, minor);
+ ret = mknod(filename, S_IFCHR | S_IRUSR | S_IWUSR, dev);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "%s(): mknod() failed %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+
+ return ret;
+}
+
+/*
+ * Return the uioX char device used for a pci device. On success, return
+ * the UIO number and fill dstbuf string with the path of the device in
+ * sysfs. On error, return a negative value. In this case dstbuf is
+ * invalid.
+ */
+static int
+pci_get_uio_dev(struct rte_pci_device *dev, char *dstbuf,
+ unsigned int buflen, int create)
+{
+ struct rte_pci_addr *loc = &dev->addr;
+ int uio_num = -1;
+ struct dirent *e;
+ DIR *dir;
+ char dirname[PATH_MAX];
+
+ /* depending on kernel version, uio can be located in uio/uioX
+ * or uio:uioX */
+
+ snprintf(dirname, sizeof(dirname),
+ "%s/" PCI_PRI_FMT "/uio", rte_pci_get_sysfs_path(),
+ loc->domain, loc->bus, loc->devid, loc->function);
+
+ dir = opendir(dirname);
+ if (dir == NULL) {
+ /* retry with the parent directory */
+ snprintf(dirname, sizeof(dirname),
+ "%s/" PCI_PRI_FMT, rte_pci_get_sysfs_path(),
+ loc->domain, loc->bus, loc->devid, loc->function);
+ dir = opendir(dirname);
+
+ if (dir == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot opendir %s\n", dirname);
+ return -1;
+ }
+ }
+
+ /* take the first file starting with "uio" */
+ while ((e = readdir(dir)) != NULL) {
+ /* format could be uio%d ...*/
+ int shortprefix_len = sizeof("uio") - 1;
+ /* ... or uio:uio%d */
+ int longprefix_len = sizeof("uio:uio") - 1;
+ char *endptr;
+
+ if (strncmp(e->d_name, "uio", 3) != 0)
+ continue;
+
+ /* first try uio%d */
+ errno = 0;
+ uio_num = strtoull(e->d_name + shortprefix_len, &endptr, 10);
+ if (errno == 0 && endptr != (e->d_name + shortprefix_len)) {
+ snprintf(dstbuf, buflen, "%s/uio%u", dirname, uio_num);
+ break;
+ }
+
+ /* then try uio:uio%d */
+ errno = 0;
+ uio_num = strtoull(e->d_name + longprefix_len, &endptr, 10);
+ if (errno == 0 && endptr != (e->d_name + longprefix_len)) {
+ snprintf(dstbuf, buflen, "%s/uio:uio%u", dirname, uio_num);
+ break;
+ }
+ }
+ closedir(dir);
+
+ /* No uio resource found */
+ if (e == NULL)
+ return -1;
+
+ /* create uio device if we've been asked to */
+ if (rte_eal_create_uio_dev() && create &&
+ pci_mknod_uio_dev(dstbuf, uio_num) < 0)
+ RTE_LOG(WARNING, EAL, "Cannot create /dev/uio%u\n", uio_num);
+
+ return uio_num;
+}
+
+void
+pci_uio_free_resource(struct rte_pci_device *dev,
+ struct mapped_pci_resource *uio_res)
+{
+ rte_free(uio_res);
+
+ if (dev->intr_handle.uio_cfg_fd >= 0) {
+ close(dev->intr_handle.uio_cfg_fd);
+ dev->intr_handle.uio_cfg_fd = -1;
+ }
+ if (dev->intr_handle.fd >= 0) {
+ close(dev->intr_handle.fd);
+ dev->intr_handle.fd = -1;
+ dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+ }
+}
+
+int
+pci_uio_alloc_resource(struct rte_pci_device *dev,
+ struct mapped_pci_resource **uio_res)
+{
+ char dirname[PATH_MAX];
+ char cfgname[PATH_MAX];
+ char devname[PATH_MAX]; /* contains the /dev/uioX */
+ int uio_num;
+ struct rte_pci_addr *loc;
+
+ loc = &dev->addr;
+
+ /* find uio resource */
+ uio_num = pci_get_uio_dev(dev, dirname, sizeof(dirname), 1);
+ if (uio_num < 0) {
+ RTE_LOG(WARNING, EAL, " "PCI_PRI_FMT" not managed by UIO driver, "
+ "skipping\n", loc->domain, loc->bus, loc->devid, loc->function);
+ return 1;
+ }
+ snprintf(devname, sizeof(devname), "/dev/uio%u", uio_num);
+
+ /* save fd if in primary process */
+ dev->intr_handle.fd = open(devname, O_RDWR);
+ if (dev->intr_handle.fd < 0) {
+ RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
+ devname, strerror(errno));
+ goto error;
+ }
+
+ snprintf(cfgname, sizeof(cfgname),
+ "/sys/class/uio/uio%u/device/config", uio_num);
+ dev->intr_handle.uio_cfg_fd = open(cfgname, O_RDWR);
+ if (dev->intr_handle.uio_cfg_fd < 0) {
+ RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
+ cfgname, strerror(errno));
+ goto error;
+ }
+
+ if (dev->kdrv == RTE_KDRV_IGB_UIO)
+ dev->intr_handle.type = RTE_INTR_HANDLE_UIO;
+ else {
+ dev->intr_handle.type = RTE_INTR_HANDLE_UIO_INTX;
+
+ /* set bus master that is not done by uio_pci_generic */
+ if (pci_uio_set_bus_master(dev->intr_handle.uio_cfg_fd)) {
+ RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n");
+ goto error;
+ }
+ }
+
+ /* allocate the mapping details for secondary processes*/
+ *uio_res = rte_zmalloc("UIO_RES", sizeof(**uio_res), 0);
+ if (*uio_res == NULL) {
+ RTE_LOG(ERR, EAL,
+ "%s(): cannot store uio mmap details\n", __func__);
+ goto error;
+ }
+
+ snprintf((*uio_res)->path, sizeof((*uio_res)->path), "%s", devname);
+ memcpy(&(*uio_res)->pci_addr, &dev->addr, sizeof((*uio_res)->pci_addr));
+
+ return 0;
+
+error:
+ pci_uio_free_resource(dev, *uio_res);
+ return -1;
+}
+
+int
+pci_uio_map_resource_by_index(struct rte_pci_device *dev, int res_idx,
+ struct mapped_pci_resource *uio_res, int map_idx)
+{
+ int fd;
+ char devname[PATH_MAX];
+ void *mapaddr;
+ struct rte_pci_addr *loc;
+ struct pci_map *maps;
+
+ loc = &dev->addr;
+ maps = uio_res->maps;
+
+ /* update devname for mmap */
+ snprintf(devname, sizeof(devname),
+ "%s/" PCI_PRI_FMT "/resource%d",
+ rte_pci_get_sysfs_path(),
+ loc->domain, loc->bus, loc->devid,
+ loc->function, res_idx);
+
+ /* allocate memory to keep path */
+ maps[map_idx].path = rte_malloc(NULL, strlen(devname) + 1, 0);
+ if (maps[map_idx].path == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memory for path: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ /*
+ * open resource file, to mmap it
+ */
+ fd = open(devname, O_RDWR);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
+ devname, strerror(errno));
+ goto error;
+ }
+
+ /* try mapping somewhere close to the end of hugepages */
+ if (pci_map_addr == NULL)
+ pci_map_addr = pci_find_max_end_va();
+
+ mapaddr = pci_map_resource(pci_map_addr, fd, 0,
+ (size_t)dev->mem_resource[res_idx].len, 0);
+ close(fd);
+ if (mapaddr == MAP_FAILED)
+ goto error;
+
+ pci_map_addr = RTE_PTR_ADD(mapaddr,
+ (size_t)dev->mem_resource[res_idx].len);
+
+ maps[map_idx].phaddr = dev->mem_resource[res_idx].phys_addr;
+ maps[map_idx].size = dev->mem_resource[res_idx].len;
+ maps[map_idx].addr = mapaddr;
+ maps[map_idx].offset = 0;
+ strcpy(maps[map_idx].path, devname);
+ dev->mem_resource[res_idx].addr = mapaddr;
+
+ return 0;
+
+error:
+ rte_free(maps[map_idx].path);
+ return -1;
+}
+
+#if defined(RTE_ARCH_X86)
+int
+pci_uio_ioport_map(struct rte_pci_device *dev, int bar,
+ struct rte_pci_ioport *p)
+{
+ char dirname[PATH_MAX];
+ char filename[PATH_MAX];
+ int uio_num;
+ unsigned long start;
+
+ uio_num = pci_get_uio_dev(dev, dirname, sizeof(dirname), 0);
+ if (uio_num < 0)
+ return -1;
+
+ /* get portio start */
+ snprintf(filename, sizeof(filename),
+ "%s/portio/port%d/start", dirname, bar);
+ if (eal_parse_sysfs_value(filename, &start) < 0) {
+ RTE_LOG(ERR, EAL, "%s(): cannot parse portio start\n",
+ __func__);
+ return -1;
+ }
+ /* ensure we don't get anything funny here, read/write will cast to
+ * uin16_t */
+ if (start > UINT16_MAX)
+ return -1;
+
+ /* FIXME only for primary process ? */
+ if (dev->intr_handle.type == RTE_INTR_HANDLE_UNKNOWN) {
+
+ snprintf(filename, sizeof(filename), "/dev/uio%u", uio_num);
+ dev->intr_handle.fd = open(filename, O_RDWR);
+ if (dev->intr_handle.fd < 0) {
+ RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
+ filename, strerror(errno));
+ return -1;
+ }
+ dev->intr_handle.type = RTE_INTR_HANDLE_UIO;
+ }
+
+ RTE_LOG(DEBUG, EAL, "PCI Port IO found start=0x%lx\n", start);
+
+ p->base = start;
+ p->len = 0;
+ return 0;
+}
+#else
+int
+pci_uio_ioport_map(struct rte_pci_device *dev, int bar,
+ struct rte_pci_ioport *p)
+{
+ FILE *f;
+ char buf[BUFSIZ];
+ char filename[PATH_MAX];
+ uint64_t phys_addr, end_addr, flags;
+ int fd, i;
+ void *addr;
+
+ /* open and read addresses of the corresponding resource in sysfs */
+ snprintf(filename, sizeof(filename), "%s/" PCI_PRI_FMT "/resource",
+ rte_pci_get_sysfs_path(), dev->addr.domain, dev->addr.bus,
+ dev->addr.devid, dev->addr.function);
+ f = fopen(filename, "r");
+ if (f == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot open sysfs resource: %s\n",
+ strerror(errno));
+ return -1;
+ }
+ for (i = 0; i < bar + 1; i++) {
+ if (fgets(buf, sizeof(buf), f) == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot read sysfs resource\n");
+ goto error;
+ }
+ }
+ if (pci_parse_one_sysfs_resource(buf, sizeof(buf), &phys_addr,
+ &end_addr, &flags) < 0)
+ goto error;
+ if ((flags & IORESOURCE_IO) == 0) {
+ RTE_LOG(ERR, EAL, "BAR %d is not an IO resource\n", bar);
+ goto error;
+ }
+ snprintf(filename, sizeof(filename), "%s/" PCI_PRI_FMT "/resource%d",
+ rte_pci_get_sysfs_path(), dev->addr.domain, dev->addr.bus,
+ dev->addr.devid, dev->addr.function, bar);
+
+ /* mmap the pci resource */
+ fd = open(filename, O_RDWR);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
+ strerror(errno));
+ goto error;
+ }
+ addr = mmap(NULL, end_addr + 1, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ close(fd);
+ if (addr == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "Cannot mmap IO port resource: %s\n",
+ strerror(errno));
+ goto error;
+ }
+
+ /* strangely, the base address is mmap addr + phys_addr */
+ p->base = (uintptr_t)addr + phys_addr;
+ p->len = end_addr + 1;
+ RTE_LOG(DEBUG, EAL, "PCI Port IO found start=0x%"PRIx64"\n", p->base);
+ fclose(f);
+
+ return 0;
+
+error:
+ fclose(f);
+ return -1;
+}
+#endif
+
+void
+pci_uio_ioport_read(struct rte_pci_ioport *p,
+ void *data, size_t len, off_t offset)
+{
+ uint8_t *d;
+ int size;
+ uintptr_t reg = p->base + offset;
+
+ for (d = data; len > 0; d += size, reg += size, len -= size) {
+ if (len >= 4) {
+ size = 4;
+#if defined(RTE_ARCH_X86)
+ *(uint32_t *)d = inl(reg);
+#else
+ *(uint32_t *)d = *(volatile uint32_t *)reg;
+#endif
+ } else if (len >= 2) {
+ size = 2;
+#if defined(RTE_ARCH_X86)
+ *(uint16_t *)d = inw(reg);
+#else
+ *(uint16_t *)d = *(volatile uint16_t *)reg;
+#endif
+ } else {
+ size = 1;
+#if defined(RTE_ARCH_X86)
+ *d = inb(reg);
+#else
+ *d = *(volatile uint8_t *)reg;
+#endif
+ }
+ }
+}
+
+void
+pci_uio_ioport_write(struct rte_pci_ioport *p,
+ const void *data, size_t len, off_t offset)
+{
+ const uint8_t *s;
+ int size;
+ uintptr_t reg = p->base + offset;
+
+ for (s = data; len > 0; s += size, reg += size, len -= size) {
+ if (len >= 4) {
+ size = 4;
+#if defined(RTE_ARCH_X86)
+ outl_p(*(const uint32_t *)s, reg);
+#else
+ *(volatile uint32_t *)reg = *(const uint32_t *)s;
+#endif
+ } else if (len >= 2) {
+ size = 2;
+#if defined(RTE_ARCH_X86)
+ outw_p(*(const uint16_t *)s, reg);
+#else
+ *(volatile uint16_t *)reg = *(const uint16_t *)s;
+#endif
+ } else {
+ size = 1;
+#if defined(RTE_ARCH_X86)
+ outb_p(*s, reg);
+#else
+ *(volatile uint8_t *)reg = *s;
+#endif
+ }
+ }
+}
+
+int
+pci_uio_ioport_unmap(struct rte_pci_ioport *p)
+{
+#if defined(RTE_ARCH_X86)
+ RTE_SET_USED(p);
+ /* FIXME close intr fd ? */
+ return 0;
+#else
+ return munmap((void *)(uintptr_t)p->base, p->len);
+#endif
+}
diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
new file mode 100644
index 00000000..1f93fa4d
--- /dev/null
+++ b/drivers/bus/pci/linux/pci_vfio.c
@@ -0,0 +1,763 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <fcntl.h>
+#include <linux/pci_regs.h>
+#include <sys/eventfd.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <stdbool.h>
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_eal_memconfig.h>
+#include <rte_malloc.h>
+#include <rte_vfio.h>
+
+#include "eal_filesystem.h"
+
+#include "pci_init.h"
+#include "private.h"
+
+/**
+ * @file
+ * PCI probing under linux (VFIO version)
+ *
+ * This code tries to determine if the PCI device is bound to VFIO driver,
+ * and initialize it (map BARs, set up interrupts) if that's the case.
+ *
+ * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y".
+ */
+
+#ifdef VFIO_PRESENT
+
+#define PAGE_SIZE (sysconf(_SC_PAGESIZE))
+#define PAGE_MASK (~(PAGE_SIZE - 1))
+
+static struct rte_tailq_elem rte_vfio_tailq = {
+ .name = "VFIO_RESOURCE_LIST",
+};
+EAL_REGISTER_TAILQ(rte_vfio_tailq)
+
+int
+pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
+ void *buf, size_t len, off_t offs)
+{
+ return pread64(intr_handle->vfio_dev_fd, buf, len,
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs);
+}
+
+int
+pci_vfio_write_config(const struct rte_intr_handle *intr_handle,
+ const void *buf, size_t len, off_t offs)
+{
+ return pwrite64(intr_handle->vfio_dev_fd, buf, len,
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs);
+}
+
+/* get PCI BAR number where MSI-X interrupts are */
+static int
+pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table)
+{
+ int ret;
+ uint32_t reg;
+ uint16_t flags;
+ uint8_t cap_id, cap_offset;
+
+ /* read PCI capability pointer from config space */
+ ret = pread64(fd, &reg, sizeof(reg),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ PCI_CAPABILITY_LIST);
+ if (ret != sizeof(reg)) {
+ RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI "
+ "config space!\n");
+ return -1;
+ }
+
+ /* we need first byte */
+ cap_offset = reg & 0xFF;
+
+ while (cap_offset) {
+
+ /* read PCI capability ID */
+ ret = pread64(fd, &reg, sizeof(reg),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ cap_offset);
+ if (ret != sizeof(reg)) {
+ RTE_LOG(ERR, EAL, "Cannot read capability ID from PCI "
+ "config space!\n");
+ return -1;
+ }
+
+ /* we need first byte */
+ cap_id = reg & 0xFF;
+
+ /* if we haven't reached MSI-X, check next capability */
+ if (cap_id != PCI_CAP_ID_MSIX) {
+ ret = pread64(fd, &reg, sizeof(reg),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ cap_offset);
+ if (ret != sizeof(reg)) {
+ RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI "
+ "config space!\n");
+ return -1;
+ }
+
+ /* we need second byte */
+ cap_offset = (reg & 0xFF00) >> 8;
+
+ continue;
+ }
+ /* else, read table offset */
+ else {
+ /* table offset resides in the next 4 bytes */
+ ret = pread64(fd, &reg, sizeof(reg),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ cap_offset + 4);
+ if (ret != sizeof(reg)) {
+ RTE_LOG(ERR, EAL, "Cannot read table offset from PCI config "
+ "space!\n");
+ return -1;
+ }
+
+ ret = pread64(fd, &flags, sizeof(flags),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ cap_offset + 2);
+ if (ret != sizeof(flags)) {
+ RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config "
+ "space!\n");
+ return -1;
+ }
+
+ msix_table->bar_index = reg & RTE_PCI_MSIX_TABLE_BIR;
+ msix_table->offset = reg & RTE_PCI_MSIX_TABLE_OFFSET;
+ msix_table->size =
+ 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE));
+
+ return 0;
+ }
+ }
+ return 0;
+}
+
+/* set PCI bus mastering */
+static int
+pci_vfio_set_bus_master(int dev_fd, bool op)
+{
+ uint16_t reg;
+ int ret;
+
+ ret = pread64(dev_fd, &reg, sizeof(reg),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ PCI_COMMAND);
+ if (ret != sizeof(reg)) {
+ RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n");
+ return -1;
+ }
+
+ if (op)
+ /* set the master bit */
+ reg |= PCI_COMMAND_MASTER;
+ else
+ reg &= ~(PCI_COMMAND_MASTER);
+
+ ret = pwrite64(dev_fd, &reg, sizeof(reg),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ PCI_COMMAND);
+
+ if (ret != sizeof(reg)) {
+ RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+/* set up interrupt support (but not enable interrupts) */
+static int
+pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
+{
+ int i, ret, intr_idx;
+ enum rte_intr_mode intr_mode;
+
+ /* default to invalid index */
+ intr_idx = VFIO_PCI_NUM_IRQS;
+
+ /* Get default / configured intr_mode */
+ intr_mode = rte_eal_vfio_intr_mode();
+
+ /* get interrupt type from internal config (MSI-X by default, can be
+ * overridden from the command line
+ */
+ switch (intr_mode) {
+ case RTE_INTR_MODE_MSIX:
+ intr_idx = VFIO_PCI_MSIX_IRQ_INDEX;
+ break;
+ case RTE_INTR_MODE_MSI:
+ intr_idx = VFIO_PCI_MSI_IRQ_INDEX;
+ break;
+ case RTE_INTR_MODE_LEGACY:
+ intr_idx = VFIO_PCI_INTX_IRQ_INDEX;
+ break;
+ /* don't do anything if we want to automatically determine interrupt type */
+ case RTE_INTR_MODE_NONE:
+ break;
+ default:
+ RTE_LOG(ERR, EAL, " unknown default interrupt type!\n");
+ return -1;
+ }
+
+ /* start from MSI-X interrupt type */
+ for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) {
+ struct vfio_irq_info irq = { .argsz = sizeof(irq) };
+ int fd = -1;
+
+ /* skip interrupt modes we don't want */
+ if (intr_mode != RTE_INTR_MODE_NONE &&
+ i != intr_idx)
+ continue;
+
+ irq.index = i;
+
+ ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, " cannot get IRQ info, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ /* if this vector cannot be used with eventfd, fail if we explicitly
+ * specified interrupt type, otherwise continue */
+ if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) {
+ if (intr_mode != RTE_INTR_MODE_NONE) {
+ RTE_LOG(ERR, EAL,
+ " interrupt vector does not support eventfd!\n");
+ return -1;
+ } else
+ continue;
+ }
+
+ /* set up an eventfd for interrupts */
+ fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, " cannot set up eventfd, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ dev->intr_handle.fd = fd;
+ dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
+
+ switch (i) {
+ case VFIO_PCI_MSIX_IRQ_INDEX:
+ intr_mode = RTE_INTR_MODE_MSIX;
+ dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX;
+ break;
+ case VFIO_PCI_MSI_IRQ_INDEX:
+ intr_mode = RTE_INTR_MODE_MSI;
+ dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI;
+ break;
+ case VFIO_PCI_INTX_IRQ_INDEX:
+ intr_mode = RTE_INTR_MODE_LEGACY;
+ dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_LEGACY;
+ break;
+ default:
+ RTE_LOG(ERR, EAL, " unknown interrupt type!\n");
+ return -1;
+ }
+
+ return 0;
+ }
+
+ /* if we're here, we haven't found a suitable interrupt vector */
+ return -1;
+}
+
+static int
+pci_vfio_is_ioport_bar(int vfio_dev_fd, int bar_index)
+{
+ uint32_t ioport_bar;
+ int ret;
+
+ ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX)
+ + PCI_BASE_ADDRESS_0 + bar_index*4);
+ if (ret != sizeof(ioport_bar)) {
+ RTE_LOG(ERR, EAL, "Cannot read command (%x) from config space!\n",
+ PCI_BASE_ADDRESS_0 + bar_index*4);
+ return -1;
+ }
+
+ return (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) != 0;
+}
+
+static int
+pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd)
+{
+ if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) {
+ RTE_LOG(ERR, EAL, "Error setting up interrupts!\n");
+ return -1;
+ }
+
+ /* set bus mastering for the device */
+ if (pci_vfio_set_bus_master(vfio_dev_fd, true)) {
+ RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n");
+ return -1;
+ }
+
+ /*
+ * Reset the device. If the device is not capable of resetting,
+ * then it updates errno as EINVAL.
+ */
+ if (ioctl(vfio_dev_fd, VFIO_DEVICE_RESET) && errno != EINVAL) {
+ RTE_LOG(ERR, EAL, "Unable to reset device! Error: %d (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
+ int bar_index, int additional_flags)
+{
+ struct memreg {
+ unsigned long offset, size;
+ } memreg[2] = {};
+ void *bar_addr;
+ struct pci_msix_table *msix_table = &vfio_res->msix_table;
+ struct pci_map *bar = &vfio_res->maps[bar_index];
+
+ if (bar->size == 0)
+ /* Skip this BAR */
+ return 0;
+
+ if (msix_table->bar_index == bar_index) {
+ /*
+ * VFIO will not let us map the MSI-X table,
+ * but we can map around it.
+ */
+ uint32_t table_start = msix_table->offset;
+ uint32_t table_end = table_start + msix_table->size;
+ table_end = (table_end + ~PAGE_MASK) & PAGE_MASK;
+ table_start &= PAGE_MASK;
+
+ if (table_start == 0 && table_end >= bar->size) {
+ /* Cannot map this BAR */
+ RTE_LOG(DEBUG, EAL, "Skipping BAR%d\n", bar_index);
+ bar->size = 0;
+ bar->addr = 0;
+ return 0;
+ }
+
+ memreg[0].offset = bar->offset;
+ memreg[0].size = table_start;
+ memreg[1].offset = bar->offset + table_end;
+ memreg[1].size = bar->size - table_end;
+
+ RTE_LOG(DEBUG, EAL,
+ "Trying to map BAR%d that contains the MSI-X "
+ "table. Trying offsets: "
+ "0x%04lx:0x%04lx, 0x%04lx:0x%04lx\n", bar_index,
+ memreg[0].offset, memreg[0].size,
+ memreg[1].offset, memreg[1].size);
+ } else {
+ memreg[0].offset = bar->offset;
+ memreg[0].size = bar->size;
+ }
+
+ /* reserve the address using an inaccessible mapping */
+ bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE |
+ MAP_ANONYMOUS | additional_flags, -1, 0);
+ if (bar_addr != MAP_FAILED) {
+ void *map_addr = NULL;
+ if (memreg[0].size) {
+ /* actual map of first part */
+ map_addr = pci_map_resource(bar_addr, vfio_dev_fd,
+ memreg[0].offset,
+ memreg[0].size,
+ MAP_FIXED);
+ }
+
+ /* if there's a second part, try to map it */
+ if (map_addr != MAP_FAILED
+ && memreg[1].offset && memreg[1].size) {
+ void *second_addr = RTE_PTR_ADD(bar_addr,
+ memreg[1].offset -
+ (uintptr_t)bar->offset);
+ map_addr = pci_map_resource(second_addr,
+ vfio_dev_fd,
+ memreg[1].offset,
+ memreg[1].size,
+ MAP_FIXED);
+ }
+
+ if (map_addr == MAP_FAILED || !map_addr) {
+ munmap(bar_addr, bar->size);
+ bar_addr = MAP_FAILED;
+ RTE_LOG(ERR, EAL, "Failed to map pci BAR%d\n",
+ bar_index);
+ return -1;
+ }
+ } else {
+ RTE_LOG(ERR, EAL,
+ "Failed to create inaccessible mapping for BAR%d\n",
+ bar_index);
+ return -1;
+ }
+
+ bar->addr = bar_addr;
+ return 0;
+}
+
+static int
+pci_vfio_map_resource_primary(struct rte_pci_device *dev)
+{
+ struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+ char pci_addr[PATH_MAX] = {0};
+ int vfio_dev_fd;
+ struct rte_pci_addr *loc = &dev->addr;
+ int i, ret;
+ struct mapped_pci_resource *vfio_res = NULL;
+ struct mapped_pci_res_list *vfio_res_list =
+ RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
+
+ struct pci_map *maps;
+
+ dev->intr_handle.fd = -1;
+ dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+
+ /* store PCI address string */
+ snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+ loc->domain, loc->bus, loc->devid, loc->function);
+
+ ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
+ &vfio_dev_fd, &device_info);
+ if (ret)
+ return ret;
+
+ /* allocate vfio_res and get region info */
+ vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0);
+ if (vfio_res == NULL) {
+ RTE_LOG(ERR, EAL,
+ "%s(): cannot store uio mmap details\n", __func__);
+ goto err_vfio_dev_fd;
+ }
+ memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr));
+
+ /* get number of registers (up to BAR5) */
+ vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions,
+ VFIO_PCI_BAR5_REGION_INDEX + 1);
+
+ /* map BARs */
+ maps = vfio_res->maps;
+
+ vfio_res->msix_table.bar_index = -1;
+ /* get MSI-X BAR, if any (we have to know where it is because we can't
+ * easily mmap it when using VFIO)
+ */
+ ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n",
+ pci_addr);
+ goto err_vfio_dev_fd;
+ }
+
+ for (i = 0; i < (int) vfio_res->nb_maps; i++) {
+ struct vfio_region_info reg = { .argsz = sizeof(reg) };
+ void *bar_addr;
+
+ reg.index = i;
+
+ ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " %s cannot get device region info "
+ "error %i (%s)\n", pci_addr, errno, strerror(errno));
+ goto err_vfio_res;
+ }
+
+ /* chk for io port region */
+ ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
+ if (ret < 0)
+ goto err_vfio_res;
+ else if (ret) {
+ RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n",
+ i);
+ continue;
+ }
+
+ /* skip non-mmapable BARs */
+ if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
+ continue;
+
+ /* try mapping somewhere close to the end of hugepages */
+ if (pci_map_addr == NULL)
+ pci_map_addr = pci_find_max_end_va();
+
+ bar_addr = pci_map_addr;
+ pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size);
+
+ maps[i].addr = bar_addr;
+ maps[i].offset = reg.offset;
+ maps[i].size = reg.size;
+ maps[i].path = NULL; /* vfio doesn't have per-resource paths */
+
+ ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n",
+ pci_addr, i, strerror(errno));
+ goto err_vfio_res;
+ }
+
+ dev->mem_resource[i].addr = maps[i].addr;
+ }
+
+ if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
+ RTE_LOG(ERR, EAL, " %s setup device failed\n", pci_addr);
+ goto err_vfio_res;
+ }
+
+ TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next);
+
+ return 0;
+err_vfio_res:
+ rte_free(vfio_res);
+err_vfio_dev_fd:
+ close(vfio_dev_fd);
+ return -1;
+}
+
+static int
+pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
+{
+ struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+ char pci_addr[PATH_MAX] = {0};
+ int vfio_dev_fd;
+ struct rte_pci_addr *loc = &dev->addr;
+ int i, ret;
+ struct mapped_pci_resource *vfio_res = NULL;
+ struct mapped_pci_res_list *vfio_res_list =
+ RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
+
+ struct pci_map *maps;
+
+ dev->intr_handle.fd = -1;
+ dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+
+ /* store PCI address string */
+ snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+ loc->domain, loc->bus, loc->devid, loc->function);
+
+ ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
+ &vfio_dev_fd, &device_info);
+ if (ret)
+ return ret;
+
+ /* if we're in a secondary process, just find our tailq entry */
+ TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
+ if (rte_pci_addr_cmp(&vfio_res->pci_addr,
+ &dev->addr))
+ continue;
+ break;
+ }
+ /* if we haven't found our tailq entry, something's wrong */
+ if (vfio_res == NULL) {
+ RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n",
+ pci_addr);
+ goto err_vfio_dev_fd;
+ }
+
+ /* map BARs */
+ maps = vfio_res->maps;
+
+ for (i = 0; i < (int) vfio_res->nb_maps; i++) {
+ ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n",
+ pci_addr, i, strerror(errno));
+ goto err_vfio_dev_fd;
+ }
+
+ dev->mem_resource[i].addr = maps[i].addr;
+ }
+
+ return 0;
+err_vfio_dev_fd:
+ close(vfio_dev_fd);
+ return -1;
+}
+
+/*
+ * map the PCI resources of a PCI device in virtual memory (VFIO version).
+ * primary and secondary processes follow almost exactly the same path
+ */
+int
+pci_vfio_map_resource(struct rte_pci_device *dev)
+{
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ return pci_vfio_map_resource_primary(dev);
+ else
+ return pci_vfio_map_resource_secondary(dev);
+}
+
+int
+pci_vfio_unmap_resource(struct rte_pci_device *dev)
+{
+ char pci_addr[PATH_MAX] = {0};
+ struct rte_pci_addr *loc = &dev->addr;
+ int i, ret;
+ struct mapped_pci_resource *vfio_res = NULL;
+ struct mapped_pci_res_list *vfio_res_list;
+
+ struct pci_map *maps;
+
+ /* store PCI address string */
+ snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+ loc->domain, loc->bus, loc->devid, loc->function);
+
+
+ if (close(dev->intr_handle.fd) < 0) {
+ RTE_LOG(INFO, EAL, "Error when closing eventfd file descriptor for %s\n",
+ pci_addr);
+ return -1;
+ }
+
+ if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, false)) {
+ RTE_LOG(ERR, EAL, " %s cannot unset bus mastering for PCI device!\n",
+ pci_addr);
+ return -1;
+ }
+
+ ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr,
+ dev->intr_handle.vfio_dev_fd);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL,
+ "%s(): cannot release device\n", __func__);
+ return ret;
+ }
+
+ vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
+ /* Get vfio_res */
+ TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
+ if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev->addr)))
+ continue;
+ break;
+ }
+ /* if we haven't found our tailq entry, something's wrong */
+ if (vfio_res == NULL) {
+ RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n",
+ pci_addr);
+ return -1;
+ }
+
+ /* unmap BARs */
+ maps = vfio_res->maps;
+
+ RTE_LOG(INFO, EAL, "Releasing pci mapped resource for %s\n",
+ pci_addr);
+ for (i = 0; i < (int) vfio_res->nb_maps; i++) {
+
+ /*
+ * We do not need to be aware of MSI-X table BAR mappings as
+ * when mapping. Just using current maps array is enough
+ */
+ if (maps[i].addr) {
+ RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n",
+ pci_addr, maps[i].addr);
+ pci_unmap_resource(maps[i].addr, maps[i].size);
+ }
+ }
+
+ TAILQ_REMOVE(vfio_res_list, vfio_res, next);
+
+ return 0;
+}
+
+int
+pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
+ struct rte_pci_ioport *p)
+{
+ if (bar < VFIO_PCI_BAR0_REGION_INDEX ||
+ bar > VFIO_PCI_BAR5_REGION_INDEX) {
+ RTE_LOG(ERR, EAL, "invalid bar (%d)!\n", bar);
+ return -1;
+ }
+
+ p->dev = dev;
+ p->base = VFIO_GET_REGION_ADDR(bar);
+ return 0;
+}
+
+void
+pci_vfio_ioport_read(struct rte_pci_ioport *p,
+ void *data, size_t len, off_t offset)
+{
+ const struct rte_intr_handle *intr_handle = &p->dev->intr_handle;
+
+ if (pread64(intr_handle->vfio_dev_fd, data,
+ len, p->base + offset) <= 0)
+ RTE_LOG(ERR, EAL,
+ "Can't read from PCI bar (%" PRIu64 ") : offset (%x)\n",
+ VFIO_GET_REGION_IDX(p->base), (int)offset);
+}
+
+void
+pci_vfio_ioport_write(struct rte_pci_ioport *p,
+ const void *data, size_t len, off_t offset)
+{
+ const struct rte_intr_handle *intr_handle = &p->dev->intr_handle;
+
+ if (pwrite64(intr_handle->vfio_dev_fd, data,
+ len, p->base + offset) <= 0)
+ RTE_LOG(ERR, EAL,
+ "Can't write to PCI bar (%" PRIu64 ") : offset (%x)\n",
+ VFIO_GET_REGION_IDX(p->base), (int)offset);
+}
+
+int
+pci_vfio_ioport_unmap(struct rte_pci_ioport *p)
+{
+ RTE_SET_USED(p);
+ return -1;
+}
+
+int
+pci_vfio_is_enabled(void)
+{
+ return rte_vfio_is_enabled("vfio_pci");
+}
+#endif