From a41e6ff15809d40e0f9bbc9576bf8f7f80fbec1d Mon Sep 17 00:00:00 2001 From: Ricardo Salveti Date: Mon, 18 Jul 2016 15:30:06 -0300 Subject: Imported Upstream version 16.07-rc2 Change-Id: Ie9e8ec528a2a0dace085c5e44aa7fa3b489d4ba0 Signed-off-by: Ricardo Salveti --- lib/librte_eal/common/eal_common_options.c | 9 + lib/librte_eal/common/include/rte_dev.h | 28 +- lib/librte_eal/common/include/rte_pci_dev_ids.h | 420 ---------------- lib/librte_eal/common/include/rte_version.h | 2 +- lib/librte_eal/common/malloc_elem.c | 17 +- lib/librte_eal/common/rte_malloc.c | 6 +- lib/librte_eal/linuxapp/eal/Makefile | 5 +- lib/librte_eal/linuxapp/eal/eal.c | 33 ++ lib/librte_eal/linuxapp/eal/eal_memory.c | 32 +- lib/librte_eal/linuxapp/eal/eal_pci.c | 17 +- lib/librte_eal/linuxapp/eal/eal_pci_init.h | 41 -- lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 517 +------------------ lib/librte_eal/linuxapp/eal/eal_pci_vfio_mp_sync.c | 408 --------------- lib/librte_eal/linuxapp/eal/eal_vfio.c | 547 +++++++++++++++++++++ lib/librte_eal/linuxapp/eal/eal_vfio.h | 94 ++++ lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 408 +++++++++++++++ lib/librte_eal/linuxapp/eal/eal_xen_memory.c | 4 +- lib/librte_eal/linuxapp/eal/rte_eal_version.map | 1 + lib/librte_eal/linuxapp/igb_uio/igb_uio.c | 13 - 19 files changed, 1171 insertions(+), 1431 deletions(-) delete mode 100644 lib/librte_eal/linuxapp/eal/eal_pci_vfio_mp_sync.c create mode 100644 lib/librte_eal/linuxapp/eal/eal_vfio.c create mode 100644 lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c (limited to 'lib/librte_eal') diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c index 3efc90f0..0a594d7f 100644 --- a/lib/librte_eal/common/eal_common_options.c +++ b/lib/librte_eal/common/eal_common_options.c @@ -115,6 +115,15 @@ TAILQ_HEAD_INITIALIZER(solib_list); /* Default path of external loadable drivers */ static const char *default_solib_dir = RTE_EAL_PMD_PATH; +/* + * Stringified version of solib path used by pmdinfo.py + * Note: PLEASE DO NOT ALTER THIS without making a corresponding + * change to tools/pmdinfo.py + */ +static const char dpdk_solib_path[] __attribute__((used)) = +"DPDK_PLUGIN_PATH=" RTE_EAL_PMD_PATH; + + static int master_lcore_parsed; static int mem_parsed; diff --git a/lib/librte_eal/common/include/rte_dev.h b/lib/librte_eal/common/include/rte_dev.h index f1b55079..95789f9d 100644 --- a/lib/librte_eal/common/include/rte_dev.h +++ b/lib/librte_eal/common/include/rte_dev.h @@ -178,12 +178,30 @@ int rte_eal_vdev_init(const char *name, const char *args); */ int rte_eal_vdev_uninit(const char *name); -#define PMD_REGISTER_DRIVER(d)\ -void devinitfn_ ##d(void);\ -void __attribute__((constructor, used)) devinitfn_ ##d(void)\ +#define DRIVER_EXPORT_NAME_ARRAY(n, idx) n##idx[] + +#define DRIVER_EXPORT_NAME(name, idx) \ +static const char DRIVER_EXPORT_NAME_ARRAY(this_pmd_name, idx) \ +__attribute__((used)) = RTE_STR(name) + +#define PMD_REGISTER_DRIVER(drv, nm)\ +void devinitfn_ ##drv(void);\ +void __attribute__((constructor, used)) devinitfn_ ##drv(void)\ {\ - rte_eal_driver_register(&d);\ -} + (drv).name = RTE_STR(nm);\ + rte_eal_driver_register(&drv);\ +} \ +DRIVER_EXPORT_NAME(nm, __COUNTER__) + +#define DRV_EXP_TAG(name, tag) __##name##_##tag + +#define DRIVER_REGISTER_PCI_TABLE(name, table) \ +static const char DRV_EXP_TAG(name, pci_tbl_export)[] __attribute__((used)) = \ +RTE_STR(table) + +#define DRIVER_REGISTER_PARAM_STRING(name, str) \ +static const char DRV_EXP_TAG(name, param_string_export)[] \ +__attribute__((used)) = str #ifdef __cplusplus } diff --git a/lib/librte_eal/common/include/rte_pci_dev_ids.h b/lib/librte_eal/common/include/rte_pci_dev_ids.h index af39fbbd..6ec8ae8c 100644 --- a/lib/librte_eal/common/include/rte_pci_dev_ids.h +++ b/lib/librte_eal/common/include/rte_pci_dev_ids.h @@ -57,50 +57,6 @@ * */ -/** - * @file - * - * This file contains a list of the PCI device IDs recognised by DPDK, which - * can be used to fill out an array of structures describing the devices. - * - * Currently five families of devices are recognised: those supported by the - * IGB driver, by EM driver, those supported by the IXGBE driver, those - * supported by the BNXT driver, and by virtio driver which is a para - * virtualization driver running in guest virtual machine. The inclusion of - * these in an array built using this file depends on the definition of - * RTE_PCI_DEV_ID_DECL_BNXT - * RTE_PCI_DEV_ID_DECL_EM - * RTE_PCI_DEV_ID_DECL_IGB - * RTE_PCI_DEV_ID_DECL_IGBVF - * RTE_PCI_DEV_ID_DECL_IXGBE - * RTE_PCI_DEV_ID_DECL_IXGBEVF - * RTE_PCI_DEV_ID_DECL_I40E - * RTE_PCI_DEV_ID_DECL_I40EVF - * RTE_PCI_DEV_ID_DECL_VIRTIO - * at the time when this file is included. - * - * In order to populate an array, the user of this file must define this macro: - * RTE_PCI_DEV_ID_DECL_IXGBE(vendorID, deviceID). For example: - * - * @code - * struct device { - * int vend; - * int dev; - * }; - * - * struct device devices[] = { - * #define RTE_PCI_DEV_ID_DECL_IXGBE(vendorID, deviceID) {vend, dev}, - * #include - * }; - * @endcode - * - * Note that this file can be included multiple times within the same file. - */ - -#ifndef RTE_PCI_DEV_ID_DECL_EM -#define RTE_PCI_DEV_ID_DECL_EM(vend, dev) -#endif - #ifndef RTE_PCI_DEV_ID_DECL_IGB #define RTE_PCI_DEV_ID_DECL_IGB(vend, dev) #endif @@ -117,214 +73,11 @@ #define RTE_PCI_DEV_ID_DECL_IXGBEVF(vend, dev) #endif -#ifndef RTE_PCI_DEV_ID_DECL_I40E -#define RTE_PCI_DEV_ID_DECL_I40E(vend, dev) -#endif - -#ifndef RTE_PCI_DEV_ID_DECL_I40EVF -#define RTE_PCI_DEV_ID_DECL_I40EVF(vend, dev) -#endif - -#ifndef RTE_PCI_DEV_ID_DECL_VIRTIO -#define RTE_PCI_DEV_ID_DECL_VIRTIO(vend, dev) -#endif - -#ifndef RTE_PCI_DEV_ID_DECL_VMXNET3 -#define RTE_PCI_DEV_ID_DECL_VMXNET3(vend, dev) -#endif - -#ifndef RTE_PCI_DEV_ID_DECL_FM10K -#define RTE_PCI_DEV_ID_DECL_FM10K(vend, dev) -#endif - -#ifndef RTE_PCI_DEV_ID_DECL_FM10KVF -#define RTE_PCI_DEV_ID_DECL_FM10KVF(vend, dev) -#endif - -#ifndef RTE_PCI_DEV_ID_DECL_ENIC -#define RTE_PCI_DEV_ID_DECL_ENIC(vend, dev) -#endif - -#ifndef RTE_PCI_DEV_ID_DECL_BNX2X -#define RTE_PCI_DEV_ID_DECL_BNX2X(vend, dev) -#endif - -#ifndef RTE_PCI_DEV_ID_DECL_BNX2XVF -#define RTE_PCI_DEV_ID_DECL_BNX2XVF(vend, dev) -#endif - -#ifndef RTE_PCI_DEV_ID_DECL_BNXT -#define RTE_PCI_DEV_ID_DECL_BNXT(vend, dev) -#endif - #ifndef PCI_VENDOR_ID_INTEL /** Vendor ID used by Intel devices */ #define PCI_VENDOR_ID_INTEL 0x8086 #endif -#ifndef PCI_VENDOR_ID_QUMRANET -/** Vendor ID used by virtio devices */ -#define PCI_VENDOR_ID_QUMRANET 0x1AF4 -#endif - -#ifndef PCI_VENDOR_ID_VMWARE -/** Vendor ID used by VMware devices */ -#define PCI_VENDOR_ID_VMWARE 0x15AD -#endif - -#ifndef PCI_VENDOR_ID_CISCO -/** Vendor ID used by Cisco VIC devices */ -#define PCI_VENDOR_ID_CISCO 0x1137 -#endif - -#ifndef PCI_VENDOR_ID_BROADCOM -/** Vendor ID used by Broadcom devices */ -#define PCI_VENDOR_ID_BROADCOM 0x14E4 -#endif - -/******************** Physical EM devices from e1000_hw.h ********************/ - -#define E1000_DEV_ID_82542 0x1000 -#define E1000_DEV_ID_82543GC_FIBER 0x1001 -#define E1000_DEV_ID_82543GC_COPPER 0x1004 -#define E1000_DEV_ID_82544EI_COPPER 0x1008 -#define E1000_DEV_ID_82544EI_FIBER 0x1009 -#define E1000_DEV_ID_82544GC_COPPER 0x100C -#define E1000_DEV_ID_82544GC_LOM 0x100D -#define E1000_DEV_ID_82540EM 0x100E -#define E1000_DEV_ID_82540EM_LOM 0x1015 -#define E1000_DEV_ID_82540EP_LOM 0x1016 -#define E1000_DEV_ID_82540EP 0x1017 -#define E1000_DEV_ID_82540EP_LP 0x101E -#define E1000_DEV_ID_82545EM_COPPER 0x100F -#define E1000_DEV_ID_82545EM_FIBER 0x1011 -#define E1000_DEV_ID_82545GM_COPPER 0x1026 -#define E1000_DEV_ID_82545GM_FIBER 0x1027 -#define E1000_DEV_ID_82545GM_SERDES 0x1028 -#define E1000_DEV_ID_82546EB_COPPER 0x1010 -#define E1000_DEV_ID_82546EB_FIBER 0x1012 -#define E1000_DEV_ID_82546EB_QUAD_COPPER 0x101D -#define E1000_DEV_ID_82546GB_COPPER 0x1079 -#define E1000_DEV_ID_82546GB_FIBER 0x107A -#define E1000_DEV_ID_82546GB_SERDES 0x107B -#define E1000_DEV_ID_82546GB_PCIE 0x108A -#define E1000_DEV_ID_82546GB_QUAD_COPPER 0x1099 -#define E1000_DEV_ID_82546GB_QUAD_COPPER_KSP3 0x10B5 -#define E1000_DEV_ID_82541EI 0x1013 -#define E1000_DEV_ID_82541EI_MOBILE 0x1018 -#define E1000_DEV_ID_82541ER_LOM 0x1014 -#define E1000_DEV_ID_82541ER 0x1078 -#define E1000_DEV_ID_82541GI 0x1076 -#define E1000_DEV_ID_82541GI_LF 0x107C -#define E1000_DEV_ID_82541GI_MOBILE 0x1077 -#define E1000_DEV_ID_82547EI 0x1019 -#define E1000_DEV_ID_82547EI_MOBILE 0x101A -#define E1000_DEV_ID_82547GI 0x1075 -#define E1000_DEV_ID_82571EB_COPPER 0x105E -#define E1000_DEV_ID_82571EB_FIBER 0x105F -#define E1000_DEV_ID_82571EB_SERDES 0x1060 -#define E1000_DEV_ID_82571EB_SERDES_DUAL 0x10D9 -#define E1000_DEV_ID_82571EB_SERDES_QUAD 0x10DA -#define E1000_DEV_ID_82571EB_QUAD_COPPER 0x10A4 -#define E1000_DEV_ID_82571PT_QUAD_COPPER 0x10D5 -#define E1000_DEV_ID_82571EB_QUAD_FIBER 0x10A5 -#define E1000_DEV_ID_82571EB_QUAD_COPPER_LP 0x10BC -#define E1000_DEV_ID_82572EI_COPPER 0x107D -#define E1000_DEV_ID_82572EI_FIBER 0x107E -#define E1000_DEV_ID_82572EI_SERDES 0x107F -#define E1000_DEV_ID_82572EI 0x10B9 -#define E1000_DEV_ID_82573E 0x108B -#define E1000_DEV_ID_82573E_IAMT 0x108C -#define E1000_DEV_ID_82573L 0x109A -#define E1000_DEV_ID_82574L 0x10D3 -#define E1000_DEV_ID_82574LA 0x10F6 -#define E1000_DEV_ID_82583V 0x150C -#define E1000_DEV_ID_80003ES2LAN_COPPER_DPT 0x1096 -#define E1000_DEV_ID_80003ES2LAN_SERDES_DPT 0x1098 -#define E1000_DEV_ID_80003ES2LAN_COPPER_SPT 0x10BA -#define E1000_DEV_ID_80003ES2LAN_SERDES_SPT 0x10BB -#define E1000_DEV_ID_ICH8_82567V_3 0x1501 -#define E1000_DEV_ID_ICH8_IGP_M_AMT 0x1049 -#define E1000_DEV_ID_ICH8_IGP_AMT 0x104A -#define E1000_DEV_ID_ICH8_IGP_C 0x104B -#define E1000_DEV_ID_ICH8_IFE 0x104C -#define E1000_DEV_ID_ICH8_IFE_GT 0x10C4 -#define E1000_DEV_ID_ICH8_IFE_G 0x10C5 -#define E1000_DEV_ID_ICH8_IGP_M 0x104D -#define E1000_DEV_ID_ICH9_IGP_M 0x10BF -#define E1000_DEV_ID_ICH9_IGP_M_AMT 0x10F5 -#define E1000_DEV_ID_ICH9_IGP_M_V 0x10CB -#define E1000_DEV_ID_ICH9_IGP_AMT 0x10BD -#define E1000_DEV_ID_ICH9_BM 0x10E5 -#define E1000_DEV_ID_ICH9_IGP_C 0x294C -#define E1000_DEV_ID_ICH9_IFE 0x10C0 -#define E1000_DEV_ID_ICH9_IFE_GT 0x10C3 -#define E1000_DEV_ID_ICH9_IFE_G 0x10C2 -#define E1000_DEV_ID_ICH10_R_BM_LM 0x10CC -#define E1000_DEV_ID_ICH10_R_BM_LF 0x10CD -#define E1000_DEV_ID_ICH10_R_BM_V 0x10CE -#define E1000_DEV_ID_ICH10_D_BM_LM 0x10DE -#define E1000_DEV_ID_ICH10_D_BM_LF 0x10DF -#define E1000_DEV_ID_ICH10_D_BM_V 0x1525 - -#define E1000_DEV_ID_PCH_M_HV_LM 0x10EA -#define E1000_DEV_ID_PCH_M_HV_LC 0x10EB -#define E1000_DEV_ID_PCH_D_HV_DM 0x10EF -#define E1000_DEV_ID_PCH_D_HV_DC 0x10F0 -#define E1000_DEV_ID_PCH2_LV_LM 0x1502 -#define E1000_DEV_ID_PCH2_LV_V 0x1503 -#define E1000_DEV_ID_PCH_LPT_I217_LM 0x153A -#define E1000_DEV_ID_PCH_LPT_I217_V 0x153B -#define E1000_DEV_ID_PCH_LPTLP_I218_LM 0x155A -#define E1000_DEV_ID_PCH_LPTLP_I218_V 0x1559 -#define E1000_DEV_ID_PCH_I218_LM2 0x15A0 -#define E1000_DEV_ID_PCH_I218_V2 0x15A1 -#define E1000_DEV_ID_PCH_I218_LM3 0x15A2 -#define E1000_DEV_ID_PCH_I218_V3 0x15A3 - - -/* - * Tested (supported) on VM emulated HW. - */ - -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82540EM) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82545EM_COPPER) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82545EM_FIBER) - -/* - * Tested (supported) on real HW. - */ - -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82546EB_COPPER) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82546EB_FIBER) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82546EB_QUAD_COPPER) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82571EB_COPPER) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82571EB_FIBER) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82571EB_SERDES) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82571EB_SERDES_DUAL) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82571EB_SERDES_QUAD) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82571EB_QUAD_COPPER) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82571PT_QUAD_COPPER) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82571EB_QUAD_FIBER) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82571EB_QUAD_COPPER_LP) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82572EI_COPPER) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82572EI_FIBER) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82572EI_SERDES) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82572EI) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82573L) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82574L) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82574LA) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_82583V) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_PCH_LPT_I217_LM) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_PCH_LPT_I217_V) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_PCH_LPTLP_I218_LM) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_PCH_LPTLP_I218_V) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_PCH_I218_LM2) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_PCH_I218_V2) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_PCH_I218_LM3) -RTE_PCI_DEV_ID_DECL_EM(PCI_VENDOR_ID_INTEL, E1000_DEV_ID_PCH_I218_V3) - - /******************** Physical IGB devices from e1000_hw.h ********************/ #define E1000_DEV_ID_82576 0x10C9 @@ -528,60 +281,6 @@ RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_X_KR) RTE_PCI_DEV_ID_DECL_IXGBE(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_82599_BYPASS) #endif -/*************** Physical I40E devices from i40e_type.h *****************/ - -#define I40E_DEV_ID_SFP_XL710 0x1572 -#define I40E_DEV_ID_QEMU 0x1574 -#define I40E_DEV_ID_KX_B 0x1580 -#define I40E_DEV_ID_KX_C 0x1581 -#define I40E_DEV_ID_QSFP_A 0x1583 -#define I40E_DEV_ID_QSFP_B 0x1584 -#define I40E_DEV_ID_QSFP_C 0x1585 -#define I40E_DEV_ID_10G_BASE_T 0x1586 -#define I40E_DEV_ID_20G_KR2 0x1587 -#define I40E_DEV_ID_20G_KR2_A 0x1588 -#define I40E_DEV_ID_10G_BASE_T4 0x1589 -#define I40E_DEV_ID_25G_B 0x158A -#define I40E_DEV_ID_25G_SFP28 0x158B -#define I40E_DEV_ID_X722_A0 0x374C -#define I40E_DEV_ID_KX_X722 0x37CE -#define I40E_DEV_ID_QSFP_X722 0x37CF -#define I40E_DEV_ID_SFP_X722 0x37D0 -#define I40E_DEV_ID_1G_BASE_T_X722 0x37D1 -#define I40E_DEV_ID_10G_BASE_T_X722 0x37D2 -#define I40E_DEV_ID_SFP_I_X722 0x37D3 -#define I40E_DEV_ID_QSFP_I_X722 0x37D4 - -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_SFP_XL710) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_QEMU) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_KX_B) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_KX_C) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_QSFP_A) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_QSFP_B) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_QSFP_C) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_10G_BASE_T) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_20G_KR2) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_20G_KR2_A) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_10G_BASE_T4) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_25G_B) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_25G_SFP28) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_X722_A0) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_KX_X722) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_QSFP_X722) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_SFP_X722) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_1G_BASE_T_X722) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_10G_BASE_T_X722) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_SFP_I_X722) -RTE_PCI_DEV_ID_DECL_I40E(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_QSFP_I_X722) - -/*************** Physical FM10K devices from fm10k_type.h ***************/ - -#define FM10K_DEV_ID_PF 0x15A4 -#define FM10K_DEV_ID_SDI_FM10420_QDA2 0x15D0 - -RTE_PCI_DEV_ID_DECL_FM10K(PCI_VENDOR_ID_INTEL, FM10K_DEV_ID_PF) -RTE_PCI_DEV_ID_DECL_FM10K(PCI_VENDOR_ID_INTEL, FM10K_DEV_ID_SDI_FM10420_QDA2) - /****************** Virtual IGB devices from e1000_hw.h ******************/ #define E1000_DEV_ID_82576_VF 0x10CA @@ -618,129 +317,10 @@ RTE_PCI_DEV_ID_DECL_IXGBEVF(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_A_VF_HV) RTE_PCI_DEV_ID_DECL_IXGBEVF(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_X_VF) RTE_PCI_DEV_ID_DECL_IXGBEVF(PCI_VENDOR_ID_INTEL, IXGBE_DEV_ID_X550EM_X_VF_HV) -/****************** Virtual I40E devices from i40e_type.h ********************/ - -#define I40E_DEV_ID_VF 0x154C -#define I40E_DEV_ID_VF_HV 0x1571 -#define I40E_DEV_ID_X722_A0_VF 0x374D -#define I40E_DEV_ID_X722_VF 0x37CD -#define I40E_DEV_ID_X722_VF_HV 0x37D9 - -RTE_PCI_DEV_ID_DECL_I40EVF(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_VF) -RTE_PCI_DEV_ID_DECL_I40EVF(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_VF_HV) -RTE_PCI_DEV_ID_DECL_I40EVF(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_X722_A0_VF) -RTE_PCI_DEV_ID_DECL_I40EVF(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_X722_VF) -RTE_PCI_DEV_ID_DECL_I40EVF(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_X722_VF_HV) - -/****************** Virtio devices from virtio.h ******************/ - -#define QUMRANET_DEV_ID_VIRTIO 0x1000 - -RTE_PCI_DEV_ID_DECL_VIRTIO(PCI_VENDOR_ID_QUMRANET, QUMRANET_DEV_ID_VIRTIO) - -/****************** VMware VMXNET3 devices ******************/ - -#define VMWARE_DEV_ID_VMXNET3 0x07B0 - -RTE_PCI_DEV_ID_DECL_VMXNET3(PCI_VENDOR_ID_VMWARE, VMWARE_DEV_ID_VMXNET3) - -/*************** Virtual FM10K devices from fm10k_type.h ***************/ - -#define FM10K_DEV_ID_VF 0x15A5 - -RTE_PCI_DEV_ID_DECL_FM10KVF(PCI_VENDOR_ID_INTEL, FM10K_DEV_ID_VF) - -/****************** Cisco VIC devices ******************/ - -#define PCI_DEVICE_ID_CISCO_VIC_ENET 0x0043 /* ethernet vnic */ -#define PCI_DEVICE_ID_CISCO_VIC_ENET_VF 0x0071 /* enet SRIOV VF */ - -RTE_PCI_DEV_ID_DECL_ENIC(PCI_VENDOR_ID_CISCO, PCI_DEVICE_ID_CISCO_VIC_ENET) -RTE_PCI_DEV_ID_DECL_ENIC(PCI_VENDOR_ID_CISCO, PCI_DEVICE_ID_CISCO_VIC_ENET_VF) - -/****************** QLogic devices ******************/ - -/* Broadcom/QLogic BNX2X */ -#define BNX2X_DEV_ID_57710 0x164e -#define BNX2X_DEV_ID_57711 0x164f -#define BNX2X_DEV_ID_57711E 0x1650 -#define BNX2X_DEV_ID_57712 0x1662 -#define BNX2X_DEV_ID_57712_MF 0x1663 -#define BNX2X_DEV_ID_57712_VF 0x166f -#define BNX2X_DEV_ID_57713 0x1651 -#define BNX2X_DEV_ID_57713E 0x1652 -#define BNX2X_DEV_ID_57800 0x168a -#define BNX2X_DEV_ID_57800_MF 0x16a5 -#define BNX2X_DEV_ID_57800_VF 0x16a9 -#define BNX2X_DEV_ID_57810 0x168e -#define BNX2X_DEV_ID_57810_MF 0x16ae -#define BNX2X_DEV_ID_57810_VF 0x16af -#define BNX2X_DEV_ID_57811 0x163d -#define BNX2X_DEV_ID_57811_MF 0x163e -#define BNX2X_DEV_ID_57811_VF 0x163f - -#define BNX2X_DEV_ID_57840_OBS 0x168d -#define BNX2X_DEV_ID_57840_OBS_MF 0x16ab -#define BNX2X_DEV_ID_57840_4_10 0x16a1 -#define BNX2X_DEV_ID_57840_2_20 0x16a2 -#define BNX2X_DEV_ID_57840_MF 0x16a4 -#define BNX2X_DEV_ID_57840_VF 0x16ad - -RTE_PCI_DEV_ID_DECL_BNX2X(PCI_VENDOR_ID_BROADCOM, BNX2X_DEV_ID_57800) -RTE_PCI_DEV_ID_DECL_BNX2XVF(PCI_VENDOR_ID_BROADCOM, BNX2X_DEV_ID_57800_VF) -RTE_PCI_DEV_ID_DECL_BNX2X(PCI_VENDOR_ID_BROADCOM, BNX2X_DEV_ID_57711) -RTE_PCI_DEV_ID_DECL_BNX2X(PCI_VENDOR_ID_BROADCOM, BNX2X_DEV_ID_57810) -RTE_PCI_DEV_ID_DECL_BNX2XVF(PCI_VENDOR_ID_BROADCOM, BNX2X_DEV_ID_57810_VF) -RTE_PCI_DEV_ID_DECL_BNX2X(PCI_VENDOR_ID_BROADCOM, BNX2X_DEV_ID_57811) -RTE_PCI_DEV_ID_DECL_BNX2XVF(PCI_VENDOR_ID_BROADCOM, BNX2X_DEV_ID_57811_VF) -RTE_PCI_DEV_ID_DECL_BNX2X(PCI_VENDOR_ID_BROADCOM, BNX2X_DEV_ID_57840_OBS) -RTE_PCI_DEV_ID_DECL_BNX2X(PCI_VENDOR_ID_BROADCOM, BNX2X_DEV_ID_57840_4_10) -RTE_PCI_DEV_ID_DECL_BNX2X(PCI_VENDOR_ID_BROADCOM, BNX2X_DEV_ID_57840_2_20) -RTE_PCI_DEV_ID_DECL_BNX2XVF(PCI_VENDOR_ID_BROADCOM, BNX2X_DEV_ID_57840_VF) -#ifdef RTE_LIBRTE_BNX2X_MF_SUPPORT -RTE_PCI_DEV_ID_DECL_BNX2X(PCI_VENDOR_ID_BROADCOM, BNX2X_DEV_ID_57810_MF) -RTE_PCI_DEV_ID_DECL_BNX2X(PCI_VENDOR_ID_BROADCOM, BNX2X_DEV_ID_57811_MF) -RTE_PCI_DEV_ID_DECL_BNX2X(PCI_VENDOR_ID_BROADCOM, BNX2X_DEV_ID_57840_MF) -#endif - -/****************** Broadcom bnxt devices ******************/ - -#define BROADCOM_DEV_ID_57301 0x16c8 -#define BROADCOM_DEV_ID_57302 0x16c9 -#define BROADCOM_DEV_ID_57304_PF 0x16ca -#define BROADCOM_DEV_ID_57304_VF 0x16cb -#define BROADCOM_DEV_ID_57402 0x16d0 -#define BROADCOM_DEV_ID_57404 0x16d1 -#define BROADCOM_DEV_ID_57406_PF 0x16d2 -#define BROADCOM_DEV_ID_57406_VF 0x16d3 -#define BROADCOM_DEV_ID_57406_MF 0x16d4 -#define BROADCOM_DEV_ID_57314 0x16df - -RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57301) -RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57302) -RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57304_PF) -RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57304_VF) -RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57402) -RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57404) -RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57406_PF) -RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57406_VF) -RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57406_MF) -RTE_PCI_DEV_ID_DECL_BNXT(PCI_VENDOR_ID_BROADCOM, BROADCOM_DEV_ID_57314) - /* * Undef all RTE_PCI_DEV_ID_DECL_* here. */ -#undef RTE_PCI_DEV_ID_DECL_BNX2X -#undef RTE_PCI_DEV_ID_DECL_BNX2XVF -#undef RTE_PCI_DEV_ID_DECL_EM #undef RTE_PCI_DEV_ID_DECL_IGB #undef RTE_PCI_DEV_ID_DECL_IGBVF #undef RTE_PCI_DEV_ID_DECL_IXGBE #undef RTE_PCI_DEV_ID_DECL_IXGBEVF -#undef RTE_PCI_DEV_ID_DECL_I40E -#undef RTE_PCI_DEV_ID_DECL_I40EVF -#undef RTE_PCI_DEV_ID_DECL_VIRTIO -#undef RTE_PCI_DEV_ID_DECL_VMXNET3 -#undef RTE_PCI_DEV_ID_DECL_FM10K -#undef RTE_PCI_DEV_ID_DECL_FM10KVF -#undef RTE_PCI_DEV_ID_DECL_BNXT diff --git a/lib/librte_eal/common/include/rte_version.h b/lib/librte_eal/common/include/rte_version.h index dbe09975..37102227 100644 --- a/lib/librte_eal/common/include/rte_version.h +++ b/lib/librte_eal/common/include/rte_version.h @@ -77,7 +77,7 @@ extern "C" { * 0-15 = release candidates * 16 = release */ -#define RTE_VER_RELEASE 1 +#define RTE_VER_RELEASE 2 /** * Macro to compute a version number usable for comparisons diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c index 27e9925d..42568e1d 100644 --- a/lib/librte_eal/common/malloc_elem.c +++ b/lib/librte_eal/common/malloc_elem.c @@ -275,11 +275,14 @@ malloc_elem_free(struct malloc_elem *elem) return -1; rte_spinlock_lock(&(elem->heap->lock)); + size_t sz = elem->size - sizeof(*elem); + uint8_t *ptr = (uint8_t *)&elem[1]; struct malloc_elem *next = RTE_PTR_ADD(elem, elem->size); if (next->state == ELEM_FREE){ /* remove from free list, join to this one */ elem_free_list_remove(next); join_elem(elem, next); + sz += sizeof(*elem); } /* check if previous element is free, if so join with it and return, @@ -288,15 +291,17 @@ malloc_elem_free(struct malloc_elem *elem) if (elem->prev != NULL && elem->prev->state == ELEM_FREE) { elem_free_list_remove(elem->prev); join_elem(elem->prev, elem); - malloc_elem_free_list_insert(elem->prev); - } - /* otherwise add ourselves to the free list */ - else { - malloc_elem_free_list_insert(elem); - elem->pad = 0; + sz += sizeof(*elem); + ptr -= sizeof(*elem); + elem = elem->prev; } + malloc_elem_free_list_insert(elem); + /* decrease heap's count of allocated elements */ elem->heap->alloc_count--; + + memset(ptr, 0, sz); + rte_spinlock_unlock(&(elem->heap->lock)); return 0; diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c index 47deb007..f4a88352 100644 --- a/lib/librte_eal/common/rte_malloc.c +++ b/lib/librte_eal/common/rte_malloc.c @@ -123,11 +123,7 @@ rte_malloc(const char *type, size_t size, unsigned align) void * rte_zmalloc_socket(const char *type, size_t size, unsigned align, int socket) { - void *ptr = rte_malloc_socket(type, size, align, socket); - - if (ptr != NULL) - memset(ptr, 0, size); - return ptr; + return rte_malloc_socket(type, size, align, socket); } /* diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile index 30b30f33..1a976931 100644 --- a/lib/librte_eal/linuxapp/eal/Makefile +++ b/lib/librte_eal/linuxapp/eal/Makefile @@ -66,10 +66,11 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_xen_memory.c endif SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_thread.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_log.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio_mp_sync.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_pci.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_pci_uio.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_pci_vfio.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_pci_vfio_mp_sync.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_debug.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_lcore.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c @@ -110,7 +111,7 @@ CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST) CFLAGS_eal.o := -D_GNU_SOURCE CFLAGS_eal_interrupts.o := -D_GNU_SOURCE -CFLAGS_eal_pci_vfio_mp_sync.o := -D_GNU_SOURCE +CFLAGS_eal_vfio_mp_sync.o := -D_GNU_SOURCE CFLAGS_eal_timer.o := -D_GNU_SOURCE CFLAGS_eal_lcore.o := -D_GNU_SOURCE CFLAGS_eal_thread.o := -D_GNU_SOURCE diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c index 543ef869..3fb2188f 100644 --- a/lib/librte_eal/linuxapp/eal/eal.c +++ b/lib/librte_eal/linuxapp/eal/eal.c @@ -82,6 +82,7 @@ #include "eal_filesystem.h" #include "eal_hugepages.h" #include "eal_options.h" +#include "eal_vfio.h" #define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL) @@ -701,6 +702,33 @@ rte_eal_iopl_init(void) return 0; } +#ifdef VFIO_PRESENT +static int rte_eal_vfio_setup(void) +{ + int vfio_enabled = 0; + + if (!internal_config.no_pci) { + pci_vfio_enable(); + vfio_enabled |= pci_vfio_is_enabled(); + } + + if (vfio_enabled) { + + /* if we are primary process, create a thread to communicate with + * secondary processes. the thread will use a socket to wait for + * requests from secondary process to send open file descriptors, + * because VFIO does not allow multiple open descriptors on a group or + * VFIO container. + */ + if (internal_config.process_type == RTE_PROC_PRIMARY && + vfio_mp_sync_setup() < 0) + return -1; + } + + return 0; +} +#endif + /* Launch threads, called at application init(). */ int rte_eal_init(int argc, char **argv) @@ -764,6 +792,11 @@ rte_eal_init(int argc, char **argv) if (rte_eal_pci_init() < 0) rte_panic("Cannot init PCI\n"); +#ifdef VFIO_PRESENT + if (rte_eal_vfio_setup() < 0) + rte_panic("Cannot init VFIO\n"); +#endif + #ifdef RTE_LIBRTE_IVSHMEM if (rte_eal_ivshmem_init() < 0) rte_panic("Cannot init IVSHMEM\n"); diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c index 5578c254..42a29faf 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memory.c +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -164,6 +164,29 @@ rte_mem_virt2phy(const void *virtaddr) int page_size; off_t offset; + /* when using dom0, /proc/self/pagemap always returns 0, check in + * dpdk memory by browsing the memsegs */ + if (rte_xen_dom0_supported()) { + struct rte_mem_config *mcfg; + struct rte_memseg *memseg; + unsigned i; + + mcfg = rte_eal_get_configuration()->mem_config; + for (i = 0; i < RTE_MAX_MEMSEG; i++) { + memseg = &mcfg->memseg[i]; + if (memseg->addr == NULL) + break; + if (virtaddr > memseg->addr && + virtaddr < RTE_PTR_ADD(memseg->addr, + memseg->len)) { + return memseg->phys_addr + + RTE_PTR_DIFF(virtaddr, memseg->addr); + } + } + + return RTE_BAD_PHYS_ADDR; + } + /* Cannot parse /proc/self/pagemap, no need to log errors everywhere */ if (!proc_pagemap_readable) return RTE_BAD_PHYS_ADDR; @@ -1136,7 +1159,7 @@ int rte_eal_hugepage_init(void) { struct rte_mem_config *mcfg; - struct hugepage_file *hugepage, *tmp_hp = NULL; + struct hugepage_file *hugepage = NULL, *tmp_hp = NULL; struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; uint64_t memory[RTE_MAX_NUMA_NODES]; @@ -1479,14 +1502,19 @@ rte_eal_hugepage_init(void) "of memory.\n", i, nr_hugefiles, RTE_STR(CONFIG_RTE_MAX_MEMSEG), RTE_MAX_MEMSEG); - return -ENOMEM; + goto fail; } + munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file)); + return 0; fail: huge_recover_sigbus(); free(tmp_hp); + if (hugepage != NULL) + munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file)); + return -1; } diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c index f9c3efd2..cd9de7cc 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci.c +++ b/lib/librte_eal/linuxapp/eal/eal_pci.c @@ -754,21 +754,6 @@ rte_eal_pci_init(void) RTE_LOG(ERR, EAL, "%s(): Cannot scan PCI bus\n", __func__); return -1; } -#ifdef VFIO_PRESENT - pci_vfio_enable(); - - if (pci_vfio_is_enabled()) { - - /* if we are primary process, create a thread to communicate with - * secondary processes. the thread will use a socket to wait for - * requests from secondary process to send open file descriptors, - * because VFIO does not allow multiple open descriptors on a group or - * VFIO container. - */ - if (internal_config.process_type == RTE_PROC_PRIMARY && - pci_vfio_mp_sync_setup() < 0) - return -1; - } -#endif + return 0; } diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_init.h b/lib/librte_eal/linuxapp/eal/eal_pci_init.h index f72a2548..6a960d1b 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci_init.h +++ b/lib/librte_eal/linuxapp/eal/eal_pci_init.h @@ -74,12 +74,6 @@ int pci_uio_ioport_unmap(struct rte_pci_ioport *p); #ifdef VFIO_PRESENT -#define VFIO_MAX_GROUPS 64 - -int pci_vfio_enable(void); -int pci_vfio_is_enabled(void); -int pci_vfio_mp_sync_setup(void); - /* access config space */ int pci_vfio_read_config(const struct rte_intr_handle *intr_handle, void *buf, size_t len, off_t offs); @@ -96,41 +90,6 @@ int pci_vfio_ioport_unmap(struct rte_pci_ioport *p); /* map VFIO resource prototype */ int pci_vfio_map_resource(struct rte_pci_device *dev); -int pci_vfio_get_group_fd(int iommu_group_fd); -int pci_vfio_get_container_fd(void); - -/* - * Function prototypes for VFIO multiprocess sync functions - */ -int vfio_mp_sync_send_request(int socket, int req); -int vfio_mp_sync_receive_request(int socket); -int vfio_mp_sync_send_fd(int socket, int fd); -int vfio_mp_sync_receive_fd(int socket); -int vfio_mp_sync_connect_to_primary(void); - -/* socket comm protocol definitions */ -#define SOCKET_REQ_CONTAINER 0x100 -#define SOCKET_REQ_GROUP 0x200 -#define SOCKET_OK 0x0 -#define SOCKET_NO_FD 0x1 -#define SOCKET_ERR 0xFF - -/* - * we don't need to store device fd's anywhere since they can be obtained from - * the group fd via an ioctl() call. - */ -struct vfio_group { - int group_no; - int fd; -}; - -struct vfio_config { - int vfio_enabled; - int vfio_container_fd; - int vfio_container_has_dma; - int vfio_group_idx; - struct vfio_group vfio_groups[VFIO_MAX_GROUPS]; -}; #endif diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c index f91b9242..46cd6831 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c @@ -43,11 +43,11 @@ #include #include #include -#include #include "eal_filesystem.h" #include "eal_pci_init.h" #include "eal_vfio.h" +#include "eal_private.h" /** * @file @@ -69,78 +69,6 @@ static struct rte_tailq_elem rte_vfio_tailq = { }; EAL_REGISTER_TAILQ(rte_vfio_tailq) -#define VFIO_DIR "/dev/vfio" -#define VFIO_CONTAINER_PATH "/dev/vfio/vfio" -#define VFIO_GROUP_FMT "/dev/vfio/%u" -#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u" -#define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL) -#define VFIO_GET_REGION_IDX(x) (x >> 40) - -/* per-process VFIO config */ -static struct vfio_config vfio_cfg; - -/* DMA mapping function prototype. - * Takes VFIO container fd as a parameter. - * Returns 0 on success, -1 on error. - * */ -typedef int (*vfio_dma_func_t)(int); - -struct vfio_iommu_type { - int type_id; - const char *name; - vfio_dma_func_t dma_map_func; -}; - -static int vfio_type1_dma_map(int); -static int vfio_noiommu_dma_map(int); - -/* IOMMU types we support */ -static const struct vfio_iommu_type iommu_types[] = { - /* x86 IOMMU, otherwise known as type 1 */ - { RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map}, - /* IOMMU-less mode */ - { RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map}, -}; - -int -vfio_type1_dma_map(int vfio_container_fd) -{ - const struct rte_memseg *ms = rte_eal_get_physmem_layout(); - int i, ret; - - /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ - for (i = 0; i < RTE_MAX_MEMSEG; i++) { - struct vfio_iommu_type1_dma_map dma_map; - - if (ms[i].addr == NULL) - break; - - memset(&dma_map, 0, sizeof(dma_map)); - dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); - dma_map.vaddr = ms[i].addr_64; - dma_map.size = ms[i].len; - dma_map.iova = ms[i].phys_addr; - dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; - - ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); - - if (ret) { - RTE_LOG(ERR, EAL, " cannot set up DMA remapping, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; - } - } - - return 0; -} - -int -vfio_noiommu_dma_map(int __rte_unused vfio_container_fd) -{ - /* No-IOMMU mode does not need DMA mapping */ - return 0; -} - int pci_vfio_read_config(const struct rte_intr_handle *intr_handle, void *buf, size_t len, off_t offs) @@ -272,63 +200,6 @@ pci_vfio_set_bus_master(int dev_fd) return 0; } -/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */ -static const struct vfio_iommu_type * -pci_vfio_set_iommu_type(int vfio_container_fd) { - unsigned idx; - for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { - const struct vfio_iommu_type *t = &iommu_types[idx]; - - int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, - t->type_id); - if (!ret) { - RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", - t->type_id, t->name); - return t; - } - /* not an error, there may be more supported IOMMU types */ - RTE_LOG(DEBUG, EAL, " set IOMMU type %d (%s) failed, " - "error %i (%s)\n", t->type_id, t->name, errno, - strerror(errno)); - } - /* if we didn't find a suitable IOMMU type, fail */ - return NULL; -} - -/* check if we have any supported extensions */ -static int -pci_vfio_has_supported_extensions(int vfio_container_fd) { - int ret; - unsigned idx, n_extensions = 0; - for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { - const struct vfio_iommu_type *t = &iommu_types[idx]; - - ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, - t->type_id); - if (ret < 0) { - RTE_LOG(ERR, EAL, " could not get IOMMU type, " - "error %i (%s)\n", errno, - strerror(errno)); - close(vfio_container_fd); - return -1; - } else if (ret == 1) { - /* we found a supported extension */ - n_extensions++; - } - RTE_LOG(DEBUG, EAL, " IOMMU type %d (%s) is %s\n", - t->type_id, t->name, - ret ? "supported" : "not supported"); - } - - /* if we didn't find any supported IOMMU types, fail */ - if (!n_extensions) { - close(vfio_container_fd); - return -1; - } - - return 0; -} - /* set up interrupt support (but not enable interrupts) */ static int pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd) @@ -425,220 +296,6 @@ pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd) return -1; } -/* open container fd or get an existing one */ -int -pci_vfio_get_container_fd(void) -{ - int ret, vfio_container_fd; - - /* if we're in a primary process, try to open the container */ - if (internal_config.process_type == RTE_PROC_PRIMARY) { - vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR); - if (vfio_container_fd < 0) { - RTE_LOG(ERR, EAL, " cannot open VFIO container, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; - } - - /* check VFIO API version */ - ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION); - if (ret != VFIO_API_VERSION) { - if (ret < 0) - RTE_LOG(ERR, EAL, " could not get VFIO API version, " - "error %i (%s)\n", errno, strerror(errno)); - else - RTE_LOG(ERR, EAL, " unsupported VFIO API version!\n"); - close(vfio_container_fd); - return -1; - } - - ret = pci_vfio_has_supported_extensions(vfio_container_fd); - if (ret) { - RTE_LOG(ERR, EAL, " no supported IOMMU " - "extensions found!\n"); - return -1; - } - - return vfio_container_fd; - } else { - /* - * if we're in a secondary process, request container fd from the - * primary process via our socket - */ - int socket_fd; - - socket_fd = vfio_mp_sync_connect_to_primary(); - if (socket_fd < 0) { - RTE_LOG(ERR, EAL, " cannot connect to primary process!\n"); - return -1; - } - if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) { - RTE_LOG(ERR, EAL, " cannot request container fd!\n"); - close(socket_fd); - return -1; - } - vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd); - if (vfio_container_fd < 0) { - RTE_LOG(ERR, EAL, " cannot get container fd!\n"); - close(socket_fd); - return -1; - } - close(socket_fd); - return vfio_container_fd; - } - - return -1; -} - -/* open group fd or get an existing one */ -int -pci_vfio_get_group_fd(int iommu_group_no) -{ - int i; - int vfio_group_fd; - char filename[PATH_MAX]; - - /* check if we already have the group descriptor open */ - for (i = 0; i < vfio_cfg.vfio_group_idx; i++) - if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no) - return vfio_cfg.vfio_groups[i].fd; - - /* if primary, try to open the group */ - if (internal_config.process_type == RTE_PROC_PRIMARY) { - /* try regular group format */ - snprintf(filename, sizeof(filename), - VFIO_GROUP_FMT, iommu_group_no); - vfio_group_fd = open(filename, O_RDWR); - if (vfio_group_fd < 0) { - /* if file not found, it's not an error */ - if (errno != ENOENT) { - RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, - strerror(errno)); - return -1; - } - - /* special case: try no-IOMMU path as well */ - snprintf(filename, sizeof(filename), - VFIO_NOIOMMU_GROUP_FMT, iommu_group_no); - vfio_group_fd = open(filename, O_RDWR); - if (vfio_group_fd < 0) { - if (errno != ENOENT) { - RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, - strerror(errno)); - return -1; - } - return 0; - } - /* noiommu group found */ - } - - /* if the fd is valid, create a new group for it */ - if (vfio_cfg.vfio_group_idx == VFIO_MAX_GROUPS) { - RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n"); - close(vfio_group_fd); - return -1; - } - vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no; - vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd; - return vfio_group_fd; - } - /* if we're in a secondary process, request group fd from the primary - * process via our socket - */ - else { - int socket_fd, ret; - - socket_fd = vfio_mp_sync_connect_to_primary(); - - if (socket_fd < 0) { - RTE_LOG(ERR, EAL, " cannot connect to primary process!\n"); - return -1; - } - if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) { - RTE_LOG(ERR, EAL, " cannot request container fd!\n"); - close(socket_fd); - return -1; - } - if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) { - RTE_LOG(ERR, EAL, " cannot send group number!\n"); - close(socket_fd); - return -1; - } - ret = vfio_mp_sync_receive_request(socket_fd); - switch (ret) { - case SOCKET_NO_FD: - close(socket_fd); - return 0; - case SOCKET_OK: - vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd); - /* if we got the fd, return it */ - if (vfio_group_fd > 0) { - close(socket_fd); - return vfio_group_fd; - } - /* fall-through on error */ - default: - RTE_LOG(ERR, EAL, " cannot get container fd!\n"); - close(socket_fd); - return -1; - } - } - return -1; -} - -/* parse IOMMU group number for a PCI device - * returns 1 on success, -1 for errors, 0 for non-existent group - */ -static int -pci_vfio_get_group_no(const char *pci_addr, int *iommu_group_no) -{ - char linkname[PATH_MAX]; - char filename[PATH_MAX]; - char *tok[16], *group_tok, *end; - int ret; - - memset(linkname, 0, sizeof(linkname)); - memset(filename, 0, sizeof(filename)); - - /* try to find out IOMMU group for this device */ - snprintf(linkname, sizeof(linkname), - "%s/%s/iommu_group", pci_get_sysfs_path(), pci_addr); - - ret = readlink(linkname, filename, sizeof(filename)); - - /* if the link doesn't exist, no VFIO for us */ - if (ret < 0) - return 0; - - ret = rte_strsplit(filename, sizeof(filename), - tok, RTE_DIM(tok), '/'); - - if (ret <= 0) { - RTE_LOG(ERR, EAL, " %s cannot get IOMMU group\n", pci_addr); - return -1; - } - - /* IOMMU group is always the last token */ - errno = 0; - group_tok = tok[ret - 1]; - end = group_tok; - *iommu_group_no = strtol(group_tok, &end, 10); - if ((end != group_tok && *end != '\0') || errno != 0) { - RTE_LOG(ERR, EAL, " %s error parsing IOMMU number!\n", pci_addr); - return -1; - } - - return 1; -} - -static void -clear_current_group(void) -{ - vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = 0; - vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = -1; -} - - /* * map the PCI resources of a PCI device in virtual memory (VFIO version). * primary and secondary processes follow almost exactly the same path @@ -646,13 +303,9 @@ clear_current_group(void) int pci_vfio_map_resource(struct rte_pci_device *dev) { - struct vfio_group_status group_status = { - .argsz = sizeof(group_status) - }; struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; - int vfio_group_fd, vfio_dev_fd; - int iommu_group_no; char pci_addr[PATH_MAX] = {0}; + int vfio_dev_fd; struct rte_pci_addr *loc = &dev->addr; int i, ret, msix_bar; struct mapped_pci_resource *vfio_res = NULL; @@ -670,127 +323,9 @@ pci_vfio_map_resource(struct rte_pci_device *dev) snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, loc->domain, loc->bus, loc->devid, loc->function); - /* get group number */ - ret = pci_vfio_get_group_no(pci_addr, &iommu_group_no); - if (ret == 0) { - RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", - pci_addr); - return 1; - } - - /* if negative, something failed */ - if (ret < 0) - return -1; - - /* get the actual group fd */ - vfio_group_fd = pci_vfio_get_group_fd(iommu_group_no); - if (vfio_group_fd < 0) - return -1; - - /* store group fd */ - vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no; - vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd; - - /* if group_fd == 0, that means the device isn't managed by VFIO */ - if (vfio_group_fd == 0) { - RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", - pci_addr); - /* we store 0 as group fd to distinguish between existing but - * unbound VFIO groups, and groups that don't exist at all. - */ - vfio_cfg.vfio_group_idx++; - return 1; - } - - /* - * at this point, we know at least one port on this device is bound to VFIO, - * so we can proceed to try and set this particular port up - */ - - /* check if the group is viable */ - ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status); - if (ret) { - RTE_LOG(ERR, EAL, " %s cannot get group status, " - "error %i (%s)\n", pci_addr, errno, strerror(errno)); - close(vfio_group_fd); - clear_current_group(); - return -1; - } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { - RTE_LOG(ERR, EAL, " %s VFIO group is not viable!\n", pci_addr); - close(vfio_group_fd); - clear_current_group(); - return -1; - } - - /* - * at this point, we know that this group is viable (meaning, all devices - * are either bound to VFIO or not bound to anything) - */ - - /* check if group does not have a container yet */ - if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) { - - /* add group to a container */ - ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER, - &vfio_cfg.vfio_container_fd); - if (ret) { - RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, " - "error %i (%s)\n", pci_addr, errno, strerror(errno)); - close(vfio_group_fd); - clear_current_group(); - return -1; - } - /* - * at this point we know that this group has been successfully - * initialized, so we increment vfio_group_idx to indicate that we can - * add new groups. - */ - vfio_cfg.vfio_group_idx++; - } - - /* - * pick an IOMMU type and set up DMA mappings for container - * - * needs to be done only once, only when at least one group is assigned to - * a container and only in primary process - */ - if (internal_config.process_type == RTE_PROC_PRIMARY && - vfio_cfg.vfio_container_has_dma == 0) { - /* select an IOMMU type which we will be using */ - const struct vfio_iommu_type *t = - pci_vfio_set_iommu_type(vfio_cfg.vfio_container_fd); - if (!t) { - RTE_LOG(ERR, EAL, " %s failed to select IOMMU type\n", pci_addr); - return -1; - } - ret = t->dma_map_func(vfio_cfg.vfio_container_fd); - if (ret) { - RTE_LOG(ERR, EAL, " %s DMA remapping failed, " - "error %i (%s)\n", pci_addr, errno, strerror(errno)); - return -1; - } - vfio_cfg.vfio_container_has_dma = 1; - } - - /* get a file descriptor for the device */ - vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, pci_addr); - if (vfio_dev_fd < 0) { - /* if we cannot get a device fd, this simply means that this - * particular port is not bound to VFIO - */ - RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", - pci_addr); - return 1; - } - - /* test and setup the device */ - ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_INFO, &device_info); - if (ret) { - RTE_LOG(ERR, EAL, " %s cannot get device info, " - "error %i (%s)\n", pci_addr, errno, strerror(errno)); - close(vfio_dev_fd); - return -1; - } + if ((ret = vfio_setup_device(pci_get_sysfs_path(), pci_addr, + &vfio_dev_fd, &device_info))) + return ret; /* get MSI-X BAR, if any (we have to know where it is because we can't * easily mmap it when using VFIO) */ @@ -1048,50 +583,12 @@ pci_vfio_ioport_unmap(struct rte_pci_ioport *p) int pci_vfio_enable(void) { - /* initialize group list */ - int i; - int vfio_available; - - for (i = 0; i < VFIO_MAX_GROUPS; i++) { - vfio_cfg.vfio_groups[i].fd = -1; - vfio_cfg.vfio_groups[i].group_no = -1; - } - - /* inform the user that we are probing for VFIO */ - RTE_LOG(INFO, EAL, "Probing VFIO support...\n"); - - /* check if vfio-pci module is loaded */ - vfio_available = rte_eal_check_module("vfio_pci"); - - /* return error directly */ - if (vfio_available == -1) { - RTE_LOG(INFO, EAL, "Could not get loaded module details!\n"); - return -1; - } - - /* return 0 if VFIO modules not loaded */ - if (vfio_available == 0) { - RTE_LOG(DEBUG, EAL, "VFIO modules not loaded, " - "skipping VFIO support...\n"); - return 0; - } - - vfio_cfg.vfio_container_fd = pci_vfio_get_container_fd(); - - /* check if we have VFIO driver enabled */ - if (vfio_cfg.vfio_container_fd != -1) { - RTE_LOG(NOTICE, EAL, "VFIO support initialized\n"); - vfio_cfg.vfio_enabled = 1; - } else { - RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n"); - } - - return 0; + return vfio_enable("vfio_pci"); } int pci_vfio_is_enabled(void) { - return vfio_cfg.vfio_enabled; + return vfio_is_enabled("vfio_pci"); } #endif diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_mp_sync.c deleted file mode 100644 index d54ded88..00000000 --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio_mp_sync.c +++ /dev/null @@ -1,408 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include - -/* sys/un.h with __USE_MISC uses strlen, which is unsafe */ -#ifdef __USE_MISC -#define REMOVED_USE_MISC -#undef __USE_MISC -#endif -#include -/* make sure we redefine __USE_MISC only if it was previously undefined */ -#ifdef REMOVED_USE_MISC -#define __USE_MISC -#undef REMOVED_USE_MISC -#endif - -#include -#include -#include -#include - -#include "eal_filesystem.h" -#include "eal_pci_init.h" -#include "eal_thread.h" - -/** - * @file - * VFIO socket for communication between primary and secondary processes. - * - * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y". - */ - -#ifdef VFIO_PRESENT - -#define SOCKET_PATH_FMT "%s/.%s_mp_socket" -#define CMSGLEN (CMSG_LEN(sizeof(int))) -#define FD_TO_CMSGHDR(fd, chdr) \ - do {\ - (chdr).cmsg_len = CMSGLEN;\ - (chdr).cmsg_level = SOL_SOCKET;\ - (chdr).cmsg_type = SCM_RIGHTS;\ - memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\ - } while (0) -#define CMSGHDR_TO_FD(chdr, fd) \ - memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd)) - -static pthread_t socket_thread; -static int mp_socket_fd; - - -/* get socket path (/var/run if root, $HOME otherwise) */ -static void -get_socket_path(char *buffer, int bufsz) -{ - const char *dir = "/var/run"; - const char *home_dir = getenv("HOME"); - - if (getuid() != 0 && home_dir != NULL) - dir = home_dir; - - /* use current prefix as file path */ - snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir, - internal_config.hugefile_prefix); -} - - - -/* - * data flow for socket comm protocol: - * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP - * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number - * 2. server receives message - * 2a. in case of invalid group, SOCKET_ERR is sent back to client - * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client - * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd - * - * in case of any error, socket is closed. - */ - -/* send a request, return -1 on error */ -int -vfio_mp_sync_send_request(int socket, int req) -{ - struct msghdr hdr; - struct iovec iov; - int buf; - int ret; - - memset(&hdr, 0, sizeof(hdr)); - - buf = req; - - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; - iov.iov_base = (char *) &buf; - iov.iov_len = sizeof(buf); - - ret = sendmsg(socket, &hdr, 0); - if (ret < 0) - return -1; - return 0; -} - -/* receive a request and return it */ -int -vfio_mp_sync_receive_request(int socket) -{ - int buf; - struct msghdr hdr; - struct iovec iov; - int ret, req; - - memset(&hdr, 0, sizeof(hdr)); - - buf = SOCKET_ERR; - - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; - iov.iov_base = (char *) &buf; - iov.iov_len = sizeof(buf); - - ret = recvmsg(socket, &hdr, 0); - if (ret < 0) - return -1; - - req = buf; - - return req; -} - -/* send OK in message, fd in control message */ -int -vfio_mp_sync_send_fd(int socket, int fd) -{ - int buf; - struct msghdr hdr; - struct cmsghdr *chdr; - char chdr_buf[CMSGLEN]; - struct iovec iov; - int ret; - - chdr = (struct cmsghdr *) chdr_buf; - memset(chdr, 0, sizeof(chdr_buf)); - memset(&hdr, 0, sizeof(hdr)); - - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; - iov.iov_base = (char *) &buf; - iov.iov_len = sizeof(buf); - hdr.msg_control = chdr; - hdr.msg_controllen = CMSGLEN; - - buf = SOCKET_OK; - FD_TO_CMSGHDR(fd, *chdr); - - ret = sendmsg(socket, &hdr, 0); - if (ret < 0) - return -1; - return 0; -} - -/* receive OK in message, fd in control message */ -int -vfio_mp_sync_receive_fd(int socket) -{ - int buf; - struct msghdr hdr; - struct cmsghdr *chdr; - char chdr_buf[CMSGLEN]; - struct iovec iov; - int ret, req, fd; - - buf = SOCKET_ERR; - - chdr = (struct cmsghdr *) chdr_buf; - memset(chdr, 0, sizeof(chdr_buf)); - memset(&hdr, 0, sizeof(hdr)); - - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; - iov.iov_base = (char *) &buf; - iov.iov_len = sizeof(buf); - hdr.msg_control = chdr; - hdr.msg_controllen = CMSGLEN; - - ret = recvmsg(socket, &hdr, 0); - if (ret < 0) - return -1; - - req = buf; - - if (req != SOCKET_OK) - return -1; - - CMSGHDR_TO_FD(*chdr, fd); - - return fd; -} - -/* connect socket_fd in secondary process to the primary process's socket */ -int -vfio_mp_sync_connect_to_primary(void) -{ - struct sockaddr_un addr; - socklen_t sockaddr_len; - int socket_fd; - - /* set up a socket */ - socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0); - if (socket_fd < 0) { - RTE_LOG(ERR, EAL, "Failed to create socket!\n"); - return -1; - } - - get_socket_path(addr.sun_path, sizeof(addr.sun_path)); - addr.sun_family = AF_UNIX; - - sockaddr_len = sizeof(struct sockaddr_un); - - if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0) - return socket_fd; - - /* if connect failed */ - close(socket_fd); - return -1; -} - - - -/* - * socket listening thread for primary process - */ -static __attribute__((noreturn)) void * -pci_vfio_mp_sync_thread(void __rte_unused * arg) -{ - int ret, fd, vfio_group_no; - - /* wait for requests on the socket */ - for (;;) { - int conn_sock; - struct sockaddr_un addr; - socklen_t sockaddr_len = sizeof(addr); - - /* this is a blocking call */ - conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr, - &sockaddr_len); - - /* just restart on error */ - if (conn_sock == -1) - continue; - - /* set socket to linger after close */ - struct linger l; - l.l_onoff = 1; - l.l_linger = 60; - - if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0) - RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option " - "on listen socket (%s)\n", strerror(errno)); - - ret = vfio_mp_sync_receive_request(conn_sock); - - switch (ret) { - case SOCKET_REQ_CONTAINER: - fd = pci_vfio_get_container_fd(); - if (fd < 0) - vfio_mp_sync_send_request(conn_sock, SOCKET_ERR); - else - vfio_mp_sync_send_fd(conn_sock, fd); - break; - case SOCKET_REQ_GROUP: - /* wait for group number */ - vfio_group_no = vfio_mp_sync_receive_request(conn_sock); - if (vfio_group_no < 0) { - close(conn_sock); - continue; - } - - fd = pci_vfio_get_group_fd(vfio_group_no); - - if (fd < 0) - vfio_mp_sync_send_request(conn_sock, SOCKET_ERR); - /* if VFIO group exists but isn't bound to VFIO driver */ - else if (fd == 0) - vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD); - /* if group exists and is bound to VFIO driver */ - else { - vfio_mp_sync_send_request(conn_sock, SOCKET_OK); - vfio_mp_sync_send_fd(conn_sock, fd); - } - break; - default: - vfio_mp_sync_send_request(conn_sock, SOCKET_ERR); - break; - } - close(conn_sock); - } -} - -static int -vfio_mp_sync_socket_setup(void) -{ - int ret, socket_fd; - struct sockaddr_un addr; - socklen_t sockaddr_len; - - /* set up a socket */ - socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0); - if (socket_fd < 0) { - RTE_LOG(ERR, EAL, "Failed to create socket!\n"); - return -1; - } - - get_socket_path(addr.sun_path, sizeof(addr.sun_path)); - addr.sun_family = AF_UNIX; - - sockaddr_len = sizeof(struct sockaddr_un); - - unlink(addr.sun_path); - - ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len); - if (ret) { - RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno)); - close(socket_fd); - return -1; - } - - ret = listen(socket_fd, 50); - if (ret) { - RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno)); - close(socket_fd); - return -1; - } - - /* save the socket in local configuration */ - mp_socket_fd = socket_fd; - - return 0; -} - -/* - * set up a local socket and tell it to listen for incoming connections - */ -int -pci_vfio_mp_sync_setup(void) -{ - int ret; - char thread_name[RTE_MAX_THREAD_NAME_LEN]; - - if (vfio_mp_sync_socket_setup() < 0) { - RTE_LOG(ERR, EAL, "Failed to set up local socket!\n"); - return -1; - } - - ret = pthread_create(&socket_thread, NULL, - pci_vfio_mp_sync_thread, NULL); - if (ret) { - RTE_LOG(ERR, EAL, - "Failed to create thread for communication with secondary processes!\n"); - close(mp_socket_fd); - return -1; - } - - /* Set thread_name for aid in debugging. */ - snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "pci-vfio-sync"); - ret = rte_thread_setname(socket_thread, thread_name); - if (ret) - RTE_LOG(DEBUG, EAL, - "Failed to set thread name for secondary processes!\n"); - - return 0; -} - -#endif diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c new file mode 100644 index 00000000..fcb0ab38 --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c @@ -0,0 +1,547 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "eal_filesystem.h" +#include "eal_vfio.h" +#include "eal_private.h" + +#ifdef VFIO_PRESENT + +/* per-process VFIO config */ +static struct vfio_config vfio_cfg; + +static int vfio_type1_dma_map(int); +static int vfio_noiommu_dma_map(int); + +/* IOMMU types we support */ +static const struct vfio_iommu_type iommu_types[] = { + /* x86 IOMMU, otherwise known as type 1 */ + { RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map}, + /* IOMMU-less mode */ + { RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map}, +}; + +int +vfio_get_group_fd(int iommu_group_no) +{ + int i; + int vfio_group_fd; + char filename[PATH_MAX]; + + /* check if we already have the group descriptor open */ + for (i = 0; i < vfio_cfg.vfio_group_idx; i++) + if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no) + return vfio_cfg.vfio_groups[i].fd; + + /* if primary, try to open the group */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* try regular group format */ + snprintf(filename, sizeof(filename), + VFIO_GROUP_FMT, iommu_group_no); + vfio_group_fd = open(filename, O_RDWR); + if (vfio_group_fd < 0) { + /* if file not found, it's not an error */ + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, + strerror(errno)); + return -1; + } + + /* special case: try no-IOMMU path as well */ + snprintf(filename, sizeof(filename), + VFIO_NOIOMMU_GROUP_FMT, iommu_group_no); + vfio_group_fd = open(filename, O_RDWR); + if (vfio_group_fd < 0) { + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, + strerror(errno)); + return -1; + } + return 0; + } + /* noiommu group found */ + } + + /* if the fd is valid, create a new group for it */ + if (vfio_cfg.vfio_group_idx == VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n"); + close(vfio_group_fd); + return -1; + } + vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no; + vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd; + return vfio_group_fd; + } + /* if we're in a secondary process, request group fd from the primary + * process via our socket + */ + else { + int socket_fd, ret; + + socket_fd = vfio_mp_sync_connect_to_primary(); + + if (socket_fd < 0) { + RTE_LOG(ERR, EAL, " cannot connect to primary process!\n"); + return -1; + } + if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) { + RTE_LOG(ERR, EAL, " cannot request container fd!\n"); + close(socket_fd); + return -1; + } + if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) { + RTE_LOG(ERR, EAL, " cannot send group number!\n"); + close(socket_fd); + return -1; + } + ret = vfio_mp_sync_receive_request(socket_fd); + switch (ret) { + case SOCKET_NO_FD: + close(socket_fd); + return 0; + case SOCKET_OK: + vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd); + /* if we got the fd, return it */ + if (vfio_group_fd > 0) { + close(socket_fd); + return vfio_group_fd; + } + /* fall-through on error */ + default: + RTE_LOG(ERR, EAL, " cannot get container fd!\n"); + close(socket_fd); + return -1; + } + } + return -1; +} + +static void +clear_current_group(void) +{ + vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = 0; + vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = -1; +} + +int vfio_setup_device(const char *sysfs_base, const char *dev_addr, + int *vfio_dev_fd, struct vfio_device_info *device_info) +{ + struct vfio_group_status group_status = { + .argsz = sizeof(group_status) + }; + int vfio_group_fd; + int iommu_group_no; + int ret; + + /* get group number */ + ret = vfio_get_group_no(sysfs_base, dev_addr, &iommu_group_no); + if (ret == 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", + dev_addr); + return 1; + } + + /* if negative, something failed */ + if (ret < 0) + return -1; + + /* get the actual group fd */ + vfio_group_fd = vfio_get_group_fd(iommu_group_no); + if (vfio_group_fd < 0) + return -1; + + /* store group fd */ + vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no; + vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd; + + /* if group_fd == 0, that means the device isn't managed by VFIO */ + if (vfio_group_fd == 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", + dev_addr); + /* we store 0 as group fd to distinguish between existing but + * unbound VFIO groups, and groups that don't exist at all. + */ + vfio_cfg.vfio_group_idx++; + return 1; + } + + /* + * at this point, we know that this group is viable (meaning, all devices + * are either bound to VFIO or not bound to anything) + */ + + /* check if the group is viable */ + ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot get group status, " + "error %i (%s)\n", dev_addr, errno, strerror(errno)); + close(vfio_group_fd); + clear_current_group(); + return -1; + } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + RTE_LOG(ERR, EAL, " %s VFIO group is not viable!\n", dev_addr); + close(vfio_group_fd); + clear_current_group(); + return -1; + } + + /* check if group does not have a container yet */ + if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) { + + /* add group to a container */ + ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER, + &vfio_cfg.vfio_container_fd); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, " + "error %i (%s)\n", dev_addr, errno, strerror(errno)); + close(vfio_group_fd); + clear_current_group(); + return -1; + } + /* + * at this point we know that this group has been successfully + * initialized, so we increment vfio_group_idx to indicate that we can + * add new groups. + */ + vfio_cfg.vfio_group_idx++; + } + + /* + * pick an IOMMU type and set up DMA mappings for container + * + * needs to be done only once, only when at least one group is assigned to + * a container and only in primary process + */ + if (internal_config.process_type == RTE_PROC_PRIMARY && + vfio_cfg.vfio_container_has_dma == 0) { + /* select an IOMMU type which we will be using */ + const struct vfio_iommu_type *t = + vfio_set_iommu_type(vfio_cfg.vfio_container_fd); + if (!t) { + RTE_LOG(ERR, EAL, " %s failed to select IOMMU type\n", dev_addr); + return -1; + } + ret = t->dma_map_func(vfio_cfg.vfio_container_fd); + if (ret) { + RTE_LOG(ERR, EAL, " %s DMA remapping failed, " + "error %i (%s)\n", dev_addr, errno, strerror(errno)); + return -1; + } + vfio_cfg.vfio_container_has_dma = 1; + } + + /* get a file descriptor for the device */ + *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr); + if (*vfio_dev_fd < 0) { + /* if we cannot get a device fd, this simply means that this + * particular port is not bound to VFIO + */ + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", + dev_addr); + return 1; + } + + /* test and setup the device */ + ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot get device info, " + "error %i (%s)\n", dev_addr, errno, strerror(errno)); + close(*vfio_dev_fd); + return -1; + } + + return 0; +} + +int +vfio_enable(const char *modname) +{ + /* initialize group list */ + int i; + int vfio_available; + + for (i = 0; i < VFIO_MAX_GROUPS; i++) { + vfio_cfg.vfio_groups[i].fd = -1; + vfio_cfg.vfio_groups[i].group_no = -1; + } + + /* inform the user that we are probing for VFIO */ + RTE_LOG(INFO, EAL, "Probing VFIO support...\n"); + + /* check if vfio-pci module is loaded */ + vfio_available = rte_eal_check_module(modname); + + /* return error directly */ + if (vfio_available == -1) { + RTE_LOG(INFO, EAL, "Could not get loaded module details!\n"); + return -1; + } + + /* return 0 if VFIO modules not loaded */ + if (vfio_available == 0) { + RTE_LOG(DEBUG, EAL, "VFIO modules not loaded, " + "skipping VFIO support...\n"); + return 0; + } + + vfio_cfg.vfio_container_fd = vfio_get_container_fd(); + + /* check if we have VFIO driver enabled */ + if (vfio_cfg.vfio_container_fd != -1) { + RTE_LOG(NOTICE, EAL, "VFIO support initialized\n"); + vfio_cfg.vfio_enabled = 1; + } else { + RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n"); + } + + return 0; +} + +int +vfio_is_enabled(const char *modname) +{ + const int mod_available = rte_eal_check_module(modname); + return vfio_cfg.vfio_enabled && mod_available; +} + +const struct vfio_iommu_type * +vfio_set_iommu_type(int vfio_container_fd) { + unsigned idx; + for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { + const struct vfio_iommu_type *t = &iommu_types[idx]; + + int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, + t->type_id); + if (!ret) { + RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", + t->type_id, t->name); + return t; + } + /* not an error, there may be more supported IOMMU types */ + RTE_LOG(DEBUG, EAL, " set IOMMU type %d (%s) failed, " + "error %i (%s)\n", t->type_id, t->name, errno, + strerror(errno)); + } + /* if we didn't find a suitable IOMMU type, fail */ + return NULL; +} + +int +vfio_has_supported_extensions(int vfio_container_fd) { + int ret; + unsigned idx, n_extensions = 0; + for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { + const struct vfio_iommu_type *t = &iommu_types[idx]; + + ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, + t->type_id); + if (ret < 0) { + RTE_LOG(ERR, EAL, " could not get IOMMU type, " + "error %i (%s)\n", errno, + strerror(errno)); + close(vfio_container_fd); + return -1; + } else if (ret == 1) { + /* we found a supported extension */ + n_extensions++; + } + RTE_LOG(DEBUG, EAL, " IOMMU type %d (%s) is %s\n", + t->type_id, t->name, + ret ? "supported" : "not supported"); + } + + /* if we didn't find any supported IOMMU types, fail */ + if (!n_extensions) { + close(vfio_container_fd); + return -1; + } + + return 0; +} + +int +vfio_get_container_fd(void) +{ + int ret, vfio_container_fd; + + /* if we're in a primary process, try to open the container */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR); + if (vfio_container_fd < 0) { + RTE_LOG(ERR, EAL, " cannot open VFIO container, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* check VFIO API version */ + ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION); + if (ret != VFIO_API_VERSION) { + if (ret < 0) + RTE_LOG(ERR, EAL, " could not get VFIO API version, " + "error %i (%s)\n", errno, strerror(errno)); + else + RTE_LOG(ERR, EAL, " unsupported VFIO API version!\n"); + close(vfio_container_fd); + return -1; + } + + ret = vfio_has_supported_extensions(vfio_container_fd); + if (ret) { + RTE_LOG(ERR, EAL, " no supported IOMMU " + "extensions found!\n"); + return -1; + } + + return vfio_container_fd; + } else { + /* + * if we're in a secondary process, request container fd from the + * primary process via our socket + */ + int socket_fd; + + socket_fd = vfio_mp_sync_connect_to_primary(); + if (socket_fd < 0) { + RTE_LOG(ERR, EAL, " cannot connect to primary process!\n"); + return -1; + } + if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) { + RTE_LOG(ERR, EAL, " cannot request container fd!\n"); + close(socket_fd); + return -1; + } + vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd); + if (vfio_container_fd < 0) { + RTE_LOG(ERR, EAL, " cannot get container fd!\n"); + close(socket_fd); + return -1; + } + close(socket_fd); + return vfio_container_fd; + } + + return -1; +} + +int +vfio_get_group_no(const char *sysfs_base, + const char *dev_addr, int *iommu_group_no) +{ + char linkname[PATH_MAX]; + char filename[PATH_MAX]; + char *tok[16], *group_tok, *end; + int ret; + + memset(linkname, 0, sizeof(linkname)); + memset(filename, 0, sizeof(filename)); + + /* try to find out IOMMU group for this device */ + snprintf(linkname, sizeof(linkname), + "%s/%s/iommu_group", sysfs_base, dev_addr); + + ret = readlink(linkname, filename, sizeof(filename)); + + /* if the link doesn't exist, no VFIO for us */ + if (ret < 0) + return 0; + + ret = rte_strsplit(filename, sizeof(filename), + tok, RTE_DIM(tok), '/'); + + if (ret <= 0) { + RTE_LOG(ERR, EAL, " %s cannot get IOMMU group\n", dev_addr); + return -1; + } + + /* IOMMU group is always the last token */ + errno = 0; + group_tok = tok[ret - 1]; + end = group_tok; + *iommu_group_no = strtol(group_tok, &end, 10); + if ((end != group_tok && *end != '\0') || errno != 0) { + RTE_LOG(ERR, EAL, " %s error parsing IOMMU number!\n", dev_addr); + return -1; + } + + return 1; +} + +static int +vfio_type1_dma_map(int vfio_container_fd) +{ + const struct rte_memseg *ms = rte_eal_get_physmem_layout(); + int i, ret; + + /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ + for (i = 0; i < RTE_MAX_MEMSEG; i++) { + struct vfio_iommu_type1_dma_map dma_map; + + if (ms[i].addr == NULL) + break; + + memset(&dma_map, 0, sizeof(dma_map)); + dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + dma_map.vaddr = ms[i].addr_64; + dma_map.size = ms[i].len; + dma_map.iova = ms[i].phys_addr; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + + if (ret) { + RTE_LOG(ERR, EAL, " cannot set up DMA remapping, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + } + + return 0; +} + +static int +vfio_noiommu_dma_map(int __rte_unused vfio_container_fd) +{ + /* No-IOMMU mode does not need DMA mapping */ + return 0; +} + +#endif diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h index f483bf40..29f7f3ec 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h @@ -60,6 +60,100 @@ #define RTE_VFIO_NOIOMMU VFIO_NOIOMMU_IOMMU #endif +#define VFIO_MAX_GROUPS 64 + +/* + * Function prototypes for VFIO multiprocess sync functions + */ +int vfio_mp_sync_send_request(int socket, int req); +int vfio_mp_sync_receive_request(int socket); +int vfio_mp_sync_send_fd(int socket, int fd); +int vfio_mp_sync_receive_fd(int socket); +int vfio_mp_sync_connect_to_primary(void); + +/* + * we don't need to store device fd's anywhere since they can be obtained from + * the group fd via an ioctl() call. + */ +struct vfio_group { + int group_no; + int fd; +}; + +struct vfio_config { + int vfio_enabled; + int vfio_container_fd; + int vfio_container_has_dma; + int vfio_group_idx; + struct vfio_group vfio_groups[VFIO_MAX_GROUPS]; +}; + +#define VFIO_DIR "/dev/vfio" +#define VFIO_CONTAINER_PATH "/dev/vfio/vfio" +#define VFIO_GROUP_FMT "/dev/vfio/%u" +#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u" +#define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL) +#define VFIO_GET_REGION_IDX(x) (x >> 40) + +/* DMA mapping function prototype. + * Takes VFIO container fd as a parameter. + * Returns 0 on success, -1 on error. + * */ +typedef int (*vfio_dma_func_t)(int); + +struct vfio_iommu_type { + int type_id; + const char *name; + vfio_dma_func_t dma_map_func; +}; + +/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */ +const struct vfio_iommu_type * +vfio_set_iommu_type(int vfio_container_fd); + +/* check if we have any supported extensions */ +int +vfio_has_supported_extensions(int vfio_container_fd); + +/* open container fd or get an existing one */ +int +vfio_get_container_fd(void); + +/* parse IOMMU group number for a device + * returns 1 on success, -1 for errors, 0 for non-existent group + */ +int +vfio_get_group_no(const char *sysfs_base, + const char *dev_addr, int *iommu_group_no); + +/* open group fd or get an existing one */ +int +vfio_get_group_fd(int iommu_group_no); + +/** + * Setup vfio_cfg for the device identified by its address. It discovers + * the configured I/O MMU groups or sets a new one for the device. If a new + * groups is assigned, the DMA mapping is performed. + * Returns 0 on success, a negative value on failure and a positive value in + * case the given device cannot be managed this way. + */ +int vfio_setup_device(const char *sysfs_base, const char *dev_addr, + int *vfio_dev_fd, struct vfio_device_info *device_info); + +int vfio_enable(const char *modname); +int vfio_is_enabled(const char *modname); + +int pci_vfio_enable(void); +int pci_vfio_is_enabled(void); + +int vfio_mp_sync_setup(void); + +#define SOCKET_REQ_CONTAINER 0x100 +#define SOCKET_REQ_GROUP 0x200 +#define SOCKET_OK 0x0 +#define SOCKET_NO_FD 0x1 +#define SOCKET_ERR 0xFF + #define VFIO_PRESENT #endif /* kernel version */ #endif /* RTE_EAL_VFIO */ diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c new file mode 100644 index 00000000..00cf919b --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c @@ -0,0 +1,408 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include + +/* sys/un.h with __USE_MISC uses strlen, which is unsafe */ +#ifdef __USE_MISC +#define REMOVED_USE_MISC +#undef __USE_MISC +#endif +#include +/* make sure we redefine __USE_MISC only if it was previously undefined */ +#ifdef REMOVED_USE_MISC +#define __USE_MISC +#undef REMOVED_USE_MISC +#endif + +#include +#include +#include +#include + +#include "eal_filesystem.h" +#include "eal_pci_init.h" +#include "eal_thread.h" + +/** + * @file + * VFIO socket for communication between primary and secondary processes. + * + * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y". + */ + +#ifdef VFIO_PRESENT + +#define SOCKET_PATH_FMT "%s/.%s_mp_socket" +#define CMSGLEN (CMSG_LEN(sizeof(int))) +#define FD_TO_CMSGHDR(fd, chdr) \ + do {\ + (chdr).cmsg_len = CMSGLEN;\ + (chdr).cmsg_level = SOL_SOCKET;\ + (chdr).cmsg_type = SCM_RIGHTS;\ + memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\ + } while (0) +#define CMSGHDR_TO_FD(chdr, fd) \ + memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd)) + +static pthread_t socket_thread; +static int mp_socket_fd; + + +/* get socket path (/var/run if root, $HOME otherwise) */ +static void +get_socket_path(char *buffer, int bufsz) +{ + const char *dir = "/var/run"; + const char *home_dir = getenv("HOME"); + + if (getuid() != 0 && home_dir != NULL) + dir = home_dir; + + /* use current prefix as file path */ + snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir, + internal_config.hugefile_prefix); +} + + + +/* + * data flow for socket comm protocol: + * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP + * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number + * 2. server receives message + * 2a. in case of invalid group, SOCKET_ERR is sent back to client + * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client + * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd + * + * in case of any error, socket is closed. + */ + +/* send a request, return -1 on error */ +int +vfio_mp_sync_send_request(int socket, int req) +{ + struct msghdr hdr; + struct iovec iov; + int buf; + int ret; + + memset(&hdr, 0, sizeof(hdr)); + + buf = req; + + hdr.msg_iov = &iov; + hdr.msg_iovlen = 1; + iov.iov_base = (char *) &buf; + iov.iov_len = sizeof(buf); + + ret = sendmsg(socket, &hdr, 0); + if (ret < 0) + return -1; + return 0; +} + +/* receive a request and return it */ +int +vfio_mp_sync_receive_request(int socket) +{ + int buf; + struct msghdr hdr; + struct iovec iov; + int ret, req; + + memset(&hdr, 0, sizeof(hdr)); + + buf = SOCKET_ERR; + + hdr.msg_iov = &iov; + hdr.msg_iovlen = 1; + iov.iov_base = (char *) &buf; + iov.iov_len = sizeof(buf); + + ret = recvmsg(socket, &hdr, 0); + if (ret < 0) + return -1; + + req = buf; + + return req; +} + +/* send OK in message, fd in control message */ +int +vfio_mp_sync_send_fd(int socket, int fd) +{ + int buf; + struct msghdr hdr; + struct cmsghdr *chdr; + char chdr_buf[CMSGLEN]; + struct iovec iov; + int ret; + + chdr = (struct cmsghdr *) chdr_buf; + memset(chdr, 0, sizeof(chdr_buf)); + memset(&hdr, 0, sizeof(hdr)); + + hdr.msg_iov = &iov; + hdr.msg_iovlen = 1; + iov.iov_base = (char *) &buf; + iov.iov_len = sizeof(buf); + hdr.msg_control = chdr; + hdr.msg_controllen = CMSGLEN; + + buf = SOCKET_OK; + FD_TO_CMSGHDR(fd, *chdr); + + ret = sendmsg(socket, &hdr, 0); + if (ret < 0) + return -1; + return 0; +} + +/* receive OK in message, fd in control message */ +int +vfio_mp_sync_receive_fd(int socket) +{ + int buf; + struct msghdr hdr; + struct cmsghdr *chdr; + char chdr_buf[CMSGLEN]; + struct iovec iov; + int ret, req, fd; + + buf = SOCKET_ERR; + + chdr = (struct cmsghdr *) chdr_buf; + memset(chdr, 0, sizeof(chdr_buf)); + memset(&hdr, 0, sizeof(hdr)); + + hdr.msg_iov = &iov; + hdr.msg_iovlen = 1; + iov.iov_base = (char *) &buf; + iov.iov_len = sizeof(buf); + hdr.msg_control = chdr; + hdr.msg_controllen = CMSGLEN; + + ret = recvmsg(socket, &hdr, 0); + if (ret < 0) + return -1; + + req = buf; + + if (req != SOCKET_OK) + return -1; + + CMSGHDR_TO_FD(*chdr, fd); + + return fd; +} + +/* connect socket_fd in secondary process to the primary process's socket */ +int +vfio_mp_sync_connect_to_primary(void) +{ + struct sockaddr_un addr; + socklen_t sockaddr_len; + int socket_fd; + + /* set up a socket */ + socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0); + if (socket_fd < 0) { + RTE_LOG(ERR, EAL, "Failed to create socket!\n"); + return -1; + } + + get_socket_path(addr.sun_path, sizeof(addr.sun_path)); + addr.sun_family = AF_UNIX; + + sockaddr_len = sizeof(struct sockaddr_un); + + if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0) + return socket_fd; + + /* if connect failed */ + close(socket_fd); + return -1; +} + + + +/* + * socket listening thread for primary process + */ +static __attribute__((noreturn)) void * +vfio_mp_sync_thread(void __rte_unused * arg) +{ + int ret, fd, vfio_group_no; + + /* wait for requests on the socket */ + for (;;) { + int conn_sock; + struct sockaddr_un addr; + socklen_t sockaddr_len = sizeof(addr); + + /* this is a blocking call */ + conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr, + &sockaddr_len); + + /* just restart on error */ + if (conn_sock == -1) + continue; + + /* set socket to linger after close */ + struct linger l; + l.l_onoff = 1; + l.l_linger = 60; + + if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0) + RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option " + "on listen socket (%s)\n", strerror(errno)); + + ret = vfio_mp_sync_receive_request(conn_sock); + + switch (ret) { + case SOCKET_REQ_CONTAINER: + fd = vfio_get_container_fd(); + if (fd < 0) + vfio_mp_sync_send_request(conn_sock, SOCKET_ERR); + else + vfio_mp_sync_send_fd(conn_sock, fd); + break; + case SOCKET_REQ_GROUP: + /* wait for group number */ + vfio_group_no = vfio_mp_sync_receive_request(conn_sock); + if (vfio_group_no < 0) { + close(conn_sock); + continue; + } + + fd = vfio_get_group_fd(vfio_group_no); + + if (fd < 0) + vfio_mp_sync_send_request(conn_sock, SOCKET_ERR); + /* if VFIO group exists but isn't bound to VFIO driver */ + else if (fd == 0) + vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD); + /* if group exists and is bound to VFIO driver */ + else { + vfio_mp_sync_send_request(conn_sock, SOCKET_OK); + vfio_mp_sync_send_fd(conn_sock, fd); + } + break; + default: + vfio_mp_sync_send_request(conn_sock, SOCKET_ERR); + break; + } + close(conn_sock); + } +} + +static int +vfio_mp_sync_socket_setup(void) +{ + int ret, socket_fd; + struct sockaddr_un addr; + socklen_t sockaddr_len; + + /* set up a socket */ + socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0); + if (socket_fd < 0) { + RTE_LOG(ERR, EAL, "Failed to create socket!\n"); + return -1; + } + + get_socket_path(addr.sun_path, sizeof(addr.sun_path)); + addr.sun_family = AF_UNIX; + + sockaddr_len = sizeof(struct sockaddr_un); + + unlink(addr.sun_path); + + ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len); + if (ret) { + RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno)); + close(socket_fd); + return -1; + } + + ret = listen(socket_fd, 50); + if (ret) { + RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno)); + close(socket_fd); + return -1; + } + + /* save the socket in local configuration */ + mp_socket_fd = socket_fd; + + return 0; +} + +/* + * set up a local socket and tell it to listen for incoming connections + */ +int +vfio_mp_sync_setup(void) +{ + int ret; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + + if (vfio_mp_sync_socket_setup() < 0) { + RTE_LOG(ERR, EAL, "Failed to set up local socket!\n"); + return -1; + } + + ret = pthread_create(&socket_thread, NULL, + vfio_mp_sync_thread, NULL); + if (ret) { + RTE_LOG(ERR, EAL, + "Failed to create thread for communication with secondary processes!\n"); + close(mp_socket_fd); + return -1; + } + + /* Set thread_name for aid in debugging. */ + snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync"); + ret = rte_thread_setname(socket_thread, thread_name); + if (ret) + RTE_LOG(DEBUG, EAL, + "Failed to set thread name for secondary processes!\n"); + + return 0; +} + +#endif diff --git a/lib/librte_eal/linuxapp/eal/eal_xen_memory.c b/lib/librte_eal/linuxapp/eal/eal_xen_memory.c index 0b612bb1..bddbdb07 100644 --- a/lib/librte_eal/linuxapp/eal/eal_xen_memory.c +++ b/lib/librte_eal/linuxapp/eal/eal_xen_memory.c @@ -167,8 +167,8 @@ rte_xen_mem_phy2mch(int32_t memseg_id, const phys_addr_t phy_addr) if (memseg_id == -1) { for (i = 0; i < RTE_MAX_MEMSEG; i++) { if ((phy_addr >= memseg[i].phys_addr) && - (phys_addr < memseg[i].phys_addr + - memseg[i].size)) { + (phy_addr < memseg[i].phys_addr + + memseg[i].len)) { memseg_id = i; break; } diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map index 05134673..a617b9e4 100644 --- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map +++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map @@ -138,6 +138,7 @@ DPDK_2.2 { rte_keepalive_mark_alive; rte_keepalive_register_core; rte_xen_dom0_supported; + rte_xen_mem_phy2mch; } DPDK_2.1; diff --git a/lib/librte_eal/linuxapp/igb_uio/igb_uio.c b/lib/librte_eal/linuxapp/igb_uio/igb_uio.c index 45a5720e..df41e457 100644 --- a/lib/librte_eal/linuxapp/igb_uio/igb_uio.c +++ b/lib/librte_eal/linuxapp/igb_uio/igb_uio.c @@ -342,16 +342,6 @@ igbuio_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) goto fail_free; } - /* - * reserve device's PCI memory regions for use by this - * module - */ - err = pci_request_regions(dev, "igb_uio"); - if (err != 0) { - dev_err(&dev->dev, "Cannot request regions\n"); - goto fail_disable; - } - /* enable bus mastering on the device */ pci_set_master(dev); @@ -441,8 +431,6 @@ fail_release_iomem: igbuio_pci_release_iomem(&udev->info); if (udev->mode == RTE_INTR_MODE_MSIX) pci_disable_msix(udev->pdev); - pci_release_regions(dev); -fail_disable: pci_disable_device(dev); fail_free: kfree(udev); @@ -460,7 +448,6 @@ igbuio_pci_remove(struct pci_dev *dev) igbuio_pci_release_iomem(&udev->info); if (udev->mode == RTE_INTR_MODE_MSIX) pci_disable_msix(dev); - pci_release_regions(dev); pci_disable_device(dev); pci_set_drvdata(dev, NULL); kfree(udev); -- cgit 1.2.3-korg