diff options
author | Benoît Ganne <bganne@cisco.com> | 2019-03-25 11:41:34 +0100 |
---|---|---|
committer | Damjan Marion <dmarion@me.com> | 2019-03-28 19:31:59 +0000 |
commit | fe750c248be58b76479836639fbd0c4617210aa5 (patch) | |
tree | 471a48243fb13e3eb84c95cf0be0b270607ae286 | |
parent | 6bc6fd0aebd7feb523604973bcf593bfe14bbd30 (diff) |
Add RDMA ibverb driver plugin
RDMA ibverb is a userspace API to efficiently rx/tx packets. This is an
initial, unoptimized driver targeting Mellanox cards.
Next steps should include batching, multiqueue and additional cards.
Change-Id: I0309c7a543f75f2f9317eaf63ca502ac7a093ef9
Signed-off-by: Benoît Ganne <bganne@cisco.com>
-rw-r--r-- | build/external/Makefile | 32 | ||||
-rwxr-xr-x | build/external/deb/debian/rules | 1 | ||||
-rw-r--r-- | build/external/packages.mk | 6 | ||||
-rw-r--r-- | build/external/packages/dpdk.mk | 60 | ||||
-rw-r--r-- | build/external/packages/rdma-core.mk | 46 | ||||
-rw-r--r-- | src/plugins/rdma/CMakeLists.txt | 61 | ||||
-rw-r--r-- | src/plugins/rdma/cli.c | 133 | ||||
-rw-r--r-- | src/plugins/rdma/device.c | 607 | ||||
-rw-r--r-- | src/plugins/rdma/format.c | 89 | ||||
-rw-r--r-- | src/plugins/rdma/input.c | 202 | ||||
-rw-r--r-- | src/plugins/rdma/output.c | 133 | ||||
-rw-r--r-- | src/plugins/rdma/plugin.c | 35 | ||||
-rw-r--r-- | src/plugins/rdma/rdma.h | 141 | ||||
-rw-r--r-- | src/vnet/devices/tap/tap.c | 12 | ||||
-rw-r--r-- | src/vnet/ethernet/mac_address.h | 11 |
15 files changed, 1508 insertions, 61 deletions
diff --git a/build/external/Makefile b/build/external/Makefile index a1352a69cbf..084d694e88c 100644 --- a/build/external/Makefile +++ b/build/external/Makefile @@ -31,11 +31,18 @@ include packages.mk include packages/nasm.mk include packages/ipsec-mb.mk include packages/dpdk.mk +include packages/rdma-core.mk .PHONY: clean clean: @rm -rf $(B) $(I) +.PHONY: install +install: dpdk-install rdma-core-install + +.PHONY: config +config: dpdk-config rdma-core-config + ############################################################################## # .deb packaging ############################################################################## @@ -62,11 +69,6 @@ build-deb: $(DEV_DEB) install-deb: ifneq ($(INSTALLED_VER),$(DEB_VER)-$(PKG_SUFFIX)) - @echo "==========================================================" - @echo " Out of date vpp-ext-deps package installed." - @echo " Installed: $(INSTALLED_VER)" - @echo " Needed: $(DEB_VER)-$(PKG_SUFFIX)" - @echo "==========================================================" @make $(DEV_DEB) @sudo dpkg -i $(DEV_DEB) else @@ -78,9 +80,9 @@ endif check-deb: ifneq ($(INSTALLED_VER),$(DEB_VER)-$(PKG_SUFFIX)) @echo "==========================================================" - @echo " Outdated DPDK package detected:" - @echo " Installed: vpp-ext-deps $(INSTALLED_VER)" - @echo " Current: vpp-ext-deps $(DEB_VER)-$(PKG_SUFFIX)" + @echo " Out of date vpp-ext-deps package installed." + @echo " Installed: $(INSTALLED_VER)" + @echo " Needed: $(DEB_VER)-$(PKG_SUFFIX)" @echo "" @echo " Please upgrade by invoking 'make install-ext-deps'" @echo " from the top level directory." @@ -115,16 +117,16 @@ ifneq ($(INSTALLED_RPM_VER),$(RPM_VER)-$(PKG_SUFFIX)) sudo rpm -Uih --force $(DEV_RPM) else @echo "==========================================================" - @echo " Up-to-date DPDK package already installed" + @echo " Up-to-date vpp-ext-deps package already installed" @echo "==========================================================" endif check-rpm: ifneq ($(INSTALLED_RPM_VER),$(RPM_VER)-$(PKG_SUFFIX)) @echo "==========================================================" - @echo " Outdated DPDK package detected:" - @echo " Installed: vpp-ext-deps $(INSTALLED_RPM_VER)" - @echo " Current: vpp-ext-deps $(RPM_VER)-$(PKG_SUFFIX)" + @echo " Out of date vpp-ext-deps package installed." + @echo " Installed: $(INSTALLED_RPM_VER)" + @echo " Needed: $(RPM_VER)-$(PKG_SUFFIX)" @echo "" @echo " Please upgrade by invoking 'make install-ext-deps'" @echo " from the top level directory." @@ -140,9 +142,9 @@ endif ebuild-build: ifeq ($(INSTALLED_VER)$(INSTALLED_RPM_VER),) @echo "==========================================================" - @echo "Building DPDK from source. Consider installing development" - @echo "package by invoking 'make install-ext-deps' from the" - @echo "top level directory" + @echo "Building vpp-ext-deps from source. Consider installing" + @echo "development package by invoking 'make install-ext-deps'" + @echo "from the top level directory" @echo "==========================================================" make config else diff --git a/build/external/deb/debian/rules b/build/external/deb/debian/rules index 6393f82716b..2b1157e5e48 100755 --- a/build/external/deb/debian/rules +++ b/build/external/deb/debian/rules @@ -20,7 +20,6 @@ override_dh_clean: make $(MAKE_ARGS) clean override_dh_auto_configure: - make $(MAKE_ARGS) config override_dh_install: make $(MAKE_ARGS) install diff --git a/build/external/packages.mk b/build/external/packages.mk index 4056b2f63b5..005c2a958b2 100644 --- a/build/external/packages.mk +++ b/build/external/packages.mk @@ -31,12 +31,12 @@ $1_install_log ?= $(B)/$1.install.log downloads/$($1_tarball): mkdir -p downloads @if [ -e $(DL_CACHE_DIR)/$($1_tarball) ] ; \ - then cp $(DL_CACHE_DIR)/$($1_tarball) downloads/ ; \ + then cp $(DL_CACHE_DIR)/$($1_tarball) $$@ ; \ else \ echo "Downloading $($1_url)" ; \ - curl -o downloads/$($1_tarball) -LO $($1_url) ; \ + curl -o $$@ -LO $($1_url) ; \ fi - @rm -f $(B)/.download.ok + @rm -f $(B)/.$1.download.ok $(B)/.$1.download.ok: downloads/$($1_tarball) @mkdir -p $(B) diff --git a/build/external/packages/dpdk.mk b/build/external/packages/dpdk.mk index 68c27678931..ae9d9c5df95 100644 --- a/build/external/packages/dpdk.mk +++ b/build/external/packages/dpdk.mk @@ -167,9 +167,7 @@ define set fi endef -all: build - -$(B)/custom-config: $(B)/.patch.ok Makefile +$(B)/custom-config: $(B)/.dpdk-patch.ok Makefile @echo --- generating custom config from $(DPDK_SOURCE)/config/defconfig_$(DPDK_TARGET) --- @cpp -undef -ffreestanding -x assembler-with-cpp $(DPDK_SOURCE)/config/defconfig_$(DPDK_TARGET) $@ $(call set,RTE_MACHINE,$(DPDK_MACHINE)) @@ -230,18 +228,19 @@ $(B)/custom-config: $(B)/.patch.ok Makefile $(call set,RTE_LIBRTE_DPAA_PMD,n) $(call set,RTE_LIBRTE_PMD_DPAA_SEC,n) $(call set,RTE_LIBRTE_PMD_DPAA_EVENTDEV,n) - @rm -f .config.ok + @rm -f .dpdk-config.ok + +DPDK_DOWNLOADS = $(CURDIR)/downloads/$(DPDK_TARBALL) -$(CURDIR)/$(DPDK_TARBALL): +$(DPDK_DOWNLOADS): + mkdir -p downloads @if [ -e $(DPDK_DOWNLOAD_DIR)/$(DPDK_TARBALL) ] ; \ - then cp $(DPDK_DOWNLOAD_DIR)/$(DPDK_TARBALL) $(CURDIR) ; \ - else curl -o $(CURDIR)/$(DPDK_TARBALL) -LO $(DPDK_TAR_URL) ; \ + then cp $(DPDK_DOWNLOAD_DIR)/$(DPDK_TARBALL) $@ ; \ + else curl -o $@ -LO $(DPDK_TAR_URL) ; \ fi - @rm -f $(B)/.download.ok + @rm -f $(B)/.dpdk-download.ok -DPDK_DOWNLOADS = $(CURDIR)/$(DPDK_TARBALL) - -$(B)/.download.ok: $(DPDK_DOWNLOADS) +$(B)/.dpdk-download.ok: $(DPDK_DOWNLOADS) @mkdir -p $(B) @openssl md5 $< | cut -f 2 -d " " - > $(B)/$(DPDK_TARBALL).md5sum @([ "$$(<$(B)/$(DPDK_TARBALL).md5sum)" = "$(DPDK_$(DPDK_VERSION)_TARBALL_MD5_CKSUM)" ] || \ @@ -249,18 +248,18 @@ $(B)/.download.ok: $(DPDK_DOWNLOADS) rm $(B)/$(DPDK_TARBALL).md5sum && false )) @touch $@ -.PHONY: download -download: $(B)/.download.ok +.PHONY: dpdk-download +dpdk-download: $(B)/.dpdk-download.ok -$(B)/.extract.ok: $(B)/.download.ok +$(B)/.dpdk-extract.ok: $(B)/.dpdk-download.ok @echo --- extracting $(DPDK_TARBALL) --- - @tar --directory $(B) --extract --file $(CURDIR)/$(DPDK_TARBALL) + @tar --directory $(B) --extract --file $(DPDK_DOWNLOADS) @touch $@ -.PHONY: extract -extract: $(B)/.extract.ok +.PHONY: dpdk-extract +dpdk-extract: $(B)/.dpdk-extract.ok -$(B)/.patch.ok: $(B)/.extract.ok +$(B)/.dpdk-patch.ok: $(B)/.dpdk-extract.ok ifneq ($(wildcard $(CURDIR)/patches/dpdk_$(DPDK_VERSION)/*.patch),) @echo --- patching --- @for f in $(CURDIR)/patches/dpdk_$(DPDK_VERSION)/*.patch ; do \ @@ -270,26 +269,23 @@ ifneq ($(wildcard $(CURDIR)/patches/dpdk_$(DPDK_VERSION)/*.patch),) endif @touch $@ -.PHONY: patch -patch: $(B)/.patch.ok +.PHONY: dpdk-patch +dpdk-patch: $(B)/.dpdk-patch.ok -$(B)/.config.ok: $(B)/.patch.ok $(B)/custom-config +$(B)/.dpdk-config.ok: $(B)/.dpdk-patch.ok $(B)/custom-config @make $(DPDK_MAKE_ARGS) config @touch $@ -.PHONY: config -config: $(B)/.config.ok +.PHONY: dpdk-config +dpdk-config: $(B)/.dpdk-config.ok -.PHONY: build-dpdk -build-dpdk: $(DPDK_BUILD_DEPS) - @if [ ! -e $(B)/.config.ok ] ; then echo 'Please run "make config" first' && false ; fi +$(B)/.dpdk-build.ok: dpdk-config $(DPDK_BUILD_DEPS) + @if [ ! -e $(B)/.dpdk-config.ok ] ; then echo 'Please run "make config" first' && false ; fi @make $(DPDK_MAKE_ARGS) install - -$(B)/.build.ok: build-dpdk @touch $@ -.PHONY: build -build: $(B)/.build.ok +.PHONY: dpdk-build +dpdk-build: $(B)/.dpdk-build.ok -.PHONY: install -install: $(B)/.build.ok +.PHONY: dpdk-install +dpdk-install: $(B)/.dpdk-build.ok diff --git a/build/external/packages/rdma-core.mk b/build/external/packages/rdma-core.mk new file mode 100644 index 00000000000..0e8c878034e --- /dev/null +++ b/build/external/packages/rdma-core.mk @@ -0,0 +1,46 @@ +# Copyright (c) 2018 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +rdma-core_version := 23 +rdma-core_tarball := rdma-core-$(rdma-core_version).tar.gz +rdma-core_tarball_md5sum_22.1 := dde4d30e3db20893408ae51041117034 +rdma-core_tarball_md5sum_23 := c78575735c4a71609c1a214ea16cd8dc +rdma-core_tarball_md5sum := $(rdma-core_tarball_md5sum_$(rdma-core_version)) +rdma-core_tarball_strip_dirs := 1 +rdma-core_url := http://github.com/linux-rdma/rdma-core/releases/download/v$(rdma-core_version)/$(rdma-core_tarball) + +RDMA_FILES := include/infiniband/verbs.h \ + include/infiniband/verbs_api.h \ + include/infiniband/ib_user_ioctl_verbs.h \ + include/rdma/ib_user_verbs.h \ + lib/statics/libibverbs.a \ + lib/statics/libmlx5.a + +define rdma-core_config_cmds + cd $(rdma-core_build_dir) && \ + cmake -G Ninja $(rdma-core_src_dir) \ + -DENABLE_STATIC=1 -DENABLE_RESOLVE_NEIGH=0 -DNO_PYVERBS=1 -DENABLE_VALGRIND=0 \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_C_FLAGS=-fPIC > $(rdma-core_config_log) +endef + +define rdma-core_build_cmds + cmake --build $(rdma-core_build_dir) -- libibverbs.a libmlx5.a > $(rdma-core_build_log) +endef + +define rdma-core_install_cmds + mkdir -p $(rdma-core_install_dir) + tar -C $(rdma-core_build_dir) --xform='s|/statics/|/|' -hc $(RDMA_FILES) | tar -C $(rdma-core_install_dir) -xv > $(rdma-core_install_log) +endef + +$(eval $(call package,rdma-core)) diff --git a/src/plugins/rdma/CMakeLists.txt b/src/plugins/rdma/CMakeLists.txt new file mode 100644 index 00000000000..35d43db28a1 --- /dev/null +++ b/src/plugins/rdma/CMakeLists.txt @@ -0,0 +1,61 @@ +# Copyright (c) 2018 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +message(STATUS "RDMA plugins - looking for ibverbs") + +find_path(IBVERBS_INCLUDE_DIR NAMES infiniband/verbs.h) +find_library(IBVERBS_LIB NAMES libibverbs.a) +find_library(MLX5_LIB NAMES libmlx5.a) + +if (NOT IBVERBS_LIB OR NOT MLX5_LIB) + message(WARNING "RDMA plugins - ibverbs not found - rdma_plugin disabled") + return() +endif() + +if (MLX5_LIB) + string_append(RDMA_LINK_FLAGS "-Wl,--whole-archive,${MLX5_LIB},--no-whole-archive") +endif() + +set(CMAKE_REQUIRED_FLAGS "-fPIC -shared ${IBVERBS_LIB} ${RDMA_LINK_FLAGS}") +CHECK_C_SOURCE_COMPILES("" IBVERBS_COMPILES_CHECK) + +if (NOT IBVERBS_COMPILES_CHECK) + message(WARNING "RDMA plugins - no working ibverbs found - rdma_plugin disabled") + return() +endif() + +message(STATUS "RDMA plugins - found ${IBVERBS_INCLUDE_DIR}") +message(STATUS "RDMA plugins - found ${IBVERBS_LIB}") +message(STATUS "RDMA plugins - found ${MLX5_LIB}") + +include_directories(${IBVERBS_INCLUDE_DIR}) + +add_vpp_plugin(rdma + SOURCES + cli.c + device.c + format.c + plugin.c + input.c + output.c + + MULTIARCH_SOURCES + input.c + output.c + + LINK_FLAGS + "${RDMA_LINK_FLAGS}" + + LINK_LIBRARIES + ${IBVERBS_LIB} +) diff --git a/src/plugins/rdma/cli.c b/src/plugins/rdma/cli.c new file mode 100644 index 00000000000..8919603e293 --- /dev/null +++ b/src/plugins/rdma/cli.c @@ -0,0 +1,133 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ +#include <stdint.h> +#include <net/if.h> +#include <sys/ioctl.h> +#include <inttypes.h> + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vlib/pci/pci.h> +#include <vnet/ethernet/ethernet.h> + +#include <rdma/rdma.h> + +static clib_error_t * +rdma_create_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + rdma_create_if_args_t args; + + clib_memset (&args, 0, sizeof (rdma_create_if_args_t)); + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "name %s", &args.ifname)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + unformat_free (line_input); + + rdma_create_if (vm, &args); + + vec_free (args.ifname); + + return args.error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (rdma_create_command, static) = { + .path = "create interface rdma", + .short_help = "create interface rdma <name ifname>", + .function = rdma_create_command_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +rdma_delete_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + u32 sw_if_index = ~0; + vnet_hw_interface_t *hw; + rdma_main_t *rm = &rdma_main; + rdma_device_t *rd; + vnet_main_t *vnm = vnet_get_main (); + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "sw_if_index %d", &sw_if_index)) + ; + else if (unformat (line_input, "%U", unformat_vnet_sw_interface, + vnm, &sw_if_index)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + unformat_free (line_input); + + if (sw_if_index == ~0) + return clib_error_return (0, + "please specify interface name or sw_if_index"); + + hw = vnet_get_sup_hw_interface (vnm, sw_if_index); + if (hw == NULL || rdma_device_class.index != hw->dev_class_index) + return clib_error_return (0, "not an AVF interface"); + + rd = pool_elt_at_index (rm->devices, hw->dev_instance); + + rdma_delete_if (vm, rd); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (rdma_delete_command, static) = { + .path = "delete interface rdma", + .short_help = "delete interface rdma " + "{<interface> | sw_if_index <sw_idx>}", + .function = rdma_delete_command_fn, +}; +/* *INDENT-ON* */ + +clib_error_t * +rdma_cli_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (rdma_cli_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/rdma/device.c b/src/plugins/rdma/device.c new file mode 100644 index 00000000000..31112a923d0 --- /dev/null +++ b/src/plugins/rdma/device.c @@ -0,0 +1,607 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <unistd.h> +#include <fcntl.h> +#include <net/if.h> +#include <linux/if_link.h> +#include <linux/if_ether.h> + +#include <vppinfra/linux/sysfs.h> +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vlib/pci/pci.h> +#include <vnet/ethernet/ethernet.h> + +#include <rdma/rdma.h> + +rdma_main_t rdma_main; + +#define rdma_log_debug(dev, f, ...) \ +{ \ + vlib_log(VLIB_LOG_LEVEL_DEBUG, rdma_main.log_class, "%U: " f, \ + format_vlib_pci_addr, &rd->pci_addr, ##__VA_ARGS__); \ +}; + +static u32 +rdma_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags) +{ + rdma_main_t *rm = &rdma_main; + vlib_log_warn (rm->log_class, "TODO"); + return 0; +} + +static void +rdma_update_state (vnet_main_t * vnm, rdma_device_t * rd, int port) +{ + struct ibv_port_attr attr; + u32 width = 0; + u32 speed = 0; + + if (ibv_query_port (rd->ctx, port, &attr)) + { + vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, 0); + vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0); + return; + } + + /* update state */ + switch (attr.state) + { + case IBV_PORT_ACTIVE: /* fallthrough */ + case IBV_PORT_ACTIVE_DEFER: + rd->flags |= RDMA_DEVICE_F_LINK_UP; + vnet_hw_interface_set_flags (vnm, rd->hw_if_index, + VNET_HW_INTERFACE_FLAG_LINK_UP); + break; + default: + rd->flags &= ~RDMA_DEVICE_F_LINK_UP; + vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0); + break; + } + + /* update speed */ + switch (attr.active_width) + { + case 1: + width = 1; + break; + case 2: + width = 4; + break; + case 4: + width = 8; + break; + case 8: + width = 12; + break; + } + switch (attr.active_speed) + { + case 1: + speed = 2500000; + break; + case 2: + speed = 5000000; + break; + case 4: /* fallthrough */ + case 8: + speed = 10000000; + break; + case 16: + speed = 14000000; + break; + case 32: + speed = 25000000; + break; + } + vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, width * speed); +} + +static clib_error_t * +rdma_async_event_error_ready (clib_file_t * f) +{ + rdma_main_t *rm = &rdma_main; + rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data); + return clib_error_return (0, "RDMA async event error for device %U", + format_vlib_pci_addr, &rd->pci_addr); +} + +static clib_error_t * +rdma_async_event_read_ready (clib_file_t * f) +{ + vnet_main_t *vnm = vnet_get_main (); + rdma_main_t *rm = &rdma_main; + rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data); + int ret; + struct ibv_async_event event; + ret = ibv_get_async_event (rd->ctx, &event); + if (ret < 0) + { + return clib_error_return_unix (0, "ibv_get_async_event() failed"); + } + + switch (event.event_type) + { + case IBV_EVENT_PORT_ACTIVE: + rdma_update_state (vnm, rd, event.element.port_num); + break; + case IBV_EVENT_PORT_ERR: + rdma_update_state (vnm, rd, event.element.port_num); + break; + case IBV_EVENT_DEVICE_FATAL: + rd->flags &= ~RDMA_DEVICE_F_LINK_UP; + vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0); + vlib_log_emerg (rm->log_class, "Fatal RDMA error for device %U", + format_vlib_pci_addr, &rd->pci_addr); + break; + default: + vlib_log_warn (rm->log_class, + "Unhandeld RDMA async event %i for device %U", + event.event_type, format_vlib_pci_addr, &rd->pci_addr); + break; + } + + ibv_ack_async_event (&event); + return 0; +} + +static clib_error_t * +rdma_async_event_init (rdma_device_t * rd) +{ + clib_file_t t = { 0 }; + int ret; + + /* make RDMA async event fd non-blocking */ + ret = fcntl (rd->ctx->async_fd, F_GETFL); + if (ret < 0) + { + return clib_error_return_unix (0, "fcntl(F_GETFL) failed"); + } + ret = fcntl (rd->ctx->async_fd, F_SETFL, ret | O_NONBLOCK); + if (ret < 0) + { + return clib_error_return_unix (0, "fcntl(F_SETFL, O_NONBLOCK) failed"); + } + + /* register RDMA async event fd */ + t.read_function = rdma_async_event_read_ready; + t.file_descriptor = rd->ctx->async_fd; + t.error_function = rdma_async_event_error_ready; + t.private_data = rd->dev_instance; + t.description = + format (0, "RMDA %U async event", format_vlib_pci_addr, &rd->pci_addr); + + rd->async_event_clib_file_index = clib_file_add (&file_main, &t); + + return 0; +} + +static void +rdma_async_event_cleanup (rdma_device_t * rd) +{ + clib_file_del_by_index (&file_main, rd->async_event_clib_file_index); +} + +static clib_error_t * +rdma_register_interface (vnet_main_t * vnm, rdma_device_t * rd) +{ + return ethernet_register_interface (vnm, rdma_device_class.index, + rd->dev_instance, rd->hwaddr, + &rd->hw_if_index, rdma_flag_change); +} + +static void +rdma_unregister_interface (vnet_main_t * vnm, rdma_device_t * rd) +{ + vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0); + vnet_hw_interface_unassign_rx_thread (vnm, rd->hw_if_index, 0); + ethernet_delete_interface (vnm, rd->hw_if_index); +} + +static void +rdma_dev_cleanup (rdma_device_t * rd) +{ + rdma_main_t *rm = &rdma_main; + rdma_rxq_t *rxq; + rdma_txq_t *txq; + +#define _(fn, arg) if (arg) \ + { \ + int rv; \ + if ((rv = fn (arg))) \ + rdma_log_debug (rd, #fn "() failed (rv = %d)", rv); \ + } + + _(ibv_destroy_flow, rd->flow_mcast); + _(ibv_destroy_flow, rd->flow_ucast); + _(ibv_dereg_mr, rd->mr); + vec_foreach (txq, rd->txqs) + { + _(ibv_destroy_qp, txq->qp); + _(ibv_destroy_cq, txq->cq); + } + vec_foreach (rxq, rd->rxqs) + { + _(ibv_destroy_qp, rxq->qp); + _(ibv_destroy_cq, rxq->cq); + } + _(ibv_dealloc_pd, rd->pd); + _(ibv_close_device, rd->ctx); +#undef _ + + clib_error_free (rd->error); + + vec_free (rd->rxqs); + vec_free (rd->txqs); + pool_put (rm->devices, rd); +} + +static clib_error_t * +rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) +{ + rdma_rxq_t *rxq; + struct ibv_qp_init_attr qpia; + struct ibv_qp_attr qpa; + int qp_flags; + + vec_validate_aligned (rd->rxqs, qid, CLIB_CACHE_LINE_BYTES); + rxq = vec_elt_at_index (rd->rxqs, qid); + rxq->size = n_desc; + + if ((rxq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0) + return clib_error_return_unix (0, "Create CQ Failed"); + + memset (&qpia, 0, sizeof (qpia)); + qpia.qp_type = IBV_QPT_RAW_PACKET; + qpia.send_cq = rxq->cq; + qpia.recv_cq = rxq->cq; + qpia.cap.max_recv_wr = n_desc; + qpia.cap.max_recv_sge = 1; + + if ((rxq->qp = ibv_create_qp (rd->pd, &qpia)) == 0) + return clib_error_return_unix (0, "Queue Pair create failed"); + + memset (&qpa, 0, sizeof (qpa)); + qp_flags = IBV_QP_STATE | IBV_QP_PORT; + qpa.qp_state = IBV_QPS_INIT; + qpa.port_num = 1; + if (ibv_modify_qp (rxq->qp, &qpa, qp_flags) != 0) + return clib_error_return_unix (0, "Modify QP (init) Failed"); + + memset (&qpa, 0, sizeof (qpa)); + qp_flags = IBV_QP_STATE; + qpa.qp_state = IBV_QPS_RTR; + if (ibv_modify_qp (rxq->qp, &qpa, qp_flags) != 0) + return clib_error_return_unix (0, "Modify QP (receive) Failed"); + + return 0; +} + +static clib_error_t * +rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) +{ + rdma_txq_t *txq; + struct ibv_qp_init_attr qpia; + struct ibv_qp_attr qpa; + int qp_flags; + + vec_validate_aligned (rd->txqs, qid, CLIB_CACHE_LINE_BYTES); + txq = vec_elt_at_index (rd->txqs, qid); + txq->size = n_desc; + + if ((txq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0) + return clib_error_return_unix (0, "Create CQ Failed"); + + memset (&qpia, 0, sizeof (qpia)); + qpia.qp_type = IBV_QPT_RAW_PACKET; + qpia.send_cq = txq->cq; + qpia.recv_cq = txq->cq; + qpia.cap.max_send_wr = n_desc; + qpia.cap.max_send_sge = 1; + + if ((txq->qp = ibv_create_qp (rd->pd, &qpia)) == 0) + return clib_error_return_unix (0, "Queue Pair create failed"); + + memset (&qpa, 0, sizeof (qpa)); + qp_flags = IBV_QP_STATE | IBV_QP_PORT; + qpa.qp_state = IBV_QPS_INIT; + qpa.port_num = 1; + if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0) + return clib_error_return_unix (0, "Modify QP (init) Failed"); + + memset (&qpa, 0, sizeof (qpa)); + qp_flags = IBV_QP_STATE; + qpa.qp_state = IBV_QPS_RTR; + if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0) + return clib_error_return_unix (0, "Modify QP (receive) Failed"); + + memset (&qpa, 0, sizeof (qpa)); + qp_flags = IBV_QP_STATE; + qpa.qp_state = IBV_QPS_RTS; + if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0) + return clib_error_return_unix (0, "Modify QP (send) Failed"); + return 0; +} + +static clib_error_t * +rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd) +{ + clib_error_t *err; + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + u16 i; + + if (rd->ctx == 0) + return clib_error_return_unix (0, "Device Open Failed"); + + if ((rd->pd = ibv_alloc_pd (rd->ctx)) == 0) + return clib_error_return_unix (0, "PD Alloc Failed"); + + if ((err = rdma_rxq_init (vm, rd, 0, 512))) + return err; + + for (i = 0; i < tm->n_vlib_mains; i++) + if ((err = rdma_txq_init (vm, rd, i, 512))) + return err; + + if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start, + bm->buffer_mem_size, + IBV_ACCESS_LOCAL_WRITE)) == 0) + return clib_error_return_unix (0, "Register MR Failed"); + + ethernet_mac_address_generate (rd->hwaddr); + + /* + * restrict packets steering to our MAC + * allows to share a single HW NIC with multiple RDMA ifaces + * and/or Linux + */ + struct raw_eth_flow_attr + { + struct ibv_flow_attr attr; + struct ibv_flow_spec_eth spec_eth; + } __attribute__ ((packed)) fa; + memset (&fa, 0, sizeof (fa)); + fa.attr.num_of_specs = 1; + fa.attr.port = 1; + fa.spec_eth.type = IBV_FLOW_SPEC_ETH; + fa.spec_eth.size = sizeof (struct ibv_flow_spec_eth); + memcpy (fa.spec_eth.val.dst_mac, rd->hwaddr, + sizeof (fa.spec_eth.val.dst_mac)); + memset (fa.spec_eth.mask.dst_mac, 0xff, sizeof (fa.spec_eth.mask.dst_mac)); + if ((rd->flow_ucast = ibv_create_flow (rd->rxqs[0].qp, &fa.attr)) == 0) + return clib_error_return_unix (0, "create Flow Failed"); + + /* receive multicast packets too */ + memset (&fa, 0, sizeof (fa)); + fa.attr.num_of_specs = 1; + fa.attr.port = 1; + fa.attr.flags = IBV_FLOW_ATTR_FLAGS_DONT_TRAP; /* let others receive them too */ + fa.spec_eth.type = IBV_FLOW_SPEC_ETH; + fa.spec_eth.size = sizeof (struct ibv_flow_spec_eth); + fa.spec_eth.val.dst_mac[0] = 1; + fa.spec_eth.mask.dst_mac[0] = 1; + if ((rd->flow_mcast = ibv_create_flow (rd->rxqs[0].qp, &fa.attr)) == 0) + return clib_error_return_unix (0, "create Flow Failed"); + + return 0; +} + +static uword +sysfs_path_to_pci_addr (char *path, vlib_pci_addr_t * addr) +{ + uword rv; + unformat_input_t in; + u8 *s; + + s = clib_sysfs_link_to_name (path); + unformat_init_string (&in, (char *) s, strlen ((char *) s)); + rv = unformat (&in, "%U", unformat_vlib_pci_addr, addr); + unformat_free (&in); + vec_free (s); + return rv; +} + +void +rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args) +{ + vnet_main_t *vnm = vnet_get_main (); + rdma_main_t *rm = &rdma_main; + rdma_device_t *rd = 0; + struct ibv_device **dev_list = 0; + int n_devs; + u8 *s = 0, *s2 = 0; + + pool_get_zero (rm->devices, rd); + rd->dev_instance = rd - rm->devices; + rd->per_interface_next_index = ~0; + + /* check if device exist and if it is bound to mlx5_core */ + s = format (s, "/sys/class/net/%s/device/driver/module%c", args->ifname, 0); + s2 = clib_sysfs_link_to_name ((char *) s); + + if (s2 == 0 || strncmp ((char *) s2, "mlx5_core", 9) != 0) + { + args->error = + clib_error_return (0, + "invalid interface (only mlx5 supported for now)"); + goto err0; + } + + /* extract PCI address */ + vec_reset_length (s); + s = format (s, "/sys/class/net/%s/device%c", args->ifname, 0); + if (sysfs_path_to_pci_addr ((char *) s, &rd->pci_addr) == 0) + { + args->error = clib_error_return (0, "cannot find PCI address"); + goto err0; + } + + dev_list = ibv_get_device_list (&n_devs); + if (n_devs == 0) + { + args->error = + clib_error_return_unix (0, + "no RDMA devices available, errno = %d. Is the ib_uverbs module loaded?", + errno); + goto err1; + } + + for (int i = 0; i < n_devs; i++) + { + vlib_pci_addr_t addr; + + vec_reset_length (s); + s = format (s, "%s/device%c", dev_list[i]->dev_path, 0); + + if (sysfs_path_to_pci_addr ((char *) s, &addr) == 0) + continue; + + if (addr.as_u32 != rd->pci_addr.as_u32) + continue; + + if ((rd->ctx = ibv_open_device (dev_list[i]))) + break; + } + + if ((args->error = rdma_dev_init (vm, rd))) + goto err2; + + if ((args->error = rdma_register_interface (vnm, rd))) + goto err2; + + if ((args->error = rdma_async_event_init (rd))) + goto err3; + + rdma_update_state (vnm, rd, 1); + + vnet_sw_interface_t *sw = vnet_get_hw_sw_interface (vnm, rd->hw_if_index); + args->sw_if_index = rd->sw_if_index = sw->sw_if_index; + /* + * FIXME: add support for interrupt mode + * vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, rd->hw_if_index); + * hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE; + */ + vnet_hw_interface_set_input_node (vnm, rd->hw_if_index, + rdma_input_node.index); + vnet_hw_interface_assign_rx_thread (vnm, rd->hw_if_index, 0, ~0); + return; + +err3: + rdma_unregister_interface (vnm, rd); +err2: + rdma_dev_cleanup (rd); +err1: + ibv_free_device_list (dev_list); +err0: + vec_free (s2); + vec_free (s); + args->rv = VNET_API_ERROR_INVALID_INTERFACE; + vlib_log_err (rm->log_class, "%U", format_clib_error, args->error); +} + +void +rdma_delete_if (vlib_main_t * vm, rdma_device_t * rd) +{ + rdma_async_event_cleanup (rd); + rdma_unregister_interface (vnet_get_main (), rd); + rdma_dev_cleanup (rd); +} + +static clib_error_t * +rdma_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) +{ + vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, hw_if_index); + rdma_main_t *rm = &rdma_main; + rdma_device_t *rd = vec_elt_at_index (rm->devices, hi->dev_instance); + uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; + + if (rd->flags & RDMA_DEVICE_F_ERROR) + return clib_error_return (0, "device is in error state"); + + if (is_up) + { + vnet_hw_interface_set_flags (vnm, rd->hw_if_index, + VNET_HW_INTERFACE_FLAG_LINK_UP); + rd->flags |= RDMA_DEVICE_F_ADMIN_UP; + } + else + { + vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0); + rd->flags &= ~RDMA_DEVICE_F_ADMIN_UP; + } + return 0; +} + +static void +rdma_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index, + u32 node_index) +{ + rdma_main_t *rm = &rdma_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + rdma_device_t *rd = pool_elt_at_index (rm->devices, hw->dev_instance); + + /* Shut off redirection */ + if (node_index == ~0) + { + rd->per_interface_next_index = node_index; + return; + } + + rd->per_interface_next_index = + vlib_node_add_next (vlib_get_main (), rdma_input_node.index, node_index); +} + +static char *rdma_tx_func_error_strings[] = { +#define _(n,s) s, + foreach_rdma_tx_func_error +#undef _ +}; + +/* *INDENT-OFF* */ +VNET_DEVICE_CLASS (rdma_device_class,) = +{ + .name = "RDMA interface", + .format_device = format_rdma_device, + .format_device_name = format_rdma_device_name, + .admin_up_down_function = rdma_interface_admin_up_down, + .rx_redirect_to_node = rdma_set_interface_next_node, + .tx_function_n_errors = RDMA_TX_N_ERROR, + .tx_function_error_strings = rdma_tx_func_error_strings, +}; +/* *INDENT-ON* */ + +clib_error_t * +rdma_init (vlib_main_t * vm) +{ + rdma_main_t *rm = &rdma_main; + + rm->log_class = vlib_log_register_class ("rdma", 0); + + return 0; +} + +VLIB_INIT_FUNCTION (rdma_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/rdma/format.c b/src/plugins/rdma/format.c new file mode 100644 index 00000000000..7ef65d43957 --- /dev/null +++ b/src/plugins/rdma/format.c @@ -0,0 +1,89 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vlib/pci/pci.h> +#include <vnet/ethernet/ethernet.h> + +#include <rdma/rdma.h> + +u8 * +format_rdma_device_name (u8 * s, va_list * args) +{ + u32 i = va_arg (*args, u32); + rdma_main_t *rm = &rdma_main; + rdma_device_t *rd = vec_elt_at_index (rm->devices, i); + + s = format (s, "rdma-%u", rd->dev_instance); + return s; +} + +u8 * +format_rdma_device_flags (u8 * s, va_list * args) +{ + rdma_device_t *rd = va_arg (*args, rdma_device_t *); + u8 *t = 0; + +#define _(a, b, c) if (rd->flags & (1 << a)) \ +t = format (t, "%s%s", t ? " ":"", c); + foreach_rdma_device_flags +#undef _ + s = format (s, "%v", t); + vec_free (t); + return s; +} + +u8 * +format_rdma_device (u8 * s, va_list * args) +{ + u32 i = va_arg (*args, u32); + rdma_main_t *rm = &rdma_main; + rdma_device_t *rd = vec_elt_at_index (rm->devices, i); + u32 indent = format_get_indent (s); + + s = format (s, "flags: %U", format_rdma_device_flags, rd); + if (rd->error) + s = format (s, "\n%Uerror %U", format_white_space, indent, + format_clib_error, rd->error); + + return s; +} + +u8 * +format_rdma_input_trace (u8 * s, va_list * args) +{ + vlib_main_t *vm = va_arg (*args, vlib_main_t *); + vlib_node_t *node = va_arg (*args, vlib_node_t *); + rdma_input_trace_t *t = va_arg (*args, rdma_input_trace_t *); + vnet_main_t *vnm = vnet_get_main (); + vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, t->hw_if_index); + + s = format (s, "rdma: %v (%d) next-node %U", + hi->name, t->hw_if_index, format_vlib_next_node_name, vm, + node->index, t->next_index); + + return s; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/rdma/input.c b/src/plugins/rdma/input.c new file mode 100644 index 00000000000..001d1c5d493 --- /dev/null +++ b/src/plugins/rdma/input.c @@ -0,0 +1,202 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vlib/pci/pci.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/devices/devices.h> + +#include <rdma/rdma.h> + +#define foreach_rdma_input_error \ + _(BUFFER_ALLOC, "buffer alloc error") + +typedef enum +{ +#define _(f,s) RDMA_INPUT_ERROR_##f, + foreach_rdma_input_error +#undef _ + RDMA_INPUT_N_ERROR, +} rdma_input_error_t; + +static __clib_unused char *rdma_input_error_strings[] = { +#define _(n,s) s, + foreach_rdma_input_error +#undef _ +}; + +static_always_inline void +rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, + rdma_rxq_t * rxq) +{ + u32 n_alloc, n; + struct ibv_sge sg_entry; + struct ibv_recv_wr wr, *bad_wr; + u32 buffers[VLIB_FRAME_SIZE]; + + if (rxq->n_enq >= rxq->size) + return; + + n_alloc = clib_min (VLIB_FRAME_SIZE, rxq->size - rxq->n_enq); + n_alloc = vlib_buffer_alloc (vm, buffers, n_alloc); + + sg_entry.length = vlib_buffer_get_default_data_size (vm); + sg_entry.lkey = rd->mr->lkey; + wr.num_sge = 1; + wr.sg_list = &sg_entry; + wr.next = NULL; + for (n = 0; n < n_alloc; n++) + { + vlib_buffer_t *b = vlib_get_buffer (vm, buffers[n]); + sg_entry.addr = vlib_buffer_get_va (b); + wr.wr_id = buffers[n]; + if (ibv_post_recv (rxq->qp, &wr, &bad_wr) != 0) + vlib_buffer_free (vm, buffers + n, 1); + else + rxq->n_enq++; + } +} + +static_always_inline uword +rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame, rdma_device_t * rd, u16 qid) +{ + vnet_main_t *vnm = vnet_get_main (); + rdma_rxq_t *rxq = vec_elt_at_index (rd->rxqs, qid); + u32 n_trace; + struct ibv_wc wc[VLIB_FRAME_SIZE]; + u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; + u32 *bi, *to_next, n_left_to_next; + int i; + u32 n_rx_packets = 0, n_rx_bytes = 0; + + n_rx_packets = ibv_poll_cq (rxq->cq, VLIB_FRAME_SIZE, wc); + + if (n_rx_packets <= 0) + rdma_device_input_refill (vm, rd, rxq); + + if (PREDICT_FALSE (rd->per_interface_next_index != ~0)) + next_index = rd->per_interface_next_index; + + vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next); + + for (i = 0; i < n_rx_packets; i++) + { + u32 bi = wc[i].wr_id; + vlib_buffer_t *b = vlib_get_buffer (vm, bi); + b->current_length = wc[i].byte_len; + vnet_buffer (b)->sw_if_index[VLIB_RX] = rd->sw_if_index; + vnet_buffer (b)->sw_if_index[VLIB_TX] = ~0; + to_next[i] = bi; + n_rx_bytes += wc[i].byte_len; + } + + if (PREDICT_FALSE ((n_trace = vlib_get_trace_count (vm, node)))) + { + u32 n_left = n_rx_packets, i = 0; + bi = to_next; + + while (n_trace && n_left) + { + vlib_buffer_t *b; + rdma_input_trace_t *tr; + b = vlib_get_buffer (vm, bi[0]); + vlib_trace_buffer (vm, node, next_index, b, /* follow_chain */ 0); + tr = vlib_add_trace (vm, node, b, sizeof (*tr)); + tr->next_index = next_index; + tr->hw_if_index = rd->hw_if_index; + + /* next */ + n_trace--; + n_left--; + bi++; + i++; + } + vlib_set_trace_count (vm, node, n_trace); + } + + if (PREDICT_TRUE (next_index == VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT)) + { + vlib_next_frame_t *nf; + vlib_frame_t *f; + ethernet_input_frame_t *ef; + nf = vlib_node_runtime_get_next_frame (vm, node, next_index); + f = vlib_get_frame (vm, nf->frame_index); + f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX; + + ef = vlib_frame_scalar_args (f); + ef->sw_if_index = rd->sw_if_index; + ef->hw_if_index = rd->hw_if_index; + //f->flags |= ETH_INPUT_FRAME_F_IP4_CKSUM_OK; + } + + n_left_to_next -= n_rx_packets; + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + + vlib_increment_combined_counter + (vnm->interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, vm->thread_index, + rd->hw_if_index, n_rx_packets, n_rx_bytes); + + rxq->n_enq -= n_rx_packets; + rdma_device_input_refill (vm, rd, rxq); + + return n_rx_packets; +} + +VLIB_NODE_FN (rdma_input_node) (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_rx = 0; + rdma_main_t *rm = &rdma_main; + vnet_device_input_runtime_t *rt = (void *) node->runtime_data; + vnet_device_and_queue_t *dq; + + foreach_device_and_queue (dq, rt->devices_and_queues) + { + rdma_device_t *rd; + rd = vec_elt_at_index (rm->devices, dq->dev_instance); + if ((rd->flags & RDMA_DEVICE_F_ADMIN_UP) == 0) + continue; + n_rx += rdma_device_input_inline (vm, node, frame, rd, dq->queue_id); + } + return n_rx; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (rdma_input_node) = { + .name = "rdma-input", + .sibling_of = "device-input", + .format_trace = format_rdma_input_trace, + .type = VLIB_NODE_TYPE_INPUT, + .state = VLIB_NODE_STATE_DISABLED, + .n_errors = RDMA_INPUT_N_ERROR, + .error_strings = rdma_input_error_strings, +}; + +/* *INDENT-ON* */ + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/rdma/output.c b/src/plugins/rdma/output.c new file mode 100644 index 00000000000..410784308f3 --- /dev/null +++ b/src/plugins/rdma/output.c @@ -0,0 +1,133 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vlib/pci/pci.h> +#include <vppinfra/ring.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/devices/devices.h> + +#include <rdma/rdma.h> + +static_always_inline u16 +rdma_device_output_tx (vlib_main_t * vm, rdma_device_t * rd, rdma_txq_t * txq, + u32 * buffers, u16 n_left, u32 * n_tx_packets, + u32 * n_tx_bytes) +{ + struct ibv_sge sg_entry; + struct ibv_send_wr wr, *bad_wr; + u16 i; + + for (i = 0; i < n_left; i++) + { + vlib_buffer_t *b = vlib_get_buffer (vm, buffers[i]); + sg_entry.addr = vlib_buffer_get_current_va (b); + sg_entry.length = b->current_length; + sg_entry.lkey = rd->mr->lkey; + + memset (&wr, 0, sizeof (wr)); + wr.num_sge = 1; + wr.sg_list = &sg_entry; + wr.opcode = IBV_WR_SEND; + wr.send_flags = IBV_SEND_SIGNALED; + wr.wr_id = buffers[i]; + + if (ibv_post_send (txq->qp, &wr, &bad_wr) != 0) + break; + + *n_tx_bytes += b->current_length; + } + + *n_tx_packets += i; + return i; +} + +static_always_inline void +rdma_device_output_free (vlib_main_t * vm, rdma_txq_t * txq) +{ + struct ibv_wc wc[VLIB_FRAME_SIZE]; + u32 to_free[VLIB_FRAME_SIZE]; + int n_free; + int i; + + n_free = ibv_poll_cq (txq->cq, VLIB_FRAME_SIZE, wc); + if (n_free <= 0) + return; + + for (i = 0; i < n_free; i++) + to_free[i] = wc[i].wr_id; + + vlib_buffer_free (vm, to_free, n_free); +} + +VNET_DEVICE_CLASS_TX_FN (rdma_device_class) (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + vnet_main_t *vnm = vnet_get_main (); + rdma_main_t *rm = &rdma_main; + vnet_interface_output_runtime_t *ord = (void *) node->runtime_data; + rdma_device_t *rd = pool_elt_at_index (rm->devices, ord->dev_instance); + u32 thread_index = vm->thread_index; + u8 qid = thread_index; + rdma_txq_t *txq = vec_elt_at_index (rd->txqs, qid % vec_len (rd->txqs)); + u32 *buffers = vlib_frame_vector_args (frame); + u16 n_left; + u16 n_retry = 5; + u32 n_tx_packets = 0, n_tx_bytes = 0; + + clib_spinlock_lock_if_init (&txq->lock); + + n_left = frame->n_vectors; + + while (n_left) + { + u16 n; + rdma_device_output_free (vm, txq); + n = + rdma_device_output_tx (vm, rd, txq, buffers, n_left, &n_tx_packets, + &n_tx_bytes); + n_left -= n; + buffers += n; + + if (n_left && n_retry--) + { + vlib_buffer_free (vm, buffers, n_left); + vlib_error_count (vm, node->node_index, + RDMA_TX_ERROR_NO_FREE_SLOTS, n_left); + break; + } + } + + clib_spinlock_unlock_if_init (&txq->lock); + + vlib_increment_combined_counter + (vnm->interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_TX, thread_index, + rd->hw_if_index, n_tx_packets, n_tx_bytes); + + return frame->n_vectors - n_left; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/rdma/plugin.c b/src/plugins/rdma/plugin.c new file mode 100644 index 00000000000..f229b75129d --- /dev/null +++ b/src/plugins/rdma/plugin.c @@ -0,0 +1,35 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <vlib/vlib.h> +#include <vnet/plugin/plugin.h> +#include <vpp/app/version.h> + +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .description = "RDMA (ibverb) Device Plugin", +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/rdma/rdma.h b/src/plugins/rdma/rdma.h new file mode 100644 index 00000000000..860ddaba2b1 --- /dev/null +++ b/src/plugins/rdma/rdma.h @@ -0,0 +1,141 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#ifndef _RDMA_H_ +#define _RDMA_H_ + +#include <infiniband/verbs.h> +#include <vlib/log.h> + +#define foreach_rdma_device_flags \ + _(0, INITIALIZED, "initialized") \ + _(1, ERROR, "error") \ + _(2, ADMIN_UP, "admin-up") \ + _(3, VA_DMA, "vaddr-dma") \ + _(4, LINK_UP, "link-up") \ + _(5, SHARED_TXQ_LOCK, "shared-txq-lock") \ + _(6, ELOG, "elog") \ + +enum +{ +#define _(a, b, c) RDMA_DEVICE_F_##b = (1 << a), + foreach_rdma_device_flags +#undef _ +}; + +typedef struct +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + u32 size; + u32 n_enq; + struct ibv_cq *cq; + struct ibv_qp *qp; +} rdma_rxq_t; + +typedef struct +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + u32 size; + u32 n_enq; + struct ibv_cq *cq; + struct ibv_qp *qp; + clib_spinlock_t lock; +} rdma_txq_t; + +typedef struct +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + u32 flags; + u32 per_interface_next_index; + + u32 dev_instance; + u32 sw_if_index; + u32 hw_if_index; + + u32 async_event_clib_file_index; + + rdma_rxq_t *rxqs; + rdma_txq_t *txqs; + + u8 hwaddr[6]; + vlib_pci_addr_t pci_addr; + + struct ibv_context *ctx; + struct ibv_pd *pd; + struct ibv_mr *mr; + struct ibv_flow *flow_ucast; + struct ibv_flow *flow_mcast; + + /* error */ + clib_error_t *error; +} rdma_device_t; + +typedef struct +{ + rdma_device_t *devices; + vlib_log_class_t log_class; +} rdma_main_t; + +extern rdma_main_t rdma_main; + +typedef struct +{ + u8 *ifname; + + /* return */ + int rv; + u32 sw_if_index; + clib_error_t *error; +} rdma_create_if_args_t; + +void rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args); +void rdma_delete_if (vlib_main_t * vm, rdma_device_t * rd); + +extern vlib_node_registration_t rdma_input_node; +extern vnet_device_class_t rdma_device_class; + +/* format.c */ +format_function_t format_rdma_device; +format_function_t format_rdma_device_name; +format_function_t format_rdma_input_trace; + +typedef struct +{ + u32 next_index; + u32 hw_if_index; +} rdma_input_trace_t; + +#define foreach_rdma_tx_func_error \ +_(NO_FREE_SLOTS, "no free tx slots") + +typedef enum +{ +#define _(f,s) RDMA_TX_ERROR_##f, + foreach_rdma_tx_func_error +#undef _ + RDMA_TX_N_ERROR, +} rdma_tx_func_error_t; + +#endif /* AVF_H */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/devices/tap/tap.c b/src/vnet/devices/tap/tap.c index 0b2ebd6b5e3..2649f68713e 100644 --- a/src/vnet/devices/tap/tap.c +++ b/src/vnet/devices/tap/tap.c @@ -347,16 +347,8 @@ tap_create_if (vlib_main_t * vm, tap_create_if_args_t * args) } if (!args->mac_addr_set) - { - f64 now = vlib_time_now (vm); - u32 rnd; - rnd = (u32) (now * 1e6); - rnd = random_u32 (&rnd); - - memcpy (args->mac_addr + 2, &rnd, sizeof (rnd)); - args->mac_addr[0] = 2; - args->mac_addr[1] = 0xfe; - } + ethernet_mac_address_generate (args->mac_addr); + vif->rx_ring_sz = args->rx_ring_sz != 0 ? args->rx_ring_sz : 256; vif->tx_ring_sz = args->tx_ring_sz != 0 ? args->tx_ring_sz : 256; clib_memcpy (vif->mac_addr, args->mac_addr, 6); diff --git a/src/vnet/ethernet/mac_address.h b/src/vnet/ethernet/mac_address.h index 87a66a242be..01fb76e002b 100644 --- a/src/vnet/ethernet/mac_address.h +++ b/src/vnet/ethernet/mac_address.h @@ -70,6 +70,17 @@ ethernet_mac_address_is_zero (const u8 * mac) return ((*((u32 *) mac) == 0) && (*((u16 *) (mac + 4)) == 0)); } +static inline void +ethernet_mac_address_generate (u8 * mac) +{ + u32 rnd = clib_cpu_time_now (); + rnd = random_u32 (&rnd); + + memcpy (mac + 2, &rnd, sizeof (rnd)); + mac[0] = 2; + mac[1] = 0xfe; +} + static inline int ethernet_mac_address_equal (const u8 * a, const u8 * b) { |