aboutsummaryrefslogtreecommitdiffstats
path: root/src/plugins
diff options
context:
space:
mode:
authorBenoît Ganne <bganne@cisco.com>2019-03-25 11:41:34 +0100
committerDamjan Marion <dmarion@me.com>2019-03-28 19:31:59 +0000
commitfe750c248be58b76479836639fbd0c4617210aa5 (patch)
tree471a48243fb13e3eb84c95cf0be0b270607ae286 /src/plugins
parent6bc6fd0aebd7feb523604973bcf593bfe14bbd30 (diff)
Add RDMA ibverb driver plugin
RDMA ibverb is a userspace API to efficiently rx/tx packets. This is an initial, unoptimized driver targeting Mellanox cards. Next steps should include batching, multiqueue and additional cards. Change-Id: I0309c7a543f75f2f9317eaf63ca502ac7a093ef9 Signed-off-by: Benoît Ganne <bganne@cisco.com>
Diffstat (limited to 'src/plugins')
-rw-r--r--src/plugins/rdma/CMakeLists.txt61
-rw-r--r--src/plugins/rdma/cli.c133
-rw-r--r--src/plugins/rdma/device.c607
-rw-r--r--src/plugins/rdma/format.c89
-rw-r--r--src/plugins/rdma/input.c202
-rw-r--r--src/plugins/rdma/output.c133
-rw-r--r--src/plugins/rdma/plugin.c35
-rw-r--r--src/plugins/rdma/rdma.h141
8 files changed, 1401 insertions, 0 deletions
diff --git a/src/plugins/rdma/CMakeLists.txt b/src/plugins/rdma/CMakeLists.txt
new file mode 100644
index 00000000000..35d43db28a1
--- /dev/null
+++ b/src/plugins/rdma/CMakeLists.txt
@@ -0,0 +1,61 @@
+# Copyright (c) 2018 Cisco and/or its affiliates.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+message(STATUS "RDMA plugins - looking for ibverbs")
+
+find_path(IBVERBS_INCLUDE_DIR NAMES infiniband/verbs.h)
+find_library(IBVERBS_LIB NAMES libibverbs.a)
+find_library(MLX5_LIB NAMES libmlx5.a)
+
+if (NOT IBVERBS_LIB OR NOT MLX5_LIB)
+ message(WARNING "RDMA plugins - ibverbs not found - rdma_plugin disabled")
+ return()
+endif()
+
+if (MLX5_LIB)
+ string_append(RDMA_LINK_FLAGS "-Wl,--whole-archive,${MLX5_LIB},--no-whole-archive")
+endif()
+
+set(CMAKE_REQUIRED_FLAGS "-fPIC -shared ${IBVERBS_LIB} ${RDMA_LINK_FLAGS}")
+CHECK_C_SOURCE_COMPILES("" IBVERBS_COMPILES_CHECK)
+
+if (NOT IBVERBS_COMPILES_CHECK)
+ message(WARNING "RDMA plugins - no working ibverbs found - rdma_plugin disabled")
+ return()
+endif()
+
+message(STATUS "RDMA plugins - found ${IBVERBS_INCLUDE_DIR}")
+message(STATUS "RDMA plugins - found ${IBVERBS_LIB}")
+message(STATUS "RDMA plugins - found ${MLX5_LIB}")
+
+include_directories(${IBVERBS_INCLUDE_DIR})
+
+add_vpp_plugin(rdma
+ SOURCES
+ cli.c
+ device.c
+ format.c
+ plugin.c
+ input.c
+ output.c
+
+ MULTIARCH_SOURCES
+ input.c
+ output.c
+
+ LINK_FLAGS
+ "${RDMA_LINK_FLAGS}"
+
+ LINK_LIBRARIES
+ ${IBVERBS_LIB}
+)
diff --git a/src/plugins/rdma/cli.c b/src/plugins/rdma/cli.c
new file mode 100644
index 00000000000..8919603e293
--- /dev/null
+++ b/src/plugins/rdma/cli.c
@@ -0,0 +1,133 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+#include <stdint.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <inttypes.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vlib/pci/pci.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <rdma/rdma.h>
+
+static clib_error_t *
+rdma_create_command_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ rdma_create_if_args_t args;
+
+ clib_memset (&args, 0, sizeof (rdma_create_if_args_t));
+
+ /* Get a line of input. */
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "name %s", &args.ifname))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ rdma_create_if (vm, &args);
+
+ vec_free (args.ifname);
+
+ return args.error;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (rdma_create_command, static) = {
+ .path = "create interface rdma",
+ .short_help = "create interface rdma <name ifname>",
+ .function = rdma_create_command_fn,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+rdma_delete_command_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ u32 sw_if_index = ~0;
+ vnet_hw_interface_t *hw;
+ rdma_main_t *rm = &rdma_main;
+ rdma_device_t *rd;
+ vnet_main_t *vnm = vnet_get_main ();
+
+ /* Get a line of input. */
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "sw_if_index %d", &sw_if_index))
+ ;
+ else if (unformat (line_input, "%U", unformat_vnet_sw_interface,
+ vnm, &sw_if_index))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ if (sw_if_index == ~0)
+ return clib_error_return (0,
+ "please specify interface name or sw_if_index");
+
+ hw = vnet_get_sup_hw_interface (vnm, sw_if_index);
+ if (hw == NULL || rdma_device_class.index != hw->dev_class_index)
+ return clib_error_return (0, "not an AVF interface");
+
+ rd = pool_elt_at_index (rm->devices, hw->dev_instance);
+
+ rdma_delete_if (vm, rd);
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (rdma_delete_command, static) = {
+ .path = "delete interface rdma",
+ .short_help = "delete interface rdma "
+ "{<interface> | sw_if_index <sw_idx>}",
+ .function = rdma_delete_command_fn,
+};
+/* *INDENT-ON* */
+
+clib_error_t *
+rdma_cli_init (vlib_main_t * vm)
+{
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (rdma_cli_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/rdma/device.c b/src/plugins/rdma/device.c
new file mode 100644
index 00000000000..31112a923d0
--- /dev/null
+++ b/src/plugins/rdma/device.c
@@ -0,0 +1,607 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <net/if.h>
+#include <linux/if_link.h>
+#include <linux/if_ether.h>
+
+#include <vppinfra/linux/sysfs.h>
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vlib/pci/pci.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <rdma/rdma.h>
+
+rdma_main_t rdma_main;
+
+#define rdma_log_debug(dev, f, ...) \
+{ \
+ vlib_log(VLIB_LOG_LEVEL_DEBUG, rdma_main.log_class, "%U: " f, \
+ format_vlib_pci_addr, &rd->pci_addr, ##__VA_ARGS__); \
+};
+
+static u32
+rdma_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags)
+{
+ rdma_main_t *rm = &rdma_main;
+ vlib_log_warn (rm->log_class, "TODO");
+ return 0;
+}
+
+static void
+rdma_update_state (vnet_main_t * vnm, rdma_device_t * rd, int port)
+{
+ struct ibv_port_attr attr;
+ u32 width = 0;
+ u32 speed = 0;
+
+ if (ibv_query_port (rd->ctx, port, &attr))
+ {
+ vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, 0);
+ vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
+ return;
+ }
+
+ /* update state */
+ switch (attr.state)
+ {
+ case IBV_PORT_ACTIVE: /* fallthrough */
+ case IBV_PORT_ACTIVE_DEFER:
+ rd->flags |= RDMA_DEVICE_F_LINK_UP;
+ vnet_hw_interface_set_flags (vnm, rd->hw_if_index,
+ VNET_HW_INTERFACE_FLAG_LINK_UP);
+ break;
+ default:
+ rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
+ vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
+ break;
+ }
+
+ /* update speed */
+ switch (attr.active_width)
+ {
+ case 1:
+ width = 1;
+ break;
+ case 2:
+ width = 4;
+ break;
+ case 4:
+ width = 8;
+ break;
+ case 8:
+ width = 12;
+ break;
+ }
+ switch (attr.active_speed)
+ {
+ case 1:
+ speed = 2500000;
+ break;
+ case 2:
+ speed = 5000000;
+ break;
+ case 4: /* fallthrough */
+ case 8:
+ speed = 10000000;
+ break;
+ case 16:
+ speed = 14000000;
+ break;
+ case 32:
+ speed = 25000000;
+ break;
+ }
+ vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, width * speed);
+}
+
+static clib_error_t *
+rdma_async_event_error_ready (clib_file_t * f)
+{
+ rdma_main_t *rm = &rdma_main;
+ rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data);
+ return clib_error_return (0, "RDMA async event error for device %U",
+ format_vlib_pci_addr, &rd->pci_addr);
+}
+
+static clib_error_t *
+rdma_async_event_read_ready (clib_file_t * f)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ rdma_main_t *rm = &rdma_main;
+ rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data);
+ int ret;
+ struct ibv_async_event event;
+ ret = ibv_get_async_event (rd->ctx, &event);
+ if (ret < 0)
+ {
+ return clib_error_return_unix (0, "ibv_get_async_event() failed");
+ }
+
+ switch (event.event_type)
+ {
+ case IBV_EVENT_PORT_ACTIVE:
+ rdma_update_state (vnm, rd, event.element.port_num);
+ break;
+ case IBV_EVENT_PORT_ERR:
+ rdma_update_state (vnm, rd, event.element.port_num);
+ break;
+ case IBV_EVENT_DEVICE_FATAL:
+ rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
+ vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
+ vlib_log_emerg (rm->log_class, "Fatal RDMA error for device %U",
+ format_vlib_pci_addr, &rd->pci_addr);
+ break;
+ default:
+ vlib_log_warn (rm->log_class,
+ "Unhandeld RDMA async event %i for device %U",
+ event.event_type, format_vlib_pci_addr, &rd->pci_addr);
+ break;
+ }
+
+ ibv_ack_async_event (&event);
+ return 0;
+}
+
+static clib_error_t *
+rdma_async_event_init (rdma_device_t * rd)
+{
+ clib_file_t t = { 0 };
+ int ret;
+
+ /* make RDMA async event fd non-blocking */
+ ret = fcntl (rd->ctx->async_fd, F_GETFL);
+ if (ret < 0)
+ {
+ return clib_error_return_unix (0, "fcntl(F_GETFL) failed");
+ }
+ ret = fcntl (rd->ctx->async_fd, F_SETFL, ret | O_NONBLOCK);
+ if (ret < 0)
+ {
+ return clib_error_return_unix (0, "fcntl(F_SETFL, O_NONBLOCK) failed");
+ }
+
+ /* register RDMA async event fd */
+ t.read_function = rdma_async_event_read_ready;
+ t.file_descriptor = rd->ctx->async_fd;
+ t.error_function = rdma_async_event_error_ready;
+ t.private_data = rd->dev_instance;
+ t.description =
+ format (0, "RMDA %U async event", format_vlib_pci_addr, &rd->pci_addr);
+
+ rd->async_event_clib_file_index = clib_file_add (&file_main, &t);
+
+ return 0;
+}
+
+static void
+rdma_async_event_cleanup (rdma_device_t * rd)
+{
+ clib_file_del_by_index (&file_main, rd->async_event_clib_file_index);
+}
+
+static clib_error_t *
+rdma_register_interface (vnet_main_t * vnm, rdma_device_t * rd)
+{
+ return ethernet_register_interface (vnm, rdma_device_class.index,
+ rd->dev_instance, rd->hwaddr,
+ &rd->hw_if_index, rdma_flag_change);
+}
+
+static void
+rdma_unregister_interface (vnet_main_t * vnm, rdma_device_t * rd)
+{
+ vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
+ vnet_hw_interface_unassign_rx_thread (vnm, rd->hw_if_index, 0);
+ ethernet_delete_interface (vnm, rd->hw_if_index);
+}
+
+static void
+rdma_dev_cleanup (rdma_device_t * rd)
+{
+ rdma_main_t *rm = &rdma_main;
+ rdma_rxq_t *rxq;
+ rdma_txq_t *txq;
+
+#define _(fn, arg) if (arg) \
+ { \
+ int rv; \
+ if ((rv = fn (arg))) \
+ rdma_log_debug (rd, #fn "() failed (rv = %d)", rv); \
+ }
+
+ _(ibv_destroy_flow, rd->flow_mcast);
+ _(ibv_destroy_flow, rd->flow_ucast);
+ _(ibv_dereg_mr, rd->mr);
+ vec_foreach (txq, rd->txqs)
+ {
+ _(ibv_destroy_qp, txq->qp);
+ _(ibv_destroy_cq, txq->cq);
+ }
+ vec_foreach (rxq, rd->rxqs)
+ {
+ _(ibv_destroy_qp, rxq->qp);
+ _(ibv_destroy_cq, rxq->cq);
+ }
+ _(ibv_dealloc_pd, rd->pd);
+ _(ibv_close_device, rd->ctx);
+#undef _
+
+ clib_error_free (rd->error);
+
+ vec_free (rd->rxqs);
+ vec_free (rd->txqs);
+ pool_put (rm->devices, rd);
+}
+
+static clib_error_t *
+rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
+{
+ rdma_rxq_t *rxq;
+ struct ibv_qp_init_attr qpia;
+ struct ibv_qp_attr qpa;
+ int qp_flags;
+
+ vec_validate_aligned (rd->rxqs, qid, CLIB_CACHE_LINE_BYTES);
+ rxq = vec_elt_at_index (rd->rxqs, qid);
+ rxq->size = n_desc;
+
+ if ((rxq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
+ return clib_error_return_unix (0, "Create CQ Failed");
+
+ memset (&qpia, 0, sizeof (qpia));
+ qpia.qp_type = IBV_QPT_RAW_PACKET;
+ qpia.send_cq = rxq->cq;
+ qpia.recv_cq = rxq->cq;
+ qpia.cap.max_recv_wr = n_desc;
+ qpia.cap.max_recv_sge = 1;
+
+ if ((rxq->qp = ibv_create_qp (rd->pd, &qpia)) == 0)
+ return clib_error_return_unix (0, "Queue Pair create failed");
+
+ memset (&qpa, 0, sizeof (qpa));
+ qp_flags = IBV_QP_STATE | IBV_QP_PORT;
+ qpa.qp_state = IBV_QPS_INIT;
+ qpa.port_num = 1;
+ if (ibv_modify_qp (rxq->qp, &qpa, qp_flags) != 0)
+ return clib_error_return_unix (0, "Modify QP (init) Failed");
+
+ memset (&qpa, 0, sizeof (qpa));
+ qp_flags = IBV_QP_STATE;
+ qpa.qp_state = IBV_QPS_RTR;
+ if (ibv_modify_qp (rxq->qp, &qpa, qp_flags) != 0)
+ return clib_error_return_unix (0, "Modify QP (receive) Failed");
+
+ return 0;
+}
+
+static clib_error_t *
+rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
+{
+ rdma_txq_t *txq;
+ struct ibv_qp_init_attr qpia;
+ struct ibv_qp_attr qpa;
+ int qp_flags;
+
+ vec_validate_aligned (rd->txqs, qid, CLIB_CACHE_LINE_BYTES);
+ txq = vec_elt_at_index (rd->txqs, qid);
+ txq->size = n_desc;
+
+ if ((txq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
+ return clib_error_return_unix (0, "Create CQ Failed");
+
+ memset (&qpia, 0, sizeof (qpia));
+ qpia.qp_type = IBV_QPT_RAW_PACKET;
+ qpia.send_cq = txq->cq;
+ qpia.recv_cq = txq->cq;
+ qpia.cap.max_send_wr = n_desc;
+ qpia.cap.max_send_sge = 1;
+
+ if ((txq->qp = ibv_create_qp (rd->pd, &qpia)) == 0)
+ return clib_error_return_unix (0, "Queue Pair create failed");
+
+ memset (&qpa, 0, sizeof (qpa));
+ qp_flags = IBV_QP_STATE | IBV_QP_PORT;
+ qpa.qp_state = IBV_QPS_INIT;
+ qpa.port_num = 1;
+ if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
+ return clib_error_return_unix (0, "Modify QP (init) Failed");
+
+ memset (&qpa, 0, sizeof (qpa));
+ qp_flags = IBV_QP_STATE;
+ qpa.qp_state = IBV_QPS_RTR;
+ if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
+ return clib_error_return_unix (0, "Modify QP (receive) Failed");
+
+ memset (&qpa, 0, sizeof (qpa));
+ qp_flags = IBV_QP_STATE;
+ qpa.qp_state = IBV_QPS_RTS;
+ if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
+ return clib_error_return_unix (0, "Modify QP (send) Failed");
+ return 0;
+}
+
+static clib_error_t *
+rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd)
+{
+ clib_error_t *err;
+ vlib_buffer_main_t *bm = vm->buffer_main;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ u16 i;
+
+ if (rd->ctx == 0)
+ return clib_error_return_unix (0, "Device Open Failed");
+
+ if ((rd->pd = ibv_alloc_pd (rd->ctx)) == 0)
+ return clib_error_return_unix (0, "PD Alloc Failed");
+
+ if ((err = rdma_rxq_init (vm, rd, 0, 512)))
+ return err;
+
+ for (i = 0; i < tm->n_vlib_mains; i++)
+ if ((err = rdma_txq_init (vm, rd, i, 512)))
+ return err;
+
+ if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
+ bm->buffer_mem_size,
+ IBV_ACCESS_LOCAL_WRITE)) == 0)
+ return clib_error_return_unix (0, "Register MR Failed");
+
+ ethernet_mac_address_generate (rd->hwaddr);
+
+ /*
+ * restrict packets steering to our MAC
+ * allows to share a single HW NIC with multiple RDMA ifaces
+ * and/or Linux
+ */
+ struct raw_eth_flow_attr
+ {
+ struct ibv_flow_attr attr;
+ struct ibv_flow_spec_eth spec_eth;
+ } __attribute__ ((packed)) fa;
+ memset (&fa, 0, sizeof (fa));
+ fa.attr.num_of_specs = 1;
+ fa.attr.port = 1;
+ fa.spec_eth.type = IBV_FLOW_SPEC_ETH;
+ fa.spec_eth.size = sizeof (struct ibv_flow_spec_eth);
+ memcpy (fa.spec_eth.val.dst_mac, rd->hwaddr,
+ sizeof (fa.spec_eth.val.dst_mac));
+ memset (fa.spec_eth.mask.dst_mac, 0xff, sizeof (fa.spec_eth.mask.dst_mac));
+ if ((rd->flow_ucast = ibv_create_flow (rd->rxqs[0].qp, &fa.attr)) == 0)
+ return clib_error_return_unix (0, "create Flow Failed");
+
+ /* receive multicast packets too */
+ memset (&fa, 0, sizeof (fa));
+ fa.attr.num_of_specs = 1;
+ fa.attr.port = 1;
+ fa.attr.flags = IBV_FLOW_ATTR_FLAGS_DONT_TRAP; /* let others receive them too */
+ fa.spec_eth.type = IBV_FLOW_SPEC_ETH;
+ fa.spec_eth.size = sizeof (struct ibv_flow_spec_eth);
+ fa.spec_eth.val.dst_mac[0] = 1;
+ fa.spec_eth.mask.dst_mac[0] = 1;
+ if ((rd->flow_mcast = ibv_create_flow (rd->rxqs[0].qp, &fa.attr)) == 0)
+ return clib_error_return_unix (0, "create Flow Failed");
+
+ return 0;
+}
+
+static uword
+sysfs_path_to_pci_addr (char *path, vlib_pci_addr_t * addr)
+{
+ uword rv;
+ unformat_input_t in;
+ u8 *s;
+
+ s = clib_sysfs_link_to_name (path);
+ unformat_init_string (&in, (char *) s, strlen ((char *) s));
+ rv = unformat (&in, "%U", unformat_vlib_pci_addr, addr);
+ unformat_free (&in);
+ vec_free (s);
+ return rv;
+}
+
+void
+rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ rdma_main_t *rm = &rdma_main;
+ rdma_device_t *rd = 0;
+ struct ibv_device **dev_list = 0;
+ int n_devs;
+ u8 *s = 0, *s2 = 0;
+
+ pool_get_zero (rm->devices, rd);
+ rd->dev_instance = rd - rm->devices;
+ rd->per_interface_next_index = ~0;
+
+ /* check if device exist and if it is bound to mlx5_core */
+ s = format (s, "/sys/class/net/%s/device/driver/module%c", args->ifname, 0);
+ s2 = clib_sysfs_link_to_name ((char *) s);
+
+ if (s2 == 0 || strncmp ((char *) s2, "mlx5_core", 9) != 0)
+ {
+ args->error =
+ clib_error_return (0,
+ "invalid interface (only mlx5 supported for now)");
+ goto err0;
+ }
+
+ /* extract PCI address */
+ vec_reset_length (s);
+ s = format (s, "/sys/class/net/%s/device%c", args->ifname, 0);
+ if (sysfs_path_to_pci_addr ((char *) s, &rd->pci_addr) == 0)
+ {
+ args->error = clib_error_return (0, "cannot find PCI address");
+ goto err0;
+ }
+
+ dev_list = ibv_get_device_list (&n_devs);
+ if (n_devs == 0)
+ {
+ args->error =
+ clib_error_return_unix (0,
+ "no RDMA devices available, errno = %d. Is the ib_uverbs module loaded?",
+ errno);
+ goto err1;
+ }
+
+ for (int i = 0; i < n_devs; i++)
+ {
+ vlib_pci_addr_t addr;
+
+ vec_reset_length (s);
+ s = format (s, "%s/device%c", dev_list[i]->dev_path, 0);
+
+ if (sysfs_path_to_pci_addr ((char *) s, &addr) == 0)
+ continue;
+
+ if (addr.as_u32 != rd->pci_addr.as_u32)
+ continue;
+
+ if ((rd->ctx = ibv_open_device (dev_list[i])))
+ break;
+ }
+
+ if ((args->error = rdma_dev_init (vm, rd)))
+ goto err2;
+
+ if ((args->error = rdma_register_interface (vnm, rd)))
+ goto err2;
+
+ if ((args->error = rdma_async_event_init (rd)))
+ goto err3;
+
+ rdma_update_state (vnm, rd, 1);
+
+ vnet_sw_interface_t *sw = vnet_get_hw_sw_interface (vnm, rd->hw_if_index);
+ args->sw_if_index = rd->sw_if_index = sw->sw_if_index;
+ /*
+ * FIXME: add support for interrupt mode
+ * vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, rd->hw_if_index);
+ * hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE;
+ */
+ vnet_hw_interface_set_input_node (vnm, rd->hw_if_index,
+ rdma_input_node.index);
+ vnet_hw_interface_assign_rx_thread (vnm, rd->hw_if_index, 0, ~0);
+ return;
+
+err3:
+ rdma_unregister_interface (vnm, rd);
+err2:
+ rdma_dev_cleanup (rd);
+err1:
+ ibv_free_device_list (dev_list);
+err0:
+ vec_free (s2);
+ vec_free (s);
+ args->rv = VNET_API_ERROR_INVALID_INTERFACE;
+ vlib_log_err (rm->log_class, "%U", format_clib_error, args->error);
+}
+
+void
+rdma_delete_if (vlib_main_t * vm, rdma_device_t * rd)
+{
+ rdma_async_event_cleanup (rd);
+ rdma_unregister_interface (vnet_get_main (), rd);
+ rdma_dev_cleanup (rd);
+}
+
+static clib_error_t *
+rdma_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
+{
+ vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, hw_if_index);
+ rdma_main_t *rm = &rdma_main;
+ rdma_device_t *rd = vec_elt_at_index (rm->devices, hi->dev_instance);
+ uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+
+ if (rd->flags & RDMA_DEVICE_F_ERROR)
+ return clib_error_return (0, "device is in error state");
+
+ if (is_up)
+ {
+ vnet_hw_interface_set_flags (vnm, rd->hw_if_index,
+ VNET_HW_INTERFACE_FLAG_LINK_UP);
+ rd->flags |= RDMA_DEVICE_F_ADMIN_UP;
+ }
+ else
+ {
+ vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
+ rd->flags &= ~RDMA_DEVICE_F_ADMIN_UP;
+ }
+ return 0;
+}
+
+static void
+rdma_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index,
+ u32 node_index)
+{
+ rdma_main_t *rm = &rdma_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ rdma_device_t *rd = pool_elt_at_index (rm->devices, hw->dev_instance);
+
+ /* Shut off redirection */
+ if (node_index == ~0)
+ {
+ rd->per_interface_next_index = node_index;
+ return;
+ }
+
+ rd->per_interface_next_index =
+ vlib_node_add_next (vlib_get_main (), rdma_input_node.index, node_index);
+}
+
+static char *rdma_tx_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_rdma_tx_func_error
+#undef _
+};
+
+/* *INDENT-OFF* */
+VNET_DEVICE_CLASS (rdma_device_class,) =
+{
+ .name = "RDMA interface",
+ .format_device = format_rdma_device,
+ .format_device_name = format_rdma_device_name,
+ .admin_up_down_function = rdma_interface_admin_up_down,
+ .rx_redirect_to_node = rdma_set_interface_next_node,
+ .tx_function_n_errors = RDMA_TX_N_ERROR,
+ .tx_function_error_strings = rdma_tx_func_error_strings,
+};
+/* *INDENT-ON* */
+
+clib_error_t *
+rdma_init (vlib_main_t * vm)
+{
+ rdma_main_t *rm = &rdma_main;
+
+ rm->log_class = vlib_log_register_class ("rdma", 0);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (rdma_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/rdma/format.c b/src/plugins/rdma/format.c
new file mode 100644
index 00000000000..7ef65d43957
--- /dev/null
+++ b/src/plugins/rdma/format.c
@@ -0,0 +1,89 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vlib/pci/pci.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <rdma/rdma.h>
+
+u8 *
+format_rdma_device_name (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+ rdma_main_t *rm = &rdma_main;
+ rdma_device_t *rd = vec_elt_at_index (rm->devices, i);
+
+ s = format (s, "rdma-%u", rd->dev_instance);
+ return s;
+}
+
+u8 *
+format_rdma_device_flags (u8 * s, va_list * args)
+{
+ rdma_device_t *rd = va_arg (*args, rdma_device_t *);
+ u8 *t = 0;
+
+#define _(a, b, c) if (rd->flags & (1 << a)) \
+t = format (t, "%s%s", t ? " ":"", c);
+ foreach_rdma_device_flags
+#undef _
+ s = format (s, "%v", t);
+ vec_free (t);
+ return s;
+}
+
+u8 *
+format_rdma_device (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+ rdma_main_t *rm = &rdma_main;
+ rdma_device_t *rd = vec_elt_at_index (rm->devices, i);
+ u32 indent = format_get_indent (s);
+
+ s = format (s, "flags: %U", format_rdma_device_flags, rd);
+ if (rd->error)
+ s = format (s, "\n%Uerror %U", format_white_space, indent,
+ format_clib_error, rd->error);
+
+ return s;
+}
+
+u8 *
+format_rdma_input_trace (u8 * s, va_list * args)
+{
+ vlib_main_t *vm = va_arg (*args, vlib_main_t *);
+ vlib_node_t *node = va_arg (*args, vlib_node_t *);
+ rdma_input_trace_t *t = va_arg (*args, rdma_input_trace_t *);
+ vnet_main_t *vnm = vnet_get_main ();
+ vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, t->hw_if_index);
+
+ s = format (s, "rdma: %v (%d) next-node %U",
+ hi->name, t->hw_if_index, format_vlib_next_node_name, vm,
+ node->index, t->next_index);
+
+ return s;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/rdma/input.c b/src/plugins/rdma/input.c
new file mode 100644
index 00000000000..001d1c5d493
--- /dev/null
+++ b/src/plugins/rdma/input.c
@@ -0,0 +1,202 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vlib/pci/pci.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/devices.h>
+
+#include <rdma/rdma.h>
+
+#define foreach_rdma_input_error \
+ _(BUFFER_ALLOC, "buffer alloc error")
+
+typedef enum
+{
+#define _(f,s) RDMA_INPUT_ERROR_##f,
+ foreach_rdma_input_error
+#undef _
+ RDMA_INPUT_N_ERROR,
+} rdma_input_error_t;
+
+static __clib_unused char *rdma_input_error_strings[] = {
+#define _(n,s) s,
+ foreach_rdma_input_error
+#undef _
+};
+
+static_always_inline void
+rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd,
+ rdma_rxq_t * rxq)
+{
+ u32 n_alloc, n;
+ struct ibv_sge sg_entry;
+ struct ibv_recv_wr wr, *bad_wr;
+ u32 buffers[VLIB_FRAME_SIZE];
+
+ if (rxq->n_enq >= rxq->size)
+ return;
+
+ n_alloc = clib_min (VLIB_FRAME_SIZE, rxq->size - rxq->n_enq);
+ n_alloc = vlib_buffer_alloc (vm, buffers, n_alloc);
+
+ sg_entry.length = vlib_buffer_get_default_data_size (vm);
+ sg_entry.lkey = rd->mr->lkey;
+ wr.num_sge = 1;
+ wr.sg_list = &sg_entry;
+ wr.next = NULL;
+ for (n = 0; n < n_alloc; n++)
+ {
+ vlib_buffer_t *b = vlib_get_buffer (vm, buffers[n]);
+ sg_entry.addr = vlib_buffer_get_va (b);
+ wr.wr_id = buffers[n];
+ if (ibv_post_recv (rxq->qp, &wr, &bad_wr) != 0)
+ vlib_buffer_free (vm, buffers + n, 1);
+ else
+ rxq->n_enq++;
+ }
+}
+
+static_always_inline uword
+rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * frame, rdma_device_t * rd, u16 qid)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ rdma_rxq_t *rxq = vec_elt_at_index (rd->rxqs, qid);
+ u32 n_trace;
+ struct ibv_wc wc[VLIB_FRAME_SIZE];
+ u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+ u32 *bi, *to_next, n_left_to_next;
+ int i;
+ u32 n_rx_packets = 0, n_rx_bytes = 0;
+
+ n_rx_packets = ibv_poll_cq (rxq->cq, VLIB_FRAME_SIZE, wc);
+
+ if (n_rx_packets <= 0)
+ rdma_device_input_refill (vm, rd, rxq);
+
+ if (PREDICT_FALSE (rd->per_interface_next_index != ~0))
+ next_index = rd->per_interface_next_index;
+
+ vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ for (i = 0; i < n_rx_packets; i++)
+ {
+ u32 bi = wc[i].wr_id;
+ vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+ b->current_length = wc[i].byte_len;
+ vnet_buffer (b)->sw_if_index[VLIB_RX] = rd->sw_if_index;
+ vnet_buffer (b)->sw_if_index[VLIB_TX] = ~0;
+ to_next[i] = bi;
+ n_rx_bytes += wc[i].byte_len;
+ }
+
+ if (PREDICT_FALSE ((n_trace = vlib_get_trace_count (vm, node))))
+ {
+ u32 n_left = n_rx_packets, i = 0;
+ bi = to_next;
+
+ while (n_trace && n_left)
+ {
+ vlib_buffer_t *b;
+ rdma_input_trace_t *tr;
+ b = vlib_get_buffer (vm, bi[0]);
+ vlib_trace_buffer (vm, node, next_index, b, /* follow_chain */ 0);
+ tr = vlib_add_trace (vm, node, b, sizeof (*tr));
+ tr->next_index = next_index;
+ tr->hw_if_index = rd->hw_if_index;
+
+ /* next */
+ n_trace--;
+ n_left--;
+ bi++;
+ i++;
+ }
+ vlib_set_trace_count (vm, node, n_trace);
+ }
+
+ if (PREDICT_TRUE (next_index == VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT))
+ {
+ vlib_next_frame_t *nf;
+ vlib_frame_t *f;
+ ethernet_input_frame_t *ef;
+ nf = vlib_node_runtime_get_next_frame (vm, node, next_index);
+ f = vlib_get_frame (vm, nf->frame_index);
+ f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX;
+
+ ef = vlib_frame_scalar_args (f);
+ ef->sw_if_index = rd->sw_if_index;
+ ef->hw_if_index = rd->hw_if_index;
+ //f->flags |= ETH_INPUT_FRAME_F_IP4_CKSUM_OK;
+ }
+
+ n_left_to_next -= n_rx_packets;
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+
+ vlib_increment_combined_counter
+ (vnm->interface_main.combined_sw_if_counters +
+ VNET_INTERFACE_COUNTER_RX, vm->thread_index,
+ rd->hw_if_index, n_rx_packets, n_rx_bytes);
+
+ rxq->n_enq -= n_rx_packets;
+ rdma_device_input_refill (vm, rd, rxq);
+
+ return n_rx_packets;
+}
+
+VLIB_NODE_FN (rdma_input_node) (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_rx = 0;
+ rdma_main_t *rm = &rdma_main;
+ vnet_device_input_runtime_t *rt = (void *) node->runtime_data;
+ vnet_device_and_queue_t *dq;
+
+ foreach_device_and_queue (dq, rt->devices_and_queues)
+ {
+ rdma_device_t *rd;
+ rd = vec_elt_at_index (rm->devices, dq->dev_instance);
+ if ((rd->flags & RDMA_DEVICE_F_ADMIN_UP) == 0)
+ continue;
+ n_rx += rdma_device_input_inline (vm, node, frame, rd, dq->queue_id);
+ }
+ return n_rx;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (rdma_input_node) = {
+ .name = "rdma-input",
+ .sibling_of = "device-input",
+ .format_trace = format_rdma_input_trace,
+ .type = VLIB_NODE_TYPE_INPUT,
+ .state = VLIB_NODE_STATE_DISABLED,
+ .n_errors = RDMA_INPUT_N_ERROR,
+ .error_strings = rdma_input_error_strings,
+};
+
+/* *INDENT-ON* */
+
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/rdma/output.c b/src/plugins/rdma/output.c
new file mode 100644
index 00000000000..410784308f3
--- /dev/null
+++ b/src/plugins/rdma/output.c
@@ -0,0 +1,133 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vlib/pci/pci.h>
+#include <vppinfra/ring.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/devices.h>
+
+#include <rdma/rdma.h>
+
+static_always_inline u16
+rdma_device_output_tx (vlib_main_t * vm, rdma_device_t * rd, rdma_txq_t * txq,
+ u32 * buffers, u16 n_left, u32 * n_tx_packets,
+ u32 * n_tx_bytes)
+{
+ struct ibv_sge sg_entry;
+ struct ibv_send_wr wr, *bad_wr;
+ u16 i;
+
+ for (i = 0; i < n_left; i++)
+ {
+ vlib_buffer_t *b = vlib_get_buffer (vm, buffers[i]);
+ sg_entry.addr = vlib_buffer_get_current_va (b);
+ sg_entry.length = b->current_length;
+ sg_entry.lkey = rd->mr->lkey;
+
+ memset (&wr, 0, sizeof (wr));
+ wr.num_sge = 1;
+ wr.sg_list = &sg_entry;
+ wr.opcode = IBV_WR_SEND;
+ wr.send_flags = IBV_SEND_SIGNALED;
+ wr.wr_id = buffers[i];
+
+ if (ibv_post_send (txq->qp, &wr, &bad_wr) != 0)
+ break;
+
+ *n_tx_bytes += b->current_length;
+ }
+
+ *n_tx_packets += i;
+ return i;
+}
+
+static_always_inline void
+rdma_device_output_free (vlib_main_t * vm, rdma_txq_t * txq)
+{
+ struct ibv_wc wc[VLIB_FRAME_SIZE];
+ u32 to_free[VLIB_FRAME_SIZE];
+ int n_free;
+ int i;
+
+ n_free = ibv_poll_cq (txq->cq, VLIB_FRAME_SIZE, wc);
+ if (n_free <= 0)
+ return;
+
+ for (i = 0; i < n_free; i++)
+ to_free[i] = wc[i].wr_id;
+
+ vlib_buffer_free (vm, to_free, n_free);
+}
+
+VNET_DEVICE_CLASS_TX_FN (rdma_device_class) (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ rdma_main_t *rm = &rdma_main;
+ vnet_interface_output_runtime_t *ord = (void *) node->runtime_data;
+ rdma_device_t *rd = pool_elt_at_index (rm->devices, ord->dev_instance);
+ u32 thread_index = vm->thread_index;
+ u8 qid = thread_index;
+ rdma_txq_t *txq = vec_elt_at_index (rd->txqs, qid % vec_len (rd->txqs));
+ u32 *buffers = vlib_frame_vector_args (frame);
+ u16 n_left;
+ u16 n_retry = 5;
+ u32 n_tx_packets = 0, n_tx_bytes = 0;
+
+ clib_spinlock_lock_if_init (&txq->lock);
+
+ n_left = frame->n_vectors;
+
+ while (n_left)
+ {
+ u16 n;
+ rdma_device_output_free (vm, txq);
+ n =
+ rdma_device_output_tx (vm, rd, txq, buffers, n_left, &n_tx_packets,
+ &n_tx_bytes);
+ n_left -= n;
+ buffers += n;
+
+ if (n_left && n_retry--)
+ {
+ vlib_buffer_free (vm, buffers, n_left);
+ vlib_error_count (vm, node->node_index,
+ RDMA_TX_ERROR_NO_FREE_SLOTS, n_left);
+ break;
+ }
+ }
+
+ clib_spinlock_unlock_if_init (&txq->lock);
+
+ vlib_increment_combined_counter
+ (vnm->interface_main.combined_sw_if_counters +
+ VNET_INTERFACE_COUNTER_TX, thread_index,
+ rd->hw_if_index, n_tx_packets, n_tx_bytes);
+
+ return frame->n_vectors - n_left;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/rdma/plugin.c b/src/plugins/rdma/plugin.c
new file mode 100644
index 00000000000..f229b75129d
--- /dev/null
+++ b/src/plugins/rdma/plugin.c
@@ -0,0 +1,35 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/plugin/plugin.h>
+#include <vpp/app/version.h>
+
+/* *INDENT-OFF* */
+VLIB_PLUGIN_REGISTER () = {
+ .version = VPP_BUILD_VER,
+ .description = "RDMA (ibverb) Device Plugin",
+};
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/rdma/rdma.h b/src/plugins/rdma/rdma.h
new file mode 100644
index 00000000000..860ddaba2b1
--- /dev/null
+++ b/src/plugins/rdma/rdma.h
@@ -0,0 +1,141 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2018 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef _RDMA_H_
+#define _RDMA_H_
+
+#include <infiniband/verbs.h>
+#include <vlib/log.h>
+
+#define foreach_rdma_device_flags \
+ _(0, INITIALIZED, "initialized") \
+ _(1, ERROR, "error") \
+ _(2, ADMIN_UP, "admin-up") \
+ _(3, VA_DMA, "vaddr-dma") \
+ _(4, LINK_UP, "link-up") \
+ _(5, SHARED_TXQ_LOCK, "shared-txq-lock") \
+ _(6, ELOG, "elog") \
+
+enum
+{
+#define _(a, b, c) RDMA_DEVICE_F_##b = (1 << a),
+ foreach_rdma_device_flags
+#undef _
+};
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+ u32 size;
+ u32 n_enq;
+ struct ibv_cq *cq;
+ struct ibv_qp *qp;
+} rdma_rxq_t;
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+ u32 size;
+ u32 n_enq;
+ struct ibv_cq *cq;
+ struct ibv_qp *qp;
+ clib_spinlock_t lock;
+} rdma_txq_t;
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+ u32 flags;
+ u32 per_interface_next_index;
+
+ u32 dev_instance;
+ u32 sw_if_index;
+ u32 hw_if_index;
+
+ u32 async_event_clib_file_index;
+
+ rdma_rxq_t *rxqs;
+ rdma_txq_t *txqs;
+
+ u8 hwaddr[6];
+ vlib_pci_addr_t pci_addr;
+
+ struct ibv_context *ctx;
+ struct ibv_pd *pd;
+ struct ibv_mr *mr;
+ struct ibv_flow *flow_ucast;
+ struct ibv_flow *flow_mcast;
+
+ /* error */
+ clib_error_t *error;
+} rdma_device_t;
+
+typedef struct
+{
+ rdma_device_t *devices;
+ vlib_log_class_t log_class;
+} rdma_main_t;
+
+extern rdma_main_t rdma_main;
+
+typedef struct
+{
+ u8 *ifname;
+
+ /* return */
+ int rv;
+ u32 sw_if_index;
+ clib_error_t *error;
+} rdma_create_if_args_t;
+
+void rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args);
+void rdma_delete_if (vlib_main_t * vm, rdma_device_t * rd);
+
+extern vlib_node_registration_t rdma_input_node;
+extern vnet_device_class_t rdma_device_class;
+
+/* format.c */
+format_function_t format_rdma_device;
+format_function_t format_rdma_device_name;
+format_function_t format_rdma_input_trace;
+
+typedef struct
+{
+ u32 next_index;
+ u32 hw_if_index;
+} rdma_input_trace_t;
+
+#define foreach_rdma_tx_func_error \
+_(NO_FREE_SLOTS, "no free tx slots")
+
+typedef enum
+{
+#define _(f,s) RDMA_TX_ERROR_##f,
+ foreach_rdma_tx_func_error
+#undef _
+ RDMA_TX_N_ERROR,
+} rdma_tx_func_error_t;
+
+#endif /* AVF_H */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */