From 8d01b9cd70a67cdafd5b965a70420c3bd7fb3f82 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Thu, 1 Nov 2018 11:59:50 +0000 Subject: New upstream version 18.11-rc1 Change-Id: Iaa71986dd6332e878d8f4bf493101b2bbc6313bb Signed-off-by: Luca Boccassi --- lib/Makefile | 7 +- lib/librte_acl/rte_acl.c | 8 +- lib/librte_acl/rte_acl.h | 2 +- lib/librte_bpf/bpf_load.c | 2 +- lib/librte_bpf/rte_bpf_ethdev.h | 2 +- lib/librte_cmdline/Makefile | 1 - lib/librte_cmdline/cmdline.c | 24 - lib/librte_cmdline/meson.build | 4 + lib/librte_compressdev/rte_comp.c | 4 +- lib/librte_compressdev/rte_comp.h | 4 +- lib/librte_compressdev/rte_compressdev.c | 32 +- lib/librte_compressdev/rte_compressdev_pmd.c | 23 +- lib/librte_compressdev/rte_compressdev_pmd.h | 5 - lib/librte_cryptodev/Makefile | 2 +- lib/librte_cryptodev/meson.build | 2 +- lib/librte_cryptodev/rte_cryptodev.c | 37 +- lib/librte_cryptodev/rte_cryptodev_pmd.c | 23 +- lib/librte_cryptodev/rte_cryptodev_pmd.h | 3 - lib/librte_eal/bsdapp/eal/Makefile | 9 +- lib/librte_eal/bsdapp/eal/eal.c | 54 +- lib/librte_eal/bsdapp/eal/eal_dev.c | 14 + lib/librte_eal/bsdapp/eal/eal_memalloc.c | 21 + lib/librte_eal/bsdapp/eal/eal_memory.c | 9 +- lib/librte_eal/common/Makefile | 1 + lib/librte_eal/common/arch/arm/meson.build | 2 +- lib/librte_eal/common/arch/ppc_64/meson.build | 5 + lib/librte_eal/common/arch/x86/meson.build | 2 +- lib/librte_eal/common/eal_common_bus.c | 45 +- lib/librte_eal/common/eal_common_class.c | 2 +- lib/librte_eal/common/eal_common_dev.c | 347 +++- lib/librte_eal/common/eal_common_devargs.c | 52 +- lib/librte_eal/common/eal_common_fbarray.c | 5 + lib/librte_eal/common/eal_common_memory.c | 210 ++- lib/librte_eal/common/eal_common_memzone.c | 8 +- lib/librte_eal/common/eal_common_options.c | 42 +- lib/librte_eal/common/eal_common_proc.c | 10 +- lib/librte_eal/common/eal_common_string_fns.c | 26 + lib/librte_eal/common/eal_common_timer.c | 24 + lib/librte_eal/common/eal_filesystem.h | 15 +- lib/librte_eal/common/eal_internal_cfg.h | 1 + lib/librte_eal/common/eal_memalloc.h | 11 + lib/librte_eal/common/eal_options.h | 2 + lib/librte_eal/common/eal_private.h | 90 +- lib/librte_eal/common/hotplug_mp.c | 426 +++++ lib/librte_eal/common/hotplug_mp.h | 46 + .../common/include/arch/arm/rte_cycles_32.h | 4 +- .../common/include/arch/ppc_64/meson.build | 16 + .../common/include/arch/ppc_64/rte_pause.h | 7 + lib/librte_eal/common/include/generic/rte_cycles.h | 11 + lib/librte_eal/common/include/rte_bitmap.h | 14 +- lib/librte_eal/common/include/rte_bus.h | 34 + lib/librte_eal/common/include/rte_common.h | 11 + lib/librte_eal/common/include/rte_dev.h | 123 +- lib/librte_eal/common/include/rte_devargs.h | 81 +- lib/librte_eal/common/include/rte_eal.h | 20 +- lib/librte_eal/common/include/rte_eal_interrupts.h | 1 + lib/librte_eal/common/include/rte_eal_memconfig.h | 18 +- lib/librte_eal/common/include/rte_malloc.h | 192 +++ lib/librte_eal/common/include/rte_malloc_heap.h | 3 + lib/librte_eal/common/include/rte_memory.h | 109 ++ lib/librte_eal/common/include/rte_option.h | 63 + lib/librte_eal/common/include/rte_string_fns.h | 26 +- lib/librte_eal/common/include/rte_version.h | 6 +- lib/librte_eal/common/include/rte_vfio.h | 31 +- lib/librte_eal/common/malloc_elem.c | 10 +- lib/librte_eal/common/malloc_heap.c | 340 +++- lib/librte_eal/common/malloc_heap.h | 17 + lib/librte_eal/common/malloc_mp.c | 4 +- lib/librte_eal/common/meson.build | 5 + lib/librte_eal/common/rte_malloc.c | 436 ++++- lib/librte_eal/common/rte_option.c | 54 + lib/librte_eal/linuxapp/eal/Makefile | 20 +- lib/librte_eal/linuxapp/eal/eal.c | 119 +- lib/librte_eal/linuxapp/eal/eal_dev.c | 172 +- lib/librte_eal/linuxapp/eal/eal_hugepage_info.c | 1 + lib/librte_eal/linuxapp/eal/eal_interrupts.c | 79 + lib/librte_eal/linuxapp/eal/eal_memalloc.c | 466 ++++- lib/librte_eal/linuxapp/eal/eal_memory.c | 264 ++- lib/librte_eal/linuxapp/eal/eal_thread.c | 4 +- lib/librte_eal/linuxapp/eal/eal_timer.c | 5 +- lib/librte_eal/linuxapp/eal/eal_vfio.c | 216 ++- lib/librte_eal/linuxapp/eal/eal_vfio.h | 4 + lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 11 + .../linuxapp/eal/include/exec-env/rte_kni_common.h | 6 + lib/librte_eal/meson.build | 3 +- lib/librte_eal/rte_eal_version.map | 39 +- lib/librte_ethdev/Makefile | 6 +- lib/librte_ethdev/ethdev_private.c | 121 ++ lib/librte_ethdev/ethdev_private.h | 38 + lib/librte_ethdev/ethdev_profile.c | 103 +- lib/librte_ethdev/ethdev_profile.h | 6 +- lib/librte_ethdev/meson.build | 8 +- lib/librte_ethdev/rte_class_eth.c | 173 ++ lib/librte_ethdev/rte_ethdev.c | 463 ++--- lib/librte_ethdev/rte_ethdev.h | 183 +- lib/librte_ethdev/rte_ethdev_core.h | 50 +- lib/librte_ethdev/rte_ethdev_driver.h | 37 +- lib/librte_ethdev/rte_ethdev_pci.h | 11 - lib/librte_ethdev/rte_ethdev_version.map | 17 +- lib/librte_ethdev/rte_flow.c | 686 ++++++-- lib/librte_ethdev/rte_flow.h | 484 +++++- lib/librte_ethdev/rte_tm.h | 4 +- lib/librte_eventdev/Makefile | 4 +- lib/librte_eventdev/meson.build | 8 +- lib/librte_eventdev/rte_event_eth_rx_adapter.c | 6 +- lib/librte_eventdev/rte_event_eth_rx_adapter.h | 4 - lib/librte_eventdev/rte_event_eth_tx_adapter.c | 1138 ++++++++++++ lib/librte_eventdev/rte_event_eth_tx_adapter.h | 462 +++++ lib/librte_eventdev/rte_eventdev.c | 70 +- lib/librte_eventdev/rte_eventdev.h | 74 +- lib/librte_eventdev/rte_eventdev_pmd.h | 225 ++- lib/librte_eventdev/rte_eventdev_version.map | 13 + lib/librte_flow_classify/rte_flow_classify.c | 3 +- lib/librte_hash/rte_cuckoo_hash.c | 1061 +++++++++--- lib/librte_hash/rte_cuckoo_hash.h | 34 +- lib/librte_hash/rte_hash.h | 85 +- lib/librte_hash/rte_hash_version.map | 7 + lib/librte_ip_frag/ip_frag_common.h | 23 +- lib/librte_ip_frag/ip_frag_internal.c | 18 - lib/librte_ip_frag/rte_ip_frag.h | 19 +- lib/librte_ip_frag/rte_ip_frag_common.c | 21 + lib/librte_ip_frag/rte_ip_frag_version.map | 6 + lib/librte_kni/rte_kni.c | 534 +++--- lib/librte_kni/rte_kni.h | 26 +- lib/librte_kni/rte_kni_fifo.h | 43 +- lib/librte_kni/rte_kni_version.map | 6 + lib/librte_kvargs/rte_kvargs.c | 17 + lib/librte_kvargs/rte_kvargs.h | 7 +- lib/librte_latencystats/rte_latencystats.c | 7 +- lib/librte_lpm/Makefile | 2 +- lib/librte_lpm/meson.build | 1 + lib/librte_lpm/rte_lpm6.c | 1050 +++++++++--- lib/librte_mbuf/meson.build | 2 +- lib/librte_mbuf/rte_mbuf.c | 53 +- lib/librte_mbuf/rte_mbuf.h | 130 +- lib/librte_mbuf/rte_mbuf_ptype.c | 3 + lib/librte_mbuf/rte_mbuf_ptype.h | 22 + lib/librte_mempool/rte_mempool.c | 57 +- lib/librte_net/Makefile | 2 +- lib/librte_net/meson.build | 3 +- lib/librte_net/net_crc_sse.h | 4 +- lib/librte_net/rte_ether.h | 2 + lib/librte_net/rte_mpls.h | 42 + lib/librte_net/rte_net.c | 21 +- lib/librte_net/rte_net.h | 20 +- lib/librte_pdump/Makefile | 2 - lib/librte_pipeline/Makefile | 2 +- lib/librte_pipeline/meson.build | 2 +- lib/librte_pipeline/rte_pipeline.c | 3 +- lib/librte_pipeline/rte_pipeline_version.map | 1 + lib/librte_pipeline/rte_table_action.c | 1048 ++++++++++- lib/librte_pipeline/rte_table_action.h | 203 +++ lib/librte_port/Makefile | 4 +- lib/librte_port/meson.build | 8 +- lib/librte_port/rte_port_sym_crypto.c | 552 ++++++ lib/librte_port/rte_port_sym_crypto.h | 93 + lib/librte_port/rte_port_version.map | 9 + lib/librte_power/Makefile | 5 +- lib/librte_power/channel_commands.h | 5 + lib/librte_power/meson.build | 6 +- lib/librte_power/rte_power_empty_poll.c | 545 ++++++ lib/librte_power/rte_power_empty_poll.h | 223 +++ lib/librte_power/rte_power_version.map | 13 + lib/librte_rawdev/rte_rawdev.c | 10 +- lib/librte_rawdev/rte_rawdev_pmd.h | 2 - lib/librte_ring/meson.build | 1 + lib/librte_ring/rte_ring.h | 4 +- lib/librte_sched/Makefile | 2 - lib/librte_sched/rte_sched.c | 5 +- lib/librte_security/rte_security.c | 4 + lib/librte_security/rte_security.h | 90 + lib/librte_table/Makefile | 2 + lib/librte_table/meson.build | 2 + lib/librte_table/rte_table_hash_func.h | 245 +++ lib/librte_table/rte_table_hash_func_arm64.h | 21 + lib/librte_telemetry/Makefile | 30 + lib/librte_telemetry/meson.build | 15 + lib/librte_telemetry/rte_telemetry.c | 1813 ++++++++++++++++++++ lib/librte_telemetry/rte_telemetry.h | 66 + lib/librte_telemetry/rte_telemetry_internal.h | 81 + lib/librte_telemetry/rte_telemetry_parser.c | 586 +++++++ lib/librte_telemetry/rte_telemetry_parser.h | 14 + lib/librte_telemetry/rte_telemetry_parser_test.c | 534 ++++++ lib/librte_telemetry/rte_telemetry_parser_test.h | 39 + lib/librte_telemetry/rte_telemetry_socket_tests.h | 36 + lib/librte_telemetry/rte_telemetry_version.map | 10 + lib/librte_vhost/Makefile | 5 +- lib/librte_vhost/meson.build | 3 + lib/librte_vhost/rte_vdpa.h | 97 +- lib/librte_vhost/rte_vhost.h | 5 + lib/librte_vhost/rte_vhost_version.map | 1 + lib/librte_vhost/socket.c | 46 +- lib/librte_vhost/vdpa.c | 6 + lib/librte_vhost/vhost.c | 26 +- lib/librte_vhost/vhost.h | 34 +- lib/librte_vhost/vhost_crypto.c | 25 +- lib/librte_vhost/vhost_user.c | 683 +++++--- lib/librte_vhost/vhost_user.h | 12 +- lib/librte_vhost/virtio_net.c | 52 +- lib/meson.build | 31 +- 200 files changed, 17096 insertions(+), 2700 deletions(-) create mode 100644 lib/librte_eal/common/arch/ppc_64/meson.build create mode 100644 lib/librte_eal/common/hotplug_mp.c create mode 100644 lib/librte_eal/common/hotplug_mp.h create mode 100644 lib/librte_eal/common/include/arch/ppc_64/meson.build create mode 100644 lib/librte_eal/common/include/rte_option.h create mode 100644 lib/librte_eal/common/rte_option.c create mode 100644 lib/librte_ethdev/ethdev_private.c create mode 100644 lib/librte_ethdev/ethdev_private.h create mode 100644 lib/librte_ethdev/rte_class_eth.c create mode 100644 lib/librte_eventdev/rte_event_eth_tx_adapter.c create mode 100644 lib/librte_eventdev/rte_event_eth_tx_adapter.h create mode 100644 lib/librte_net/rte_mpls.h create mode 100644 lib/librte_port/rte_port_sym_crypto.c create mode 100644 lib/librte_port/rte_port_sym_crypto.h create mode 100644 lib/librte_power/rte_power_empty_poll.c create mode 100644 lib/librte_power/rte_power_empty_poll.h create mode 100644 lib/librte_table/rte_table_hash_func.h create mode 100644 lib/librte_table/rte_table_hash_func_arm64.h create mode 100644 lib/librte_telemetry/Makefile create mode 100644 lib/librte_telemetry/meson.build create mode 100644 lib/librte_telemetry/rte_telemetry.c create mode 100644 lib/librte_telemetry/rte_telemetry.h create mode 100644 lib/librte_telemetry/rte_telemetry_internal.h create mode 100644 lib/librte_telemetry/rte_telemetry_parser.c create mode 100644 lib/librte_telemetry/rte_telemetry_parser.h create mode 100644 lib/librte_telemetry/rte_telemetry_parser_test.c create mode 100644 lib/librte_telemetry/rte_telemetry_parser_test.h create mode 100644 lib/librte_telemetry/rte_telemetry_socket_tests.h create mode 100644 lib/librte_telemetry/rte_telemetry_version.map (limited to 'lib') diff --git a/lib/Makefile b/lib/Makefile index afa604e2..b7370ef9 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -25,6 +25,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_ETHER) += librte_ethdev DEPDIRS-librte_ethdev := librte_net librte_eal librte_mempool librte_ring DEPDIRS-librte_ethdev += librte_mbuf DEPDIRS-librte_ethdev += librte_kvargs +DEPDIRS-librte_ethdev += librte_cmdline DIRS-$(CONFIG_RTE_LIBRTE_BBDEV) += librte_bbdev DEPDIRS-librte_bbdev := librte_eal librte_mempool librte_mbuf DIRS-$(CONFIG_RTE_LIBRTE_CRYPTODEV) += librte_cryptodev @@ -50,7 +51,7 @@ DEPDIRS-librte_hash := librte_eal librte_ring DIRS-$(CONFIG_RTE_LIBRTE_EFD) += librte_efd DEPDIRS-librte_efd := librte_eal librte_ring librte_hash DIRS-$(CONFIG_RTE_LIBRTE_LPM) += librte_lpm -DEPDIRS-librte_lpm := librte_eal +DEPDIRS-librte_lpm := librte_eal librte_hash DIRS-$(CONFIG_RTE_LIBRTE_ACL) += librte_acl DEPDIRS-librte_acl := librte_eal DIRS-$(CONFIG_RTE_LIBRTE_MEMBER) += librte_member @@ -71,7 +72,7 @@ DEPDIRS-librte_bitratestats := librte_eal librte_metrics librte_ethdev DIRS-$(CONFIG_RTE_LIBRTE_LATENCY_STATS) += librte_latencystats DEPDIRS-librte_latencystats := librte_eal librte_metrics librte_ethdev librte_mbuf DIRS-$(CONFIG_RTE_LIBRTE_POWER) += librte_power -DEPDIRS-librte_power := librte_eal +DEPDIRS-librte_power := librte_eal librte_timer DIRS-$(CONFIG_RTE_LIBRTE_METER) += librte_meter DEPDIRS-librte_meter := librte_eal DIRS-$(CONFIG_RTE_LIBRTE_FLOW_CLASSIFY) += librte_flow_classify @@ -105,6 +106,8 @@ DEPDIRS-librte_gso := librte_eal librte_mbuf librte_ethdev librte_net DEPDIRS-librte_gso += librte_mempool DIRS-$(CONFIG_RTE_LIBRTE_BPF) += librte_bpf DEPDIRS-librte_bpf := librte_eal librte_mempool librte_mbuf librte_ethdev +DIRS-$(CONFIG_RTE_LIBRTE_TELEMETRY) += librte_telemetry +DEPDIRS-librte_telemetry := librte_eal librte_metrics librte_ethdev ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y) DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni diff --git a/lib/librte_acl/rte_acl.c b/lib/librte_acl/rte_acl.c index 2f1243cd..db7d3221 100644 --- a/lib/librte_acl/rte_acl.c +++ b/lib/librte_acl/rte_acl.c @@ -16,7 +16,7 @@ EAL_REGISTER_TAILQ(rte_acl_tailq) * If the compiler doesn't support AVX2 instructions, * then the dummy one would be used instead for AVX2 classify method. */ -int __attribute__ ((weak)) +__rte_weak int rte_acl_classify_avx2(__rte_unused const struct rte_acl_ctx *ctx, __rte_unused const uint8_t **data, __rte_unused uint32_t *results, @@ -26,7 +26,7 @@ rte_acl_classify_avx2(__rte_unused const struct rte_acl_ctx *ctx, return -ENOTSUP; } -int __attribute__ ((weak)) +__rte_weak int rte_acl_classify_sse(__rte_unused const struct rte_acl_ctx *ctx, __rte_unused const uint8_t **data, __rte_unused uint32_t *results, @@ -36,7 +36,7 @@ rte_acl_classify_sse(__rte_unused const struct rte_acl_ctx *ctx, return -ENOTSUP; } -int __attribute__ ((weak)) +__rte_weak int rte_acl_classify_neon(__rte_unused const struct rte_acl_ctx *ctx, __rte_unused const uint8_t **data, __rte_unused uint32_t *results, @@ -46,7 +46,7 @@ rte_acl_classify_neon(__rte_unused const struct rte_acl_ctx *ctx, return -ENOTSUP; } -int __attribute__ ((weak)) +__rte_weak int rte_acl_classify_altivec(__rte_unused const struct rte_acl_ctx *ctx, __rte_unused const uint8_t **data, __rte_unused uint32_t *results, diff --git a/lib/librte_acl/rte_acl.h b/lib/librte_acl/rte_acl.h index 34c3b9c6..aa22e70c 100644 --- a/lib/librte_acl/rte_acl.h +++ b/lib/librte_acl/rte_acl.h @@ -88,7 +88,7 @@ enum { RTE_ACL_TYPE_SHIFT = 29, RTE_ACL_MAX_INDEX = RTE_LEN2MASK(RTE_ACL_TYPE_SHIFT, uint32_t), RTE_ACL_MAX_PRIORITY = RTE_ACL_MAX_INDEX, - RTE_ACL_MIN_PRIORITY = 0, + RTE_ACL_MIN_PRIORITY = 1, }; #define RTE_ACL_MASKLEN_TO_BITMASK(v, s) \ diff --git a/lib/librte_bpf/bpf_load.c b/lib/librte_bpf/bpf_load.c index 2b84fe72..d9d163b7 100644 --- a/lib/librte_bpf/bpf_load.c +++ b/lib/librte_bpf/bpf_load.c @@ -131,7 +131,7 @@ rte_bpf_load(const struct rte_bpf_prm *prm) return bpf; } -__rte_experimental __attribute__ ((weak)) struct rte_bpf * +__rte_experimental __rte_weak struct rte_bpf * rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname, const char *sname) { diff --git a/lib/librte_bpf/rte_bpf_ethdev.h b/lib/librte_bpf/rte_bpf_ethdev.h index 31731e7a..11d09cdc 100644 --- a/lib/librte_bpf/rte_bpf_ethdev.h +++ b/lib/librte_bpf/rte_bpf_ethdev.h @@ -75,7 +75,7 @@ rte_bpf_eth_tx_unload(uint16_t port, uint16_t queue); * @param prm * Parameters used to create and initialise the BPF exeution context. * @param flags - * Flags that define expected expected behavior of the loaded filter + * Flags that define expected behavior of the loaded filter * (i.e. jited/non-jited version to use). * @return * Zero on successful completion or negative error code otherwise. diff --git a/lib/librte_cmdline/Makefile b/lib/librte_cmdline/Makefile index ddae1cfd..c64142b8 100644 --- a/lib/librte_cmdline/Makefile +++ b/lib/librte_cmdline/Makefile @@ -25,7 +25,6 @@ SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_vt100.c SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_socket.c SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse_portlist.c -CFLAGS += -D_GNU_SOURCE LDLIBS += -lrte_eal # install includes diff --git a/lib/librte_cmdline/cmdline.c b/lib/librte_cmdline/cmdline.c index 591b78b0..d9042f04 100644 --- a/lib/librte_cmdline/cmdline.c +++ b/lib/librte_cmdline/cmdline.c @@ -126,35 +126,11 @@ cmdline_printf(const struct cmdline *cl, const char *fmt, ...) if (!cl || !fmt) return; -#ifdef _GNU_SOURCE if (cl->s_out < 0) return; va_start(ap, fmt); vdprintf(cl->s_out, fmt, ap); va_end(ap); -#else - int ret; - char *buf; - - if (cl->s_out < 0) - return; - - buf = malloc(BUFSIZ); - if (buf == NULL) - return; - va_start(ap, fmt); - ret = vsnprintf(buf, BUFSIZ, fmt, ap); - va_end(ap); - if (ret < 0) { - free(buf); - return; - } - if (ret >= BUFSIZ) - ret = BUFSIZ - 1; - ret = write(cl->s_out, buf, ret); - (void)ret; - free(buf); -#endif } int diff --git a/lib/librte_cmdline/meson.build b/lib/librte_cmdline/meson.build index 5741817a..30498906 100644 --- a/lib/librte_cmdline/meson.build +++ b/lib/librte_cmdline/meson.build @@ -1,6 +1,10 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2017 Intel Corporation +# This library is processed before EAL +includes = [global_inc] +includes += include_directories('../librte_eal/common/include') + version = 2 sources = files('cmdline.c', 'cmdline_cirbuf.c', diff --git a/lib/librte_compressdev/rte_comp.c b/lib/librte_compressdev/rte_comp.c index 98ad0cfd..c663be59 100644 --- a/lib/librte_compressdev/rte_comp.c +++ b/lib/librte_compressdev/rte_comp.c @@ -83,8 +83,8 @@ struct rte_comp_op_pool_private { * @param nb_ops * Number of operations to allocate * @return - * - 0: Success - * - -ENOENT: Not enough entries in the mempool; no ops are retrieved. + * - nb_ops: Success, the nb_ops requested was allocated + * - 0: Not enough entries in the mempool; no ops are retrieved. */ static inline int rte_comp_op_raw_bulk_alloc(struct rte_mempool *mempool, diff --git a/lib/librte_compressdev/rte_comp.h b/lib/librte_compressdev/rte_comp.h index ee9056ea..395ce29f 100644 --- a/lib/librte_compressdev/rte_comp.h +++ b/lib/librte_compressdev/rte_comp.h @@ -448,8 +448,8 @@ rte_comp_op_alloc(struct rte_mempool *mempool); * @param nb_ops * Number of operations to allocate * @return - * - 0: Success - * - -ENOENT: Not enough entries in the mempool; no ops are retrieved. + * - nb_ops: Success, the nb_ops requested was allocated + * - 0: Not enough entries in the mempool; no ops are retrieved. */ int __rte_experimental rte_comp_op_bulk_alloc(struct rte_mempool *mempool, diff --git a/lib/librte_compressdev/rte_compressdev.c b/lib/librte_compressdev/rte_compressdev.c index 9091dd6e..10101ebb 100644 --- a/lib/librte_compressdev/rte_compressdev.c +++ b/lib/librte_compressdev/rte_compressdev.c @@ -18,19 +18,15 @@ #define RTE_COMPRESSDEV_DETACHED (0) #define RTE_COMPRESSDEV_ATTACHED (1) -struct rte_compressdev rte_comp_devices[RTE_COMPRESS_MAX_DEVS]; - -struct rte_compressdev *rte_compressdevs = &rte_comp_devices[0]; +static struct rte_compressdev rte_comp_devices[RTE_COMPRESS_MAX_DEVS]; static struct rte_compressdev_global compressdev_globals = { - .devs = &rte_comp_devices[0], + .devs = rte_comp_devices, .data = { NULL }, .nb_devs = 0, .max_devs = RTE_COMPRESS_MAX_DEVS }; -struct rte_compressdev_global *rte_compressdev_globals = &compressdev_globals; - const struct rte_compressdev_capabilities * __rte_experimental rte_compressdev_capability_get(uint8_t dev_id, enum rte_comp_algorithm algo) @@ -78,7 +74,7 @@ rte_compressdev_get_feature_name(uint64_t flag) static struct rte_compressdev * rte_compressdev_get_dev(uint8_t dev_id) { - return &rte_compressdev_globals->devs[dev_id]; + return &compressdev_globals.devs[dev_id]; } struct rte_compressdev * __rte_experimental @@ -90,8 +86,8 @@ rte_compressdev_pmd_get_named_dev(const char *name) if (name == NULL) return NULL; - for (i = 0; i < rte_compressdev_globals->max_devs; i++) { - dev = &rte_compressdev_globals->devs[i]; + for (i = 0; i < compressdev_globals.max_devs; i++) { + dev = &compressdev_globals.devs[i]; if ((dev->attached == RTE_COMPRESSDEV_ATTACHED) && (strcmp(dev->data->name, name) == 0)) @@ -106,7 +102,7 @@ rte_compressdev_is_valid_dev(uint8_t dev_id) { struct rte_compressdev *dev = NULL; - if (dev_id >= rte_compressdev_globals->nb_devs) + if (dev_id >= compressdev_globals.nb_devs) return 0; dev = rte_compressdev_get_dev(dev_id); @@ -125,10 +121,10 @@ rte_compressdev_get_dev_id(const char *name) if (name == NULL) return -1; - for (i = 0; i < rte_compressdev_globals->nb_devs; i++) - if ((strcmp(rte_compressdev_globals->devs[i].data->name, name) + for (i = 0; i < compressdev_globals.nb_devs; i++) + if ((strcmp(compressdev_globals.devs[i].data->name, name) == 0) && - (rte_compressdev_globals->devs[i].attached == + (compressdev_globals.devs[i].attached == RTE_COMPRESSDEV_ATTACHED)) return i; @@ -138,7 +134,7 @@ rte_compressdev_get_dev_id(const char *name) uint8_t __rte_experimental rte_compressdev_count(void) { - return rte_compressdev_globals->nb_devs; + return compressdev_globals.nb_devs; } uint8_t __rte_experimental @@ -146,8 +142,8 @@ rte_compressdev_devices_get(const char *driver_name, uint8_t *devices, uint8_t nb_devices) { uint8_t i, count = 0; - struct rte_compressdev *devs = rte_compressdev_globals->devs; - uint8_t max_devs = rte_compressdev_globals->max_devs; + struct rte_compressdev *devs = compressdev_globals.devs; + uint8_t max_devs = compressdev_globals.max_devs; for (i = 0; i < max_devs && count < nb_devices; i++) { @@ -578,7 +574,7 @@ uint16_t __rte_experimental rte_compressdev_dequeue_burst(uint8_t dev_id, uint16_t qp_id, struct rte_comp_op **ops, uint16_t nb_ops) { - struct rte_compressdev *dev = &rte_compressdevs[dev_id]; + struct rte_compressdev *dev = &rte_comp_devices[dev_id]; nb_ops = (*dev->dequeue_burst) (dev->data->queue_pairs[qp_id], ops, nb_ops); @@ -590,7 +586,7 @@ uint16_t __rte_experimental rte_compressdev_enqueue_burst(uint8_t dev_id, uint16_t qp_id, struct rte_comp_op **ops, uint16_t nb_ops) { - struct rte_compressdev *dev = &rte_compressdevs[dev_id]; + struct rte_compressdev *dev = &rte_comp_devices[dev_id]; return (*dev->enqueue_burst)( dev->data->queue_pairs[qp_id], ops, nb_ops); diff --git a/lib/librte_compressdev/rte_compressdev_pmd.c b/lib/librte_compressdev/rte_compressdev_pmd.c index 7de4f339..95beb26a 100644 --- a/lib/librte_compressdev/rte_compressdev_pmd.c +++ b/lib/librte_compressdev/rte_compressdev_pmd.c @@ -92,24 +92,20 @@ rte_compressdev_pmd_create(const char *name, struct rte_compressdev *compressdev; if (params->name[0] != '\0') { - COMPRESSDEV_LOG(INFO, "[%s] User specified device name = %s\n", - device->driver->name, params->name); + COMPRESSDEV_LOG(INFO, "User specified device name = %s\n", + params->name); name = params->name; } - COMPRESSDEV_LOG(INFO, "[%s] - Creating compressdev %s\n", - device->driver->name, name); + COMPRESSDEV_LOG(INFO, "Creating compressdev %s\n", name); - COMPRESSDEV_LOG(INFO, - "[%s] - Init parameters - name: %s, socket id: %d", - device->driver->name, name, - params->socket_id); + COMPRESSDEV_LOG(INFO, "Init parameters - name: %s, socket id: %d", + name, params->socket_id); /* allocate device structure */ compressdev = rte_compressdev_pmd_allocate(name, params->socket_id); if (compressdev == NULL) { - COMPRESSDEV_LOG(ERR, "[%s] Failed to allocate comp device %s", - device->driver->name, name); + COMPRESSDEV_LOG(ERR, "Failed to allocate comp device %s", name); return NULL; } @@ -123,8 +119,8 @@ rte_compressdev_pmd_create(const char *name, if (compressdev->data->dev_private == NULL) { COMPRESSDEV_LOG(ERR, - "[%s] Cannot allocate memory for compressdev %s private data", - device->driver->name, name); + "Cannot allocate memory for compressdev" + " %s private data", name); rte_compressdev_pmd_release_device(compressdev); return NULL; @@ -141,8 +137,7 @@ rte_compressdev_pmd_destroy(struct rte_compressdev *compressdev) { int retval; - COMPRESSDEV_LOG(INFO, "[%s] Closing comp device %s", - compressdev->device->driver->name, + COMPRESSDEV_LOG(INFO, "Closing comp device %s", compressdev->device->name); /* free comp device */ diff --git a/lib/librte_compressdev/rte_compressdev_pmd.h b/lib/librte_compressdev/rte_compressdev_pmd.h index 38e9ea02..043353c9 100644 --- a/lib/librte_compressdev/rte_compressdev_pmd.h +++ b/lib/librte_compressdev/rte_compressdev_pmd.h @@ -51,11 +51,6 @@ struct rte_compressdev_global { uint8_t max_devs; /**< Max number of devices */ }; -/** Pointer to global array of comp devices */ -extern struct rte_compressdev *rte_compressdevs; -/** Pointer to global comp devices data structure */ -extern struct rte_compressdev_global *rte_compressdev_globals; - /** * Get the rte_compressdev structure device pointer for the named device. * diff --git a/lib/librte_cryptodev/Makefile b/lib/librte_cryptodev/Makefile index c1148887..a8f94c09 100644 --- a/lib/librte_cryptodev/Makefile +++ b/lib/librte_cryptodev/Makefile @@ -7,7 +7,7 @@ include $(RTE_SDK)/mk/rte.vars.mk LIB = librte_cryptodev.a # library version -LIBABIVER := 4 +LIBABIVER := 5 # build flags CFLAGS += -O3 diff --git a/lib/librte_cryptodev/meson.build b/lib/librte_cryptodev/meson.build index 295f509e..990dd3d4 100644 --- a/lib/librte_cryptodev/meson.build +++ b/lib/librte_cryptodev/meson.build @@ -1,7 +1,7 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2017 Intel Corporation -version = 4 +version = 5 sources = files('rte_cryptodev.c', 'rte_cryptodev_pmd.c') headers = files('rte_cryptodev.h', 'rte_cryptodev_pmd.h', diff --git a/lib/librte_cryptodev/rte_cryptodev.c b/lib/librte_cryptodev/rte_cryptodev.c index 63ae23f0..a52eaaa4 100644 --- a/lib/librte_cryptodev/rte_cryptodev.c +++ b/lib/librte_cryptodev/rte_cryptodev.c @@ -43,19 +43,17 @@ static uint8_t nb_drivers; -struct rte_cryptodev rte_crypto_devices[RTE_CRYPTO_MAX_DEVS]; +static struct rte_cryptodev rte_crypto_devices[RTE_CRYPTO_MAX_DEVS]; -struct rte_cryptodev *rte_cryptodevs = &rte_crypto_devices[0]; +struct rte_cryptodev *rte_cryptodevs = rte_crypto_devices; static struct rte_cryptodev_global cryptodev_globals = { - .devs = &rte_crypto_devices[0], + .devs = rte_crypto_devices, .data = { NULL }, .nb_devs = 0, .max_devs = RTE_CRYPTO_MAX_DEVS }; -struct rte_cryptodev_global *rte_cryptodev_globals = &cryptodev_globals; - /* spinlock for crypto device callbacks */ static rte_spinlock_t rte_cryptodev_cb_lock = RTE_SPINLOCK_INITIALIZER; @@ -486,7 +484,7 @@ rte_cryptodev_get_feature_name(uint64_t flag) struct rte_cryptodev * rte_cryptodev_pmd_get_dev(uint8_t dev_id) { - return &rte_cryptodev_globals->devs[dev_id]; + return &cryptodev_globals.devs[dev_id]; } struct rte_cryptodev * @@ -498,8 +496,8 @@ rte_cryptodev_pmd_get_named_dev(const char *name) if (name == NULL) return NULL; - for (i = 0; i < rte_cryptodev_globals->max_devs; i++) { - dev = &rte_cryptodev_globals->devs[i]; + for (i = 0; i < cryptodev_globals.max_devs; i++) { + dev = &cryptodev_globals.devs[i]; if ((dev->attached == RTE_CRYPTODEV_ATTACHED) && (strcmp(dev->data->name, name) == 0)) @@ -514,7 +512,7 @@ rte_cryptodev_pmd_is_valid_dev(uint8_t dev_id) { struct rte_cryptodev *dev = NULL; - if (dev_id >= rte_cryptodev_globals->nb_devs) + if (dev_id >= cryptodev_globals.nb_devs) return 0; dev = rte_cryptodev_pmd_get_dev(dev_id); @@ -533,10 +531,10 @@ rte_cryptodev_get_dev_id(const char *name) if (name == NULL) return -1; - for (i = 0; i < rte_cryptodev_globals->nb_devs; i++) - if ((strcmp(rte_cryptodev_globals->devs[i].data->name, name) + for (i = 0; i < cryptodev_globals.nb_devs; i++) + if ((strcmp(cryptodev_globals.devs[i].data->name, name) == 0) && - (rte_cryptodev_globals->devs[i].attached == + (cryptodev_globals.devs[i].attached == RTE_CRYPTODEV_ATTACHED)) return i; @@ -546,7 +544,7 @@ rte_cryptodev_get_dev_id(const char *name) uint8_t rte_cryptodev_count(void) { - return rte_cryptodev_globals->nb_devs; + return cryptodev_globals.nb_devs; } uint8_t @@ -554,9 +552,9 @@ rte_cryptodev_device_count_by_driver(uint8_t driver_id) { uint8_t i, dev_count = 0; - for (i = 0; i < rte_cryptodev_globals->max_devs; i++) - if (rte_cryptodev_globals->devs[i].driver_id == driver_id && - rte_cryptodev_globals->devs[i].attached == + for (i = 0; i < cryptodev_globals.max_devs; i++) + if (cryptodev_globals.devs[i].driver_id == driver_id && + cryptodev_globals.devs[i].attached == RTE_CRYPTODEV_ATTACHED) dev_count++; @@ -568,8 +566,8 @@ rte_cryptodev_devices_get(const char *driver_name, uint8_t *devices, uint8_t nb_devices) { uint8_t i, count = 0; - struct rte_cryptodev *devs = rte_cryptodev_globals->devs; - uint8_t max_devs = rte_cryptodev_globals->max_devs; + struct rte_cryptodev *devs = cryptodev_globals.devs; + uint8_t max_devs = cryptodev_globals.max_devs; for (i = 0; i < max_devs && count < nb_devices; i++) { @@ -1477,6 +1475,9 @@ rte_crypto_op_pool_create(const char *name, enum rte_crypto_op_type type, elt_size += sizeof(struct rte_crypto_sym_op); } else if (type == RTE_CRYPTO_OP_TYPE_ASYMMETRIC) { elt_size += sizeof(struct rte_crypto_asym_op); + } else if (type == RTE_CRYPTO_OP_TYPE_UNDEFINED) { + elt_size += RTE_MAX(sizeof(struct rte_crypto_sym_op), + sizeof(struct rte_crypto_asym_op)); } else { CDEV_LOG_ERR("Invalid op_type\n"); return NULL; diff --git a/lib/librte_cryptodev/rte_cryptodev_pmd.c b/lib/librte_cryptodev/rte_cryptodev_pmd.c index 2088ac3f..f03bdbd5 100644 --- a/lib/librte_cryptodev/rte_cryptodev_pmd.c +++ b/lib/librte_cryptodev/rte_cryptodev_pmd.c @@ -93,24 +93,20 @@ rte_cryptodev_pmd_create(const char *name, struct rte_cryptodev *cryptodev; if (params->name[0] != '\0') { - CDEV_LOG_INFO("[%s] User specified device name = %s\n", - device->driver->name, params->name); + CDEV_LOG_INFO("User specified device name = %s\n", params->name); name = params->name; } - CDEV_LOG_INFO("[%s] - Creating cryptodev %s\n", - device->driver->name, name); + CDEV_LOG_INFO("Creating cryptodev %s\n", name); - CDEV_LOG_INFO("[%s] - Initialisation parameters - name: %s," + CDEV_LOG_INFO("Initialisation parameters - name: %s," "socket id: %d, max queue pairs: %u", - device->driver->name, name, - params->socket_id, params->max_nb_queue_pairs); + name, params->socket_id, params->max_nb_queue_pairs); /* allocate device structure */ cryptodev = rte_cryptodev_pmd_allocate(name, params->socket_id); if (cryptodev == NULL) { - CDEV_LOG_ERR("[%s] Failed to allocate crypto device for %s", - device->driver->name, name); + CDEV_LOG_ERR("Failed to allocate crypto device for %s", name); return NULL; } @@ -123,9 +119,8 @@ rte_cryptodev_pmd_create(const char *name, params->socket_id); if (cryptodev->data->dev_private == NULL) { - CDEV_LOG_ERR("[%s] Cannot allocate memory for " - "cryptodev %s private data", - device->driver->name, name); + CDEV_LOG_ERR("Cannot allocate memory for cryptodev %s" + " private data", name); rte_cryptodev_pmd_release_device(cryptodev); return NULL; @@ -145,9 +140,7 @@ rte_cryptodev_pmd_destroy(struct rte_cryptodev *cryptodev) { int retval; - CDEV_LOG_INFO("[%s] Closing crypto device %s", - cryptodev->device->driver->name, - cryptodev->device->name); + CDEV_LOG_INFO("Closing crypto device %s", cryptodev->device->name); /* free crypto device */ retval = rte_cryptodev_pmd_release_device(cryptodev); diff --git a/lib/librte_cryptodev/rte_cryptodev_pmd.h b/lib/librte_cryptodev/rte_cryptodev_pmd.h index 6ff49d64..1b6cafd1 100644 --- a/lib/librte_cryptodev/rte_cryptodev_pmd.h +++ b/lib/librte_cryptodev/rte_cryptodev_pmd.h @@ -71,9 +71,6 @@ struct cryptodev_driver { uint8_t id; }; -/** pointer to global crypto devices data structure. */ -extern struct rte_cryptodev_global *rte_cryptodev_globals; - /** * Get the rte_cryptodev structure device pointer for the device. Assumes a * valid device index. diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile index d27da3d1..bfeddaad 100644 --- a/lib/librte_eal/bsdapp/eal/Makefile +++ b/lib/librte_eal/bsdapp/eal/Makefile @@ -22,7 +22,7 @@ LDLIBS += -lrte_kvargs EXPORT_MAP := ../../rte_eal_version.map -LIBABIVER := 8 +LIBABIVER := 9 # specific to bsdapp exec-env SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) := eal.c @@ -62,10 +62,12 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_proc.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_fbarray.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_uuid.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_malloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += hotplug_mp.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_elem.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_heap.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_mp.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_keepalive.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_option.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_service.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_reciprocal.c @@ -77,11 +79,6 @@ SRCS-y += rte_cycles.c CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST) -CFLAGS_eal.o := -D_GNU_SOURCE -#CFLAGS_eal_thread.o := -D_GNU_SOURCE -CFLAGS_eal_log.o := -D_GNU_SOURCE -CFLAGS_eal_common_log.o := -D_GNU_SOURCE - # workaround for a gcc bug with noreturn attribute # http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c index d7ae9d68..508cbc46 100644 --- a/lib/librte_eal/bsdapp/eal/eal.c +++ b/lib/librte_eal/bsdapp/eal/eal.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -141,7 +142,7 @@ eal_create_runtime_dir(void) } const char * -eal_get_runtime_dir(void) +rte_eal_get_runtime_dir(void) { return runtime_dir; } @@ -414,12 +415,20 @@ eal_parse_args(int argc, char **argv) argvopt = argv; optind = 1; optreset = 1; + opterr = 0; while ((opt = getopt_long(argc, argvopt, eal_short_options, eal_long_options, &option_index)) != EOF) { - /* getopt is not happy, stop right now */ + /* + * getopt didn't recognise the option, lets parse the + * registered options to see if the flag is valid + */ if (opt == '?') { + ret = rte_option_parse(argv[optind-1]); + if (ret == 0) + continue; + eal_usage(prgname); ret = -1; goto out; @@ -502,6 +511,9 @@ check_socket(const struct rte_memseg_list *msl, void *arg) { int *socket_id = arg; + if (msl->external) + return 0; + if (msl->socket_id == *socket_id && msl->memseg_arr.count != 0) return 1; @@ -607,7 +619,7 @@ rte_eal_init(int argc, char **argv) internal_config.legacy_mem = true; if (eal_plugins_init() < 0) { - rte_eal_init_alert("Cannot init plugins\n"); + rte_eal_init_alert("Cannot init plugins"); rte_errno = EINVAL; rte_atomic32_clear(&run_once); return -1; @@ -622,7 +634,7 @@ rte_eal_init(int argc, char **argv) rte_config_init(); if (rte_eal_intr_init() < 0) { - rte_eal_init_alert("Cannot init interrupt-handling thread\n"); + rte_eal_init_alert("Cannot init interrupt-handling thread"); return -1; } @@ -630,7 +642,7 @@ rte_eal_init(int argc, char **argv) * bus through mp channel in the secondary process before the bus scan. */ if (rte_mp_channel_init() < 0) { - rte_eal_init_alert("failed to init mp channel\n"); + rte_eal_init_alert("failed to init mp channel"); if (rte_eal_process_type() == RTE_PROC_PRIMARY) { rte_errno = EFAULT; return -1; @@ -638,14 +650,21 @@ rte_eal_init(int argc, char **argv) } if (rte_bus_scan()) { - rte_eal_init_alert("Cannot scan the buses for devices\n"); + rte_eal_init_alert("Cannot scan the buses for devices"); rte_errno = ENODEV; rte_atomic32_clear(&run_once); return -1; } - /* autodetect the iova mapping mode (default is iova_pa) */ - rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class(); + /* if no EAL option "--iova-mode=", use bus IOVA scheme */ + if (internal_config.iova_mode == RTE_IOVA_DC) { + /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */ + rte_eal_get_configuration()->iova_mode = + rte_bus_get_iommu_class(); + } else { + rte_eal_get_configuration()->iova_mode = + internal_config.iova_mode; + } if (internal_config.no_hugetlbfs == 0) { /* rte_config isn't initialized yet */ @@ -685,37 +704,37 @@ rte_eal_init(int argc, char **argv) * initialize memzones first. */ if (rte_eal_memzone_init() < 0) { - rte_eal_init_alert("Cannot init memzone\n"); + rte_eal_init_alert("Cannot init memzone"); rte_errno = ENODEV; return -1; } if (rte_eal_memory_init() < 0) { - rte_eal_init_alert("Cannot init memory\n"); + rte_eal_init_alert("Cannot init memory"); rte_errno = ENOMEM; return -1; } if (rte_eal_malloc_heap_init() < 0) { - rte_eal_init_alert("Cannot init malloc heap\n"); + rte_eal_init_alert("Cannot init malloc heap"); rte_errno = ENODEV; return -1; } if (rte_eal_tailqs_init() < 0) { - rte_eal_init_alert("Cannot init tail queues for objects\n"); + rte_eal_init_alert("Cannot init tail queues for objects"); rte_errno = EFAULT; return -1; } if (rte_eal_alarm_init() < 0) { - rte_eal_init_alert("Cannot init interrupt-handling thread\n"); + rte_eal_init_alert("Cannot init interrupt-handling thread"); /* rte_eal_alarm_init sets rte_errno on failure. */ return -1; } if (rte_eal_timer_init() < 0) { - rte_eal_init_alert("Cannot init HPET or TSC timers\n"); + rte_eal_init_alert("Cannot init HPET or TSC timers"); rte_errno = ENOTSUP; return -1; } @@ -765,14 +784,14 @@ rte_eal_init(int argc, char **argv) /* initialize services so vdevs register service during bus_probe. */ ret = rte_service_init(); if (ret) { - rte_eal_init_alert("rte_service_init() failed\n"); + rte_eal_init_alert("rte_service_init() failed"); rte_errno = ENOEXEC; return -1; } /* Probe all the buses and devices/drivers on them */ if (rte_bus_probe()) { - rte_eal_init_alert("Cannot probe devices\n"); + rte_eal_init_alert("Cannot probe devices"); rte_errno = ENOTSUP; return -1; } @@ -788,6 +807,9 @@ rte_eal_init(int argc, char **argv) rte_eal_mcfg_complete(); + /* Call each registered callback, if enabled */ + rte_option_init(); + return fctret; } diff --git a/lib/librte_eal/bsdapp/eal/eal_dev.c b/lib/librte_eal/bsdapp/eal/eal_dev.c index 1c6c51bd..255d611e 100644 --- a/lib/librte_eal/bsdapp/eal/eal_dev.c +++ b/lib/librte_eal/bsdapp/eal/eal_dev.c @@ -19,3 +19,17 @@ rte_dev_event_monitor_stop(void) RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); return -1; } + +int __rte_experimental +rte_dev_hotplug_handle_enable(void) +{ + RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); + return -1; +} + +int __rte_experimental +rte_dev_hotplug_handle_disable(void) +{ + RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); + return -1; +} diff --git a/lib/librte_eal/bsdapp/eal/eal_memalloc.c b/lib/librte_eal/bsdapp/eal/eal_memalloc.c index f7f07abd..a5847f0b 100644 --- a/lib/librte_eal/bsdapp/eal/eal_memalloc.c +++ b/lib/librte_eal/bsdapp/eal/eal_memalloc.c @@ -4,6 +4,7 @@ #include +#include #include #include @@ -47,6 +48,26 @@ eal_memalloc_sync_with_primary(void) return -1; } +int +eal_memalloc_get_seg_fd(int list_idx __rte_unused, int seg_idx __rte_unused) +{ + return -ENOTSUP; +} + +int +eal_memalloc_set_seg_fd(int list_idx __rte_unused, int seg_idx __rte_unused, + int fd __rte_unused) +{ + return -ENOTSUP; +} + +int +eal_memalloc_get_seg_fd_offset(int list_idx __rte_unused, + int seg_idx __rte_unused, size_t *offset __rte_unused) +{ + return -ENOTSUP; +} + int eal_memalloc_init(void) { diff --git a/lib/librte_eal/bsdapp/eal/eal_memory.c b/lib/librte_eal/bsdapp/eal/eal_memory.c index 16d2bc7c..4b092e1f 100644 --- a/lib/librte_eal/bsdapp/eal/eal_memory.c +++ b/lib/librte_eal/bsdapp/eal/eal_memory.c @@ -79,6 +79,7 @@ rte_eal_hugepage_init(void) } msl->base_va = addr; msl->page_sz = page_sz; + msl->len = internal_config.memory; msl->socket_id = 0; /* populate memsegs. each memseg is 1 page long */ @@ -235,12 +236,15 @@ struct attach_walk_args { int seg_idx; }; static int -attach_segment(const struct rte_memseg_list *msl __rte_unused, - const struct rte_memseg *ms, void *arg) +attach_segment(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg) { struct attach_walk_args *wa = arg; void *addr; + if (msl->external) + return 0; + addr = mmap(ms->addr, ms->len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, wa->fd_hugepage, wa->seg_idx * EAL_PAGE_SIZE); @@ -370,6 +374,7 @@ alloc_va_space(struct rte_memseg_list *msl) return -1; } msl->base_va = addr; + msl->len = mem_sz; return 0; } diff --git a/lib/librte_eal/common/Makefile b/lib/librte_eal/common/Makefile index cca68826..87d8c455 100644 --- a/lib/librte_eal/common/Makefile +++ b/lib/librte_eal/common/Makefile @@ -12,6 +12,7 @@ INC += rte_tailq.h rte_interrupts.h rte_alarm.h INC += rte_string_fns.h rte_version.h INC += rte_eal_memconfig.h rte_malloc_heap.h INC += rte_hexdump.h rte_devargs.h rte_bus.h rte_dev.h rte_class.h +INC += rte_option.h INC += rte_pci_dev_feature_defs.h rte_pci_dev_features.h INC += rte_malloc.h rte_keepalive.h rte_time.h INC += rte_service.h rte_service_component.h diff --git a/lib/librte_eal/common/arch/arm/meson.build b/lib/librte_eal/common/arch/arm/meson.build index c6bd9227..79731e1a 100644 --- a/lib/librte_eal/common/arch/arm/meson.build +++ b/lib/librte_eal/common/arch/arm/meson.build @@ -2,4 +2,4 @@ # Copyright(c) 2017 Intel Corporation. eal_common_arch_sources = files('rte_cpuflags.c', - 'rte_cycles.c') + 'rte_cycles.c', 'rte_hypervisor.c') diff --git a/lib/librte_eal/common/arch/ppc_64/meson.build b/lib/librte_eal/common/arch/ppc_64/meson.build new file mode 100644 index 00000000..40b3dc53 --- /dev/null +++ b/lib/librte_eal/common/arch/ppc_64/meson.build @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Luca Boccassi + +eal_common_arch_sources = files('rte_cpuflags.c', + 'rte_cycles.c', 'rte_hypervisor.c') diff --git a/lib/librte_eal/common/arch/x86/meson.build b/lib/librte_eal/common/arch/x86/meson.build index 4e0f7790..14bf204c 100644 --- a/lib/librte_eal/common/arch/x86/meson.build +++ b/lib/librte_eal/common/arch/x86/meson.build @@ -2,4 +2,4 @@ # Copyright(c) 2017 Intel Corporation eal_common_arch_sources = files('rte_spinlock.c', 'rte_cpuflags.c', - 'rte_cycles.c') + 'rte_cycles.c', 'rte_hypervisor.c') diff --git a/lib/librte_eal/common/eal_common_bus.c b/lib/librte_eal/common/eal_common_bus.c index 0943851c..c8f1901f 100644 --- a/lib/librte_eal/common/eal_common_bus.c +++ b/lib/librte_eal/common/eal_common_bus.c @@ -37,10 +37,11 @@ #include #include #include +#include #include "eal_private.h" -struct rte_bus_list rte_bus_list = +static struct rte_bus_list rte_bus_list = TAILQ_HEAD_INITIALIZER(rte_bus_list); void @@ -242,3 +243,45 @@ rte_bus_get_iommu_class(void) } return mode; } + +static int +bus_handle_sigbus(const struct rte_bus *bus, + const void *failure_addr) +{ + int ret; + + if (!bus->sigbus_handler) + return -1; + + ret = bus->sigbus_handler(failure_addr); + + /* find bus but handle failed, keep the errno be set. */ + if (ret < 0 && rte_errno == 0) + rte_errno = ENOTSUP; + + return ret > 0; +} + +int +rte_bus_sigbus_handler(const void *failure_addr) +{ + struct rte_bus *bus; + + int ret = 0; + int old_errno = rte_errno; + + rte_errno = 0; + + bus = rte_bus_find(NULL, bus_handle_sigbus, failure_addr); + /* can not find bus. */ + if (!bus) + return 1; + /* find bus but handle failed, pass on the new errno. */ + else if (rte_errno != 0) + return -1; + + /* restore the old errno. */ + rte_errno = old_errno; + + return ret; +} diff --git a/lib/librte_eal/common/eal_common_class.c b/lib/librte_eal/common/eal_common_class.c index 404a9065..d922266d 100644 --- a/lib/librte_eal/common/eal_common_class.c +++ b/lib/librte_eal/common/eal_common_class.c @@ -9,7 +9,7 @@ #include #include -struct rte_class_list rte_class_list = +static struct rte_class_list rte_class_list = TAILQ_HEAD_INITIALIZER(rte_class_list); __rte_experimental void diff --git a/lib/librte_eal/common/eal_common_dev.c b/lib/librte_eal/common/eal_common_dev.c index 678dbcac..62e9ed47 100644 --- a/lib/librte_eal/common/eal_common_dev.c +++ b/lib/librte_eal/common/eal_common_dev.c @@ -19,8 +19,10 @@ #include #include #include +#include #include "eal_private.h" +#include "hotplug_mp.h" /** * The device event callback description. @@ -74,119 +76,110 @@ static int cmp_dev_name(const struct rte_device *dev, const void *_name) return strcmp(dev->name, name); } -int rte_eal_dev_attach(const char *name, const char *devargs) +int __rte_experimental +rte_dev_is_probed(const struct rte_device *dev) { - struct rte_bus *bus; + /* The field driver should be set only when the probe is successful. */ + return dev->driver != NULL; +} - if (name == NULL || devargs == NULL) { - RTE_LOG(ERR, EAL, "Invalid device or arguments provided\n"); +/* helper function to build devargs, caller should free the memory */ +static int +build_devargs(const char *busname, const char *devname, + const char *drvargs, char **devargs) +{ + int length; + + length = snprintf(NULL, 0, "%s:%s,%s", busname, devname, drvargs); + if (length < 0) return -EINVAL; - } - bus = rte_bus_find_by_device_name(name); - if (bus == NULL) { - RTE_LOG(ERR, EAL, "Unable to find a bus for the device '%s'\n", - name); + *devargs = malloc(length + 1); + if (*devargs == NULL) + return -ENOMEM; + + length = snprintf(*devargs, length + 1, "%s:%s,%s", + busname, devname, drvargs); + if (length < 0) { + free(*devargs); return -EINVAL; } - if (strcmp(bus->name, "pci") == 0 || strcmp(bus->name, "vdev") == 0) - return rte_eal_hotplug_add(bus->name, name, devargs); - - RTE_LOG(ERR, EAL, - "Device attach is only supported for PCI and vdev devices.\n"); - return -ENOTSUP; + return 0; } -int rte_eal_dev_detach(struct rte_device *dev) +int +rte_eal_hotplug_add(const char *busname, const char *devname, + const char *drvargs) { - struct rte_bus *bus; - int ret; - if (dev == NULL) { - RTE_LOG(ERR, EAL, "Invalid device provided.\n"); - return -EINVAL; - } + char *devargs; + int ret; - bus = rte_bus_find_by_device(dev); - if (bus == NULL) { - RTE_LOG(ERR, EAL, "Cannot find bus for device (%s)\n", - dev->name); - return -EINVAL; - } + ret = build_devargs(busname, devname, drvargs, &devargs); + if (ret != 0) + return ret; - if (bus->unplug == NULL) { - RTE_LOG(ERR, EAL, "Bus function not supported\n"); - return -ENOTSUP; - } + ret = rte_dev_probe(devargs); + free(devargs); - ret = bus->unplug(dev); - if (ret) - RTE_LOG(ERR, EAL, "Driver cannot detach the device (%s)\n", - dev->name); return ret; } -int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devname, - const char *devargs) +/* probe device at local process. */ +int +local_dev_probe(const char *devargs, struct rte_device **new_dev) { - struct rte_bus *bus; struct rte_device *dev; struct rte_devargs *da; int ret; - bus = rte_bus_find_by_name(busname); - if (bus == NULL) { - RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", busname); - return -ENOENT; - } - - if (bus->plug == NULL) { - RTE_LOG(ERR, EAL, "Function plug not supported by bus (%s)\n", - bus->name); - return -ENOTSUP; - } - + *new_dev = NULL; da = calloc(1, sizeof(*da)); if (da == NULL) return -ENOMEM; - ret = rte_devargs_parsef(da, "%s:%s,%s", - busname, devname, devargs); + ret = rte_devargs_parse(da, devargs); if (ret) goto err_devarg; + if (da->bus->plug == NULL) { + RTE_LOG(ERR, EAL, "Function plug not supported by bus (%s)\n", + da->bus->name); + ret = -ENOTSUP; + goto err_devarg; + } + ret = rte_devargs_insert(da); if (ret) goto err_devarg; - ret = bus->scan(); + ret = da->bus->scan(); if (ret) goto err_devarg; - dev = bus->find_device(NULL, cmp_dev_name, devname); + dev = da->bus->find_device(NULL, cmp_dev_name, da->name); if (dev == NULL) { RTE_LOG(ERR, EAL, "Cannot find device (%s)\n", - devname); + da->name); ret = -ENODEV; goto err_devarg; } - if (dev->driver != NULL) { - RTE_LOG(ERR, EAL, "Device is already plugged\n"); - return -EEXIST; - } - - ret = bus->plug(dev); + ret = dev->bus->plug(dev); if (ret) { + if (rte_dev_is_probed(dev)) /* if already succeeded earlier */ + return ret; /* no rollback */ RTE_LOG(ERR, EAL, "Driver cannot attach the device (%s)\n", dev->name); goto err_devarg; } + + *new_dev = dev; return 0; err_devarg: - if (rte_devargs_remove(busname, devname)) { + if (rte_devargs_remove(da) != 0) { free(da->args); free(da); } @@ -194,40 +187,235 @@ err_devarg: } int __rte_experimental -rte_eal_hotplug_remove(const char *busname, const char *devname) +rte_dev_probe(const char *devargs) { - struct rte_bus *bus; + struct eal_dev_mp_req req; struct rte_device *dev; int ret; + memset(&req, 0, sizeof(req)); + req.t = EAL_DEV_REQ_TYPE_ATTACH; + strlcpy(req.devargs, devargs, EAL_DEV_MP_DEV_ARGS_MAX_LEN); + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) { + /** + * If in secondary process, just send IPC request to + * primary process. + */ + ret = eal_dev_hotplug_request_to_primary(&req); + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send hotplug request to primary\n"); + return -ENOMSG; + } + if (req.result != 0) + RTE_LOG(ERR, EAL, + "Failed to hotplug add device\n"); + return req.result; + } + + /* attach a shared device from primary start from here: */ + + /* primary attach the new device itself. */ + ret = local_dev_probe(devargs, &dev); + + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to attach device on primary process\n"); + + /** + * it is possible that secondary process failed to attached a + * device that primary process have during initialization, + * so for -EEXIST case, we still need to sync with secondary + * process. + */ + if (ret != -EEXIST) + return ret; + } + + /* primary send attach sync request to secondary. */ + ret = eal_dev_hotplug_request_to_secondary(&req); + + /* if any communication error, we need to rollback. */ + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send hotplug add request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + + /** + * if any secondary failed to attach, we need to consider if rollback + * is necessary. + */ + if (req.result != 0) { + RTE_LOG(ERR, EAL, + "Failed to attach device on secondary process\n"); + ret = req.result; + + /* for -EEXIST, we don't need to rollback. */ + if (ret == -EEXIST) + return ret; + goto rollback; + } + + return 0; + +rollback: + req.t = EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK; + + /* primary send rollback request to secondary. */ + if (eal_dev_hotplug_request_to_secondary(&req) != 0) + RTE_LOG(WARNING, EAL, + "Failed to rollback device attach on secondary." + "Devices in secondary may not sync with primary\n"); + + /* primary rollback itself. */ + if (local_dev_remove(dev) != 0) + RTE_LOG(WARNING, EAL, + "Failed to rollback device attach on primary." + "Devices in secondary may not sync with primary\n"); + + return ret; +} + +int +rte_eal_hotplug_remove(const char *busname, const char *devname) +{ + struct rte_device *dev; + struct rte_bus *bus; + bus = rte_bus_find_by_name(busname); if (bus == NULL) { RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", busname); return -ENOENT; } - if (bus->unplug == NULL) { - RTE_LOG(ERR, EAL, "Function unplug not supported by bus (%s)\n", - bus->name); - return -ENOTSUP; - } - dev = bus->find_device(NULL, cmp_dev_name, devname); if (dev == NULL) { RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", devname); return -EINVAL; } - if (dev->driver == NULL) { - RTE_LOG(ERR, EAL, "Device is already unplugged\n"); - return -ENOENT; + return rte_dev_remove(dev); +} + +/* remove device at local process. */ +int +local_dev_remove(struct rte_device *dev) +{ + int ret; + + if (dev->bus->unplug == NULL) { + RTE_LOG(ERR, EAL, "Function unplug not supported by bus (%s)\n", + dev->bus->name); + return -ENOTSUP; } - ret = bus->unplug(dev); - if (ret) + ret = dev->bus->unplug(dev); + if (ret) { RTE_LOG(ERR, EAL, "Driver cannot detach the device (%s)\n", dev->name); - rte_devargs_remove(busname, devname); + return ret; + } + + return 0; +} + +int __rte_experimental +rte_dev_remove(struct rte_device *dev) +{ + struct eal_dev_mp_req req; + char *devargs; + int ret; + + if (!rte_dev_is_probed(dev)) { + RTE_LOG(ERR, EAL, "Device is not probed\n"); + return -ENOENT; + } + + ret = build_devargs(dev->bus->name, dev->name, "", &devargs); + if (ret != 0) + return ret; + + memset(&req, 0, sizeof(req)); + req.t = EAL_DEV_REQ_TYPE_DETACH; + strlcpy(req.devargs, devargs, EAL_DEV_MP_DEV_ARGS_MAX_LEN); + free(devargs); + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) { + /** + * If in secondary process, just send IPC request to + * primary process. + */ + ret = eal_dev_hotplug_request_to_primary(&req); + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send hotplug request to primary\n"); + return -ENOMSG; + } + if (req.result != 0) + RTE_LOG(ERR, EAL, + "Failed to hotplug remove device\n"); + return req.result; + } + + /* detach a device from primary start from here: */ + + /* primary send detach sync request to secondary */ + ret = eal_dev_hotplug_request_to_secondary(&req); + + /** + * if communication error, we need to rollback, because it is possible + * part of the secondary processes still detached it successfully. + */ + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send device detach request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + + /** + * if any secondary failed to detach, we need to consider if rollback + * is necessary. + */ + if (req.result != 0) { + RTE_LOG(ERR, EAL, + "Failed to detach device on secondary process\n"); + ret = req.result; + /** + * if -ENOENT, we don't need to rollback, since devices is + * already detached on secondary process. + */ + if (ret != -ENOENT) + goto rollback; + } + + /* primary detach the device itself. */ + ret = local_dev_remove(dev); + + /* if primary failed, still need to consider if rollback is necessary */ + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to detach device on primary process\n"); + /* if -ENOENT, we don't need to rollback */ + if (ret == -ENOENT) + return ret; + goto rollback; + } + + return 0; + +rollback: + req.t = EAL_DEV_REQ_TYPE_DETACH_ROLLBACK; + + /* primary send rollback request to secondary. */ + if (eal_dev_hotplug_request_to_secondary(&req) != 0) + RTE_LOG(WARNING, EAL, + "Failed to rollback device detach on secondary." + "Devices in secondary may not sync with primary\n"); + return ret; } @@ -342,8 +530,9 @@ rte_dev_event_callback_unregister(const char *device_name, return ret; } -void -dev_callback_process(char *device_name, enum rte_dev_event_type event) +void __rte_experimental +rte_dev_event_callback_process(const char *device_name, + enum rte_dev_event_type event) { struct dev_event_callback *cb_lst; diff --git a/lib/librte_eal/common/eal_common_devargs.c b/lib/librte_eal/common/eal_common_devargs.c index dac2402a..b7b9cb69 100644 --- a/lib/librte_eal/common/eal_common_devargs.c +++ b/lib/librte_eal/common/eal_common_devargs.c @@ -4,9 +4,6 @@ /* This file manages the list of devices and their arguments, as given * by the user at startup - * - * Code here should not call rte_log since the EAL environment - * may not be initialized. */ #include @@ -28,39 +25,9 @@ TAILQ_HEAD(rte_devargs_list, rte_devargs); /** Global list of user devices */ -struct rte_devargs_list devargs_list = +static struct rte_devargs_list devargs_list = TAILQ_HEAD_INITIALIZER(devargs_list); -int -rte_eal_parse_devargs_str(const char *devargs_str, - char **drvname, char **drvargs) -{ - char *sep; - - if ((devargs_str) == NULL || (drvname) == NULL || (drvargs == NULL)) - return -1; - - *drvname = strdup(devargs_str); - if (*drvname == NULL) - return -1; - - /* set the first ',' to '\0' to split name and arguments */ - sep = strchr(*drvname, ','); - if (sep != NULL) { - sep[0] = '\0'; - *drvargs = strdup(sep + 1); - } else { - *drvargs = strdup(""); - } - - if (*drvargs == NULL) { - free(*drvname); - *drvname = NULL; - return -1; - } - return 0; -} - static size_t devargs_layer_count(const char *s) { @@ -270,6 +237,7 @@ rte_devargs_parsef(struct rte_devargs *da, const char *format, ...) va_list ap; size_t len; char *dev; + int ret; if (da == NULL) return -EINVAL; @@ -288,7 +256,10 @@ rte_devargs_parsef(struct rte_devargs *da, const char *format, ...) vsnprintf(dev, len + 1, format, ap); va_end(ap); - return rte_devargs_parse(da, dev); + ret = rte_devargs_parse(da, dev); + + free(dev); + return ret; } int __rte_experimental @@ -296,7 +267,7 @@ rte_devargs_insert(struct rte_devargs *da) { int ret; - ret = rte_devargs_remove(da->bus->name, da->name); + ret = rte_devargs_remove(da); if (ret < 0) return ret; TAILQ_INSERT_TAIL(&devargs_list, da, next); @@ -342,14 +313,17 @@ fail: } int __rte_experimental -rte_devargs_remove(const char *busname, const char *devname) +rte_devargs_remove(struct rte_devargs *devargs) { struct rte_devargs *d; void *tmp; + if (devargs == NULL || devargs->bus == NULL) + return -1; + TAILQ_FOREACH_SAFE(d, &devargs_list, next, tmp) { - if (strcmp(d->bus->name, busname) == 0 && - strcmp(d->name, devname) == 0) { + if (strcmp(d->bus->name, devargs->bus->name) == 0 && + strcmp(d->name, devargs->name) == 0) { TAILQ_REMOVE(&devargs_list, d, next); free(d->args); free(d); diff --git a/lib/librte_eal/common/eal_common_fbarray.c b/lib/librte_eal/common/eal_common_fbarray.c index 43caf3ce..ea0735cb 100644 --- a/lib/librte_eal/common/eal_common_fbarray.c +++ b/lib/librte_eal/common/eal_common_fbarray.c @@ -2,6 +2,7 @@ * Copyright(c) 2017-2018 Intel Corporation */ +#include #include #include #include @@ -878,6 +879,10 @@ rte_fbarray_destroy(struct rte_fbarray *arr) if (ret) return ret; + /* with no shconf, there were never any files to begin with */ + if (internal_config.no_shconf) + return 0; + /* try deleting the file */ eal_get_fbarray_path(path, sizeof(path), arr->name); diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c index fbfb1b05..12dcedf5 100644 --- a/lib/librte_eal/common/eal_common_memory.c +++ b/lib/librte_eal/common/eal_common_memory.c @@ -2,6 +2,7 @@ * Copyright(c) 2010-2014 Intel Corporation */ +#include #include #include #include @@ -37,6 +38,23 @@ static void *next_baseaddr; static uint64_t system_page_sz; +#ifdef RTE_ARCH_64 +/* + * Linux kernel uses a really high address as starting address for serving + * mmaps calls. If there exists addressing limitations and IOVA mode is VA, + * this starting address is likely too high for those devices. However, it + * is possible to use a lower address in the process virtual address space + * as with 64 bits there is a lot of available space. + * + * Current known limitations are 39 or 40 bits. Setting the starting address + * at 4GB implies there are 508GB or 1020GB for mapping the available + * hugepages. This is likely enough for most systems, although a device with + * addressing limitations should call rte_eal_check_dma_mask for ensuring all + * memory is within supported range. + */ +static uint64_t baseaddr = 0x100000000; +#endif + void * eal_get_virtual_area(void *requested_addr, size_t *size, size_t page_sz, int flags, int mmap_flags) @@ -60,6 +78,11 @@ eal_get_virtual_area(void *requested_addr, size_t *size, rte_eal_process_type() == RTE_PROC_PRIMARY) next_baseaddr = (void *) internal_config.base_virtaddr; +#ifdef RTE_ARCH_64 + if (next_baseaddr == NULL && internal_config.base_virtaddr == 0 && + rte_eal_process_type() == RTE_PROC_PRIMARY) + next_baseaddr = (void *) baseaddr; +#endif if (requested_addr == NULL && next_baseaddr != NULL) { requested_addr = next_baseaddr; requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz); @@ -91,7 +114,17 @@ eal_get_virtual_area(void *requested_addr, size_t *size, mmap_flags, -1, 0); if (mapped_addr == MAP_FAILED && allow_shrink) *size -= page_sz; - } while (allow_shrink && mapped_addr == MAP_FAILED && *size > 0); + + if (mapped_addr != MAP_FAILED && addr_is_hint && + mapped_addr != requested_addr) { + /* hint was not used. Try with another offset */ + munmap(mapped_addr, map_sz); + mapped_addr = MAP_FAILED; + next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz); + requested_addr = next_baseaddr; + } + } while ((allow_shrink || addr_is_hint) && + mapped_addr == MAP_FAILED && *size > 0); /* align resulting address - if map failed, we will ignore the value * anyway, so no need to add additional checks. @@ -171,7 +204,7 @@ virt2memseg(const void *addr, const struct rte_memseg_list *msl) /* a memseg list was specified, check if it's the right one */ start = msl->base_va; - end = RTE_PTR_ADD(start, (size_t)msl->page_sz * msl->memseg_arr.len); + end = RTE_PTR_ADD(start, msl->len); if (addr < start || addr >= end) return NULL; @@ -194,8 +227,7 @@ virt2memseg_list(const void *addr) msl = &mcfg->memsegs[msl_idx]; start = msl->base_va; - end = RTE_PTR_ADD(start, - (size_t)msl->page_sz * msl->memseg_arr.len); + end = RTE_PTR_ADD(start, msl->len); if (addr >= start && addr < end) break; } @@ -273,6 +305,9 @@ physmem_size(const struct rte_memseg_list *msl, void *arg) { uint64_t *total_len = arg; + if (msl->external) + return 0; + *total_len += msl->memseg_arr.count * msl->page_sz; return 0; @@ -294,7 +329,7 @@ dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms, void *arg) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int msl_idx, ms_idx; + int msl_idx, ms_idx, fd; FILE *f = arg; msl_idx = msl - mcfg->memsegs; @@ -305,10 +340,11 @@ dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms, if (ms_idx < 0) return -1; + fd = eal_memalloc_get_seg_fd(msl_idx, ms_idx); fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, " "virt:%p, socket_id:%"PRId32", " "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", " - "nrank:%"PRIx32"\n", + "nrank:%"PRIx32" fd:%i\n", msl_idx, ms_idx, ms->iova, ms->len, @@ -316,7 +352,8 @@ dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms, ms->socket_id, ms->hugepage_sz, ms->nchannel, - ms->nrank); + ms->nrank, + fd); return 0; } @@ -383,6 +420,66 @@ rte_dump_physmem_layout(FILE *f) rte_memseg_walk(dump_memseg, f); } +static int +check_iova(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + uint64_t *mask = arg; + rte_iova_t iova; + + /* higher address within segment */ + iova = (ms->iova + ms->len) - 1; + if (!(iova & *mask)) + return 0; + + RTE_LOG(DEBUG, EAL, "memseg iova %"PRIx64", len %zx, out of range\n", + ms->iova, ms->len); + + RTE_LOG(DEBUG, EAL, "\tusing dma mask %"PRIx64"\n", *mask); + return 1; +} + +#if defined(RTE_ARCH_64) +#define MAX_DMA_MASK_BITS 63 +#else +#define MAX_DMA_MASK_BITS 31 +#endif + +/* check memseg iovas are within the required range based on dma mask */ +int __rte_experimental +rte_eal_check_dma_mask(uint8_t maskbits) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + uint64_t mask; + + /* sanity check */ + if (maskbits > MAX_DMA_MASK_BITS) { + RTE_LOG(ERR, EAL, "wrong dma mask size %u (Max: %u)\n", + maskbits, MAX_DMA_MASK_BITS); + return -1; + } + + /* create dma mask */ + mask = ~((1ULL << maskbits) - 1); + + if (rte_memseg_walk(check_iova, &mask)) + /* + * Dma mask precludes hugepage usage. + * This device can not be used and we do not need to keep + * the dma mask. + */ + return 1; + + /* + * we need to keep the more restricted maskbit for checking + * potential dynamic memory allocation in the future. + */ + mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits : + RTE_MIN(mcfg->dma_maskbits, maskbits); + + return 0; +} + /* return the number of memory channels */ unsigned rte_memory_get_nchannel(void) { @@ -548,6 +645,105 @@ rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg) return ret; } +int __rte_experimental +rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + int msl_idx, seg_idx, ret; + + if (ms == NULL) { + rte_errno = EINVAL; + return -1; + } + + msl = rte_mem_virt2memseg_list(ms->addr); + if (msl == NULL) { + rte_errno = EINVAL; + return -1; + } + arr = &msl->memseg_arr; + + msl_idx = msl - mcfg->memsegs; + seg_idx = rte_fbarray_find_idx(arr, ms); + + if (!rte_fbarray_is_used(arr, seg_idx)) { + rte_errno = ENOENT; + return -1; + } + + ret = eal_memalloc_get_seg_fd(msl_idx, seg_idx); + if (ret < 0) { + rte_errno = -ret; + ret = -1; + } + return ret; +} + +int __rte_experimental +rte_memseg_get_fd(const struct rte_memseg *ms) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int ret; + + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + ret = rte_memseg_get_fd_thread_unsafe(ms); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +int __rte_experimental +rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms, + size_t *offset) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + int msl_idx, seg_idx, ret; + + if (ms == NULL || offset == NULL) { + rte_errno = EINVAL; + return -1; + } + + msl = rte_mem_virt2memseg_list(ms->addr); + if (msl == NULL) { + rte_errno = EINVAL; + return -1; + } + arr = &msl->memseg_arr; + + msl_idx = msl - mcfg->memsegs; + seg_idx = rte_fbarray_find_idx(arr, ms); + + if (!rte_fbarray_is_used(arr, seg_idx)) { + rte_errno = ENOENT; + return -1; + } + + ret = eal_memalloc_get_seg_fd_offset(msl_idx, seg_idx, offset); + if (ret < 0) { + rte_errno = -ret; + ret = -1; + } + return ret; +} + +int __rte_experimental +rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int ret; + + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + ret = rte_memseg_get_fd_offset_thread_unsafe(ms, offset); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + /* init memory subsystem */ int rte_eal_memory_init(void) diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c index 7300fe05..b7081afb 100644 --- a/lib/librte_eal/common/eal_common_memzone.c +++ b/lib/librte_eal/common/eal_common_memzone.c @@ -120,13 +120,15 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, return NULL; } - if ((socket_id != SOCKET_ID_ANY) && - (socket_id >= RTE_MAX_NUMA_NODES || socket_id < 0)) { + if ((socket_id != SOCKET_ID_ANY) && socket_id < 0) { rte_errno = EINVAL; return NULL; } - if (!rte_eal_has_hugepages()) + /* only set socket to SOCKET_ID_ANY if we aren't allocating for an + * external heap. + */ + if (!rte_eal_has_hugepages() && socket_id < RTE_MAX_NUMA_NODES) socket_id = SOCKET_ID_ANY; contig = (flags & RTE_MEMZONE_IOVA_CONTIG) != 0; diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c index dd5f9740..b82f3ddd 100644 --- a/lib/librte_eal/common/eal_common_options.c +++ b/lib/librte_eal/common/eal_common_options.c @@ -58,6 +58,7 @@ eal_long_options[] = { {OPT_HELP, 0, NULL, OPT_HELP_NUM }, {OPT_HUGE_DIR, 1, NULL, OPT_HUGE_DIR_NUM }, {OPT_HUGE_UNLINK, 0, NULL, OPT_HUGE_UNLINK_NUM }, + {OPT_IOVA_MODE, 1, NULL, OPT_IOVA_MODE_NUM }, {OPT_LCORES, 1, NULL, OPT_LCORES_NUM }, {OPT_LOG_LEVEL, 1, NULL, OPT_LOG_LEVEL_NUM }, {OPT_MASTER_LCORE, 1, NULL, OPT_MASTER_LCORE_NUM }, @@ -205,6 +206,7 @@ eal_reset_internal_config(struct internal_config *internal_cfg) #endif internal_cfg->vmware_tsc_map = 0; internal_cfg->create_uio_dev = 0; + internal_cfg->iova_mode = RTE_IOVA_DC; internal_cfg->user_mbuf_pool_ops_name = NULL; internal_cfg->init_complete = 0; } @@ -1075,6 +1077,25 @@ eal_parse_proc_type(const char *arg) return RTE_PROC_INVALID; } +static int +eal_parse_iova_mode(const char *name) +{ + int mode; + + if (name == NULL) + return -1; + + if (!strcmp("pa", name)) + mode = RTE_IOVA_PA; + else if (!strcmp("va", name)) + mode = RTE_IOVA_VA; + else + return -1; + + internal_config.iova_mode = mode; + return 0; +} + int eal_parse_common_option(int opt, const char *optarg, struct internal_config *conf) @@ -1281,6 +1302,13 @@ eal_parse_common_option(int opt, const char *optarg, case OPT_SINGLE_FILE_SEGMENTS_NUM: conf->single_file_segments = 1; break; + case OPT_IOVA_MODE_NUM: + if (eal_parse_iova_mode(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_IOVA_MODE "\n"); + return -1; + } + break; /* don't know what to do, leave this to caller */ default: @@ -1384,10 +1412,16 @@ eal_check_common_options(struct internal_config *internal_cfg) " is only supported in non-legacy memory mode\n"); } if (internal_cfg->single_file_segments && - internal_cfg->hugepage_unlink) { + internal_cfg->hugepage_unlink && + !internal_cfg->in_memory) { RTE_LOG(ERR, EAL, "Option --"OPT_SINGLE_FILE_SEGMENTS" is " - "not compatible with neither --"OPT_IN_MEMORY" nor " - "--"OPT_HUGE_UNLINK"\n"); + "not compatible with --"OPT_HUGE_UNLINK"\n"); + return -1; + } + if (internal_cfg->legacy_mem && + internal_cfg->in_memory) { + RTE_LOG(ERR, EAL, "Option --"OPT_LEGACY_MEM" is not compatible " + "with --"OPT_IN_MEMORY"\n"); return -1; } @@ -1428,6 +1462,8 @@ eal_common_usage(void) " --"OPT_VDEV" Add a virtual device.\n" " The argument format is [,key=val,...]\n" " (ex: --vdev=net_pcap0,iface=eth2).\n" + " --"OPT_IOVA_MODE" Set IOVA mode. 'pa' for IOVA_PA\n" + " 'va' for IOVA_VA\n" " -d LIB.so|DIR Add a driver or driver directory\n" " (can be used multiple times)\n" " --"OPT_VMWARE_TSC_MAP" Use VMware TSC map instead of native RDTSC\n" diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c index 9fcb9121..97663d3b 100644 --- a/lib/librte_eal/common/eal_common_proc.c +++ b/lib/librte_eal/common/eal_common_proc.c @@ -939,13 +939,17 @@ rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply, if (check_input(req) == false) return -1; + reply->nb_sent = 0; + reply->nb_received = 0; + reply->msgs = NULL; + if (internal_config.no_shconf) { RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); return 0; } if (gettimeofday(&now, NULL) < 0) { - RTE_LOG(ERR, EAL, "Faile to get current time\n"); + RTE_LOG(ERR, EAL, "Failed to get current time\n"); rte_errno = errno; return -1; } @@ -954,10 +958,6 @@ rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply, end.tv_sec = now.tv_sec + ts->tv_sec + (now.tv_usec * 1000 + ts->tv_nsec) / 1000000000; - reply->nb_sent = 0; - reply->nb_received = 0; - reply->msgs = NULL; - /* for secondary process, send request to the primary process only */ if (rte_eal_process_type() == RTE_PROC_SECONDARY) { pthread_mutex_lock(&pending_requests.lock); diff --git a/lib/librte_eal/common/eal_common_string_fns.c b/lib/librte_eal/common/eal_common_string_fns.c index 6ac5f828..60c5dd66 100644 --- a/lib/librte_eal/common/eal_common_string_fns.c +++ b/lib/librte_eal/common/eal_common_string_fns.c @@ -38,3 +38,29 @@ einval_error: errno = EINVAL; return -1; } + +/* Copy src string into dst. + * + * Return negative value and NUL-terminate if dst is too short, + * Otherwise return number of bytes copied. + */ +ssize_t +rte_strscpy(char *dst, const char *src, size_t dsize) +{ + size_t nleft = dsize; + size_t res = 0; + + /* Copy as many bytes as will fit. */ + while (nleft != 0) { + dst[res] = src[res]; + if (src[res] == '\0') + return res; + res++; + nleft--; + } + + /* Not enough room in dst, set NUL and return error. */ + if (res != 0) + dst[res - 1] = '\0'; + return -E2BIG; +} diff --git a/lib/librte_eal/common/eal_common_timer.c b/lib/librte_eal/common/eal_common_timer.c index 2e2b770f..dcf26bfe 100644 --- a/lib/librte_eal/common/eal_common_timer.c +++ b/lib/librte_eal/common/eal_common_timer.c @@ -7,9 +7,11 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -31,6 +33,28 @@ rte_delay_us_block(unsigned int us) rte_pause(); } +void __rte_experimental +rte_delay_us_sleep(unsigned int us) +{ + struct timespec wait[2]; + int ind = 0; + + wait[0].tv_sec = 0; + if (us >= US_PER_S) { + wait[0].tv_sec = us / US_PER_S; + us -= wait[0].tv_sec * US_PER_S; + } + wait[0].tv_nsec = 1000 * us; + + while (nanosleep(&wait[ind], &wait[1 - ind]) && errno == EINTR) { + /* + * Sleep was interrupted. Flip the index, so the 'remainder' + * will become the 'request' for a next call. + */ + ind = 1 - ind; + } +} + uint64_t rte_get_tsc_hz(void) { diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h index de05febf..b3e8ae5e 100644 --- a/lib/librte_eal/common/eal_filesystem.h +++ b/lib/librte_eal/common/eal_filesystem.h @@ -27,7 +27,7 @@ eal_create_runtime_dir(void); /* returns runtime dir */ const char * -eal_get_runtime_dir(void); +rte_eal_get_runtime_dir(void); #define RUNTIME_CONFIG_FNAME "config" static inline const char * @@ -35,7 +35,7 @@ eal_runtime_config_path(void) { static char buffer[PATH_MAX]; /* static so auto-zeroed */ - snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(), RUNTIME_CONFIG_FNAME); return buffer; } @@ -47,7 +47,7 @@ eal_mp_socket_path(void) { static char buffer[PATH_MAX]; /* static so auto-zeroed */ - snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(), MP_SOCKET_FNAME); return buffer; } @@ -55,7 +55,8 @@ eal_mp_socket_path(void) #define FBARRAY_NAME_FMT "%s/fbarray_%s" static inline const char * eal_get_fbarray_path(char *buffer, size_t buflen, const char *name) { - snprintf(buffer, buflen, FBARRAY_NAME_FMT, eal_get_runtime_dir(), name); + snprintf(buffer, buflen, FBARRAY_NAME_FMT, rte_eal_get_runtime_dir(), + name); return buffer; } @@ -66,7 +67,7 @@ eal_hugepage_info_path(void) { static char buffer[PATH_MAX]; /* static so auto-zeroed */ - snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(), HUGEPAGE_INFO_FNAME); return buffer; } @@ -78,7 +79,7 @@ eal_hugepage_data_path(void) { static char buffer[PATH_MAX]; /* static so auto-zeroed */ - snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(), HUGEPAGE_DATA_FNAME); return buffer; } @@ -99,7 +100,7 @@ eal_get_hugefile_path(char *buffer, size_t buflen, const char *hugedir, int f_id static inline const char * eal_get_hugefile_lock_path(char *buffer, size_t buflen, int f_id) { - snprintf(buffer, buflen, HUGEFILE_LOCK_FMT, eal_get_runtime_dir(), + snprintf(buffer, buflen, HUGEFILE_LOCK_FMT, rte_eal_get_runtime_dir(), f_id); buffer[buflen - 1] = '\0'; return buffer; diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h index 00ee6e06..737f17e3 100644 --- a/lib/librte_eal/common/eal_internal_cfg.h +++ b/lib/librte_eal/common/eal_internal_cfg.h @@ -70,6 +70,7 @@ struct internal_config { /**< user defined mbuf pool ops name */ unsigned num_hugepage_sizes; /**< how many sizes on this system */ struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES]; + enum rte_iova_mode iova_mode ; /**< Set IOVA mode on this system */ volatile unsigned int init_complete; /**< indicates whether EAL has completed initialization */ }; diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h index 36bb1a02..af917c2f 100644 --- a/lib/librte_eal/common/eal_memalloc.h +++ b/lib/librte_eal/common/eal_memalloc.h @@ -76,6 +76,17 @@ eal_memalloc_mem_alloc_validator_unregister(const char *name, int socket_id); int eal_memalloc_mem_alloc_validate(int socket_id, size_t new_len); +/* returns fd or -errno */ +int +eal_memalloc_get_seg_fd(int list_idx, int seg_idx); + +/* returns 0 or -errno */ +int +eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd); + +int +eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset); + int eal_memalloc_init(void); diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h index 96e16678..5271f944 100644 --- a/lib/librte_eal/common/eal_options.h +++ b/lib/librte_eal/common/eal_options.h @@ -63,6 +63,8 @@ enum { OPT_LEGACY_MEM_NUM, #define OPT_SINGLE_FILE_SEGMENTS "single-file-segments" OPT_SINGLE_FILE_SEGMENTS_NUM, +#define OPT_IOVA_MODE "iova-mode" + OPT_IOVA_MODE_NUM, OPT_LONG_MAX_NUM }; diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h index 4f809a83..442c6dc4 100644 --- a/lib/librte_eal/common/eal_private.h +++ b/lib/librte_eal/common/eal_private.h @@ -258,18 +258,6 @@ struct rte_bus *rte_bus_find_by_device_name(const char *str); int rte_mp_channel_init(void); -/** - * Internal Executes all the user application registered callbacks for - * the specific device. It is for DPDK internal user only. User - * application should not call it directly. - * - * @param device_name - * The device name. - * @param event - * the device event type. - */ -void dev_callback_process(char *device_name, enum rte_dev_event_type event); - /** * @internal * Parse a device string and store its information in an @@ -304,4 +292,82 @@ int rte_devargs_layers_parse(struct rte_devargs *devargs, const char *devstr); +/* + * probe a device at local process. + * + * @param devargs + * Device arguments including bus, class and driver properties. + * @param new_dev + * new device be probed as output. + * @return + * 0 on success, negative on error. + */ +int local_dev_probe(const char *devargs, struct rte_device **new_dev); + +/** + * Hotplug remove a given device from a specific bus at local process. + * + * @param dev + * Data structure of the device to remove. + * @return + * 0 on success, negative on error. + */ +int local_dev_remove(struct rte_device *dev); + +/** + * Iterate over all buses to find the corresponding bus to handle the sigbus + * error. + * @param failure_addr + * Pointer of the fault address of the sigbus error. + * + * @return + * 0 success to handle the sigbus. + * -1 failed to handle the sigbus + * 1 no bus can handler the sigbus + */ +int rte_bus_sigbus_handler(const void *failure_addr); + +/** + * @internal + * Register the sigbus handler. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +dev_sigbus_handler_register(void); + +/** + * @internal + * Unregister the sigbus handler. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +dev_sigbus_handler_unregister(void); + +/** + * Check if the option is registered. + * + * @param option + * The option to be parsed. + * + * @return + * 0 on success + * @return + * -1 on fail + */ +int +rte_option_parse(const char *opt); + +/** + * Iterate through the registered options and execute the associated + * callback if enabled. + */ +void +rte_option_init(void); + #endif /* _EAL_PRIVATE_H_ */ diff --git a/lib/librte_eal/common/hotplug_mp.c b/lib/librte_eal/common/hotplug_mp.c new file mode 100644 index 00000000..84f59d95 --- /dev/null +++ b/lib/librte_eal/common/hotplug_mp.c @@ -0,0 +1,426 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ +#include + +#include +#include +#include +#include + +#include "hotplug_mp.h" +#include "eal_private.h" + +#define MP_TIMEOUT_S 5 /**< 5 seconds timeouts */ + +struct mp_reply_bundle { + struct rte_mp_msg msg; + void *peer; +}; + +static int cmp_dev_name(const struct rte_device *dev, const void *_name) +{ + const char *name = _name; + + return strcmp(dev->name, name); +} + +/** + * Secondary to primary request. + * start from function eal_dev_hotplug_request_to_primary. + * + * device attach on secondary: + * a) secondary send sync request to the primary. + * b) primary receive the request and attach the new device if + * failed goto i). + * c) primary forward attach sync request to all secondary. + * d) secondary receive the request and attach the device and send a reply. + * e) primary check the reply if all success goes to j). + * f) primary send attach rollback sync request to all secondary. + * g) secondary receive the request and detach the device and send a reply. + * h) primary receive the reply and detach device as rollback action. + * i) send attach fail to secondary as a reply of step a), goto k). + * j) send attach success to secondary as a reply of step a). + * k) secondary receive reply and return. + * + * device detach on secondary: + * a) secondary send sync request to the primary. + * b) primary send detach sync request to all secondary. + * c) secondary detach the device and send a reply. + * d) primary check the reply if all success goes to g). + * e) primary send detach rollback sync request to all secondary. + * f) secondary receive the request and attach back device. goto h). + * g) primary detach the device if success goto i), else goto e). + * h) primary send detach fail to secondary as a reply of step a), goto j). + * i) primary send detach success to secondary as a reply of step a). + * j) secondary receive reply and return. + */ + +static int +send_response_to_secondary(const struct eal_dev_mp_req *req, + int result, + const void *peer) +{ + struct rte_mp_msg mp_resp; + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_resp.param; + int ret; + + memset(&mp_resp, 0, sizeof(mp_resp)); + mp_resp.len_param = sizeof(*resp); + strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name)); + memcpy(resp, req, sizeof(*req)); + resp->result = result; + + ret = rte_mp_reply(&mp_resp, peer); + if (ret != 0) + RTE_LOG(ERR, EAL, "failed to send response to secondary\n"); + + return ret; +} + +static void +__handle_secondary_request(void *param) +{ + struct mp_reply_bundle *bundle = param; + const struct rte_mp_msg *msg = &bundle->msg; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + struct eal_dev_mp_req tmp_req; + struct rte_devargs *da; + struct rte_device *dev; + struct rte_bus *bus; + int ret = 0; + + tmp_req = *req; + + if (req->t == EAL_DEV_REQ_TYPE_ATTACH) { + ret = local_dev_probe(req->devargs, &dev); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to hotplug add device on primary\n"); + if (ret != -EEXIST) + goto finish; + } + ret = eal_dev_hotplug_request_to_secondary(&tmp_req); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to send hotplug request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + if (tmp_req.result != 0) { + ret = tmp_req.result; + RTE_LOG(ERR, EAL, "Failed to hotplug add device on secondary\n"); + if (ret != -EEXIST) + goto rollback; + } + } else if (req->t == EAL_DEV_REQ_TYPE_DETACH) { + da = calloc(1, sizeof(*da)); + if (da == NULL) { + ret = -ENOMEM; + goto finish; + } + + ret = rte_devargs_parse(da, req->devargs); + if (ret != 0) + goto finish; + + ret = eal_dev_hotplug_request_to_secondary(&tmp_req); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to send hotplug request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + + bus = rte_bus_find_by_name(da->bus->name); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", da->bus->name); + ret = -ENOENT; + goto finish; + } + + dev = bus->find_device(NULL, cmp_dev_name, da->name); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", da->name); + ret = -ENOENT; + goto finish; + } + + if (tmp_req.result != 0) { + RTE_LOG(ERR, EAL, "Failed to hotplug remove device on secondary\n"); + ret = tmp_req.result; + if (ret != -ENOENT) + goto rollback; + } + + ret = local_dev_remove(dev); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to hotplug remove device on primary\n"); + if (ret != -ENOENT) + goto rollback; + } + } else { + RTE_LOG(ERR, EAL, "unsupported secondary to primary request\n"); + ret = -ENOTSUP; + } + goto finish; + +rollback: + if (req->t == EAL_DEV_REQ_TYPE_ATTACH) { + tmp_req.t = EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK; + eal_dev_hotplug_request_to_secondary(&tmp_req); + local_dev_remove(dev); + } else { + tmp_req.t = EAL_DEV_REQ_TYPE_DETACH_ROLLBACK; + eal_dev_hotplug_request_to_secondary(&tmp_req); + } + +finish: + ret = send_response_to_secondary(&tmp_req, ret, bundle->peer); + if (ret) + RTE_LOG(ERR, EAL, "failed to send response to secondary\n"); + + free(bundle->peer); + free(bundle); +} + +static int +handle_secondary_request(const struct rte_mp_msg *msg, const void *peer) +{ + struct mp_reply_bundle *bundle; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + int ret = 0; + + bundle = malloc(sizeof(*bundle)); + if (bundle == NULL) { + RTE_LOG(ERR, EAL, "not enough memory\n"); + return send_response_to_secondary(req, -ENOMEM, peer); + } + + bundle->msg = *msg; + /** + * We need to send reply on interrupt thread, but peer can't be + * parsed directly, so this is a temporal hack, need to be fixed + * when it is ready. + */ + bundle->peer = strdup(peer); + + /** + * We are at IPC callback thread, sync IPC is not allowed due to + * dead lock, so we delegate the task to interrupt thread. + */ + ret = rte_eal_alarm_set(1, __handle_secondary_request, bundle); + if (ret != 0) { + RTE_LOG(ERR, EAL, "failed to add mp task\n"); + return send_response_to_secondary(req, ret, peer); + } + return 0; +} + +static void __handle_primary_request(void *param) +{ + struct mp_reply_bundle *bundle = param; + struct rte_mp_msg *msg = &bundle->msg; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + struct rte_mp_msg mp_resp; + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_resp.param; + struct rte_devargs *da; + struct rte_device *dev; + struct rte_bus *bus; + int ret = 0; + + memset(&mp_resp, 0, sizeof(mp_resp)); + + switch (req->t) { + case EAL_DEV_REQ_TYPE_ATTACH: + case EAL_DEV_REQ_TYPE_DETACH_ROLLBACK: + ret = local_dev_probe(req->devargs, &dev); + break; + case EAL_DEV_REQ_TYPE_DETACH: + case EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK: + da = calloc(1, sizeof(*da)); + if (da == NULL) { + ret = -ENOMEM; + goto quit; + } + + ret = rte_devargs_parse(da, req->devargs); + if (ret != 0) + goto quit; + + bus = rte_bus_find_by_name(da->bus->name); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", da->bus->name); + ret = -ENOENT; + goto quit; + } + + dev = bus->find_device(NULL, cmp_dev_name, da->name); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", da->name); + ret = -ENOENT; + goto quit; + } + + ret = local_dev_remove(dev); +quit: + break; + default: + ret = -EINVAL; + } + + strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name)); + mp_resp.len_param = sizeof(*req); + memcpy(resp, req, sizeof(*resp)); + resp->result = ret; + if (rte_mp_reply(&mp_resp, bundle->peer) < 0) + RTE_LOG(ERR, EAL, "failed to send reply to primary request\n"); + + free(bundle->peer); + free(bundle); +} + +static int +handle_primary_request(const struct rte_mp_msg *msg, const void *peer) +{ + struct rte_mp_msg mp_resp; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_resp.param; + struct mp_reply_bundle *bundle; + int ret = 0; + + memset(&mp_resp, 0, sizeof(mp_resp)); + strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name)); + mp_resp.len_param = sizeof(*req); + memcpy(resp, req, sizeof(*resp)); + + bundle = calloc(1, sizeof(*bundle)); + if (bundle == NULL) { + resp->result = -ENOMEM; + ret = rte_mp_reply(&mp_resp, peer); + if (ret) + RTE_LOG(ERR, EAL, "failed to send reply to primary request\n"); + return ret; + } + + bundle->msg = *msg; + /** + * We need to send reply on interrupt thread, but peer can't be + * parsed directly, so this is a temporal hack, need to be fixed + * when it is ready. + */ + bundle->peer = (void *)strdup(peer); + + /** + * We are at IPC callback thread, sync IPC is not allowed due to + * dead lock, so we delegate the task to interrupt thread. + */ + ret = rte_eal_alarm_set(1, __handle_primary_request, bundle); + if (ret != 0) { + resp->result = ret; + ret = rte_mp_reply(&mp_resp, peer); + if (ret != 0) { + RTE_LOG(ERR, EAL, "failed to send reply to primary request\n"); + return ret; + } + } + return 0; +} + +int eal_dev_hotplug_request_to_primary(struct eal_dev_mp_req *req) +{ + struct rte_mp_msg mp_req; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = MP_TIMEOUT_S, .tv_nsec = 0}; + struct eal_dev_mp_req *resp; + int ret; + + memset(&mp_req, 0, sizeof(mp_req)); + memcpy(mp_req.param, req, sizeof(*req)); + mp_req.len_param = sizeof(*req); + strlcpy(mp_req.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_req.name)); + + ret = rte_mp_request_sync(&mp_req, &mp_reply, &ts); + if (ret || mp_reply.nb_received != 1) { + RTE_LOG(ERR, EAL, "cannot send request to primary"); + if (!ret) + return -1; + return ret; + } + + resp = (struct eal_dev_mp_req *)mp_reply.msgs[0].param; + req->result = resp->result; + + return ret; +} + +int eal_dev_hotplug_request_to_secondary(struct eal_dev_mp_req *req) +{ + struct rte_mp_msg mp_req; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = MP_TIMEOUT_S, .tv_nsec = 0}; + int ret; + int i; + + memset(&mp_req, 0, sizeof(mp_req)); + memcpy(mp_req.param, req, sizeof(*req)); + mp_req.len_param = sizeof(*req); + strlcpy(mp_req.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_req.name)); + + ret = rte_mp_request_sync(&mp_req, &mp_reply, &ts); + if (ret != 0) { + RTE_LOG(ERR, EAL, "rte_mp_request_sync failed\n"); + return ret; + } + + if (mp_reply.nb_sent != mp_reply.nb_received) { + RTE_LOG(ERR, EAL, "not all secondary reply\n"); + return -1; + } + + req->result = 0; + for (i = 0; i < mp_reply.nb_received; i++) { + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_reply.msgs[i].param; + if (resp->result != 0) { + req->result = resp->result; + if (req->t == EAL_DEV_REQ_TYPE_ATTACH && + req->result != -EEXIST) + break; + if (req->t == EAL_DEV_REQ_TYPE_DETACH && + req->result != -ENOENT) + break; + } + } + + return 0; +} + +int rte_mp_dev_hotplug_init(void) +{ + int ret; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + ret = rte_mp_action_register(EAL_DEV_MP_ACTION_REQUEST, + handle_secondary_request); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + EAL_DEV_MP_ACTION_REQUEST); + return ret; + } + } else { + ret = rte_mp_action_register(EAL_DEV_MP_ACTION_REQUEST, + handle_primary_request); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + EAL_DEV_MP_ACTION_REQUEST); + return ret; + } + } + + return 0; +} diff --git a/lib/librte_eal/common/hotplug_mp.h b/lib/librte_eal/common/hotplug_mp.h new file mode 100644 index 00000000..597fde3d --- /dev/null +++ b/lib/librte_eal/common/hotplug_mp.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _HOTPLUG_MP_H_ +#define _HOTPLUG_MP_H_ + +#include "rte_dev.h" +#include "rte_bus.h" + +#define EAL_DEV_MP_ACTION_REQUEST "eal_dev_mp_request" +#define EAL_DEV_MP_ACTION_RESPONSE "eal_dev_mp_response" + +#define EAL_DEV_MP_DEV_NAME_MAX_LEN RTE_DEV_NAME_MAX_LEN +#define EAL_DEV_MP_BUS_NAME_MAX_LEN 32 +#define EAL_DEV_MP_DEV_ARGS_MAX_LEN 128 + +enum eal_dev_req_type { + EAL_DEV_REQ_TYPE_ATTACH, + EAL_DEV_REQ_TYPE_DETACH, + EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK, + EAL_DEV_REQ_TYPE_DETACH_ROLLBACK, +}; + +struct eal_dev_mp_req { + enum eal_dev_req_type t; + char devargs[EAL_DEV_MP_DEV_ARGS_MAX_LEN]; + int result; +}; + +/** + * This is a synchronous wrapper for secondary process send + * request to primary process, this is invoked when an attach + * or detach request is issued from primary process. + */ +int eal_dev_hotplug_request_to_primary(struct eal_dev_mp_req *req); + +/** + * this is a synchronous wrapper for primary process send + * request to secondary process, this is invoked when an attach + * or detach request issued from secondary process. + */ +int eal_dev_hotplug_request_to_secondary(struct eal_dev_mp_req *req); + + +#endif /* _HOTPLUG_MP_H_ */ diff --git a/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h b/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h index c4f974fe..859b0974 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h +++ b/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h @@ -29,8 +29,8 @@ extern "C" { #ifndef RTE_ARM_EAL_RDTSC_USE_PMU /** - * This call is easily portable to any ARM architecture, however, - * it may be damn slow and inprecise for some tasks. + * This call is easily portable to any architecture, however, + * it may require a system call and inprecise for some tasks. */ static inline uint64_t __rte_rdtsc_syscall(void) diff --git a/lib/librte_eal/common/include/arch/ppc_64/meson.build b/lib/librte_eal/common/include/arch/ppc_64/meson.build new file mode 100644 index 00000000..00f96117 --- /dev/null +++ b/lib/librte_eal/common/include/arch/ppc_64/meson.build @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Luca Boccassi + +install_headers( + 'rte_atomic.h', + 'rte_byteorder.h', + 'rte_cpuflags.h', + 'rte_cycles.h', + 'rte_io.h', + 'rte_memcpy.h', + 'rte_pause.h', + 'rte_prefetch.h', + 'rte_rwlock.h', + 'rte_spinlock.h', + 'rte_vect.h', + subdir: get_option('include_subdir_arch')) diff --git a/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h b/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h index 8bd83576..16e47ce2 100644 --- a/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h +++ b/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h @@ -9,10 +9,17 @@ extern "C" { #endif +#include "rte_atomic.h" + #include "generic/rte_pause.h" static inline void rte_pause(void) { + /* Set hardware multi-threading low priority */ + asm volatile("or 1,1,1"); + /* Set hardware multi-threading medium priority */ + asm volatile("or 2,2,2"); + rte_compiler_barrier(); } #ifdef __cplusplus diff --git a/lib/librte_eal/common/include/generic/rte_cycles.h b/lib/librte_eal/common/include/generic/rte_cycles.h index 0ff1af50..ac379e87 100644 --- a/lib/librte_eal/common/include/generic/rte_cycles.h +++ b/lib/librte_eal/common/include/generic/rte_cycles.h @@ -13,6 +13,7 @@ */ #include +#include #include #include @@ -157,6 +158,16 @@ rte_delay_ms(unsigned ms) */ void rte_delay_us_block(unsigned int us); +/** + * Delay function that uses system sleep. + * Does not block the CPU core. + * + * @param us + * Number of microseconds to wait. + */ +void __rte_experimental +rte_delay_us_sleep(unsigned int us); + /** * Replace rte_delay_us with user defined function. * diff --git a/lib/librte_eal/common/include/rte_bitmap.h b/lib/librte_eal/common/include/rte_bitmap.h index d9facc64..7a36ce73 100644 --- a/lib/librte_eal/common/include/rte_bitmap.h +++ b/lib/librte_eal/common/include/rte_bitmap.h @@ -88,7 +88,7 @@ __rte_bitmap_index1_inc(struct rte_bitmap *bmp) static inline uint64_t __rte_bitmap_mask1_get(struct rte_bitmap *bmp) { - return (~1lu) << bmp->offset1; + return (~1llu) << bmp->offset1; } static inline void @@ -317,7 +317,7 @@ rte_bitmap_get(struct rte_bitmap *bmp, uint32_t pos) index2 = pos >> RTE_BITMAP_SLAB_BIT_SIZE_LOG2; offset2 = pos & RTE_BITMAP_SLAB_BIT_MASK; slab2 = bmp->array2 + index2; - return (*slab2) & (1lu << offset2); + return (*slab2) & (1llu << offset2); } /** @@ -342,8 +342,8 @@ rte_bitmap_set(struct rte_bitmap *bmp, uint32_t pos) slab2 = bmp->array2 + index2; slab1 = bmp->array1 + index1; - *slab2 |= 1lu << offset2; - *slab1 |= 1lu << offset1; + *slab2 |= 1llu << offset2; + *slab1 |= 1llu << offset1; } /** @@ -370,7 +370,7 @@ rte_bitmap_set_slab(struct rte_bitmap *bmp, uint32_t pos, uint64_t slab) slab1 = bmp->array1 + index1; *slab2 |= slab; - *slab1 |= 1lu << offset1; + *slab1 |= 1llu << offset1; } static inline uint64_t @@ -408,7 +408,7 @@ rte_bitmap_clear(struct rte_bitmap *bmp, uint32_t pos) slab2 = bmp->array2 + index2; /* Return if array2 slab is not all-zeros */ - *slab2 &= ~(1lu << offset2); + *slab2 &= ~(1llu << offset2); if (*slab2){ return; } @@ -424,7 +424,7 @@ rte_bitmap_clear(struct rte_bitmap *bmp, uint32_t pos) index1 = pos >> (RTE_BITMAP_SLAB_BIT_SIZE_LOG2 + RTE_BITMAP_CL_BIT_SIZE_LOG2); offset1 = (pos >> RTE_BITMAP_CL_BIT_SIZE_LOG2) & RTE_BITMAP_SLAB_BIT_MASK; slab1 = bmp->array1 + index1; - *slab1 &= ~(1lu << offset1); + *slab1 &= ~(1llu << offset1); return; } diff --git a/lib/librte_eal/common/include/rte_bus.h b/lib/librte_eal/common/include/rte_bus.h index b7b5b084..6be4b5ca 100644 --- a/lib/librte_eal/common/include/rte_bus.h +++ b/lib/librte_eal/common/include/rte_bus.h @@ -167,6 +167,35 @@ typedef int (*rte_bus_unplug_t)(struct rte_device *dev); */ typedef int (*rte_bus_parse_t)(const char *name, void *addr); +/** + * Implement a specific hot-unplug handler, which is responsible for + * handle the failure when device be hot-unplugged. When the event of + * hot-unplug be detected, it could call this function to handle + * the hot-unplug failure and avoid app crash. + * @param dev + * Pointer of the device structure. + * + * @return + * 0 on success. + * !0 on error. + */ +typedef int (*rte_bus_hot_unplug_handler_t)(struct rte_device *dev); + +/** + * Implement a specific sigbus handler, which is responsible for handling + * the sigbus error which is either original memory error, or specific memory + * error that caused of device be hot-unplugged. When sigbus error be captured, + * it could call this function to handle sigbus error. + * @param failure_addr + * Pointer of the fault address of the sigbus error. + * + * @return + * 0 for success handle the sigbus for hot-unplug. + * 1 for not process it, because it is a generic sigbus error. + * -1 for failed to handle the sigbus for hot-unplug. + */ +typedef int (*rte_bus_sigbus_handler_t)(const void *failure_addr); + /** * Bus scan policies */ @@ -212,6 +241,11 @@ struct rte_bus { struct rte_bus_conf conf; /**< Bus configuration */ rte_bus_get_iommu_class_t get_iommu_class; /**< Get iommu class */ rte_dev_iterate_t dev_iterate; /**< Device iterator. */ + rte_bus_hot_unplug_handler_t hot_unplug_handler; + /**< handle hot-unplug failure on the bus */ + rte_bus_sigbus_handler_t sigbus_handler; + /**< handle sigbus error on the bus */ + }; /** diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h index 069c13ec..cba7bbc1 100644 --- a/lib/librte_eal/common/include/rte_common.h +++ b/lib/librte_eal/common/include/rte_common.h @@ -68,6 +68,11 @@ typedef uint16_t unaligned_uint16_t; /******* Macro to mark functions and fields scheduled for removal *****/ #define __rte_deprecated __attribute__((__deprecated__)) +/** + * Mark a function or variable to a weak reference. + */ +#define __rte_weak __attribute__((__weak__)) + /*********** Macros to eliminate unused variable warnings ********/ /** @@ -164,6 +169,12 @@ static void __attribute__((destructor(RTE_PRIO(prio)), used)) func(void) */ #define RTE_PTR_DIFF(ptr1, ptr2) ((uintptr_t)(ptr1) - (uintptr_t)(ptr2)) +/** + * Workaround to cast a const field of a structure to non-const type. + */ +#define RTE_CAST_FIELD(var, field, type) \ + (*(type *)((uintptr_t)(var) + offsetof(typeof(*(var)), field))) + /*********** Macros/static functions for doing alignment ********/ diff --git a/lib/librte_eal/common/include/rte_dev.h b/lib/librte_eal/common/include/rte_dev.h index b80a8059..cd6c187c 100644 --- a/lib/librte_eal/common/include/rte_dev.h +++ b/lib/librte_eal/common/include/rte_dev.h @@ -39,7 +39,7 @@ struct rte_dev_event { char *devname; /**< device name */ }; -typedef void (*rte_dev_event_cb_fn)(char *device_name, +typedef void (*rte_dev_event_cb_fn)(const char *device_name, enum rte_dev_event_type event, void *cb_arg); @@ -156,63 +156,67 @@ struct rte_driver { struct rte_device { TAILQ_ENTRY(rte_device) next; /**< Next device */ const char *name; /**< Device name */ - const struct rte_driver *driver;/**< Associated driver */ + const struct rte_driver *driver; /**< Driver assigned after probing */ + const struct rte_bus *bus; /**< Bus handle assigned on scan */ int numa_node; /**< NUMA node connection */ - struct rte_devargs *devargs; /**< Device user arguments */ + struct rte_devargs *devargs; /**< Arguments for latest probing */ }; /** - * Attach a device to a registered driver. + * @warning + * @b EXPERIMENTAL: this API may change without prior notice * - * @param name - * The device name, that refers to a pci device (or some private - * way of designating a vdev device). Based on this device name, eal - * will identify a driver capable of handling it and pass it to the - * driver probing function. - * @param devargs - * Device arguments to be passed to the driver. - * @return - * 0 on success, negative on error. - */ -__rte_deprecated -int rte_eal_dev_attach(const char *name, const char *devargs); - -/** - * Detach a device from its driver. + * Query status of a device. * * @param dev - * A pointer to a rte_device structure. + * Generic device pointer. * @return - * 0 on success, negative on error. + * (int)true if already probed successfully, 0 otherwise. */ -__rte_deprecated -int rte_eal_dev_detach(struct rte_device *dev); +__rte_experimental +int rte_dev_is_probed(const struct rte_device *dev); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Hotplug add a given device to a specific bus. * + * In multi-process, it will request other processes to add the same device. + * A failure, in any process, will rollback the action + * * @param busname * The bus name the device is added to. * @param devname * The device name. Based on this device name, eal will identify a driver * capable of handling it and pass it to the driver probing function. - * @param devargs + * @param drvargs * Device arguments to be passed to the driver. * @return * 0 on success, negative on error. */ -int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devname, - const char *devargs); +int rte_eal_hotplug_add(const char *busname, const char *devname, + const char *drvargs); /** * @warning * @b EXPERIMENTAL: this API may change without prior notice * + * Add matching devices. + * + * In multi-process, it will request other processes to add the same device. + * A failure, in any process, will rollback the action + * + * @param devargs + * Device arguments including bus, class and driver properties. + * @return + * 0 on success, negative on error. + */ +int __rte_experimental rte_dev_probe(const char *devargs); + +/** * Hotplug remove a given device from a specific bus. * + * In multi-process, it will request other processes to remove the same device. + * A failure, in any process, will rollback the action + * * @param busname * The bus name the device is removed from. * @param devname @@ -220,8 +224,23 @@ int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devn * @return * 0 on success, negative on error. */ -int __rte_experimental rte_eal_hotplug_remove(const char *busname, - const char *devname); +int rte_eal_hotplug_remove(const char *busname, const char *devname); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Remove one device. + * + * In multi-process, it will request other processes to remove the same device. + * A failure, in any process, will rollback the action + * + * @param dev + * Data structure of the device to remove. + * @return + * 0 on success, negative on error. + */ +int __rte_experimental rte_dev_remove(struct rte_device *dev); /** * Device comparison function. @@ -434,6 +453,22 @@ rte_dev_event_callback_unregister(const char *device_name, rte_dev_event_cb_fn cb_fn, void *cb_arg); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Executes all the user application registered callbacks for + * the specific device. + * + * @param device_name + * The device name. + * @param event + * the device event type. + */ +void __rte_experimental +rte_dev_event_callback_process(const char *device_name, + enum rte_dev_event_type event); + /** * @warning * @b EXPERIMENTAL: this API may change without prior notice @@ -460,4 +495,30 @@ rte_dev_event_monitor_start(void); int __rte_experimental rte_dev_event_monitor_stop(void); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Enable hotplug handling for devices. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int __rte_experimental +rte_dev_hotplug_handle_enable(void); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Disable hotplug handling for devices. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int __rte_experimental +rte_dev_hotplug_handle_disable(void); + #endif /* _RTE_DEV_H_ */ diff --git a/lib/librte_eal/common/include/rte_devargs.h b/lib/librte_eal/common/include/rte_devargs.h index 097a4ce7..b1f121f8 100644 --- a/lib/librte_eal/common/include/rte_devargs.h +++ b/lib/librte_eal/common/include/rte_devargs.h @@ -66,36 +66,6 @@ struct rte_devargs { const char *data; /**< Device string storage. */ }; -/** - * @deprecated - * Parse a devargs string. - * - * For PCI devices, the format of arguments string is "PCI_ADDR" or - * "PCI_ADDR,key=val,key2=val2,...". Examples: "08:00.1", "0000:5:00.0", - * "04:00.0,arg=val". - * - * For virtual devices, the format of arguments string is "DRIVER_NAME*" - * or "DRIVER_NAME*,key=val,key2=val2,...". Examples: "net_ring", - * "net_ring0", "net_pmdAnything,arg=0:arg2=1". - * - * The function parses the arguments string to get driver name and driver - * arguments. - * - * @param devargs_str - * The arguments as given by the user. - * @param drvname - * The pointer to the string to store parsed driver name. - * @param drvargs - * The pointer to the string to store parsed driver arguments. - * - * @return - * - 0 on success - * - A negative value on error - */ -__rte_deprecated -int rte_eal_parse_devargs_str(const char *devargs_str, - char **drvname, char **drvargs); - /** * Parse a device string. * @@ -201,33 +171,13 @@ rte_devargs_insert(struct rte_devargs *da); __rte_experimental int rte_devargs_add(enum rte_devtype devtype, const char *devargs_str); -/** - * @deprecated - * Add a device to the user device list - * See rte_devargs_parse() for details. - * - * @param devtype - * The type of the device. - * @param devargs_str - * The arguments as given by the user. - * - * @return - * - 0 on success - * - A negative value on error - */ -__rte_deprecated -int rte_eal_devargs_add(enum rte_devtype devtype, const char *devargs_str); - /** * Remove a device from the user device list. * Its resources are freed. * If the devargs cannot be found, nothing happens. * - * @param busname - * bus name of the devargs to remove. - * - * @param devname - * device name of the devargs to remove. + * @param devargs + * The instance or a copy of devargs to remove. * * @return * 0 on success. @@ -235,8 +185,7 @@ int rte_eal_devargs_add(enum rte_devtype devtype, const char *devargs_str); * >0 if the devargs was not within the user device list. */ __rte_experimental -int rte_devargs_remove(const char *busname, - const char *devname); +int rte_devargs_remove(struct rte_devargs *devargs); /** * Count the number of user devices of a specified type @@ -251,20 +200,6 @@ __rte_experimental unsigned int rte_devargs_type_count(enum rte_devtype devtype); -/** - * @deprecated - * Count the number of user devices of a specified type - * - * @param devtype - * The type of the devices to counted. - * - * @return - * The number of devices. - */ -__rte_deprecated -unsigned int -rte_eal_devargs_type_count(enum rte_devtype devtype); - /** * This function dumps the list of user device and their arguments. * @@ -274,16 +209,6 @@ rte_eal_devargs_type_count(enum rte_devtype devtype); __rte_experimental void rte_devargs_dump(FILE *f); -/** - * @deprecated - * This function dumps the list of user device and their arguments. - * - * @param f - * A pointer to a file for output - */ -__rte_deprecated -void rte_eal_devargs_dump(FILE *f); - /** * Find next rte_devargs matching the provided bus name. * diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h index e114dcbd..a0cedd57 100644 --- a/lib/librte_eal/common/include/rte_eal.h +++ b/lib/librte_eal/common/include/rte_eal.h @@ -316,7 +316,7 @@ rte_mp_sendmsg(struct rte_mp_msg *msg); * * @param reply * The reply argument will be for storing all the replied messages; - * the caller is responsible for free reply->replies. + * the caller is responsible for free reply->msgs. * * @param ts * The ts argument specifies how long we can wait for the peer(s) to reply. @@ -377,6 +377,15 @@ rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts, int __rte_experimental rte_mp_reply(struct rte_mp_msg *msg, const char *peer); +/** + * Register all mp action callbacks for hotplug. + * + * @return + * 0 on success, negative on error. + */ +int __rte_experimental +rte_mp_dev_hotplug_init(void); + /** * Usage function typedef used by the application usage function. * @@ -498,6 +507,15 @@ enum rte_iova_mode rte_eal_iova_mode(void); const char * rte_eal_mbuf_user_pool_ops(void); +/** + * Get the runtime directory of DPDK + * + * @return + * The runtime directory path of DPDK + */ +const char * +rte_eal_get_runtime_dir(void); + #ifdef __cplusplus } #endif diff --git a/lib/librte_eal/common/include/rte_eal_interrupts.h b/lib/librte_eal/common/include/rte_eal_interrupts.h index 6eb49327..9d302f41 100644 --- a/lib/librte_eal/common/include/rte_eal_interrupts.h +++ b/lib/librte_eal/common/include/rte_eal_interrupts.h @@ -35,6 +35,7 @@ enum rte_intr_handle_type { RTE_INTR_HANDLE_EXT, /**< external handler */ RTE_INTR_HANDLE_VDEV, /**< virtual device */ RTE_INTR_HANDLE_DEV_EVENT, /**< device event handle */ + RTE_INTR_HANDLE_VFIO_REQ, /**< VFIO request handle */ RTE_INTR_HANDLE_MAX /**< count of elements */ }; diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h index aff0688d..84aabe36 100644 --- a/lib/librte_eal/common/include/rte_eal_memconfig.h +++ b/lib/librte_eal/common/include/rte_eal_memconfig.h @@ -30,9 +30,11 @@ struct rte_memseg_list { uint64_t addr_64; /**< Makes sure addr is always 64-bits */ }; - int socket_id; /**< Socket ID for all memsegs in this list. */ uint64_t page_sz; /**< Page size for all memsegs in this list. */ + int socket_id; /**< Socket ID for all memsegs in this list. */ volatile uint32_t version; /**< version number for multiprocess sync. */ + size_t len; /**< Length of memory area covered by this memseg list. */ + unsigned int external; /**< 1 if this list points to external memory */ struct rte_fbarray memseg_arr; }; @@ -70,13 +72,23 @@ struct rte_mem_config { struct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; /**< Tailqs for objects */ - /* Heaps of Malloc per socket */ - struct malloc_heap malloc_heaps[RTE_MAX_NUMA_NODES]; + /* Heaps of Malloc */ + struct malloc_heap malloc_heaps[RTE_MAX_HEAPS]; + + /* next socket ID for external malloc heap */ + int next_socket_id; /* address of mem_config in primary process. used to map shared config into * exact same address the primary process maps it. */ uint64_t mem_cfg_addr; + + /* legacy mem and single file segments options are shared */ + uint32_t legacy_mem; + uint32_t single_file_segments; + + /* keeps the more restricted dma mask */ + uint8_t dma_maskbits; } __attribute__((__packed__)); diff --git a/lib/librte_eal/common/include/rte_malloc.h b/lib/librte_eal/common/include/rte_malloc.h index a9fb7e45..7249e6aa 100644 --- a/lib/librte_eal/common/include/rte_malloc.h +++ b/lib/librte_eal/common/include/rte_malloc.h @@ -263,6 +263,198 @@ int rte_malloc_get_socket_stats(int socket, struct rte_malloc_socket_stats *socket_stats); +/** + * Add memory chunk to a heap with specified name. + * + * @note Multiple memory chunks can be added to the same heap + * + * @note Before accessing this memory in other processes, it needs to be + * attached in each of those processes by calling + * ``rte_malloc_heap_memory_attach`` in each other process. + * + * @note Memory must be previously allocated for DPDK to be able to use it as a + * malloc heap. Failing to do so will result in undefined behavior, up to and + * including segmentation faults. + * + * @note Calling this function will erase any contents already present at the + * supplied memory address. + * + * @param heap_name + * Name of the heap to add memory chunk to + * @param va_addr + * Start of virtual area to add to the heap + * @param len + * Length of virtual area to add to the heap + * @param iova_addrs + * Array of page IOVA addresses corresponding to each page in this memory + * area. Can be NULL, in which case page IOVA addresses will be set to + * RTE_BAD_IOVA. + * @param n_pages + * Number of elements in the iova_addrs array. Ignored if ``iova_addrs`` + * is NULL. + * @param page_sz + * Page size of the underlying memory + * + * @return + * - 0 on success + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - one of the parameters was invalid + * EPERM - attempted to add memory to a reserved heap + * ENOSPC - no more space in internal config to store a new memory chunk + */ +int __rte_experimental +rte_malloc_heap_memory_add(const char *heap_name, void *va_addr, size_t len, + rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz); + +/** + * Remove memory chunk from heap with specified name. + * + * @note Memory chunk being removed must be the same as one that was added; + * partially removing memory chunks is not supported + * + * @note Memory area must not contain any allocated elements to allow its + * removal from the heap + * + * @note All other processes must detach from the memory chunk prior to it being + * removed from the heap. + * + * @param heap_name + * Name of the heap to remove memory from + * @param va_addr + * Virtual address to remove from the heap + * @param len + * Length of virtual area to remove from the heap + * + * @return + * - 0 on success + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - one of the parameters was invalid + * EPERM - attempted to remove memory from a reserved heap + * ENOENT - heap or memory chunk was not found + * EBUSY - memory chunk still contains data + */ +int __rte_experimental +rte_malloc_heap_memory_remove(const char *heap_name, void *va_addr, size_t len); + +/** + * Attach to an already existing chunk of external memory in another process. + * + * @note This function must be called before any attempt is made to use an + * already existing external memory chunk. This function does *not* need to + * be called if a call to ``rte_malloc_heap_memory_add`` was made in the + * current process. + * + * @param heap_name + * Heap name to which this chunk of memory belongs + * @param va_addr + * Start address of memory chunk to attach to + * @param len + * Length of memory chunk to attach to + * @return + * 0 on successful attach + * -1 on unsuccessful attach, with rte_errno set to indicate cause for error: + * EINVAL - one of the parameters was invalid + * EPERM - attempted to attach memory to a reserved heap + * ENOENT - heap or memory chunk was not found + */ +int __rte_experimental +rte_malloc_heap_memory_attach(const char *heap_name, void *va_addr, size_t len); + +/** + * Detach from a chunk of external memory in secondary process. + * + * @note This function must be called in before any attempt is made to remove + * external memory from the heap in another process. This function does *not* + * need to be called if a call to ``rte_malloc_heap_memory_remove`` will be + * called in current process. + * + * @param heap_name + * Heap name to which this chunk of memory belongs + * @param va_addr + * Start address of memory chunk to attach to + * @param len + * Length of memory chunk to attach to + * @return + * 0 on successful detach + * -1 on unsuccessful detach, with rte_errno set to indicate cause for error: + * EINVAL - one of the parameters was invalid + * EPERM - attempted to detach memory from a reserved heap + * ENOENT - heap or memory chunk was not found + */ +int __rte_experimental +rte_malloc_heap_memory_detach(const char *heap_name, void *va_addr, size_t len); + +/** + * Creates a new empty malloc heap with a specified name. + * + * @note Heaps created via this call will automatically get assigned a unique + * socket ID, which can be found using ``rte_malloc_heap_get_socket()`` + * + * @param heap_name + * Name of the heap to create. + * + * @return + * - 0 on successful creation + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - ``heap_name`` was NULL, empty or too long + * EEXIST - heap by name of ``heap_name`` already exists + * ENOSPC - no more space in internal config to store a new heap + */ +int __rte_experimental +rte_malloc_heap_create(const char *heap_name); + +/** + * Destroys a previously created malloc heap with specified name. + * + * @note This function will return a failure result if not all memory allocated + * from the heap has been freed back to the heap + * + * @note This function will return a failure result if not all memory segments + * were removed from the heap prior to its destruction + * + * @param heap_name + * Name of the heap to create. + * + * @return + * - 0 on success + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - ``heap_name`` was NULL, empty or too long + * ENOENT - heap by the name of ``heap_name`` was not found + * EPERM - attempting to destroy reserved heap + * EBUSY - heap still contains data + */ +int __rte_experimental +rte_malloc_heap_destroy(const char *heap_name); + +/** + * Find socket ID corresponding to a named heap. + * + * @param name + * Heap name to find socket ID for + * @return + * Socket ID in case of success (a non-negative number) + * -1 in case of error, with rte_errno set to one of the following: + * EINVAL - ``name`` was NULL + * ENOENT - heap identified by the name ``name`` was not found + */ +int __rte_experimental +rte_malloc_heap_get_socket(const char *name); + +/** + * Check if a given socket ID refers to externally allocated memory. + * + * @note Passing SOCKET_ID_ANY will return 0. + * + * @param socket_id + * Socket ID to check + * @return + * 1 if socket ID refers to externally allocated memory + * 0 if socket ID refers to internal DPDK memory + * -1 if socket ID is invalid + */ +int __rte_experimental +rte_malloc_heap_socket_is_external(int socket_id); + /** * Dump statistics. * diff --git a/lib/librte_eal/common/include/rte_malloc_heap.h b/lib/librte_eal/common/include/rte_malloc_heap.h index d43fa909..4a7e0eb1 100644 --- a/lib/librte_eal/common/include/rte_malloc_heap.h +++ b/lib/librte_eal/common/include/rte_malloc_heap.h @@ -12,6 +12,7 @@ /* Number of free lists per heap, grouped by size. */ #define RTE_HEAP_NUM_FREELISTS 13 +#define RTE_HEAP_NAME_MAX_LEN 32 /* dummy definition, for pointers */ struct malloc_elem; @@ -26,7 +27,9 @@ struct malloc_heap { struct malloc_elem *volatile last; unsigned alloc_count; + unsigned int socket_id; size_t total_size; + char name[RTE_HEAP_NAME_MAX_LEN]; } __rte_cache_aligned; #endif /* _RTE_MALLOC_HEAP_H_ */ diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h index c4b7f4cf..ce937058 100644 --- a/lib/librte_eal/common/include/rte_memory.h +++ b/lib/librte_eal/common/include/rte_memory.h @@ -215,6 +215,9 @@ typedef int (*rte_memseg_list_walk_t)(const struct rte_memseg_list *msl, * @note This function read-locks the memory hotplug subsystem, and thus cannot * be used within memory-related callback functions. * + * @note This function will also walk through externally allocated segments. It + * is up to the user to decide whether to skip through these segments. + * * @param func * Iterator function * @param arg @@ -233,6 +236,9 @@ rte_memseg_walk(rte_memseg_walk_t func, void *arg); * @note This function read-locks the memory hotplug subsystem, and thus cannot * be used within memory-related callback functions. * + * @note This function will also walk through externally allocated segments. It + * is up to the user to decide whether to skip through these segments. + * * @param func * Iterator function * @param arg @@ -251,6 +257,9 @@ rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg); * @note This function read-locks the memory hotplug subsystem, and thus cannot * be used within memory-related callback functions. * + * @note This function will also walk through externally allocated segments. It + * is up to the user to decide whether to skip through these segments. + * * @param func * Iterator function * @param arg @@ -317,6 +326,103 @@ rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg); int __rte_experimental rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg); +/** + * Return file descriptor associated with a particular memseg (if available). + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @note This returns an internal file descriptor. Performing any operations on + * this file descriptor is inherently dangerous, so it should be treated + * as read-only for all intents and purposes. + * + * @param ms + * A pointer to memseg for which to get file descriptor. + * + * @return + * Valid file descriptor in case of success. + * -1 in case of error, with ``rte_errno`` set to the following values: + * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg + * - ENODEV - ``ms`` fd is not available + * - ENOENT - ``ms`` is an unused segment + * - ENOTSUP - segment fd's are not supported + */ +int __rte_experimental +rte_memseg_get_fd(const struct rte_memseg *ms); + +/** + * Return file descriptor associated with a particular memseg (if available). + * + * @note This function does not perform any locking, and is only safe to call + * from within memory-related callback functions. + * + * @note This returns an internal file descriptor. Performing any operations on + * this file descriptor is inherently dangerous, so it should be treated + * as read-only for all intents and purposes. + * + * @param ms + * A pointer to memseg for which to get file descriptor. + * + * @return + * Valid file descriptor in case of success. + * -1 in case of error, with ``rte_errno`` set to the following values: + * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg + * - ENODEV - ``ms`` fd is not available + * - ENOENT - ``ms`` is an unused segment + * - ENOTSUP - segment fd's are not supported + */ +int __rte_experimental +rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms); + +/** + * Get offset into segment file descriptor associated with a particular memseg + * (if available). + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @param ms + * A pointer to memseg for which to get file descriptor. + * @param offset + * A pointer to offset value where the result will be stored. + * + * @return + * Valid file descriptor in case of success. + * -1 in case of error, with ``rte_errno`` set to the following values: + * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg + * - EINVAL - ``offset`` pointer was NULL + * - ENODEV - ``ms`` fd is not available + * - ENOENT - ``ms`` is an unused segment + * - ENOTSUP - segment fd's are not supported + */ +int __rte_experimental +rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset); + +/** + * Get offset into segment file descriptor associated with a particular memseg + * (if available). + * + * @note This function does not perform any locking, and is only safe to call + * from within memory-related callback functions. + * + * @param ms + * A pointer to memseg for which to get file descriptor. + * @param offset + * A pointer to offset value where the result will be stored. + * + * @return + * Valid file descriptor in case of success. + * -1 in case of error, with ``rte_errno`` set to the following values: + * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg + * - EINVAL - ``offset`` pointer was NULL + * - ENODEV - ``ms`` fd is not available + * - ENOENT - ``ms`` is an unused segment + * - ENOTSUP - segment fd's are not supported + */ +int __rte_experimental +rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms, + size_t *offset); + /** * Dump the physical memory layout to a file. * @@ -357,6 +463,9 @@ unsigned rte_memory_get_nchannel(void); */ unsigned rte_memory_get_nrank(void); +/* check memsegs iovas are within a range based on dma mask */ +int __rte_experimental rte_eal_check_dma_mask(uint8_t maskbits); + /** * Drivers based on uio will not load unless physical * addresses are obtainable. It is only possible to get diff --git a/lib/librte_eal/common/include/rte_option.h b/lib/librte_eal/common/include/rte_option.h new file mode 100644 index 00000000..8957b970 --- /dev/null +++ b/lib/librte_eal/common/include/rte_option.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation. + */ + +#ifndef __INCLUDE_RTE_OPTION_H__ +#define __INCLUDE_RTE_OPTION_H__ + +/** + * @file + * + * This API offers the ability to register options to the EAL command line and + * map those options to functions that will be executed at the end of EAL + * initialization. These options will be available as part of the EAL command + * line of applications and are dynamically managed. + * + * This is used primarily by DPDK libraries offering command line options. + * Currently, this API is limited to registering options without argument. + * + * The register API can be used to resolve circular dependency issues + * between EAL and the library. The library uses EAL, but is also initialized + * by EAL. Hence, EAL depends on the init function of the library. The API + * introduced in rte_option allows us to register the library init with EAL + * (passing a function pointer) and avoid the circular dependency. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int (*rte_option_cb)(void); + +/* + * Structure describing the EAL command line option being registered. + */ +struct rte_option { + TAILQ_ENTRY(rte_option) next; /**< Next entry in the list. */ + char *opt_str; /**< The option name. */ + rte_option_cb cb; /**< Function called when option is used. */ + int enabled; /**< Set when the option is used. */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Register an option to the EAL command line. + * When recognized, the associated function will be executed at the end of EAL + * initialization. + * + * The associated structure must be available the whole time this option is + * registered (i.e. not stack memory). + * + * @param opt + * Structure describing the option to parse. + */ +void __rte_experimental +rte_option_register(struct rte_option *opt); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/librte_eal/common/include/rte_string_fns.h b/lib/librte_eal/common/include/rte_string_fns.h index 97597a14..9a2a1ff9 100644 --- a/lib/librte_eal/common/include/rte_string_fns.h +++ b/lib/librte_eal/common/include/rte_string_fns.h @@ -16,6 +16,7 @@ extern "C" { #endif #include +#include /** * Takes string "string" parameter and splits it at character "delim" @@ -60,12 +61,10 @@ rte_strlcpy(char *dst, const char *src, size_t size) /* pull in a strlcpy function */ #ifdef RTE_EXEC_ENV_BSDAPP -#include #ifndef __BSD_VISIBLE /* non-standard functions are hidden */ #define strlcpy(dst, src, size) rte_strlcpy(dst, src, size) #endif - #else /* non-BSD platforms */ #ifdef RTE_USE_LIBBSD #include @@ -76,6 +75,29 @@ rte_strlcpy(char *dst, const char *src, size_t size) #endif /* RTE_USE_LIBBSD */ #endif /* BSDAPP */ +/** + * Copy string src to buffer dst of size dsize. + * At most dsize-1 chars will be copied. + * Always NUL-terminates, unless (dsize == 0). + * Returns number of bytes copied (terminating NUL-byte excluded) on success ; + * negative errno on error. + * + * @param dst + * The destination string. + * + * @param src + * The input string to be copied. + * + * @param dsize + * Length in bytes of the destination buffer. + * + * @return + * The number of bytes copied on success + * -E2BIG if the destination buffer is too small. + */ +ssize_t +rte_strscpy(char *dst, const char *src, size_t dsize); + #ifdef __cplusplus } #endif diff --git a/lib/librte_eal/common/include/rte_version.h b/lib/librte_eal/common/include/rte_version.h index 7c6714a2..412ed2db 100644 --- a/lib/librte_eal/common/include/rte_version.h +++ b/lib/librte_eal/common/include/rte_version.h @@ -32,7 +32,7 @@ extern "C" { /** * Minor version/month number i.e. the mm in yy.mm.z */ -#define RTE_VER_MONTH 8 +#define RTE_VER_MONTH 11 /** * Patch level number i.e. the z in yy.mm.z @@ -42,14 +42,14 @@ extern "C" { /** * Extra string to be appended to version number */ -#define RTE_VER_SUFFIX "" +#define RTE_VER_SUFFIX "-rc" /** * Patch release number * 0-15 = release candidates * 16 = release */ -#define RTE_VER_RELEASE 16 +#define RTE_VER_RELEASE 1 /** * Macro to compute a version number usable for comparisons diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h index 5ca13fcc..cae96fab 100644 --- a/lib/librte_eal/common/include/rte_vfio.h +++ b/lib/librte_eal/common/include/rte_vfio.h @@ -14,6 +14,8 @@ extern "C" { #endif +#include + /* * determine if VFIO is present on the system */ @@ -22,6 +24,9 @@ extern "C" { #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) #define VFIO_PRESENT #endif /* kernel version >= 3.6.0 */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0) +#define HAVE_VFIO_DEV_REQ_INTERFACE +#endif /* kernel version >= 4.0.0 */ #endif /* RTE_EAL_VFIO */ #ifdef VFIO_PRESENT @@ -44,6 +49,30 @@ extern "C" { #define RTE_VFIO_NOIOMMU 8 #endif +/* + * capabilities are only supported on kernel 4.6+. there were also some API + * changes as well, so add a macro to get cap offset. + */ +#ifdef VFIO_REGION_INFO_FLAG_CAPS +#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS +#define VFIO_CAP_OFFSET(x) (x->cap_offset) +#else +#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3) +#define VFIO_CAP_OFFSET(x) (x->resv) +struct vfio_info_cap_header { + uint16_t id; + uint16_t version; + uint32_t next; +}; +#endif + +/* kernels 4.16+ can map BAR containing MSI-X table */ +#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE +#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE +#else +#define RTE_VFIO_CAP_MSIX_MAPPABLE 3 +#endif + #else /* not VFIO_PRESENT */ /* we don't need an actual definition, only pointer is used */ @@ -227,7 +256,7 @@ rte_vfio_get_group_num(const char *sysfs_base, const char *dev_addr, int *iommu_group_num); /** - * Open VFIO container fd or get an existing one + * Open a new VFIO container fd * * This function is only relevant to linux and will return * an error on BSD. diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c index e0a8ed15..1a74660d 100644 --- a/lib/librte_eal/common/malloc_elem.c +++ b/lib/librte_eal/common/malloc_elem.c @@ -39,10 +39,14 @@ malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align) contig_seg_start = RTE_PTR_ALIGN_CEIL(data_start, align); /* if we're in IOVA as VA mode, or if we're in legacy mode with - * hugepages, all elements are IOVA-contiguous. + * hugepages, all elements are IOVA-contiguous. however, we can only + * make these assumptions about internal memory - externally allocated + * segments have to be checked. */ - if (rte_eal_iova_mode() == RTE_IOVA_VA || - (internal_config.legacy_mem && rte_eal_has_hugepages())) + if (!elem->msl->external && + (rte_eal_iova_mode() == RTE_IOVA_VA || + (internal_config.legacy_mem && + rte_eal_has_hugepages()))) return RTE_PTR_DIFF(data_end, contig_seg_start); cur_page = RTE_PTR_ALIGN_FLOOR(contig_seg_start, page_sz); diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c index 12aaf2d7..1973b6e6 100644 --- a/lib/librte_eal/common/malloc_heap.c +++ b/lib/librte_eal/common/malloc_heap.c @@ -29,6 +29,10 @@ #include "malloc_heap.h" #include "malloc_mp.h" +/* start external socket ID's at a very high number */ +#define CONST_MAX(a, b) (a > b ? a : b) /* RTE_MAX is not a constant */ +#define EXTERNAL_HEAP_MIN_SOCKET_ID (CONST_MAX((1 << 8), RTE_MAX_NUMA_NODES)) + static unsigned check_hugepage_sz(unsigned flags, uint64_t hugepage_sz) { @@ -66,6 +70,21 @@ check_hugepage_sz(unsigned flags, uint64_t hugepage_sz) return check_flag & flags; } +int +malloc_socket_to_heap_id(unsigned int socket_id) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i; + + for (i = 0; i < RTE_MAX_HEAPS; i++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[i]; + + if (heap->socket_id == socket_id) + return i; + } + return -1; +} + /* * Expand the heap with a memory area. */ @@ -93,9 +112,17 @@ malloc_add_seg(const struct rte_memseg_list *msl, struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; struct rte_memseg_list *found_msl; struct malloc_heap *heap; - int msl_idx; + int msl_idx, heap_idx; + + if (msl->external) + return 0; - heap = &mcfg->malloc_heaps[msl->socket_id]; + heap_idx = malloc_socket_to_heap_id(msl->socket_id); + if (heap_idx < 0) { + RTE_LOG(ERR, EAL, "Memseg list has invalid socket id\n"); + return -1; + } + heap = &mcfg->malloc_heaps[heap_idx]; /* msl is const, so find it */ msl_idx = msl - mcfg->memsegs; @@ -165,7 +192,9 @@ find_biggest_element(struct malloc_heap *heap, size_t *size, for (elem = LIST_FIRST(&heap->free_head[idx]); !!elem; elem = LIST_NEXT(elem, free_list)) { size_t cur_size; - if (!check_hugepage_sz(flags, elem->msl->page_sz)) + if ((flags & RTE_MEMZONE_SIZE_HINT_ONLY) == 0 && + !check_hugepage_sz(flags, + elem->msl->page_sz)) continue; if (contig) { cur_size = @@ -259,11 +288,13 @@ alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, int socket, unsigned int flags, size_t align, size_t bound, bool contig, struct rte_memseg **ms, int n_segs) { + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; struct rte_memseg_list *msl; struct malloc_elem *elem = NULL; size_t alloc_sz; int allocd_pages; void *ret, *map_addr; + uint64_t mask; alloc_sz = (size_t)pg_sz * n_segs; @@ -291,6 +322,16 @@ alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, goto fail; } + if (mcfg->dma_maskbits) { + mask = ~((1ULL << mcfg->dma_maskbits) - 1); + if (rte_eal_check_dma_mask(mask)) { + RTE_LOG(ERR, EAL, + "%s(): couldn't allocate memory due to DMA mask\n", + __func__); + goto fail; + } + } + /* add newly minted memsegs to malloc heap */ elem = malloc_heap_add_memory(heap, msl, map_addr, alloc_sz); @@ -326,11 +367,9 @@ try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz, /* we can't know in advance how many pages we'll need, so we malloc */ ms = malloc(sizeof(*ms) * n_segs); - - memset(ms, 0, sizeof(*ms) * n_segs); - if (ms == NULL) return -1; + memset(ms, 0, sizeof(*ms) * n_segs); elem = alloc_pages_on_heap(heap, pg_sz, elt_size, socket, flags, align, bound, contig, ms, n_segs); @@ -560,12 +599,14 @@ alloc_more_mem_on_socket(struct malloc_heap *heap, size_t size, int socket, /* this will try lower page sizes first */ static void * -heap_alloc_on_socket(const char *type, size_t size, int socket, - unsigned int flags, size_t align, size_t bound, bool contig) +malloc_heap_alloc_on_heap_id(const char *type, size_t size, + unsigned int heap_id, unsigned int flags, size_t align, + size_t bound, bool contig) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct malloc_heap *heap = &mcfg->malloc_heaps[socket]; + struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; + int socket_id; void *ret; rte_spinlock_lock(&(heap->lock)); @@ -583,12 +624,28 @@ heap_alloc_on_socket(const char *type, size_t size, int socket, * we may still be able to allocate memory from appropriate page sizes, * we just need to request more memory first. */ + + socket_id = rte_socket_id_by_idx(heap_id); + /* + * if socket ID is negative, we cannot find a socket ID for this heap - + * which means it's an external heap. those can have unexpected page + * sizes, so if the user asked to allocate from there - assume user + * knows what they're doing, and allow allocating from there with any + * page size flags. + */ + if (socket_id < 0) + size_flags |= RTE_MEMZONE_SIZE_HINT_ONLY; + ret = heap_alloc(heap, type, size, size_flags, align, bound, contig); if (ret != NULL) goto alloc_unlock; - if (!alloc_more_mem_on_socket(heap, size, socket, flags, align, bound, - contig)) { + /* if socket ID is invalid, this is an external heap */ + if (socket_id < 0) + goto alloc_unlock; + + if (!alloc_more_mem_on_socket(heap, size, socket_id, flags, align, + bound, contig)) { ret = heap_alloc(heap, type, size, flags, align, bound, contig); /* this should have succeeded */ @@ -604,14 +661,14 @@ void * malloc_heap_alloc(const char *type, size_t size, int socket_arg, unsigned int flags, size_t align, size_t bound, bool contig) { - int socket, i, cur_socket; + int socket, heap_id, i; void *ret; /* return NULL if size is 0 or alignment is not power-of-2 */ if (size == 0 || (align && !rte_is_power_of_2(align))) return NULL; - if (!rte_eal_has_hugepages()) + if (!rte_eal_has_hugepages() && socket_arg < RTE_MAX_NUMA_NODES) socket_arg = SOCKET_ID_ANY; if (socket_arg == SOCKET_ID_ANY) @@ -619,22 +676,25 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg, else socket = socket_arg; - /* Check socket parameter */ - if (socket >= RTE_MAX_NUMA_NODES) + /* turn socket ID into heap ID */ + heap_id = malloc_socket_to_heap_id(socket); + /* if heap id is negative, socket ID was invalid */ + if (heap_id < 0) return NULL; - ret = heap_alloc_on_socket(type, size, socket, flags, align, bound, - contig); + ret = malloc_heap_alloc_on_heap_id(type, size, heap_id, flags, align, + bound, contig); if (ret != NULL || socket_arg != SOCKET_ID_ANY) return ret; - /* try other heaps */ + /* try other heaps. we are only iterating through native DPDK sockets, + * so external heaps won't be included. + */ for (i = 0; i < (int) rte_socket_count(); i++) { - cur_socket = rte_socket_id_by_idx(i); - if (cur_socket == socket) + if (i == heap_id) continue; - ret = heap_alloc_on_socket(type, size, cur_socket, flags, - align, bound, contig); + ret = malloc_heap_alloc_on_heap_id(type, size, i, flags, align, + bound, contig); if (ret != NULL) return ret; } @@ -642,11 +702,11 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg, } static void * -heap_alloc_biggest_on_socket(const char *type, int socket, unsigned int flags, - size_t align, bool contig) +heap_alloc_biggest_on_heap_id(const char *type, unsigned int heap_id, + unsigned int flags, size_t align, bool contig) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct malloc_heap *heap = &mcfg->malloc_heaps[socket]; + struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; void *ret; rte_spinlock_lock(&(heap->lock)); @@ -664,7 +724,7 @@ void * malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags, size_t align, bool contig) { - int socket, i, cur_socket; + int socket, i, cur_socket, heap_id; void *ret; /* return NULL if align is not power-of-2 */ @@ -679,11 +739,13 @@ malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags, else socket = socket_arg; - /* Check socket parameter */ - if (socket >= RTE_MAX_NUMA_NODES) + /* turn socket ID into heap ID */ + heap_id = malloc_socket_to_heap_id(socket); + /* if heap id is negative, socket ID was invalid */ + if (heap_id < 0) return NULL; - ret = heap_alloc_biggest_on_socket(type, socket, flags, align, + ret = heap_alloc_biggest_on_heap_id(type, heap_id, flags, align, contig); if (ret != NULL || socket_arg != SOCKET_ID_ANY) return ret; @@ -693,8 +755,8 @@ malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags, cur_socket = rte_socket_id_by_idx(i); if (cur_socket == socket) continue; - ret = heap_alloc_biggest_on_socket(type, cur_socket, flags, - align, contig); + ret = heap_alloc_biggest_on_heap_id(type, i, flags, align, + contig); if (ret != NULL) return ret; } @@ -756,8 +818,10 @@ malloc_heap_free(struct malloc_elem *elem) /* anything after this is a bonus */ ret = 0; - /* ...of which we can't avail if we are in legacy mode */ - if (internal_config.legacy_mem) + /* ...of which we can't avail if we are in legacy mode, or if this is an + * externally allocated segment. + */ + if (internal_config.legacy_mem || (msl->external > 0)) goto free_unlock; /* check if we can free any memory back to the system */ @@ -914,7 +978,7 @@ malloc_heap_resize(struct malloc_elem *elem, size_t size) } /* - * Function to retrieve data for heap on given socket + * Function to retrieve data for a given heap */ int malloc_heap_get_stats(struct malloc_heap *heap, @@ -952,7 +1016,7 @@ malloc_heap_get_stats(struct malloc_heap *heap, } /* - * Function to retrieve data for heap on given socket + * Function to retrieve data for a given heap */ void malloc_heap_dump(struct malloc_heap *heap, FILE *f) @@ -973,10 +1037,216 @@ malloc_heap_dump(struct malloc_heap *heap, FILE *f) rte_spinlock_unlock(&heap->lock); } +static int +destroy_seg(struct malloc_elem *elem, size_t len) +{ + struct malloc_heap *heap = elem->heap; + struct rte_memseg_list *msl; + + msl = elem->msl; + + /* notify all subscribers that a memory area is going to be removed */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, elem, len); + + /* this element can be removed */ + malloc_elem_free_list_remove(elem); + malloc_elem_hide_region(elem, elem, len); + + heap->total_size -= len; + + memset(elem, 0, sizeof(*elem)); + + /* destroy the fbarray backing this memory */ + if (rte_fbarray_destroy(&msl->memseg_arr) < 0) + return -1; + + /* reset the memseg list */ + memset(msl, 0, sizeof(*msl)); + + return 0; +} + +int +malloc_heap_add_external_memory(struct malloc_heap *heap, void *va_addr, + rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + char fbarray_name[RTE_FBARRAY_NAME_LEN]; + struct rte_memseg_list *msl = NULL; + struct rte_fbarray *arr; + size_t seg_len = n_pages * page_sz; + unsigned int i; + + /* first, find a free memseg list */ + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *tmp = &mcfg->memsegs[i]; + if (tmp->base_va == NULL) { + msl = tmp; + break; + } + } + if (msl == NULL) { + RTE_LOG(ERR, EAL, "Couldn't find empty memseg list\n"); + rte_errno = ENOSPC; + return -1; + } + + snprintf(fbarray_name, sizeof(fbarray_name) - 1, "%s_%p", + heap->name, va_addr); + + /* create the backing fbarray */ + if (rte_fbarray_init(&msl->memseg_arr, fbarray_name, n_pages, + sizeof(struct rte_memseg)) < 0) { + RTE_LOG(ERR, EAL, "Couldn't create fbarray backing the memseg list\n"); + return -1; + } + arr = &msl->memseg_arr; + + /* fbarray created, fill it up */ + for (i = 0; i < n_pages; i++) { + struct rte_memseg *ms; + + rte_fbarray_set_used(arr, i); + ms = rte_fbarray_get(arr, i); + ms->addr = RTE_PTR_ADD(va_addr, i * page_sz); + ms->iova = iova_addrs == NULL ? RTE_BAD_IOVA : iova_addrs[i]; + ms->hugepage_sz = page_sz; + ms->len = page_sz; + ms->nchannel = rte_memory_get_nchannel(); + ms->nrank = rte_memory_get_nrank(); + ms->socket_id = heap->socket_id; + } + + /* set up the memseg list */ + msl->base_va = va_addr; + msl->page_sz = page_sz; + msl->socket_id = heap->socket_id; + msl->len = seg_len; + msl->version = 0; + msl->external = 1; + + /* erase contents of new memory */ + memset(va_addr, 0, seg_len); + + /* now, add newly minted memory to the malloc heap */ + malloc_heap_add_memory(heap, msl, va_addr, seg_len); + + heap->total_size += seg_len; + + /* all done! */ + RTE_LOG(DEBUG, EAL, "Added segment for heap %s starting at %p\n", + heap->name, va_addr); + + /* notify all subscribers that a new memory area has been added */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, + va_addr, seg_len); + + return 0; +} + +int +malloc_heap_remove_external_memory(struct malloc_heap *heap, void *va_addr, + size_t len) +{ + struct malloc_elem *elem = heap->first; + + /* find element with specified va address */ + while (elem != NULL && elem != va_addr) { + elem = elem->next; + /* stop if we've blown past our VA */ + if (elem > (struct malloc_elem *)va_addr) { + rte_errno = ENOENT; + return -1; + } + } + /* check if element was found */ + if (elem == NULL || elem->msl->len != len) { + rte_errno = ENOENT; + return -1; + } + /* if element's size is not equal to segment len, segment is busy */ + if (elem->state == ELEM_BUSY || elem->size != len) { + rte_errno = EBUSY; + return -1; + } + return destroy_seg(elem, len); +} + +int +malloc_heap_create(struct malloc_heap *heap, const char *heap_name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + uint32_t next_socket_id = mcfg->next_socket_id; + + /* prevent overflow. did you really create 2 billion heaps??? */ + if (next_socket_id > INT32_MAX) { + RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n"); + rte_errno = ENOSPC; + return -1; + } + + /* initialize empty heap */ + heap->alloc_count = 0; + heap->first = NULL; + heap->last = NULL; + LIST_INIT(heap->free_head); + rte_spinlock_init(&heap->lock); + heap->total_size = 0; + heap->socket_id = next_socket_id; + + /* we hold a global mem hotplug writelock, so it's safe to increment */ + mcfg->next_socket_id++; + + /* set up name */ + strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN); + return 0; +} + +int +malloc_heap_destroy(struct malloc_heap *heap) +{ + if (heap->alloc_count != 0) { + RTE_LOG(ERR, EAL, "Heap is still in use\n"); + rte_errno = EBUSY; + return -1; + } + if (heap->first != NULL || heap->last != NULL) { + RTE_LOG(ERR, EAL, "Heap still contains memory segments\n"); + rte_errno = EBUSY; + return -1; + } + if (heap->total_size != 0) + RTE_LOG(ERR, EAL, "Total size not zero, heap is likely corrupt\n"); + + /* after this, the lock will be dropped */ + memset(heap, 0, sizeof(*heap)); + + return 0; +} + int rte_eal_malloc_heap_init(void) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int i; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* assign min socket ID to external heaps */ + mcfg->next_socket_id = EXTERNAL_HEAP_MIN_SOCKET_ID; + + /* assign names to default DPDK heaps */ + for (i = 0; i < rte_socket_count(); i++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[i]; + char heap_name[RTE_HEAP_NAME_MAX_LEN]; + int socket_id = rte_socket_id_by_idx(i); + + snprintf(heap_name, sizeof(heap_name) - 1, + "socket_%i", socket_id); + strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN); + heap->socket_id = socket_id; + } + } + if (register_mp_requests()) { RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n"); diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h index f52cb555..e48996d5 100644 --- a/lib/librte_eal/common/malloc_heap.h +++ b/lib/librte_eal/common/malloc_heap.h @@ -33,6 +33,20 @@ void * malloc_heap_alloc_biggest(const char *type, int socket, unsigned int flags, size_t align, bool contig); +int +malloc_heap_create(struct malloc_heap *heap, const char *heap_name); + +int +malloc_heap_destroy(struct malloc_heap *heap); + +int +malloc_heap_add_external_memory(struct malloc_heap *heap, void *va_addr, + rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz); + +int +malloc_heap_remove_external_memory(struct malloc_heap *heap, void *va_addr, + size_t len); + int malloc_heap_free(struct malloc_elem *elem); @@ -46,6 +60,9 @@ malloc_heap_get_stats(struct malloc_heap *heap, void malloc_heap_dump(struct malloc_heap *heap, FILE *f); +int +malloc_socket_to_heap_id(unsigned int socket_id); + int rte_eal_malloc_heap_init(void); diff --git a/lib/librte_eal/common/malloc_mp.c b/lib/librte_eal/common/malloc_mp.c index 931c14bc..5f2d4e0b 100644 --- a/lib/librte_eal/common/malloc_mp.c +++ b/lib/librte_eal/common/malloc_mp.c @@ -194,13 +194,11 @@ handle_alloc_request(const struct malloc_mp_req *m, /* we can't know in advance how many pages we'll need, so we malloc */ ms = malloc(sizeof(*ms) * n_segs); - - memset(ms, 0, sizeof(*ms) * n_segs); - if (ms == NULL) { RTE_LOG(ERR, EAL, "Couldn't allocate memory for request state\n"); goto fail; } + memset(ms, 0, sizeof(*ms) * n_segs); elem = alloc_pages_on_heap(heap, ar->page_sz, ar->elt_size, ar->socket, ar->flags, ar->align, ar->bound, ar->contig, ms, diff --git a/lib/librte_eal/common/meson.build b/lib/librte_eal/common/meson.build index 56005bea..2a10d57d 100644 --- a/lib/librte_eal/common/meson.build +++ b/lib/librte_eal/common/meson.build @@ -14,6 +14,7 @@ common_sources = files( 'eal_common_errno.c', 'eal_common_fbarray.c', 'eal_common_hexdump.c', + 'eal_common_hypervisor.c', 'eal_common_launch.c', 'eal_common_lcore.c', 'eal_common_log.c', @@ -27,11 +28,13 @@ common_sources = files( 'eal_common_thread.c', 'eal_common_timer.c', 'eal_common_uuid.c', + 'hotplug_mp.c', 'malloc_elem.c', 'malloc_heap.c', 'malloc_mp.c', 'rte_keepalive.c', 'rte_malloc.c', + 'rte_option.c', 'rte_reciprocal.c', 'rte_service.c' ) @@ -59,6 +62,7 @@ common_headers = files( 'include/rte_errno.h', 'include/rte_fbarray.h', 'include/rte_hexdump.h', + 'include/rte_hypervisor.h', 'include/rte_interrupts.h', 'include/rte_keepalive.h', 'include/rte_launch.h', @@ -68,6 +72,7 @@ common_headers = files( 'include/rte_malloc_heap.h', 'include/rte_memory.h', 'include/rte_memzone.h', + 'include/rte_option.h', 'include/rte_pci_dev_feature_defs.h', 'include/rte_pci_dev_features.h', 'include/rte_per_lcore.h', diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c index b51a6d11..9e61dc41 100644 --- a/lib/librte_eal/common/rte_malloc.c +++ b/lib/librte_eal/common/rte_malloc.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -23,6 +24,7 @@ #include #include "malloc_elem.h" #include "malloc_heap.h" +#include "eal_memalloc.h" /* Free the memory space back to heap */ @@ -44,13 +46,15 @@ rte_malloc_socket(const char *type, size_t size, unsigned int align, if (size == 0 || (align && !rte_is_power_of_2(align))) return NULL; - if (!rte_eal_has_hugepages()) + /* if there are no hugepages and if we are not allocating from an + * external heap, use memory from any socket available. checking for + * socket being external may return -1 in case of invalid socket, but + * that's OK - if there are no hugepages, it doesn't matter. + */ + if (rte_malloc_heap_socket_is_external(socket_arg) != 1 && + !rte_eal_has_hugepages()) socket_arg = SOCKET_ID_ANY; - /* Check socket parameter */ - if (socket_arg >= RTE_MAX_NUMA_NODES) - return NULL; - return malloc_heap_alloc(type, size, socket_arg, 0, align == 0 ? 1 : align, 0, false); } @@ -152,11 +156,20 @@ rte_malloc_get_socket_stats(int socket, struct rte_malloc_socket_stats *socket_stats) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int heap_idx, ret = -1; - if (socket >= RTE_MAX_NUMA_NODES || socket < 0) - return -1; + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + + heap_idx = malloc_socket_to_heap_id(socket); + if (heap_idx < 0) + goto unlock; - return malloc_heap_get_stats(&mcfg->malloc_heaps[socket], socket_stats); + ret = malloc_heap_get_stats(&mcfg->malloc_heaps[heap_idx], + socket_stats); +unlock: + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; } /* @@ -168,12 +181,75 @@ rte_malloc_dump_heaps(FILE *f) struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; unsigned int idx; - for (idx = 0; idx < rte_socket_count(); idx++) { - unsigned int socket = rte_socket_id_by_idx(idx); - fprintf(f, "Heap on socket %i:\n", socket); - malloc_heap_dump(&mcfg->malloc_heaps[socket], f); + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + + for (idx = 0; idx < RTE_MAX_HEAPS; idx++) { + fprintf(f, "Heap id: %u\n", idx); + malloc_heap_dump(&mcfg->malloc_heaps[idx], f); + } + + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); +} + +int +rte_malloc_heap_get_socket(const char *name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + unsigned int idx; + int ret; + + if (name == NULL || + strnlen(name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + for (idx = 0; idx < RTE_MAX_HEAPS; idx++) { + struct malloc_heap *tmp = &mcfg->malloc_heaps[idx]; + + if (!strncmp(name, tmp->name, RTE_HEAP_NAME_MAX_LEN)) { + heap = tmp; + break; + } + } + + if (heap != NULL) { + ret = heap->socket_id; + } else { + rte_errno = ENOENT; + ret = -1; + } + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +int +rte_malloc_heap_socket_is_external(int socket_id) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int idx; + int ret = -1; + + if (socket_id == SOCKET_ID_ANY) + return 0; + + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + for (idx = 0; idx < RTE_MAX_HEAPS; idx++) { + struct malloc_heap *tmp = &mcfg->malloc_heaps[idx]; + + if ((int)tmp->socket_id == socket_id) { + /* external memory always has large socket ID's */ + ret = tmp->socket_id >= RTE_MAX_NUMA_NODES; + break; + } } + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + return ret; } /* @@ -182,14 +258,20 @@ rte_malloc_dump_heaps(FILE *f) void rte_malloc_dump_stats(FILE *f, __rte_unused const char *type) { - unsigned int socket; + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int heap_id; struct rte_malloc_socket_stats sock_stats; + + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + /* Iterate through all initialised heaps */ - for (socket=0; socket< RTE_MAX_NUMA_NODES; socket++) { - if ((rte_malloc_get_socket_stats(socket, &sock_stats) < 0)) - continue; + for (heap_id = 0; heap_id < RTE_MAX_HEAPS; heap_id++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; + + malloc_heap_get_stats(heap, &sock_stats); - fprintf(f, "Socket:%u\n", socket); + fprintf(f, "Heap id:%u\n", heap_id); + fprintf(f, "\tHeap name:%s\n", heap->name); fprintf(f, "\tHeap_size:%zu,\n", sock_stats.heap_totalsz_bytes); fprintf(f, "\tFree_size:%zu,\n", sock_stats.heap_freesz_bytes); fprintf(f, "\tAlloc_size:%zu,\n", sock_stats.heap_allocsz_bytes); @@ -198,6 +280,7 @@ rte_malloc_dump_stats(FILE *f, __rte_unused const char *type) fprintf(f, "\tAlloc_count:%u,\n",sock_stats.alloc_count); fprintf(f, "\tFree_count:%u,\n", sock_stats.free_count); } + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); return; } @@ -223,7 +306,7 @@ rte_malloc_virt2iova(const void *addr) if (elem == NULL) return RTE_BAD_IOVA; - if (rte_eal_iova_mode() == RTE_IOVA_VA) + if (!elem->msl->external && rte_eal_iova_mode() == RTE_IOVA_VA) return (uintptr_t) addr; ms = rte_mem_virt2memseg(addr, elem->msl); @@ -235,3 +318,320 @@ rte_malloc_virt2iova(const void *addr) return ms->iova + RTE_PTR_DIFF(addr, ms->addr); } + +static struct malloc_heap * +find_named_heap(const char *name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int i; + + for (i = 0; i < RTE_MAX_HEAPS; i++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[i]; + + if (!strncmp(name, heap->name, RTE_HEAP_NAME_MAX_LEN)) + return heap; + } + return NULL; +} + +int +rte_malloc_heap_memory_add(const char *heap_name, void *va_addr, size_t len, + rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + unsigned int n; + int ret; + + if (heap_name == NULL || va_addr == NULL || + page_sz == 0 || !rte_is_power_of_2(page_sz) || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + ret = -1; + goto unlock; + } + rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); + + /* find our heap */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + /* cannot add memory to internal heaps */ + rte_errno = EPERM; + ret = -1; + goto unlock; + } + n = len / page_sz; + if (n != n_pages && iova_addrs != NULL) { + rte_errno = EINVAL; + ret = -1; + goto unlock; + } + + rte_spinlock_lock(&heap->lock); + ret = malloc_heap_add_external_memory(heap, va_addr, iova_addrs, n, + page_sz); + rte_spinlock_unlock(&heap->lock); + +unlock: + rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +int +rte_malloc_heap_memory_remove(const char *heap_name, void *va_addr, size_t len) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + int ret; + + if (heap_name == NULL || va_addr == NULL || len == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); + /* find our heap */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + /* cannot remove memory from internal heaps */ + rte_errno = EPERM; + ret = -1; + goto unlock; + } + + rte_spinlock_lock(&heap->lock); + ret = malloc_heap_remove_external_memory(heap, va_addr, len); + rte_spinlock_unlock(&heap->lock); + +unlock: + rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +struct sync_mem_walk_arg { + void *va_addr; + size_t len; + int result; + bool attach; +}; + +static int +sync_mem_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct sync_mem_walk_arg *wa = arg; + size_t len = msl->page_sz * msl->memseg_arr.len; + + if (msl->base_va == wa->va_addr && + len == wa->len) { + struct rte_memseg_list *found_msl; + int msl_idx, ret; + + /* msl is const */ + msl_idx = msl - mcfg->memsegs; + found_msl = &mcfg->memsegs[msl_idx]; + + if (wa->attach) { + ret = rte_fbarray_attach(&found_msl->memseg_arr); + } else { + /* notify all subscribers that a memory area is about to + * be removed + */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + msl->base_va, msl->len); + ret = rte_fbarray_detach(&found_msl->memseg_arr); + } + + if (ret < 0) { + wa->result = -rte_errno; + } else { + /* notify all subscribers that a new memory area was + * added + */ + if (wa->attach) + eal_memalloc_mem_event_notify( + RTE_MEM_EVENT_ALLOC, + msl->base_va, msl->len); + wa->result = 0; + } + return 1; + } + return 0; +} + +static int +sync_memory(const char *heap_name, void *va_addr, size_t len, bool attach) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + struct sync_mem_walk_arg wa; + int ret; + + if (heap_name == NULL || va_addr == NULL || len == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + + /* find our heap */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + /* we shouldn't be able to sync to internal heaps */ + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + rte_errno = EPERM; + ret = -1; + goto unlock; + } + + /* find corresponding memseg list to sync to */ + wa.va_addr = va_addr; + wa.len = len; + wa.result = -ENOENT; /* fail unless explicitly told to succeed */ + wa.attach = attach; + + /* we're already holding a read lock */ + rte_memseg_list_walk_thread_unsafe(sync_mem_walk, &wa); + + if (wa.result < 0) { + rte_errno = -wa.result; + ret = -1; + } else { + /* notify all subscribers that a new memory area was added */ + if (attach) + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, + va_addr, len); + ret = 0; + } +unlock: + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + return ret; +} + +int +rte_malloc_heap_memory_attach(const char *heap_name, void *va_addr, size_t len) +{ + return sync_memory(heap_name, va_addr, len, true); +} + +int +rte_malloc_heap_memory_detach(const char *heap_name, void *va_addr, size_t len) +{ + return sync_memory(heap_name, va_addr, len, false); +} + +int +rte_malloc_heap_create(const char *heap_name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + int i, ret; + + if (heap_name == NULL || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + /* check if there is space in the heap list, or if heap with this name + * already exists. + */ + rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); + + for (i = 0; i < RTE_MAX_HEAPS; i++) { + struct malloc_heap *tmp = &mcfg->malloc_heaps[i]; + /* existing heap */ + if (strncmp(heap_name, tmp->name, + RTE_HEAP_NAME_MAX_LEN) == 0) { + RTE_LOG(ERR, EAL, "Heap %s already exists\n", + heap_name); + rte_errno = EEXIST; + ret = -1; + goto unlock; + } + /* empty heap */ + if (strnlen(tmp->name, RTE_HEAP_NAME_MAX_LEN) == 0) { + heap = tmp; + break; + } + } + if (heap == NULL) { + RTE_LOG(ERR, EAL, "Cannot create new heap: no space\n"); + rte_errno = ENOSPC; + ret = -1; + goto unlock; + } + + /* we're sure that we can create a new heap, so do it */ + ret = malloc_heap_create(heap, heap_name); +unlock: + rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +int +rte_malloc_heap_destroy(const char *heap_name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + int ret; + + if (heap_name == NULL || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); + + /* start from non-socket heaps */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + RTE_LOG(ERR, EAL, "Heap %s not found\n", heap_name); + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + /* we shouldn't be able to destroy internal heaps */ + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + rte_errno = EPERM; + ret = -1; + goto unlock; + } + /* sanity checks done, now we can destroy the heap */ + rte_spinlock_lock(&heap->lock); + ret = malloc_heap_destroy(heap); + + /* if we failed, lock is still active */ + if (ret < 0) + rte_spinlock_unlock(&heap->lock); +unlock: + rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} diff --git a/lib/librte_eal/common/rte_option.c b/lib/librte_eal/common/rte_option.c new file mode 100644 index 00000000..02d59a86 --- /dev/null +++ b/lib/librte_eal/common/rte_option.c @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation. + */ + +#include +#include + +#include +#include + +#include "eal_private.h" + +TAILQ_HEAD(rte_option_list, rte_option); + +struct rte_option_list rte_option_list = + TAILQ_HEAD_INITIALIZER(rte_option_list); + +static struct rte_option *option; + +int +rte_option_parse(const char *opt) +{ + /* Check if the option is registered */ + TAILQ_FOREACH(option, &rte_option_list, next) { + if (strcmp(opt, option->opt_str) == 0) { + option->enabled = 1; + return 0; + } + } + + return -1; +} + +void __rte_experimental +rte_option_register(struct rte_option *opt) +{ + TAILQ_FOREACH(option, &rte_option_list, next) { + if (strcmp(opt->opt_str, option->opt_str) == 0) + RTE_LOG(INFO, EAL, "Option %s has already been registered.", + opt->opt_str); + return; + } + + TAILQ_INSERT_HEAD(&rte_option_list, opt, next); +} + +void +rte_option_init(void) +{ + TAILQ_FOREACH(option, &rte_option_list, next) { + if (option->enabled) + option->cb(); + } +} diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile index fd92c75c..51deb579 100644 --- a/lib/librte_eal/linuxapp/eal/Makefile +++ b/lib/librte_eal/linuxapp/eal/Makefile @@ -10,7 +10,7 @@ ARCH_DIR ?= $(RTE_ARCH) EXPORT_MAP := ../../rte_eal_version.map VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR) -LIBABIVER := 8 +LIBABIVER := 9 VPATH += $(RTE_SDK)/lib/librte_eal/common @@ -70,10 +70,12 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_uuid.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += hotplug_mp.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_mp.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_option.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_service.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_reciprocal.c @@ -85,22 +87,6 @@ SRCS-y += rte_cycles.c CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST) -CFLAGS_eal.o := -D_GNU_SOURCE -CFLAGS_eal_interrupts.o := -D_GNU_SOURCE -CFLAGS_eal_vfio_mp_sync.o := -D_GNU_SOURCE -CFLAGS_eal_timer.o := -D_GNU_SOURCE -CFLAGS_eal_lcore.o := -D_GNU_SOURCE -CFLAGS_eal_memalloc.o := -D_GNU_SOURCE -CFLAGS_eal_thread.o := -D_GNU_SOURCE -CFLAGS_eal_log.o := -D_GNU_SOURCE -CFLAGS_eal_common_log.o := -D_GNU_SOURCE -CFLAGS_eal_hugepage_info.o := -D_GNU_SOURCE -CFLAGS_eal_common_whitelist.o := -D_GNU_SOURCE -CFLAGS_eal_common_options.o := -D_GNU_SOURCE -CFLAGS_eal_common_thread.o := -D_GNU_SOURCE -CFLAGS_eal_common_lcore.o := -D_GNU_SOURCE -CFLAGS_rte_cycles.o := -D_GNU_SOURCE - # workaround for a gcc bug with noreturn attribute # http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c index e59ac657..361744d4 100644 --- a/lib/librte_eal/linuxapp/eal/eal.c +++ b/lib/librte_eal/linuxapp/eal/eal.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "eal_private.h" #include "eal_thread.h" @@ -149,7 +150,7 @@ eal_create_runtime_dir(void) } const char * -eal_get_runtime_dir(void) +rte_eal_get_runtime_dir(void) { return runtime_dir; } @@ -263,6 +264,8 @@ rte_eal_config_create(void) * processes could later map the config into this exact location */ rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr; + rte_config.mem_config->dma_maskbits = 0; + } /* attach to an existing shared memory config */ @@ -352,6 +355,24 @@ eal_proc_type_detect(void) return ptype; } +/* copies data from internal config to shared config */ +static void +eal_update_mem_config(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + mcfg->legacy_mem = internal_config.legacy_mem; + mcfg->single_file_segments = internal_config.single_file_segments; +} + +/* copies data from shared config to internal config */ +static void +eal_update_internal_config(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + internal_config.legacy_mem = mcfg->legacy_mem; + internal_config.single_file_segments = mcfg->single_file_segments; +} + /* Sets up rte_config structure with the pointer to shared memory config.*/ static void rte_config_init(void) @@ -361,11 +382,13 @@ rte_config_init(void) switch (rte_config.process_type){ case RTE_PROC_PRIMARY: rte_eal_config_create(); + eal_update_mem_config(); break; case RTE_PROC_SECONDARY: rte_eal_config_attach(); rte_eal_mcfg_wait_complete(rte_config.mem_config); rte_eal_config_reattach(); + eal_update_internal_config(); break; case RTE_PROC_AUTO: case RTE_PROC_INVALID: @@ -580,12 +603,20 @@ eal_parse_args(int argc, char **argv) argvopt = argv; optind = 1; + opterr = 0; while ((opt = getopt_long(argc, argvopt, eal_short_options, eal_long_options, &option_index)) != EOF) { - /* getopt is not happy, stop right now */ + /* + * getopt didn't recognise the option, lets parse the + * registered options to see if the flag is valid + */ if (opt == '?') { + ret = rte_option_parse(argv[optind-1]); + if (ret == 0) + continue; + eal_usage(prgname); ret = -1; goto out; @@ -725,6 +756,9 @@ check_socket(const struct rte_memseg_list *msl, void *arg) { int *socket_id = arg; + if (msl->external) + return 0; + return *socket_id == msl->socket_id; } @@ -793,7 +827,8 @@ rte_eal_init(int argc, char **argv) int i, fctret, ret; pthread_t thread_id; static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0); - const char *logid; + const char *p; + static char logid[PATH_MAX]; char cpuset[RTE_CPU_AFFINITY_STR_LEN]; char thread_name[RTE_MAX_THREAD_NAME_LEN]; @@ -810,9 +845,8 @@ rte_eal_init(int argc, char **argv) return -1; } - logid = strrchr(argv[0], '/'); - logid = strdup(logid ? logid + 1: argv[0]); - + p = strrchr(argv[0], '/'); + strlcpy(logid, p ? p + 1 : argv[0], sizeof(logid)); thread_id = pthread_self(); eal_reset_internal_config(&internal_config); @@ -835,7 +869,7 @@ rte_eal_init(int argc, char **argv) } if (eal_plugins_init() < 0) { - rte_eal_init_alert("Cannot init plugins\n"); + rte_eal_init_alert("Cannot init plugins"); rte_errno = EINVAL; rte_atomic32_clear(&run_once); return -1; @@ -850,7 +884,7 @@ rte_eal_init(int argc, char **argv) rte_config_init(); if (rte_eal_intr_init() < 0) { - rte_eal_init_alert("Cannot init interrupt-handling thread\n"); + rte_eal_init_alert("Cannot init interrupt-handling thread"); return -1; } @@ -858,30 +892,43 @@ rte_eal_init(int argc, char **argv) * bus through mp channel in the secondary process before the bus scan. */ if (rte_mp_channel_init() < 0) { - rte_eal_init_alert("failed to init mp channel\n"); + rte_eal_init_alert("failed to init mp channel"); if (rte_eal_process_type() == RTE_PROC_PRIMARY) { rte_errno = EFAULT; return -1; } } + /* register multi-process action callbacks for hotplug */ + if (rte_mp_dev_hotplug_init() < 0) { + rte_eal_init_alert("failed to register mp callback for hotplug"); + return -1; + } + if (rte_bus_scan()) { - rte_eal_init_alert("Cannot scan the buses for devices\n"); + rte_eal_init_alert("Cannot scan the buses for devices"); rte_errno = ENODEV; rte_atomic32_clear(&run_once); return -1; } - /* autodetect the iova mapping mode (default is iova_pa) */ - rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class(); - - /* Workaround for KNI which requires physical address to work */ - if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA && - rte_eal_check_module("rte_kni") == 1) { - rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA; - RTE_LOG(WARNING, EAL, - "Some devices want IOVA as VA but PA will be used because.. " - "KNI module inserted\n"); + /* if no EAL option "--iova-mode=", use bus IOVA scheme */ + if (internal_config.iova_mode == RTE_IOVA_DC) { + /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */ + rte_eal_get_configuration()->iova_mode = + rte_bus_get_iommu_class(); + + /* Workaround for KNI which requires physical address to work */ + if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA && + rte_eal_check_module("rte_kni") == 1) { + rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA; + RTE_LOG(WARNING, EAL, + "Some devices want IOVA as VA but PA will be used because.. " + "KNI module inserted\n"); + } + } else { + rte_eal_get_configuration()->iova_mode = + internal_config.iova_mode; } if (internal_config.no_hugetlbfs == 0) { @@ -924,7 +971,7 @@ rte_eal_init(int argc, char **argv) #ifdef VFIO_PRESENT if (rte_eal_vfio_setup() < 0) { - rte_eal_init_alert("Cannot init VFIO\n"); + rte_eal_init_alert("Cannot init VFIO"); rte_errno = EAGAIN; rte_atomic32_clear(&run_once); return -1; @@ -935,13 +982,13 @@ rte_eal_init(int argc, char **argv) * initialize memzones first. */ if (rte_eal_memzone_init() < 0) { - rte_eal_init_alert("Cannot init memzone\n"); + rte_eal_init_alert("Cannot init memzone"); rte_errno = ENODEV; return -1; } if (rte_eal_memory_init() < 0) { - rte_eal_init_alert("Cannot init memory\n"); + rte_eal_init_alert("Cannot init memory"); rte_errno = ENOMEM; return -1; } @@ -950,25 +997,25 @@ rte_eal_init(int argc, char **argv) eal_hugedirs_unlock(); if (rte_eal_malloc_heap_init() < 0) { - rte_eal_init_alert("Cannot init malloc heap\n"); + rte_eal_init_alert("Cannot init malloc heap"); rte_errno = ENODEV; return -1; } if (rte_eal_tailqs_init() < 0) { - rte_eal_init_alert("Cannot init tail queues for objects\n"); + rte_eal_init_alert("Cannot init tail queues for objects"); rte_errno = EFAULT; return -1; } if (rte_eal_alarm_init() < 0) { - rte_eal_init_alert("Cannot init interrupt-handling thread\n"); + rte_eal_init_alert("Cannot init interrupt-handling thread"); /* rte_eal_alarm_init sets rte_errno on failure. */ return -1; } if (rte_eal_timer_init() < 0) { - rte_eal_init_alert("Cannot init HPET or TSC timers\n"); + rte_eal_init_alert("Cannot init HPET or TSC timers"); rte_errno = ENOTSUP; return -1; } @@ -979,8 +1026,8 @@ rte_eal_init(int argc, char **argv) ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); - RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n", - rte_config.master_lcore, (int)thread_id, cpuset, + RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n", + rte_config.master_lcore, (uintptr_t)thread_id, cpuset, ret == 0 ? "" : "..."); RTE_LCORE_FOREACH_SLAVE(i) { @@ -1022,14 +1069,14 @@ rte_eal_init(int argc, char **argv) /* initialize services so vdevs register service during bus_probe. */ ret = rte_service_init(); if (ret) { - rte_eal_init_alert("rte_service_init() failed\n"); + rte_eal_init_alert("rte_service_init() failed"); rte_errno = ENOEXEC; return -1; } /* Probe all the buses and devices/drivers on them */ if (rte_bus_probe()) { - rte_eal_init_alert("Cannot probe devices\n"); + rte_eal_init_alert("Cannot probe devices"); rte_errno = ENOTSUP; return -1; } @@ -1051,6 +1098,9 @@ rte_eal_init(int argc, char **argv) rte_eal_mcfg_complete(); + /* Call each registered callback, if enabled */ + rte_option_init(); + return fctret; } @@ -1059,7 +1109,12 @@ mark_freeable(const struct rte_memseg_list *msl, const struct rte_memseg *ms, void *arg __rte_unused) { /* ms is const, so find this memseg */ - struct rte_memseg *found = rte_mem_virt2memseg(ms->addr, msl); + struct rte_memseg *found; + + if (msl->external) + return 0; + + found = rte_mem_virt2memseg(ms->addr, msl); found->flags &= ~RTE_MEMSEG_FLAG_DO_NOT_FREE; diff --git a/lib/librte_eal/linuxapp/eal/eal_dev.c b/lib/librte_eal/linuxapp/eal/eal_dev.c index 1cf6aebf..d589c692 100644 --- a/lib/librte_eal/linuxapp/eal/eal_dev.c +++ b/lib/librte_eal/linuxapp/eal/eal_dev.c @@ -4,6 +4,8 @@ #include #include +#include +#include #include #include @@ -14,15 +16,32 @@ #include #include #include +#include +#include +#include +#include #include "eal_private.h" static struct rte_intr_handle intr_handle = {.fd = -1 }; static bool monitor_started; +static bool hotplug_handle; #define EAL_UEV_MSG_LEN 4096 #define EAL_UEV_MSG_ELEM_LEN 128 +/* + * spinlock for device hot-unplug failure handling. If it try to access bus or + * device, such as handle sigbus on bus or handle memory failure for device + * just need to use this lock. It could protect the bus and the device to avoid + * race condition. + */ +static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER; + +static struct sigaction sigbus_action_old; + +static int sigbus_need_recover; + static void dev_uev_handler(__rte_unused void *param); /* identify the system layer which reports this event. */ @@ -33,6 +52,55 @@ enum eal_dev_event_subsystem { EAL_DEV_EVENT_SUBSYSTEM_MAX }; +static void +sigbus_action_recover(void) +{ + if (sigbus_need_recover) { + sigaction(SIGBUS, &sigbus_action_old, NULL); + sigbus_need_recover = 0; + } +} + +static void sigbus_handler(int signum, siginfo_t *info, + void *ctx __rte_unused) +{ + int ret; + + RTE_LOG(DEBUG, EAL, "Thread[%d] catch SIGBUS, fault address:%p\n", + (int)pthread_self(), info->si_addr); + + rte_spinlock_lock(&failure_handle_lock); + ret = rte_bus_sigbus_handler(info->si_addr); + rte_spinlock_unlock(&failure_handle_lock); + if (ret == -1) { + rte_exit(EXIT_FAILURE, + "Failed to handle SIGBUS for hot-unplug, " + "(rte_errno: %s)!", strerror(rte_errno)); + } else if (ret == 1) { + if (sigbus_action_old.sa_flags == SA_SIGINFO + && sigbus_action_old.sa_sigaction) { + (*(sigbus_action_old.sa_sigaction))(signum, + info, ctx); + } else if (sigbus_action_old.sa_flags != SA_SIGINFO + && sigbus_action_old.sa_handler) { + (*(sigbus_action_old.sa_handler))(signum); + } else { + rte_exit(EXIT_FAILURE, + "Failed to handle generic SIGBUS!"); + } + } + + RTE_LOG(DEBUG, EAL, "Success to handle SIGBUS for hot-unplug!\n"); +} + +static int cmp_dev_name(const struct rte_device *dev, + const void *_name) +{ + const char *name = _name; + + return strcmp(dev->name, name); +} + static int dev_uev_socket_fd_create(void) { @@ -147,6 +215,9 @@ dev_uev_handler(__rte_unused void *param) struct rte_dev_event uevent; int ret; char buf[EAL_UEV_MSG_LEN]; + struct rte_bus *bus; + struct rte_device *dev; + const char *busname = ""; memset(&uevent, 0, sizeof(struct rte_dev_event)); memset(buf, 0, EAL_UEV_MSG_LEN); @@ -171,8 +242,43 @@ dev_uev_handler(__rte_unused void *param) RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n", uevent.devname, uevent.type, uevent.subsystem); - if (uevent.devname) - dev_callback_process(uevent.devname, uevent.type); + switch (uevent.subsystem) { + case EAL_DEV_EVENT_SUBSYSTEM_PCI: + case EAL_DEV_EVENT_SUBSYSTEM_UIO: + busname = "pci"; + break; + default: + break; + } + + if (uevent.devname) { + if (uevent.type == RTE_DEV_EVENT_REMOVE && hotplug_handle) { + rte_spinlock_lock(&failure_handle_lock); + bus = rte_bus_find_by_name(busname); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", + busname); + return; + } + + dev = bus->find_device(NULL, cmp_dev_name, + uevent.devname); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find device (%s) on " + "bus (%s)\n", uevent.devname, busname); + return; + } + + ret = bus->hot_unplug_handler(dev); + rte_spinlock_unlock(&failure_handle_lock); + if (ret) { + RTE_LOG(ERR, EAL, "Can not handle hot-unplug " + "for device (%s)\n", dev->name); + return; + } + } + rte_dev_event_callback_process(uevent.devname, uevent.type); + } } int __rte_experimental @@ -220,5 +326,67 @@ rte_dev_event_monitor_stop(void) close(intr_handle.fd); intr_handle.fd = -1; monitor_started = false; + return 0; } + +int +dev_sigbus_handler_register(void) +{ + sigset_t mask; + struct sigaction action; + + rte_errno = 0; + + if (sigbus_need_recover) + return 0; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = SA_SIGINFO; + action.sa_mask = mask; + action.sa_sigaction = sigbus_handler; + sigbus_need_recover = !sigaction(SIGBUS, &action, &sigbus_action_old); + + return rte_errno; +} + +int +dev_sigbus_handler_unregister(void) +{ + rte_errno = 0; + + sigbus_action_recover(); + + return rte_errno; +} + +int __rte_experimental +rte_dev_hotplug_handle_enable(void) +{ + int ret = 0; + + ret = dev_sigbus_handler_register(); + if (ret < 0) + RTE_LOG(ERR, EAL, + "fail to register sigbus handler for devices.\n"); + + hotplug_handle = true; + + return ret; +} + +int __rte_experimental +rte_dev_hotplug_handle_disable(void) +{ + int ret = 0; + + ret = dev_sigbus_handler_unregister(); + if (ret < 0) + RTE_LOG(ERR, EAL, + "fail to unregister sigbus handler for devices.\n"); + + hotplug_handle = false; + + return ret; +} diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c index 3a7d4b22..0eab1cf7 100644 --- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c +++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c index 4076c6d6..39252a88 100644 --- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c +++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "eal_private.h" #include "eal_vfio.h" @@ -308,6 +309,66 @@ vfio_disable_msix(const struct rte_intr_handle *intr_handle) { return ret; } + +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE +/* enable req notifier */ +static int +vfio_enable_req(const struct rte_intr_handle *intr_handle) +{ + int len, ret; + char irq_set_buf[IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | + VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + *fd_ptr = intr_handle->fd; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +/* disable req notifier */ +static int +vfio_disable_req(const struct rte_intr_handle *intr_handle) +{ + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) + RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n", + intr_handle->fd); + + return ret; +} +#endif #endif static int @@ -556,6 +617,12 @@ rte_intr_enable(const struct rte_intr_handle *intr_handle) if (vfio_enable_intx(intr_handle)) return -1; break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + if (vfio_enable_req(intr_handle)) + return -1; + break; +#endif #endif /* not used at this moment */ case RTE_INTR_HANDLE_DEV_EVENT: @@ -606,6 +673,12 @@ rte_intr_disable(const struct rte_intr_handle *intr_handle) if (vfio_disable_intx(intr_handle)) return -1; break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + if (vfio_disable_req(intr_handle)) + return -1; + break; +#endif #endif /* not used at this moment */ case RTE_INTR_HANDLE_DEV_EVENT: @@ -672,6 +745,12 @@ eal_intr_process_interrupts(struct epoll_event *events, int nfds) case RTE_INTR_HANDLE_VFIO_LEGACY: bytes_read = sizeof(buf.vfio_intr_count); break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + bytes_read = 0; + call = true; + break; +#endif #endif case RTE_INTR_HANDLE_VDEV: case RTE_INTR_HANDLE_EXT: diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c index aa95551a..48b9c736 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c +++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -51,31 +52,56 @@ const int anonymous_hugepages_supported = #define RTE_MAP_HUGE_SHIFT 26 #endif +/* + * we don't actually care if memfd itself is supported - we only need to check + * if memfd supports hugetlbfs, as that already implies memfd support. + * + * also, this is not a constant, because while we may be *compiled* with memfd + * hugetlbfs support, we might not be *running* on a system that supports memfd + * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at + * runtime, and fall back to anonymous memory. + */ +static int memfd_create_supported = +#ifdef MFD_HUGETLB +#define MEMFD_SUPPORTED + 1; +#else + 0; +#endif + /* * not all kernel version support fallocate on hugetlbfs, so fall back to * ftruncate and disallow deallocation if fallocate is not supported. */ static int fallocate_supported = -1; /* unknown */ -/* for single-file segments, we need some kind of mechanism to keep track of +/* + * we have two modes - single file segments, and file-per-page mode. + * + * for single-file segments, we need some kind of mechanism to keep track of * which hugepages can be freed back to the system, and which cannot. we cannot * use flock() because they don't allow locking parts of a file, and we cannot * use fcntl() due to issues with their semantics, so we will have to rely on a - * bunch of lockfiles for each page. + * bunch of lockfiles for each page. so, we will use 'fds' array to keep track + * of per-page lockfiles. we will store the actual segment list fd in the + * 'memseg_list_fd' field. + * + * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd' + * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's. * * we cannot know how many pages a system will have in advance, but we do know * that they come in lists, and we know lengths of these lists. so, simply store * a malloc'd array of fd's indexed by list and segment index. * * they will be initialized at startup, and filled as we allocate/deallocate - * segments. also, use this to track memseg list proper fd. + * segments. */ static struct { int *fds; /**< dynamically allocated array of segment lock fd's */ int memseg_list_fd; /**< memseg list fd */ int len; /**< total length of the array */ int count; /**< entries used in an array */ -} lock_fds[RTE_MAX_MEMSEG_LISTS]; +} fd_list[RTE_MAX_MEMSEG_LISTS]; /** local copy of a memory map, used to synchronize memory hotplug in MP */ static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS]; @@ -182,6 +208,31 @@ get_file_size(int fd) return st.st_size; } +static inline uint32_t +bsf64(uint64_t v) +{ + return (uint32_t)__builtin_ctzll(v); +} + +static inline uint32_t +log2_u64(uint64_t v) +{ + if (v == 0) + return 0; + v = rte_align64pow2(v); + return bsf64(v); +} + +static int +pagesz_flags(uint64_t page_sz) +{ + /* as per mmap() manpage, all page sizes are log2 of page size + * shifted by MAP_HUGE_SHIFT + */ + int log2 = log2_u64(page_sz); + return log2 << RTE_MAP_HUGE_SHIFT; +} + /* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */ static int lock(int fd, int type) { @@ -209,12 +260,12 @@ static int get_segment_lock_fd(int list_idx, int seg_idx) char path[PATH_MAX] = {0}; int fd; - if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds)) + if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list)) return -1; - if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len) + if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len) return -1; - fd = lock_fds[list_idx].fds[seg_idx]; + fd = fd_list[list_idx].fds[seg_idx]; /* does this lock already exist? */ if (fd >= 0) return fd; @@ -236,8 +287,8 @@ static int get_segment_lock_fd(int list_idx, int seg_idx) return -1; } /* store it for future reference */ - lock_fds[list_idx].fds[seg_idx] = fd; - lock_fds[list_idx].count++; + fd_list[list_idx].fds[seg_idx] = fd; + fd_list[list_idx].count++; return fd; } @@ -245,12 +296,12 @@ static int unlock_segment(int list_idx, int seg_idx) { int fd, ret; - if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds)) + if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list)) return -1; - if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len) + if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len) return -1; - fd = lock_fds[list_idx].fds[seg_idx]; + fd = fd_list[list_idx].fds[seg_idx]; /* upgrade lock to exclusive to see if we can remove the lockfile */ ret = lock(fd, LOCK_EX); @@ -270,25 +321,77 @@ static int unlock_segment(int list_idx, int seg_idx) * and remove it from list anyway. */ close(fd); - lock_fds[list_idx].fds[seg_idx] = -1; - lock_fds[list_idx].count--; + fd_list[list_idx].fds[seg_idx] = -1; + fd_list[list_idx].count--; if (ret < 0) return -1; return 0; } +static int +get_seg_memfd(struct hugepage_info *hi __rte_unused, + unsigned int list_idx __rte_unused, + unsigned int seg_idx __rte_unused) +{ +#ifdef MEMFD_SUPPORTED + int fd; + char segname[250]; /* as per manpage, limit is 249 bytes plus null */ + + if (internal_config.single_file_segments) { + fd = fd_list[list_idx].memseg_list_fd; + + if (fd < 0) { + int flags = MFD_HUGETLB | pagesz_flags(hi->hugepage_sz); + + snprintf(segname, sizeof(segname), "seg_%i", list_idx); + fd = memfd_create(segname, flags); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", + __func__, strerror(errno)); + return -1; + } + fd_list[list_idx].memseg_list_fd = fd; + } + } else { + fd = fd_list[list_idx].fds[seg_idx]; + + if (fd < 0) { + int flags = MFD_HUGETLB | pagesz_flags(hi->hugepage_sz); + + snprintf(segname, sizeof(segname), "seg_%i-%i", + list_idx, seg_idx); + fd = memfd_create(segname, flags); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", + __func__, strerror(errno)); + return -1; + } + fd_list[list_idx].fds[seg_idx] = fd; + } + } + return fd; +#endif + return -1; +} + static int get_seg_fd(char *path, int buflen, struct hugepage_info *hi, unsigned int list_idx, unsigned int seg_idx) { int fd; + /* for in-memory mode, we only make it here when we're sure we support + * memfd, and this is a special case. + */ + if (internal_config.in_memory) + return get_seg_memfd(hi, list_idx, seg_idx); + if (internal_config.single_file_segments) { /* create a hugepage file path */ eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx); - fd = lock_fds[list_idx].memseg_list_fd; + fd = fd_list[list_idx].memseg_list_fd; if (fd < 0) { fd = open(path, O_CREAT | O_RDWR, 0600); @@ -304,24 +407,30 @@ get_seg_fd(char *path, int buflen, struct hugepage_info *hi, close(fd); return -1; } - lock_fds[list_idx].memseg_list_fd = fd; + fd_list[list_idx].memseg_list_fd = fd; } } else { /* create a hugepage file path */ eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); - fd = open(path, O_CREAT | O_RDWR, 0600); + + fd = fd_list[list_idx].fds[seg_idx]; + if (fd < 0) { - RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, - strerror(errno)); - return -1; - } - /* take out a read lock */ - if (lock(fd, LOCK_SH) < 0) { - RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", - __func__, strerror(errno)); - close(fd); - return -1; + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", + __func__, strerror(errno)); + return -1; + } + /* take out a read lock */ + if (lock(fd, LOCK_SH) < 0) { + RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", + __func__, strerror(errno)); + close(fd); + return -1; + } + fd_list[list_idx].fds[seg_idx] = fd; } } return fd; @@ -332,6 +441,33 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx, uint64_t fa_offset, uint64_t page_sz, bool grow) { bool again = false; + + /* in-memory mode is a special case, because we don't need to perform + * any locking, and we can be sure that fallocate() is supported. + */ + if (internal_config.in_memory) { + int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_KEEP_SIZE; + int ret; + + /* grow or shrink the file */ + ret = fallocate(fd, flags, fa_offset, page_sz); + + if (ret < 0) { + RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", + __func__, + strerror(errno)); + return -1; + } + /* increase/decrease total segment count */ + fd_list[list_idx].count += (grow ? 1 : -1); + if (!grow && fd_list[list_idx].count == 0) { + close(fd_list[list_idx].memseg_list_fd); + fd_list[list_idx].memseg_list_fd = -1; + } + return 0; + } + do { if (fallocate_supported == 0) { /* we cannot deallocate memory if fallocate() is not @@ -410,9 +546,9 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx, * page file fd, so that one of the processes * could then delete the file after shrinking. */ - if (ret < 1 && lock_fds[list_idx].count == 0) { + if (ret < 1 && fd_list[list_idx].count == 0) { close(fd); - lock_fds[list_idx].memseg_list_fd = -1; + fd_list[list_idx].memseg_list_fd = -1; } if (ret < 0) { @@ -448,13 +584,13 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx, * more segments active in this segment list, * and remove the file if there aren't. */ - if (lock_fds[list_idx].count == 0) { + if (fd_list[list_idx].count == 0) { if (unlink(path)) RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n", __func__, path, strerror(errno)); close(fd); - lock_fds[list_idx].memseg_list_fd = -1; + fd_list[list_idx].memseg_list_fd = -1; } } } @@ -481,26 +617,34 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, void *new_addr; alloc_sz = hi->hugepage_sz; - if (!internal_config.single_file_segments && - internal_config.in_memory && - anonymous_hugepages_supported) { - int log2, flags; - - log2 = rte_log2_u32(alloc_sz); - /* as per mmap() manpage, all page sizes are log2 of page size - * shifted by MAP_HUGE_SHIFT - */ - flags = (log2 << RTE_MAP_HUGE_SHIFT) | MAP_HUGETLB | MAP_FIXED | + + /* these are checked at init, but code analyzers don't know that */ + if (internal_config.in_memory && !anonymous_hugepages_supported) { + RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n"); + return -1; + } + if (internal_config.in_memory && !memfd_create_supported && + internal_config.single_file_segments) { + RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n"); + return -1; + } + + /* in-memory without memfd is a special case */ + int mmap_flags; + + if (internal_config.in_memory && !memfd_create_supported) { + int pagesz_flag, flags; + + pagesz_flag = pagesz_flags(alloc_sz); + flags = pagesz_flag | MAP_HUGETLB | MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS; fd = -1; - va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, flags, -1, 0); - - /* single-file segments codepath will never be active because - * in-memory mode is incompatible with it and it's stopped at - * EAL initialization stage, however the compiler doesn't know - * that and complains about map_offset being used uninitialized - * on failure codepaths while having in-memory mode enabled. so, - * assign a value here. + mmap_flags = flags; + + /* single-file segments codepath will never be active + * here because in-memory mode is incompatible with the + * fallback path, and it's stopped at EAL initialization + * stage. */ map_offset = 0; } else { @@ -524,7 +668,8 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, __func__, strerror(errno)); goto resized; } - if (internal_config.hugepage_unlink) { + if (internal_config.hugepage_unlink && + !internal_config.in_memory) { if (unlink(path)) { RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n", __func__, strerror(errno)); @@ -532,16 +677,16 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, } } } - - /* - * map the segment, and populate page tables, the kernel fills - * this segment with zeros if it's a new page. - */ - va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, - map_offset); + mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED; } + /* + * map the segment, and populate page tables, the kernel fills + * this segment with zeros if it's a new page. + */ + va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd, + map_offset); + if (va == MAP_FAILED) { RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__, strerror(errno)); @@ -593,10 +738,6 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, goto mapped; } #endif - /* for non-single file segments that aren't in-memory, we can close fd - * here */ - if (!internal_config.single_file_segments && !internal_config.in_memory) - close(fd); ms->addr = addr; ms->hugepage_sz = alloc_sz; @@ -626,7 +767,10 @@ unmapped: RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n"); } resized: - /* in-memory mode will never be single-file-segments mode */ + /* some codepaths will return negative fd, so exit early */ + if (fd < 0) + return -1; + if (internal_config.single_file_segments) { resize_hugefile(fd, path, list_idx, seg_idx, map_offset, alloc_sz, false); @@ -638,6 +782,7 @@ resized: lock(fd, LOCK_EX) == 1) unlink(path); close(fd); + fd_list[list_idx].fds[seg_idx] = -1; } return -1; } @@ -648,7 +793,8 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi, { uint64_t map_offset; char path[PATH_MAX]; - int fd, ret; + int fd, ret = 0; + bool exit_early; /* erase page data */ memset(ms->addr, 0, ms->len); @@ -660,8 +806,17 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi, return -1; } + exit_early = false; + + /* if we're using anonymous hugepages, nothing to be done */ + if (internal_config.in_memory && !memfd_create_supported) + exit_early = true; + /* if we've already unlinked the page, nothing needs to be done */ - if (internal_config.hugepage_unlink) { + if (!internal_config.in_memory && internal_config.hugepage_unlink) + exit_early = true; + + if (exit_early) { memset(ms, 0, sizeof(*ms)); return 0; } @@ -684,14 +839,17 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi, /* if we're able to take out a write lock, we're the last one * holding onto this page. */ - ret = lock(fd, LOCK_EX); - if (ret >= 0) { - /* no one else is using this page */ - if (ret == 1) - unlink(path); + if (!internal_config.in_memory) { + ret = lock(fd, LOCK_EX); + if (ret >= 0) { + /* no one else is using this page */ + if (ret == 1) + unlink(path); + } } /* closing fd will drop the lock */ close(fd); + fd_list[list_idx].fds[seg_idx] = -1; } memset(ms, 0, sizeof(*ms)); @@ -828,7 +986,7 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg) int msl_idx, seg_idx, ret, dir_fd = -1; start_addr = (uintptr_t) msl->base_va; - end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz; + end_addr = start_addr + msl->len; if ((uintptr_t)wa->ms->addr < start_addr || (uintptr_t)wa->ms->addr >= end_addr) @@ -1250,6 +1408,9 @@ sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) unsigned int i; int msl_idx; + if (msl->external) + return 0; + msl_idx = msl - mcfg->memsegs; primary_msl = &mcfg->memsegs[msl_idx]; local_msl = &local_memsegs[msl_idx]; @@ -1298,6 +1459,9 @@ secondary_msl_create_walk(const struct rte_memseg_list *msl, char name[PATH_MAX]; int msl_idx, ret; + if (msl->external) + return 0; + msl_idx = msl - mcfg->memsegs; primary_msl = &mcfg->memsegs[msl_idx]; local_msl = &local_memsegs[msl_idx]; @@ -1314,50 +1478,176 @@ secondary_msl_create_walk(const struct rte_memseg_list *msl, return -1; } local_msl->base_va = primary_msl->base_va; + local_msl->len = primary_msl->len; return 0; } static int -secondary_lock_list_create_walk(const struct rte_memseg_list *msl, - void *arg __rte_unused) +alloc_list(int list_idx, int len) { - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - unsigned int i, len; - int msl_idx; int *data; + int i; - msl_idx = msl - mcfg->memsegs; - len = msl->memseg_arr.len; - - /* ensure we have space to store lock fd per each possible segment */ + /* ensure we have space to store fd per each possible segment */ data = malloc(sizeof(int) * len); if (data == NULL) { - RTE_LOG(ERR, EAL, "Unable to allocate space for lock descriptors\n"); + RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n"); return -1; } /* set all fd's as invalid */ for (i = 0; i < len; i++) data[i] = -1; - lock_fds[msl_idx].fds = data; - lock_fds[msl_idx].len = len; - lock_fds[msl_idx].count = 0; - lock_fds[msl_idx].memseg_list_fd = -1; + fd_list[list_idx].fds = data; + fd_list[list_idx].len = len; + fd_list[list_idx].count = 0; + fd_list[list_idx].memseg_list_fd = -1; + + return 0; +} + +static int +fd_list_create_walk(const struct rte_memseg_list *msl, + void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int len; + int msl_idx; + + if (msl->external) + return 0; + + msl_idx = msl - mcfg->memsegs; + len = msl->memseg_arr.len; + + return alloc_list(msl_idx, len); +} + +int +eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + /* if list is not allocated, allocate it */ + if (fd_list[list_idx].len == 0) { + int len = mcfg->memsegs[list_idx].memseg_arr.len; + + if (alloc_list(list_idx, len) < 0) + return -ENOMEM; + } + fd_list[list_idx].fds[seg_idx] = fd; return 0; } +int +eal_memalloc_get_seg_fd(int list_idx, int seg_idx) +{ + int fd; + if (internal_config.single_file_segments) { + fd = fd_list[list_idx].memseg_list_fd; + } else if (fd_list[list_idx].len == 0) { + /* list not initialized */ + fd = -1; + } else { + fd = fd_list[list_idx].fds[seg_idx]; + } + if (fd < 0) + return -ENODEV; + return fd; +} + +static int +test_memfd_create(void) +{ +#ifdef MEMFD_SUPPORTED + unsigned int i; + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + uint64_t pagesz = internal_config.hugepage_info[i].hugepage_sz; + int pagesz_flag = pagesz_flags(pagesz); + int flags; + + flags = pagesz_flag | MFD_HUGETLB; + int fd = memfd_create("test", flags); + if (fd < 0) { + /* we failed - let memalloc know this isn't working */ + if (errno == EINVAL) { + memfd_create_supported = 0; + return 0; /* not supported */ + } + + /* we got other error - something's wrong */ + return -1; /* error */ + } + close(fd); + return 1; /* supported */ + } +#endif + return 0; /* not supported */ +} + +int +eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + /* fd_list not initialized? */ + if (fd_list[list_idx].len == 0) + return -ENODEV; + if (internal_config.single_file_segments) { + size_t pgsz = mcfg->memsegs[list_idx].page_sz; + + /* segment not active? */ + if (fd_list[list_idx].memseg_list_fd < 0) + return -ENOENT; + *offset = pgsz * seg_idx; + } else { + /* segment not active? */ + if (fd_list[list_idx].fds[seg_idx] < 0) + return -ENOENT; + *offset = 0; + } + return 0; +} + int eal_memalloc_init(void) { if (rte_eal_process_type() == RTE_PROC_SECONDARY) if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0) return -1; + if (rte_eal_process_type() == RTE_PROC_PRIMARY && + internal_config.in_memory) { + int mfd_res = test_memfd_create(); - /* initialize all of the lock fd lists */ - if (internal_config.single_file_segments) - if (rte_memseg_list_walk(secondary_lock_list_create_walk, NULL)) + if (mfd_res < 0) { + RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n"); + return -1; + } + if (mfd_res == 1) + RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n"); + else + RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n"); + + /* we only support single-file segments mode with in-memory mode + * if we support hugetlbfs with memfd_create. this code will + * test if we do. + */ + if (internal_config.single_file_segments && + mfd_res != 1) { + RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n"); return -1; + } + /* this cannot ever happen but better safe than sorry */ + if (!anonymous_hugepages_supported) { + RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n"); + return -1; + } + } + + /* initialize all of the fd lists */ + if (rte_memseg_list_walk(fd_list_create_walk, NULL)) + return -1; return 0; } diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c index dbf19499..fce86fda 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memory.c +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -5,6 +5,7 @@ #define _FILE_OFFSET_BITS 64 #include +#include #include #include #include @@ -17,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -263,7 +265,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, int node_id = -1; int essential_prev = 0; int oldpolicy; - struct bitmask *oldmask = numa_allocate_nodemask(); + struct bitmask *oldmask = NULL; bool have_numa = true; unsigned long maxnode = 0; @@ -275,6 +277,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, if (have_numa) { RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); + oldmask = numa_allocate_nodemask(); if (get_mempolicy(&oldpolicy, oldmask->maskp, oldmask->size + 1, 0, 0) < 0) { RTE_LOG(ERR, EAL, @@ -402,7 +405,8 @@ out: numa_set_localalloc(); } } - numa_free_cpumask(oldmask); + if (oldmask != NULL) + numa_free_cpumask(oldmask); #endif return i; } @@ -584,7 +588,7 @@ unlink_hugepage_files(struct hugepage_file *hugepg_tbl, for (page = 0; page < nrpages; page++) { struct hugepage_file *hp = &hugepg_tbl[page]; - if (hp->final_va != NULL && unlink(hp->filepath)) { + if (hp->orig_va != NULL && unlink(hp->filepath)) { RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n", __func__, hp->filepath, strerror(errno)); } @@ -771,7 +775,10 @@ remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end) rte_fbarray_set_used(arr, ms_idx); - close(fd); + /* store segment fd internally */ + if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0) + RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n", + rte_strerror(rte_errno)); } RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n", (seg_len * page_sz) >> 20, socket_id); @@ -857,6 +864,7 @@ alloc_va_space(struct rte_memseg_list *msl) return -1; } msl->base_va = addr; + msl->len = mem_sz; return 0; } @@ -1365,6 +1373,7 @@ eal_legacy_hugepage_init(void) msl->base_va = addr; msl->page_sz = page_sz; msl->socket_id = 0; + msl->len = internal_config.memory; /* populate memsegs. each memseg is one page long */ for (cur_seg = 0; cur_seg < n_segs; cur_seg++) { @@ -1611,7 +1620,7 @@ eal_legacy_hugepage_init(void) if (msl->memseg_arr.count > 0) continue; /* this is an unused list, deallocate it */ - mem_sz = (size_t)msl->page_sz * msl->memseg_arr.len; + mem_sz = msl->len; munmap(msl->base_va, mem_sz); msl->base_va = NULL; @@ -1770,6 +1779,7 @@ getFileSize(int fd) static int eal_legacy_hugepage_attach(void) { + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; struct hugepage_file *hp = NULL; unsigned int num_hp = 0; unsigned int i = 0; @@ -1813,6 +1823,9 @@ eal_legacy_hugepage_attach(void) struct hugepage_file *hf = &hp[i]; size_t map_sz = hf->size; void *map_addr = hf->final_va; + int msl_idx, ms_idx; + struct rte_memseg_list *msl; + struct rte_memseg *ms; /* if size is zero, no more pages left */ if (map_sz == 0) @@ -1830,25 +1843,50 @@ eal_legacy_hugepage_attach(void) if (map_addr == MAP_FAILED) { RTE_LOG(ERR, EAL, "Could not map %s: %s\n", hf->filepath, strerror(errno)); - close(fd); - goto error; + goto fd_error; } /* set shared lock on the file. */ if (flock(fd, LOCK_SH) < 0) { RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n", __func__, strerror(errno)); - close(fd); - goto error; + goto fd_error; } - close(fd); + /* find segment data */ + msl = rte_mem_virt2memseg_list(map_addr); + if (msl == NULL) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg list\n", + __func__); + goto fd_error; + } + ms = rte_mem_virt2memseg(map_addr, msl); + if (ms == NULL) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg\n", + __func__); + goto fd_error; + } + + msl_idx = msl - mcfg->memsegs; + ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + if (ms_idx < 0) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg idx\n", + __func__); + goto fd_error; + } + + /* store segment fd internally */ + if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0) + RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n", + rte_strerror(rte_errno)); } /* unmap the hugepage config file, since we are done using it */ munmap(hp, size); close(fd_hugepage); return 0; +fd_error: + close(fd); error: /* map all segments into memory to make sure we get the addrs */ cur_seg = 0; @@ -2093,18 +2131,65 @@ static int __rte_unused memseg_primary_init(void) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int i, socket_id, hpi_idx, msl_idx = 0; + struct memtype { + uint64_t page_sz; + int socket_id; + } *memtypes = NULL; + int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */ struct rte_memseg_list *msl; - uint64_t max_mem, total_mem; + uint64_t max_mem, max_mem_per_type; + unsigned int max_seglists_per_type; + unsigned int n_memtypes, cur_type; /* no-huge does not need this at all */ if (internal_config.no_hugetlbfs) return 0; - max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; - total_mem = 0; + /* + * figuring out amount of memory we're going to have is a long and very + * involved process. the basic element we're operating with is a memory + * type, defined as a combination of NUMA node ID and page size (so that + * e.g. 2 sockets with 2 page sizes yield 4 memory types in total). + * + * deciding amount of memory going towards each memory type is a + * balancing act between maximum segments per type, maximum memory per + * type, and number of detected NUMA nodes. the goal is to make sure + * each memory type gets at least one memseg list. + * + * the total amount of memory is limited by RTE_MAX_MEM_MB value. + * + * the total amount of memory per type is limited by either + * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number + * of detected NUMA nodes. additionally, maximum number of segments per + * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for + * smaller page sizes, it can take hundreds of thousands of segments to + * reach the above specified per-type memory limits. + * + * additionally, each type may have multiple memseg lists associated + * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger + * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones. + * + * the number of memseg lists per type is decided based on the above + * limits, and also taking number of detected NUMA nodes, to make sure + * that we don't run out of memseg lists before we populate all NUMA + * nodes with memory. + * + * we do this in three stages. first, we collect the number of types. + * then, we figure out memory constraints and populate the list of + * would-be memseg lists. then, we go ahead and allocate the memseg + * lists. + */ - /* create memseg lists */ + /* create space for mem types */ + n_memtypes = internal_config.num_hugepage_sizes * rte_socket_count(); + memtypes = calloc(n_memtypes, sizeof(*memtypes)); + if (memtypes == NULL) { + RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n"); + return -1; + } + + /* populate mem types */ + cur_type = 0; for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes; hpi_idx++) { struct hugepage_info *hpi; @@ -2113,62 +2198,114 @@ memseg_primary_init(void) hpi = &internal_config.hugepage_info[hpi_idx]; hugepage_sz = hpi->hugepage_sz; - for (i = 0; i < (int) rte_socket_count(); i++) { - uint64_t max_type_mem, total_type_mem = 0; - int type_msl_idx, max_segs, total_segs = 0; - - socket_id = rte_socket_id_by_idx(i); + for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) { + int socket_id = rte_socket_id_by_idx(i); #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES if (socket_id > 0) break; #endif + memtypes[cur_type].page_sz = hugepage_sz; + memtypes[cur_type].socket_id = socket_id; - if (total_mem >= max_mem) - break; - - max_type_mem = RTE_MIN(max_mem - total_mem, - (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20); - max_segs = RTE_MAX_MEMSEG_PER_TYPE; + RTE_LOG(DEBUG, EAL, "Detected memory type: " + "socket_id:%u hugepage_sz:%" PRIu64 "\n", + socket_id, hugepage_sz); + } + } - type_msl_idx = 0; - while (total_type_mem < max_type_mem && - total_segs < max_segs) { - uint64_t cur_max_mem, cur_mem; - unsigned int n_segs; + /* set up limits for types */ + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20, + max_mem / n_memtypes); + /* + * limit maximum number of segment lists per type to ensure there's + * space for memseg lists for all NUMA nodes with all page sizes + */ + max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes; - if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { - RTE_LOG(ERR, EAL, - "No more space in memseg lists, please increase %s\n", - RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); - return -1; - } + if (max_seglists_per_type == 0) { + RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + goto out; + } - msl = &mcfg->memsegs[msl_idx++]; + /* go through all mem types and create segment lists */ + msl_idx = 0; + for (cur_type = 0; cur_type < n_memtypes; cur_type++) { + unsigned int cur_seglist, n_seglists, n_segs; + unsigned int max_segs_per_type, max_segs_per_list; + struct memtype *type = &memtypes[cur_type]; + uint64_t max_mem_per_list, pagesz; + int socket_id; - cur_max_mem = max_type_mem - total_type_mem; + pagesz = type->page_sz; + socket_id = type->socket_id; - cur_mem = get_mem_amount(hugepage_sz, - cur_max_mem); - n_segs = cur_mem / hugepage_sz; + /* + * we need to create segment lists for this type. we must take + * into account the following things: + * + * 1. total amount of memory we can use for this memory type + * 2. total amount of memory per memseg list allowed + * 3. number of segments needed to fit the amount of memory + * 4. number of segments allowed per type + * 5. number of segments allowed per memseg list + * 6. number of memseg lists we are allowed to take up + */ - if (alloc_memseg_list(msl, hugepage_sz, n_segs, - socket_id, type_msl_idx)) - return -1; + /* calculate how much segments we will need in total */ + max_segs_per_type = max_mem_per_type / pagesz; + /* limit number of segments to maximum allowed per type */ + max_segs_per_type = RTE_MIN(max_segs_per_type, + (unsigned int)RTE_MAX_MEMSEG_PER_TYPE); + /* limit number of segments to maximum allowed per list */ + max_segs_per_list = RTE_MIN(max_segs_per_type, + (unsigned int)RTE_MAX_MEMSEG_PER_LIST); + + /* calculate how much memory we can have per segment list */ + max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz, + (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20); + + /* calculate how many segments each segment list will have */ + n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz); + + /* calculate how many segment lists we can have */ + n_seglists = RTE_MIN(max_segs_per_type / n_segs, + max_mem_per_type / max_mem_per_list); + + /* limit number of segment lists according to our maximum */ + n_seglists = RTE_MIN(n_seglists, max_seglists_per_type); + + RTE_LOG(DEBUG, EAL, "Creating %i segment lists: " + "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n", + n_seglists, n_segs, socket_id, pagesz); + + /* create all segment lists */ + for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) { + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + goto out; + } + msl = &mcfg->memsegs[msl_idx++]; - total_segs += msl->memseg_arr.len; - total_type_mem = total_segs * hugepage_sz; - type_msl_idx++; + if (alloc_memseg_list(msl, pagesz, n_segs, + socket_id, cur_seglist)) + goto out; - if (alloc_va_space(msl)) { - RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); - return -1; - } + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); + goto out; } - total_mem += total_type_mem; } } - return 0; + /* we're successful */ + ret = 0; +out: + free(memtypes); + return ret; } static int @@ -2204,6 +2341,25 @@ memseg_secondary_init(void) int rte_eal_memseg_init(void) { + /* increase rlimit to maximum */ + struct rlimit lim; + + if (getrlimit(RLIMIT_NOFILE, &lim) == 0) { + /* set limit to maximum */ + lim.rlim_cur = lim.rlim_max; + + if (setrlimit(RLIMIT_NOFILE, &lim) < 0) { + RTE_LOG(DEBUG, EAL, "Setting maximum number of open files failed: %s\n", + strerror(errno)); + } else { + RTE_LOG(DEBUG, EAL, "Setting maximum number of open files to %" + PRIu64 "\n", + (uint64_t)lim.rlim_cur); + } + } else { + RTE_LOG(ERR, EAL, "Cannot get current resource limits\n"); + } + return rte_eal_process_type() == RTE_PROC_PRIMARY ? #ifndef RTE_ARCH_64 memseg_primary_init_32() : diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c index b496fc71..379773b6 100644 --- a/lib/librte_eal/linuxapp/eal/eal_thread.c +++ b/lib/librte_eal/linuxapp/eal/eal_thread.c @@ -121,8 +121,8 @@ eal_thread_loop(__attribute__((unused)) void *arg) ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); - RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%x;cpuset=[%s%s])\n", - lcore_id, (int)thread_id, cpuset, ret == 0 ? "" : "..."); + RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s%s])\n", + lcore_id, (uintptr_t)thread_id, cpuset, ret == 0 ? "" : "..."); /* read on our pipe to get commands */ while (1) { diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c b/lib/librte_eal/linuxapp/eal/eal_timer.c index 2766bd78..bc8f0519 100644 --- a/lib/librte_eal/linuxapp/eal/eal_timer.c +++ b/lib/librte_eal/linuxapp/eal/eal_timer.c @@ -87,7 +87,7 @@ static pthread_t msb_inc_thread_id; * containing used to process MSB of the HPET (unfortunately, we need * this because hpet is 32 bits by default under linux). */ -static void +static void * hpet_msb_inc(__attribute__((unused)) void *arg) { uint32_t t; @@ -98,6 +98,7 @@ hpet_msb_inc(__attribute__((unused)) void *arg) eal_hpet_msb ++; sleep(10); } + return NULL; } uint64_t @@ -178,7 +179,7 @@ rte_eal_hpet_init(int make_default) /* create a thread that will increment a global variable for * msb (hpet is 32 bits by default under linux) */ ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL, - (void *(*)(void *))hpet_msb_inc, NULL); + hpet_msb_inc, NULL); if (ret != 0) { RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n"); internal_config.no_hpet = 1; diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c index c68dc38e..0516b159 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c @@ -345,46 +345,13 @@ get_vfio_cfg_by_group_num(int iommu_group_num) return NULL; } -static struct vfio_config * -get_vfio_cfg_by_group_fd(int vfio_group_fd) -{ - struct vfio_config *vfio_cfg; - int i, j; - - for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { - vfio_cfg = &vfio_cfgs[i]; - for (j = 0; j < VFIO_MAX_GROUPS; j++) - if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) - return vfio_cfg; - } - - return NULL; -} - -static struct vfio_config * -get_vfio_cfg_by_container_fd(int container_fd) -{ - int i; - - for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { - if (vfio_cfgs[i].vfio_container_fd == container_fd) - return &vfio_cfgs[i]; - } - - return NULL; -} - -int -rte_vfio_get_group_fd(int iommu_group_num) +static int +vfio_get_group_fd(struct vfio_config *vfio_cfg, + int iommu_group_num) { int i; int vfio_group_fd; struct vfio_group *cur_grp; - struct vfio_config *vfio_cfg; - - /* get the vfio_config it belongs to */ - vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); - vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; /* check if we already have the group descriptor open */ for (i = 0; i < VFIO_MAX_GROUPS; i++) @@ -423,6 +390,47 @@ rte_vfio_get_group_fd(int iommu_group_num) return vfio_group_fd; } +static struct vfio_config * +get_vfio_cfg_by_group_fd(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i, j; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfg = &vfio_cfgs[i]; + for (j = 0; j < VFIO_MAX_GROUPS; j++) + if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) + return vfio_cfg; + } + + return NULL; +} + +static struct vfio_config * +get_vfio_cfg_by_container_fd(int container_fd) +{ + int i; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + if (vfio_cfgs[i].vfio_container_fd == container_fd) + return &vfio_cfgs[i]; + } + + return NULL; +} + +int +rte_vfio_get_group_fd(int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + + /* get the vfio_config it belongs to */ + vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); + vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; + + return vfio_get_group_fd(vfio_cfg, iommu_group_num); +} + static int get_vfio_group_idx(int vfio_group_fd) { @@ -509,7 +517,7 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len, msl = rte_mem_virt2memseg_list(addr); /* for IOVA as VA mode, no need to care for IOVA addresses */ - if (rte_eal_iova_mode() == RTE_IOVA_VA) { + if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) { uint64_t vfio_va = (uint64_t)(uintptr_t)addr; if (type == RTE_MEM_EVENT_ALLOC) vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va, @@ -523,13 +531,19 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len, /* memsegs are contiguous in memory */ ms = rte_mem_virt2memseg(addr, msl); while (cur_len < len) { + /* some memory segments may have invalid IOVA */ + if (ms->iova == RTE_BAD_IOVA) { + RTE_LOG(DEBUG, EAL, "Memory segment at %p has bad IOVA, skipping\n", + ms->addr); + goto next; + } if (type == RTE_MEM_EVENT_ALLOC) vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, ms->iova, ms->len, 1); else vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, ms->iova, ms->len, 0); - +next: cur_len += ms->len; ++ms; } @@ -896,7 +910,15 @@ rte_vfio_enable(const char *modname) return 0; } - default_vfio_cfg->vfio_container_fd = rte_vfio_get_container_fd(); + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* open a new container */ + default_vfio_cfg->vfio_container_fd = + rte_vfio_get_container_fd(); + } else { + /* get the default container from the primary process */ + default_vfio_cfg->vfio_container_fd = + vfio_get_default_container_fd(); + } /* check if we have VFIO driver enabled */ if (default_vfio_cfg->vfio_container_fd != -1) { @@ -916,6 +938,45 @@ rte_vfio_is_enabled(const char *modname) return default_vfio_cfg->vfio_enabled && mod_available; } +int +vfio_get_default_container_fd(void) +{ + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + + if (default_vfio_cfg->vfio_enabled) + return default_vfio_cfg->vfio_container_fd; + + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* if we were secondary process we would try requesting + * container fd from the primary, but we're the primary + * process so just exit here + */ + return -1; + } + + p->req = SOCKET_REQ_DEFAULT_CONTAINER; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + free(mp_reply.msgs); + return mp_rep->fds[0]; + } + free(mp_reply.msgs); + } + + RTE_LOG(ERR, EAL, " cannot request default container fd\n"); + return -1; +} + const struct vfio_iommu_type * vfio_set_iommu_type(int vfio_container_fd) { @@ -1028,8 +1089,9 @@ rte_vfio_get_container_fd(void) mp_rep = &mp_reply.msgs[0]; p = (struct vfio_mp_param *)mp_rep->param; if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + vfio_container_fd = mp_rep->fds[0]; free(mp_reply.msgs); - return mp_rep->fds[0]; + return vfio_container_fd; } free(mp_reply.msgs); } @@ -1082,11 +1144,14 @@ rte_vfio_get_group_num(const char *sysfs_base, } static int -type1_map(const struct rte_memseg_list *msl __rte_unused, - const struct rte_memseg *ms, void *arg) +type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg) { int *vfio_container_fd = arg; + if (msl->external) + return 0; + return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, ms->len, 1); } @@ -1145,8 +1210,22 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, struct vfio_iommu_type1_dma_map dma_map; struct vfio_iommu_type1_dma_unmap dma_unmap; int ret; + struct vfio_iommu_spapr_register_memory reg = { + .argsz = sizeof(reg), + .flags = 0 + }; + reg.vaddr = (uintptr_t) vaddr; + reg.size = len; if (do_map != 0) { + ret = ioctl(vfio_container_fd, + VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); + if (ret) { + RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + memset(&dma_map, 0, sizeof(dma_map)); dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); dma_map.vaddr = vaddr; @@ -1163,13 +1242,6 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, } } else { - struct vfio_iommu_spapr_register_memory reg = { - .argsz = sizeof(reg), - .flags = 0 - }; - reg.vaddr = (uintptr_t) vaddr; - reg.size = len; - ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); if (ret) { @@ -1196,12 +1268,15 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, } static int -vfio_spapr_map_walk(const struct rte_memseg_list *msl __rte_unused, +vfio_spapr_map_walk(const struct rte_memseg_list *msl, const struct rte_memseg *ms, void *arg) { int *vfio_container_fd = arg; - return vfio_spapr_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, + if (msl->external) + return 0; + + return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova, ms->len, 1); } @@ -1210,12 +1285,15 @@ struct spapr_walk_param { uint64_t hugepage_sz; }; static int -vfio_spapr_window_size_walk(const struct rte_memseg_list *msl __rte_unused, +vfio_spapr_window_size_walk(const struct rte_memseg_list *msl, const struct rte_memseg *ms, void *arg) { struct spapr_walk_param *param = arg; uint64_t max = ms->iova + ms->len; + if (msl->external) + return 0; + if (max > param->window_size) { param->hugepage_sz = ms->hugepage_sz; param->window_size = max; @@ -1670,9 +1748,6 @@ int rte_vfio_container_group_bind(int container_fd, int iommu_group_num) { struct vfio_config *vfio_cfg; - struct vfio_group *cur_grp; - int vfio_group_fd; - int i; vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); if (vfio_cfg == NULL) { @@ -1680,36 +1755,7 @@ rte_vfio_container_group_bind(int container_fd, int iommu_group_num) return -1; } - /* Check room for new group */ - if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) { - RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n"); - return -1; - } - - /* Get an index for the new group */ - for (i = 0; i < VFIO_MAX_GROUPS; i++) - if (vfio_cfg->vfio_groups[i].group_num == -1) { - cur_grp = &vfio_cfg->vfio_groups[i]; - break; - } - - /* This should not happen */ - if (i == VFIO_MAX_GROUPS) { - RTE_LOG(ERR, EAL, "No VFIO group free slot found\n"); - return -1; - } - - vfio_group_fd = vfio_open_group_fd(iommu_group_num); - if (vfio_group_fd < 0) { - RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num); - return -1; - } - cur_grp->group_num = iommu_group_num; - cur_grp->fd = vfio_group_fd; - cur_grp->devices = 0; - vfio_cfg->vfio_active_groups++; - - return vfio_group_fd; + return vfio_get_group_fd(vfio_cfg, iommu_group_num); } int diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h index 68d4750a..63ae115c 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h @@ -115,6 +115,9 @@ struct vfio_iommu_type { vfio_dma_func_t dma_map_func; }; +/* get the vfio container that devices are bound to by default */ +int vfio_get_default_container_fd(void); + /* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */ const struct vfio_iommu_type * vfio_set_iommu_type(int vfio_container_fd); @@ -129,6 +132,7 @@ int vfio_mp_sync_setup(void); #define SOCKET_REQ_CONTAINER 0x100 #define SOCKET_REQ_GROUP 0x200 +#define SOCKET_REQ_DEFAULT_CONTAINER 0x400 #define SOCKET_OK 0x0 #define SOCKET_NO_FD 0x1 #define SOCKET_ERR 0xFF diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c index 680a24aa..a1e8c834 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c +++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c @@ -66,6 +66,17 @@ vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer) reply.fds[0] = fd; } break; + case SOCKET_REQ_DEFAULT_CONTAINER: + r->req = SOCKET_REQ_DEFAULT_CONTAINER; + fd = vfio_get_default_container_fd(); + if (fd < 0) + r->result = SOCKET_ERR; + else { + r->result = SOCKET_OK; + reply.num_fds = 1; + reply.fds[0] = fd; + } + break; default: RTE_LOG(ERR, EAL, "vfio received invalid message!\n"); return -1; diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h index cfa9448b..5afa0871 100644 --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h @@ -8,6 +8,7 @@ #ifdef __KERNEL__ #include +#include #define RTE_STD_C11 #else #include @@ -54,8 +55,13 @@ struct rte_kni_request { * Writing should never overwrite the read position */ struct rte_kni_fifo { +#ifdef RTE_USE_C11_MEM_MODEL + unsigned write; /**< Next position to be written*/ + unsigned read; /**< Next position to be read */ +#else volatile unsigned write; /**< Next position to be written*/ volatile unsigned read; /**< Next position to be read */ +#endif unsigned len; /**< Circular buffer length */ unsigned elem_size; /**< Pointer size - for 32/64 bit OS */ void *volatile buffer[]; /**< The buffer contains mbuf pointers */ diff --git a/lib/librte_eal/meson.build b/lib/librte_eal/meson.build index e1fde15d..a18f3a82 100644 --- a/lib/librte_eal/meson.build +++ b/lib/librte_eal/meson.build @@ -21,11 +21,10 @@ else error('unsupported system type "@0@"'.format(host_machine.system())) endif -version = 8 # the version of the EAL API +version = 9 # the version of the EAL API allow_experimental_apis = true deps += 'compat' deps += 'kvargs' -cflags += '-D_GNU_SOURCE' sources = common_sources + env_sources objs = common_objs + env_objs headers = common_headers + env_headers diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map index 344a43d3..04f62424 100644 --- a/lib/librte_eal/rte_eal_version.map +++ b/lib/librte_eal/rte_eal_version.map @@ -19,9 +19,6 @@ DPDK_2.0 { rte_dump_tailq; rte_eal_alarm_cancel; rte_eal_alarm_set; - rte_eal_devargs_add; - rte_eal_devargs_dump; - rte_eal_devargs_type_count; rte_eal_get_configuration; rte_eal_get_lcore_state; rte_eal_get_physmem_size; @@ -32,7 +29,6 @@ DPDK_2.0 { rte_eal_lcore_role; rte_eal_mp_remote_launch; rte_eal_mp_wait_lcore; - rte_eal_parse_devargs_str; rte_eal_process_type; rte_eal_remote_launch; rte_eal_tailq_lookup; @@ -134,8 +130,6 @@ DPDK_16.11 { rte_delay_us_block; rte_delay_us_callback_register; - rte_eal_dev_attach; - rte_eal_dev_detach; } DPDK_16.07; @@ -262,6 +256,16 @@ DPDK_18.08 { } DPDK_18.05; +DPDK_18.11 { + global: + + rte_eal_get_runtime_dir; + rte_eal_hotplug_add; + rte_eal_hotplug_remove; + rte_strscpy; + +} DPDK_18.08; + EXPERIMENTAL { global: @@ -270,12 +274,19 @@ EXPERIMENTAL { rte_class_register; rte_class_unregister; rte_ctrl_thread_create; + rte_delay_us_sleep; + rte_dev_event_callback_process; rte_dev_event_callback_register; rte_dev_event_callback_unregister; rte_dev_event_monitor_start; rte_dev_event_monitor_stop; + rte_dev_hotplug_handle_disable; + rte_dev_hotplug_handle_enable; + rte_dev_is_probed; rte_dev_iterator_init; rte_dev_iterator_next; + rte_dev_probe; + rte_dev_remove; rte_devargs_add; rte_devargs_dump; rte_devargs_insert; @@ -284,9 +295,8 @@ EXPERIMENTAL { rte_devargs_parsef; rte_devargs_remove; rte_devargs_type_count; + rte_eal_check_dma_mask; rte_eal_cleanup; - rte_eal_hotplug_add; - rte_eal_hotplug_remove; rte_fbarray_attach; rte_fbarray_destroy; rte_fbarray_detach; @@ -311,6 +321,14 @@ EXPERIMENTAL { rte_fbarray_set_used; rte_log_register_type_and_pick_level; rte_malloc_dump_heaps; + rte_malloc_heap_create; + rte_malloc_heap_destroy; + rte_malloc_heap_get_socket; + rte_malloc_heap_memory_add; + rte_malloc_heap_memory_attach; + rte_malloc_heap_memory_detach; + rte_malloc_heap_memory_remove; + rte_malloc_heap_socket_is_external; rte_mem_alloc_validator_register; rte_mem_alloc_validator_unregister; rte_mem_event_callback_register; @@ -320,6 +338,10 @@ EXPERIMENTAL { rte_mem_virt2memseg_list; rte_memseg_contig_walk; rte_memseg_contig_walk_thread_unsafe; + rte_memseg_get_fd; + rte_memseg_get_fd_offset; + rte_memseg_get_fd_thread_unsafe; + rte_memseg_get_fd_offset_thread_unsafe; rte_memseg_list_walk; rte_memseg_list_walk_thread_unsafe; rte_memseg_walk; @@ -330,6 +352,7 @@ EXPERIMENTAL { rte_mp_request_sync; rte_mp_request_async; rte_mp_sendmsg; + rte_option_register; rte_service_lcore_attr_get; rte_service_lcore_attr_reset_all; rte_service_may_be_active; diff --git a/lib/librte_ethdev/Makefile b/lib/librte_ethdev/Makefile index 0935a275..3e27ae46 100644 --- a/lib/librte_ethdev/Makefile +++ b/lib/librte_ethdev/Makefile @@ -12,13 +12,15 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API CFLAGS += -O3 CFLAGS += $(WERROR_FLAGS) LDLIBS += -lrte_net -lrte_eal -lrte_mempool -lrte_ring -LDLIBS += -lrte_mbuf +LDLIBS += -lrte_mbuf -lrte_kvargs -lrte_cmdline EXPORT_MAP := rte_ethdev_version.map -LIBABIVER := 10 +LIBABIVER := 11 +SRCS-y += ethdev_private.c SRCS-y += rte_ethdev.c +SRCS-y += rte_class_eth.c SRCS-y += rte_flow.c SRCS-y += rte_tm.c SRCS-y += rte_mtr.c diff --git a/lib/librte_ethdev/ethdev_private.c b/lib/librte_ethdev/ethdev_private.c new file mode 100644 index 00000000..162a502f --- /dev/null +++ b/lib/librte_ethdev/ethdev_private.c @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Gaëtan Rivet + */ + +#include "rte_ethdev.h" +#include "rte_ethdev_driver.h" +#include "ethdev_private.h" + +uint16_t +eth_dev_to_id(const struct rte_eth_dev *dev) +{ + if (dev == NULL) + return RTE_MAX_ETHPORTS; + return dev - rte_eth_devices; +} + +struct rte_eth_dev * +eth_find_device(const struct rte_eth_dev *start, rte_eth_cmp_t cmp, + const void *data) +{ + struct rte_eth_dev *edev; + ptrdiff_t idx; + + /* Avoid Undefined Behaviour */ + if (start != NULL && + (start < &rte_eth_devices[0] || + start > &rte_eth_devices[RTE_MAX_ETHPORTS])) + return NULL; + if (start != NULL) + idx = eth_dev_to_id(start) + 1; + else + idx = 0; + for (; idx < RTE_MAX_ETHPORTS; idx++) { + edev = &rte_eth_devices[idx]; + if (cmp(edev, data) == 0) + return edev; + } + return NULL; +} + +int +rte_eth_devargs_parse_list(char *str, rte_eth_devargs_callback_t callback, + void *data) +{ + char *str_start; + int state; + int result; + + if (*str != '[') + /* Single element, not a list */ + return callback(str, data); + + /* Sanity check, then strip the brackets */ + str_start = &str[strlen(str) - 1]; + if (*str_start != ']') { + RTE_LOG(ERR, EAL, "(%s): List does not end with ']'\n", str); + return -EINVAL; + } + str++; + *str_start = '\0'; + + /* Process list elements */ + state = 0; + while (1) { + if (state == 0) { + if (*str == '\0') + break; + if (*str != ',') { + str_start = str; + state = 1; + } + } else if (state == 1) { + if (*str == ',' || *str == '\0') { + if (str > str_start) { + /* Non-empty string fragment */ + *str = '\0'; + result = callback(str_start, data); + if (result < 0) + return result; + } + state = 0; + } + } + str++; + } + return 0; +} + +static int +rte_eth_devargs_process_range(char *str, uint16_t *list, uint16_t *len_list, + const uint16_t max_list) +{ + uint16_t lo, hi, val; + int result; + + result = sscanf(str, "%hu-%hu", &lo, &hi); + if (result == 1) { + if (*len_list >= max_list) + return -ENOMEM; + list[(*len_list)++] = lo; + } else if (result == 2) { + if (lo >= hi || lo > RTE_MAX_ETHPORTS || hi > RTE_MAX_ETHPORTS) + return -EINVAL; + for (val = lo; val <= hi; val++) { + if (*len_list >= max_list) + return -ENOMEM; + list[(*len_list)++] = val; + } + } else + return -EINVAL; + return 0; +} + +int +rte_eth_devargs_parse_representor_ports(char *str, void *data) +{ + struct rte_eth_devargs *eth_da = data; + + return rte_eth_devargs_process_range(str, eth_da->representor_ports, + ð_da->nb_representor_ports, RTE_MAX_ETHPORTS); +} diff --git a/lib/librte_ethdev/ethdev_private.h b/lib/librte_ethdev/ethdev_private.h new file mode 100644 index 00000000..7b787bf9 --- /dev/null +++ b/lib/librte_ethdev/ethdev_private.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Gaëtan Rivet + */ + +#ifndef _RTE_ETH_PRIVATE_H_ +#define _RTE_ETH_PRIVATE_H_ + +#include "rte_ethdev.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Convert rte_eth_dev pointer to port id. + * NULL will be translated to RTE_MAX_ETHPORTS. + */ +uint16_t eth_dev_to_id(const struct rte_eth_dev *dev); + +/* Generic rte_eth_dev comparison function. */ +typedef int (*rte_eth_cmp_t)(const struct rte_eth_dev *, const void *); + +/* Generic rte_eth_dev iterator. */ +struct rte_eth_dev * +eth_find_device(const struct rte_eth_dev *_start, rte_eth_cmp_t cmp, + const void *data); + +/* Parse devargs value for representor parameter. */ +typedef int (*rte_eth_devargs_callback_t)(char *str, void *data); +int rte_eth_devargs_parse_list(char *str, rte_eth_devargs_callback_t callback, + void *data); +int rte_eth_devargs_parse_representor_ports(char *str, void *data); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ETH_PRIVATE_H_ */ diff --git a/lib/librte_ethdev/ethdev_profile.c b/lib/librte_ethdev/ethdev_profile.c index 0d1dcda3..a3c303f6 100644 --- a/lib/librte_ethdev/ethdev_profile.c +++ b/lib/librte_ethdev/ethdev_profile.c @@ -1,87 +1,33 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2017 Intel Corporation + * Copyright(c) 2010-2018 Intel Corporation */ #include "ethdev_profile.h" /** - * This conditional block enables RX queues profiling by tracking wasted - * iterations, i.e. iterations which yielded no RX packets. Profiling is - * performed using the Instrumentation and Tracing Technology (ITT) API, - * employed by the Intel (R) VTune (TM) Amplifier. + * This conditional block enables Ethernet device profiling with + * Intel (R) VTune (TM) Amplifier. */ -#ifdef RTE_ETHDEV_PROFILE_ITT_WASTED_RX_ITERATIONS - -#include - -#define ITT_MAX_NAME_LEN (100) - -/** - * Auxiliary ITT structure belonging to Ethernet device and using to: - * - track RX queue state to determine whether it is wasting loop iterations - * - begin or end ITT task using task domain and task name (handle) - */ -struct itt_profile_rx_data { - /** - * ITT domains for each queue. - */ - __itt_domain *domains[RTE_MAX_QUEUES_PER_PORT]; - /** - * ITT task names for each queue. - */ - __itt_string_handle *handles[RTE_MAX_QUEUES_PER_PORT]; - /** - * Flags indicating the queues state. Possible values: - * 1 - queue is wasting iterations, - * 0 - otherwise. - */ - uint8_t queue_state[RTE_MAX_QUEUES_PER_PORT]; -}; - -/** - * The pool of *itt_profile_rx_data* structures. - */ -struct itt_profile_rx_data itt_rx_data[RTE_MAX_ETHPORTS]; - +#ifdef RTE_ETHDEV_PROFILE_WITH_VTUNE /** - * This callback function manages ITT tasks collection on given port and queue. - * It must be registered with rte_eth_add_rx_callback() to be called from - * rte_eth_rx_burst(). To find more comments see rte_rx_callback_fn function - * type declaration. + * Hook callback to trace rte_eth_rx_burst() calls. */ -static uint16_t -collect_itt_rx_burst_cb(uint16_t port_id, uint16_t queue_id, +uint16_t +profile_hook_rx_burst_cb( + __rte_unused uint16_t port_id, __rte_unused uint16_t queue_id, __rte_unused struct rte_mbuf *pkts[], uint16_t nb_pkts, __rte_unused uint16_t max_pkts, __rte_unused void *user_param) { - if (unlikely(nb_pkts == 0)) { - if (!itt_rx_data[port_id].queue_state[queue_id]) { - __itt_task_begin( - itt_rx_data[port_id].domains[queue_id], - __itt_null, __itt_null, - itt_rx_data[port_id].handles[queue_id]); - itt_rx_data[port_id].queue_state[queue_id] = 1; - } - } else { - if (unlikely(itt_rx_data[port_id].queue_state[queue_id])) { - __itt_task_end( - itt_rx_data[port_id].domains[queue_id]); - itt_rx_data[port_id].queue_state[queue_id] = 0; - } - } return nb_pkts; } /** - * Initialization of itt_profile_rx_data for a given Ethernet device. + * Setting profiling rx callback for a given Ethernet device. * This function must be invoked when ethernet device is being configured. - * Result will be stored in the global array *itt_rx_data*. * * @param port_id * The port identifier of the Ethernet device. - * @param port_name - * The name of the Ethernet device. * @param rx_queue_num * The number of RX queues on specified port. * @@ -90,46 +36,27 @@ collect_itt_rx_burst_cb(uint16_t port_id, uint16_t queue_id, * - On failure, a negative value. */ static inline int -itt_profile_rx_init(uint16_t port_id, char *port_name, uint8_t rx_queue_num) +vtune_profile_rx_init(uint16_t port_id, uint8_t rx_queue_num) { uint16_t q_id; for (q_id = 0; q_id < rx_queue_num; ++q_id) { - char domain_name[ITT_MAX_NAME_LEN]; - - snprintf(domain_name, sizeof(domain_name), - "RXBurst.WastedIterations.Port_%s.Queue_%d", - port_name, q_id); - itt_rx_data[port_id].domains[q_id] - = __itt_domain_create(domain_name); - - char task_name[ITT_MAX_NAME_LEN]; - - snprintf(task_name, sizeof(task_name), - "port id: %d; queue id: %d", - port_id, q_id); - itt_rx_data[port_id].handles[q_id] - = __itt_string_handle_create(task_name); - - itt_rx_data[port_id].queue_state[q_id] = 0; - if (!rte_eth_add_rx_callback( - port_id, q_id, collect_itt_rx_burst_cb, NULL)) { + port_id, q_id, profile_hook_rx_burst_cb, NULL)) { return -rte_errno; } } return 0; } -#endif /* RTE_ETHDEV_PROFILE_ITT_WASTED_RX_ITERATIONS */ +#endif /* RTE_ETHDEV_PROFILE_WITH_VTUNE */ int -__rte_eth_profile_rx_init(__rte_unused uint16_t port_id, +__rte_eth_dev_profile_init(__rte_unused uint16_t port_id, __rte_unused struct rte_eth_dev *dev) { -#ifdef RTE_ETHDEV_PROFILE_ITT_WASTED_RX_ITERATIONS - return itt_profile_rx_init( - port_id, dev->data->name, dev->data->nb_rx_queues); +#ifdef RTE_ETHDEV_PROFILE_WITH_VTUNE + return vtune_profile_rx_init(port_id, dev->data->nb_rx_queues); #endif return 0; } diff --git a/lib/librte_ethdev/ethdev_profile.h b/lib/librte_ethdev/ethdev_profile.h index e5ea3682..65031e6f 100644 --- a/lib/librte_ethdev/ethdev_profile.h +++ b/lib/librte_ethdev/ethdev_profile.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2017 Intel Corporation + * Copyright(c) 2010-2018 Intel Corporation */ #ifndef _RTE_ETHDEV_PROFILE_H_ @@ -8,7 +8,7 @@ #include "rte_ethdev.h" /** - * Initialization of profiling RX queues for the Ethernet device. + * Initialization of the Ethernet device profiling. * Implementation of this function depends on chosen profiling method, * defined in configs. * @@ -22,6 +22,6 @@ * - On failure, a negative value. */ int -__rte_eth_profile_rx_init(uint16_t port_id, struct rte_eth_dev *dev); +__rte_eth_dev_profile_init(uint16_t port_id, struct rte_eth_dev *dev); #endif diff --git a/lib/librte_ethdev/meson.build b/lib/librte_ethdev/meson.build index 596cd0f3..a4d85026 100644 --- a/lib/librte_ethdev/meson.build +++ b/lib/librte_ethdev/meson.build @@ -2,9 +2,11 @@ # Copyright(c) 2017 Intel Corporation name = 'ethdev' -version = 10 +version = 11 allow_experimental_apis = true -sources = files('ethdev_profile.c', +sources = files('ethdev_private.c', + 'ethdev_profile.c', + 'rte_class_eth.c', 'rte_ethdev.c', 'rte_flow.c', 'rte_mtr.c', @@ -24,4 +26,4 @@ headers = files('rte_ethdev.h', 'rte_tm.h', 'rte_tm_driver.h') -deps += ['net', 'kvargs'] +deps += ['net', 'kvargs', 'cmdline'] diff --git a/lib/librte_ethdev/rte_class_eth.c b/lib/librte_ethdev/rte_class_eth.c new file mode 100644 index 00000000..cb99c92e --- /dev/null +++ b/lib/librte_ethdev/rte_class_eth.c @@ -0,0 +1,173 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Gaëtan Rivet + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include "rte_ethdev.h" +#include "rte_ethdev_core.h" +#include "rte_ethdev_driver.h" +#include "ethdev_private.h" + +enum eth_params { + RTE_ETH_PARAM_MAC, + RTE_ETH_PARAM_REPRESENTOR, + RTE_ETH_PARAM_MAX, +}; + +static const char * const eth_params_keys[] = { + [RTE_ETH_PARAM_MAC] = "mac", + [RTE_ETH_PARAM_REPRESENTOR] = "representor", + [RTE_ETH_PARAM_MAX] = NULL, +}; + +struct eth_dev_match_arg { + struct rte_device *device; + struct rte_kvargs *kvlist; +}; + +#define eth_dev_match_arg(d, k) \ + (&(const struct eth_dev_match_arg) { \ + .device = (d), \ + .kvlist = (k), \ + }) + +static int +eth_mac_cmp(const char *key __rte_unused, + const char *value, void *opaque) +{ + int ret; + struct ether_addr mac; + const struct rte_eth_dev_data *data = opaque; + struct rte_eth_dev_info dev_info; + uint32_t index; + + /* Parse devargs MAC address. */ + /* + * cannot use ether_aton_r(value, &mac) + * because of include conflict with rte_ether.h + */ + ret = cmdline_parse_etheraddr(NULL, value, &mac, sizeof(mac)); + if (ret < 0) + return -1; /* invalid devargs value */ + + /* Return 0 if devargs MAC is matching one of the device MACs. */ + rte_eth_dev_info_get(data->port_id, &dev_info); + for (index = 0; index < dev_info.max_mac_addrs; index++) + if (is_same_ether_addr(&mac, &data->mac_addrs[index])) + return 0; + return -1; /* no match */ +} + +static int +eth_representor_cmp(const char *key __rte_unused, + const char *value, void *opaque) +{ + int ret; + char *values; + const struct rte_eth_dev_data *data = opaque; + struct rte_eth_devargs representors; + uint16_t index; + + if ((data->dev_flags & RTE_ETH_DEV_REPRESENTOR) == 0) + return -1; /* not a representor port */ + + /* Parse devargs representor values. */ + values = strdup(value); + if (values == NULL) + return -1; + memset(&representors, 0, sizeof(representors)); + ret = rte_eth_devargs_parse_list(values, + rte_eth_devargs_parse_representor_ports, + &representors); + free(values); + if (ret != 0) + return -1; /* invalid devargs value */ + + /* Return 0 if representor id is matching one of the values. */ + for (index = 0; index < representors.nb_representor_ports; index++) + if (data->representor_id == + representors.representor_ports[index]) + return 0; + return -1; /* no match */ +} + +static int +eth_dev_match(const struct rte_eth_dev *edev, + const void *_arg) +{ + int ret; + const struct eth_dev_match_arg *arg = _arg; + const struct rte_kvargs *kvlist = arg->kvlist; + unsigned int pair; + + if (edev->state == RTE_ETH_DEV_UNUSED) + return -1; + if (arg->device != NULL && arg->device != edev->device) + return -1; + + ret = rte_kvargs_process(kvlist, + eth_params_keys[RTE_ETH_PARAM_MAC], + eth_mac_cmp, edev->data); + if (ret != 0) + return -1; + + ret = rte_kvargs_process(kvlist, + eth_params_keys[RTE_ETH_PARAM_REPRESENTOR], + eth_representor_cmp, edev->data); + if (ret != 0) + return -1; + /* search for representor key */ + for (pair = 0; pair < kvlist->count; pair++) { + ret = strcmp(kvlist->pairs[pair].key, + eth_params_keys[RTE_ETH_PARAM_REPRESENTOR]); + if (ret == 0) + break; /* there is a representor key */ + } + /* if no representor key, default is to not match representor ports */ + if (ret != 0) + if ((edev->data->dev_flags & RTE_ETH_DEV_REPRESENTOR) != 0) + return -1; /* do not match any representor */ + + return 0; +} + +static void * +eth_dev_iterate(const void *start, + const char *str, + const struct rte_dev_iterator *it) +{ + struct rte_kvargs *kvargs = NULL; + struct rte_eth_dev *edev = NULL; + const char * const *valid_keys = NULL; + + if (str != NULL) { + if (str[0] == '+') /* no validation of keys */ + str++; + else + valid_keys = eth_params_keys; + kvargs = rte_kvargs_parse(str, valid_keys); + if (kvargs == NULL) { + RTE_LOG(ERR, EAL, "cannot parse argument list\n"); + rte_errno = EINVAL; + return NULL; + } + } + edev = eth_find_device(start, eth_dev_match, + eth_dev_match_arg(it->device, kvargs)); + rte_kvargs_free(kvargs); + return edev; +} + +static struct rte_class rte_class_eth = { + .dev_iterate = eth_dev_iterate, +}; + +RTE_REGISTER_CLASS(eth, rte_class_eth); diff --git a/lib/librte_ethdev/rte_ethdev.c b/lib/librte_ethdev/rte_ethdev.c index 4c320250..9d348138 100644 --- a/lib/librte_ethdev/rte_ethdev.c +++ b/lib/librte_ethdev/rte_ethdev.c @@ -36,11 +36,13 @@ #include #include #include +#include #include "rte_ether.h" #include "rte_ethdev.h" #include "rte_ethdev_driver.h" #include "ethdev_profile.h" +#include "ethdev_private.h" int rte_eth_dev_logtype; @@ -122,11 +124,12 @@ static const struct { RTE_RX_OFFLOAD_BIT2STR(VLAN_FILTER), RTE_RX_OFFLOAD_BIT2STR(VLAN_EXTEND), RTE_RX_OFFLOAD_BIT2STR(JUMBO_FRAME), - RTE_RX_OFFLOAD_BIT2STR(CRC_STRIP), RTE_RX_OFFLOAD_BIT2STR(SCATTER), RTE_RX_OFFLOAD_BIT2STR(TIMESTAMP), RTE_RX_OFFLOAD_BIT2STR(SECURITY), RTE_RX_OFFLOAD_BIT2STR(KEEP_CRC), + RTE_RX_OFFLOAD_BIT2STR(SCTP_CKSUM), + RTE_RX_OFFLOAD_BIT2STR(OUTER_UDP_CKSUM), }; #undef RTE_RX_OFFLOAD_BIT2STR @@ -156,6 +159,10 @@ static const struct { RTE_TX_OFFLOAD_BIT2STR(MULTI_SEGS), RTE_TX_OFFLOAD_BIT2STR(MBUF_FAST_FREE), RTE_TX_OFFLOAD_BIT2STR(SECURITY), + RTE_TX_OFFLOAD_BIT2STR(UDP_TNL_TSO), + RTE_TX_OFFLOAD_BIT2STR(IP_TNL_TSO), + RTE_TX_OFFLOAD_BIT2STR(OUTER_UDP_CKSUM), + RTE_TX_OFFLOAD_BIT2STR(MATCH_METADATA), }; #undef RTE_TX_OFFLOAD_BIT2STR @@ -180,6 +187,146 @@ enum { STAT_QMAP_RX }; +int __rte_experimental +rte_eth_iterator_init(struct rte_dev_iterator *iter, const char *devargs_str) +{ + int ret; + struct rte_devargs devargs = {.args = NULL}; + const char *bus_param_key; + char *bus_str = NULL; + char *cls_str = NULL; + int str_size; + + memset(iter, 0, sizeof(*iter)); + + /* + * The devargs string may use various syntaxes: + * - 0000:08:00.0,representor=[1-3] + * - pci:0000:06:00.0,representor=[0,5] + * - class=eth,mac=00:11:22:33:44:55 + * A new syntax is in development (not yet supported): + * - bus=X,paramX=x/class=Y,paramY=y/driver=Z,paramZ=z + */ + + /* + * Handle pure class filter (i.e. without any bus-level argument), + * from future new syntax. + * rte_devargs_parse() is not yet supporting the new syntax, + * that's why this simple case is temporarily parsed here. + */ +#define iter_anybus_str "class=eth," + if (strncmp(devargs_str, iter_anybus_str, + strlen(iter_anybus_str)) == 0) { + iter->cls_str = devargs_str + strlen(iter_anybus_str); + goto end; + } + + /* Split bus, device and parameters. */ + ret = rte_devargs_parse(&devargs, devargs_str); + if (ret != 0) + goto error; + + /* + * Assume parameters of old syntax can match only at ethdev level. + * Extra parameters will be ignored, thanks to "+" prefix. + */ + str_size = strlen(devargs.args) + 2; + cls_str = malloc(str_size); + if (cls_str == NULL) { + ret = -ENOMEM; + goto error; + } + ret = snprintf(cls_str, str_size, "+%s", devargs.args); + if (ret != str_size - 1) { + ret = -EINVAL; + goto error; + } + iter->cls_str = cls_str; + free(devargs.args); /* allocated by rte_devargs_parse() */ + devargs.args = NULL; + + iter->bus = devargs.bus; + if (iter->bus->dev_iterate == NULL) { + ret = -ENOTSUP; + goto error; + } + + /* Convert bus args to new syntax for use with new API dev_iterate. */ + if (strcmp(iter->bus->name, "vdev") == 0) { + bus_param_key = "name"; + } else if (strcmp(iter->bus->name, "pci") == 0) { + bus_param_key = "addr"; + } else { + ret = -ENOTSUP; + goto error; + } + str_size = strlen(bus_param_key) + strlen(devargs.name) + 2; + bus_str = malloc(str_size); + if (bus_str == NULL) { + ret = -ENOMEM; + goto error; + } + ret = snprintf(bus_str, str_size, "%s=%s", + bus_param_key, devargs.name); + if (ret != str_size - 1) { + ret = -EINVAL; + goto error; + } + iter->bus_str = bus_str; + +end: + iter->cls = rte_class_find_by_name("eth"); + return 0; + +error: + if (ret == -ENOTSUP) + RTE_LOG(ERR, EAL, "Bus %s does not support iterating.\n", + iter->bus->name); + free(devargs.args); + free(bus_str); + free(cls_str); + return ret; +} + +uint16_t __rte_experimental +rte_eth_iterator_next(struct rte_dev_iterator *iter) +{ + if (iter->cls == NULL) /* invalid ethdev iterator */ + return RTE_MAX_ETHPORTS; + + do { /* loop to try all matching rte_device */ + /* If not pure ethdev filter and */ + if (iter->bus != NULL && + /* not in middle of rte_eth_dev iteration, */ + iter->class_device == NULL) { + /* get next rte_device to try. */ + iter->device = iter->bus->dev_iterate( + iter->device, iter->bus_str, iter); + if (iter->device == NULL) + break; /* no more rte_device candidate */ + } + /* A device is matching bus part, need to check ethdev part. */ + iter->class_device = iter->cls->dev_iterate( + iter->class_device, iter->cls_str, iter); + if (iter->class_device != NULL) + return eth_dev_to_id(iter->class_device); /* match */ + } while (iter->bus != NULL); /* need to try next rte_device */ + + /* No more ethdev port to iterate. */ + rte_eth_iterator_cleanup(iter); + return RTE_MAX_ETHPORTS; +} + +void __rte_experimental +rte_eth_iterator_cleanup(struct rte_dev_iterator *iter) +{ + if (iter->bus_str == NULL) + return; /* nothing to free in pure class filter */ + free(RTE_CAST_FIELD(iter, bus_str, char *)); /* workaround const */ + free(RTE_CAST_FIELD(iter, cls_str, char *)); /* workaround const */ + memset(iter, 0, sizeof(*iter)); +} + uint16_t rte_eth_find_next(uint16_t port_id) { @@ -366,13 +513,22 @@ rte_eth_dev_release_port(struct rte_eth_dev *eth_dev) rte_eth_dev_shared_data_prepare(); - _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_DESTROY, NULL); + if (eth_dev->state != RTE_ETH_DEV_UNUSED) + _rte_eth_dev_callback_process(eth_dev, + RTE_ETH_EVENT_DESTROY, NULL); rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock); eth_dev->state = RTE_ETH_DEV_UNUSED; - memset(eth_dev->data, 0, sizeof(struct rte_eth_dev_data)); + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + rte_free(eth_dev->data->rx_queues); + rte_free(eth_dev->data->tx_queues); + rte_free(eth_dev->data->mac_addrs); + rte_free(eth_dev->data->hash_mac_addrs); + rte_free(eth_dev->data->dev_private); + memset(eth_dev->data, 0, sizeof(struct rte_eth_dev_data)); + } rte_spinlock_unlock(&rte_eth_dev_shared_data->ownership_lock); @@ -393,11 +549,8 @@ static int rte_eth_is_valid_owner_id(uint64_t owner_id) { if (owner_id == RTE_ETH_DEV_NO_OWNER || - rte_eth_dev_shared_data->next_owner_id <= owner_id) { - RTE_ETHDEV_LOG(ERR, "Invalid owner_id=%016"PRIx64"\n", - owner_id); + rte_eth_dev_shared_data->next_owner_id <= owner_id) return 0; - } return 1; } @@ -444,8 +597,12 @@ _rte_eth_dev_owner_set(const uint16_t port_id, const uint64_t old_owner_id, } if (!rte_eth_is_valid_owner_id(new_owner->id) && - !rte_eth_is_valid_owner_id(old_owner_id)) + !rte_eth_is_valid_owner_id(old_owner_id)) { + RTE_ETHDEV_LOG(ERR, + "Invalid owner old_id=%016"PRIx64" new_id=%016"PRIx64"\n", + old_owner_id, new_owner->id); return -EINVAL; + } port_owner = &rte_eth_devices[port_id].data->owner; if (port_owner->id != old_owner_id) { @@ -516,9 +673,13 @@ rte_eth_dev_owner_delete(const uint64_t owner_id) if (rte_eth_devices[port_id].data->owner.id == owner_id) memset(&rte_eth_devices[port_id].data->owner, 0, sizeof(struct rte_eth_dev_owner)); - RTE_ETHDEV_LOG(ERR, + RTE_ETHDEV_LOG(NOTICE, "All port owners owned by %016"PRIx64" identifier have removed\n", owner_id); + } else { + RTE_ETHDEV_LOG(ERR, + "Invalid owner id=%016"PRIx64"\n", + owner_id); } rte_spinlock_unlock(&rte_eth_dev_shared_data->ownership_lock); @@ -642,87 +803,6 @@ eth_err(uint16_t port_id, int ret) return ret; } -/* attach the new device, then store port_id of the device */ -int -rte_eth_dev_attach(const char *devargs, uint16_t *port_id) -{ - int current = rte_eth_dev_count_total(); - struct rte_devargs da; - int ret = -1; - - memset(&da, 0, sizeof(da)); - - if ((devargs == NULL) || (port_id == NULL)) { - ret = -EINVAL; - goto err; - } - - /* parse devargs */ - if (rte_devargs_parse(&da, devargs)) - goto err; - - ret = rte_eal_hotplug_add(da.bus->name, da.name, da.args); - if (ret < 0) - goto err; - - /* no point looking at the port count if no port exists */ - if (!rte_eth_dev_count_total()) { - RTE_ETHDEV_LOG(ERR, "No port found for device (%s)\n", da.name); - ret = -1; - goto err; - } - - /* if nothing happened, there is a bug here, since some driver told us - * it did attach a device, but did not create a port. - * FIXME: race condition in case of plug-out of another device - */ - if (current == rte_eth_dev_count_total()) { - ret = -1; - goto err; - } - - *port_id = eth_dev_last_created_port; - ret = 0; - -err: - free(da.args); - return ret; -} - -/* detach the device, then store the name of the device */ -int -rte_eth_dev_detach(uint16_t port_id, char *name __rte_unused) -{ - struct rte_device *dev; - struct rte_bus *bus; - uint32_t dev_flags; - int ret = -1; - - RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); - - dev_flags = rte_eth_devices[port_id].data->dev_flags; - if (dev_flags & RTE_ETH_DEV_BONDED_SLAVE) { - RTE_ETHDEV_LOG(ERR, - "Port %"PRIu16" is bonded, cannot detach\n", port_id); - return -ENOTSUP; - } - - dev = rte_eth_devices[port_id].device; - if (dev == NULL) - return -EINVAL; - - bus = rte_bus_find_by_device(dev); - if (bus == NULL) - return -ENOENT; - - ret = rte_eal_hotplug_remove(bus->name, dev->name); - if (ret < 0) - return ret; - - rte_eth_dev_release_port(&rte_eth_devices[port_id]); - return 0; -} - static int rte_eth_dev_rx_queue_config(struct rte_eth_dev *dev, uint16_t nb_queues) { @@ -974,7 +1054,7 @@ rte_eth_speed_bitflag(uint32_t speed, int duplex) } } -const char * __rte_experimental +const char * rte_eth_dev_rx_offload_name(uint64_t offload) { const char *name = "UNKNOWN"; @@ -990,7 +1070,7 @@ rte_eth_dev_rx_offload_name(uint64_t offload) return name; } -const char * __rte_experimental +const char * rte_eth_dev_tx_offload_name(uint64_t offload) { const char *name = "UNKNOWN"; @@ -1142,14 +1222,6 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q, return -EINVAL; } - if ((local_conf.rxmode.offloads & DEV_RX_OFFLOAD_CRC_STRIP) && - (local_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)) { - RTE_ETHDEV_LOG(ERR, - "Port id=%u not allowed to set both CRC STRIP and KEEP CRC offload flags\n", - port_id); - return -EINVAL; - } - /* Check that device supports requested rss hash functions. */ if ((dev_info.flow_type_rss_offloads | dev_conf->rx_adv_conf.rss_conf.rss_hf) != @@ -1191,9 +1263,9 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q, } /* Initialize Rx profiling if enabled at compilation time. */ - diag = __rte_eth_profile_rx_init(port_id, dev); + diag = __rte_eth_dev_profile_init(port_id, dev); if (diag != 0) { - RTE_ETHDEV_LOG(ERR, "Port%u __rte_eth_profile_rx_init = %d\n", + RTE_ETHDEV_LOG(ERR, "Port%u __rte_eth_dev_profile_init = %d\n", port_id, diag); rte_eth_dev_rx_queue_config(dev, 0); rte_eth_dev_tx_queue_config(dev, 0); @@ -1219,19 +1291,14 @@ _rte_eth_dev_reset(struct rte_eth_dev *dev) } static void -rte_eth_dev_config_restore(uint16_t port_id) +rte_eth_dev_mac_restore(struct rte_eth_dev *dev, + struct rte_eth_dev_info *dev_info) { - struct rte_eth_dev *dev; - struct rte_eth_dev_info dev_info; struct ether_addr *addr; uint16_t i; uint32_t pool = 0; uint64_t pool_mask; - dev = &rte_eth_devices[port_id]; - - rte_eth_dev_info_get(port_id, &dev_info); - /* replay MAC address configuration including default MAC */ addr = &dev->data->mac_addrs[0]; if (*dev->dev_ops->mac_addr_set != NULL) @@ -1240,7 +1307,7 @@ rte_eth_dev_config_restore(uint16_t port_id) (*dev->dev_ops->mac_addr_add)(dev, addr, 0, pool); if (*dev->dev_ops->mac_addr_add != NULL) { - for (i = 1; i < dev_info.max_mac_addrs; i++) { + for (i = 1; i < dev_info->max_mac_addrs; i++) { addr = &dev->data->mac_addrs[i]; /* skip zero address */ @@ -1259,6 +1326,14 @@ rte_eth_dev_config_restore(uint16_t port_id) } while (pool_mask); } } +} + +static void +rte_eth_dev_config_restore(struct rte_eth_dev *dev, + struct rte_eth_dev_info *dev_info, uint16_t port_id) +{ + if (!(*dev_info->dev_flags & RTE_ETH_DEV_NOLIVE_MAC_ADDR)) + rte_eth_dev_mac_restore(dev, dev_info); /* replay promiscuous configuration */ if (rte_eth_promiscuous_get(port_id) == 1) @@ -1277,6 +1352,7 @@ int rte_eth_dev_start(uint16_t port_id) { struct rte_eth_dev *dev; + struct rte_eth_dev_info dev_info; int diag; RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); @@ -1292,13 +1368,19 @@ rte_eth_dev_start(uint16_t port_id) return 0; } + rte_eth_dev_info_get(port_id, &dev_info); + + /* Lets restore MAC now if device does not support live change */ + if (*dev_info.dev_flags & RTE_ETH_DEV_NOLIVE_MAC_ADDR) + rte_eth_dev_mac_restore(dev, &dev_info); + diag = (*dev->dev_ops->dev_start)(dev); if (diag == 0) dev->data->dev_started = 1; else return eth_err(port_id, diag); - rte_eth_dev_config_restore(port_id); + rte_eth_dev_config_restore(dev, &dev_info, port_id); if (dev->data->dev_conf.intr_conf.lsc == 0) { RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->link_update, -ENOTSUP); @@ -1366,6 +1448,16 @@ rte_eth_dev_close(uint16_t port_id) dev->data->dev_started = 0; (*dev->dev_ops->dev_close)(dev); + /* check behaviour flag - temporary for PMD migration */ + if ((dev->data->dev_flags & RTE_ETH_DEV_CLOSE_REMOVE) != 0) { + /* new behaviour: send event + reset state + free all data */ + rte_eth_dev_release_port(dev); + return; + } + RTE_ETHDEV_LOG(DEBUG, "Port closing is using an old behaviour.\n" + "The driver %s should migrate to the new behaviour.\n", + dev->device->driver->name); + /* old behaviour: only free queue arrays */ dev->data->nb_rx_queues = 0; rte_free(dev->data->rx_queues); dev->data->rx_queues = NULL; @@ -3425,6 +3517,43 @@ rte_eth_dev_rx_intr_ctl(uint16_t port_id, int epfd, int op, void *data) return 0; } +int __rte_experimental +rte_eth_dev_rx_intr_ctl_q_get_fd(uint16_t port_id, uint16_t queue_id) +{ + struct rte_intr_handle *intr_handle; + struct rte_eth_dev *dev; + unsigned int efd_idx; + uint32_t vec; + int fd; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -1); + + dev = &rte_eth_devices[port_id]; + + if (queue_id >= dev->data->nb_rx_queues) { + RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", queue_id); + return -1; + } + + if (!dev->intr_handle) { + RTE_ETHDEV_LOG(ERR, "RX Intr handle unset\n"); + return -1; + } + + intr_handle = dev->intr_handle; + if (!intr_handle->intr_vec) { + RTE_ETHDEV_LOG(ERR, "RX Intr vector unset\n"); + return -1; + } + + vec = intr_handle->intr_vec[queue_id]; + efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ? + (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec; + fd = intr_handle->efds[efd_idx]; + + return fd; +} + const struct rte_memzone * rte_eth_dma_zone_reserve(const struct rte_eth_dev *dev, const char *ring_name, uint16_t queue_id, size_t size, unsigned align, @@ -3433,9 +3562,8 @@ rte_eth_dma_zone_reserve(const struct rte_eth_dev *dev, const char *ring_name, char z_name[RTE_MEMZONE_NAMESIZE]; const struct rte_memzone *mz; - snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d", - dev->device->driver->name, ring_name, - dev->data->port_id, queue_id); + snprintf(z_name, sizeof(z_name), "eth_p%d_q%d_%s", + dev->data->port_id, queue_id, ring_name); mz = rte_memzone_lookup(z_name); if (mz) @@ -3459,10 +3587,8 @@ rte_eth_dev_create(struct rte_device *device, const char *name, if (rte_eal_process_type() == RTE_PROC_PRIMARY) { ethdev = rte_eth_dev_allocate(name); - if (!ethdev) { - retval = -ENODEV; - goto probe_failed; - } + if (!ethdev) + return -ENODEV; if (priv_data_size) { ethdev->data->dev_private = rte_zmalloc_socket( @@ -3480,8 +3606,7 @@ rte_eth_dev_create(struct rte_device *device, const char *name, if (!ethdev) { RTE_LOG(ERR, EAL, "secondary process attach failed, " "ethdev doesn't exist"); - retval = -ENODEV; - goto probe_failed; + return -ENODEV; } } @@ -3505,13 +3630,9 @@ rte_eth_dev_create(struct rte_device *device, const char *name, rte_eth_dev_probing_finish(ethdev); return retval; -probe_failed: - /* free ports private data if primary process */ - if (rte_eal_process_type() == RTE_PROC_PRIMARY) - rte_free(ethdev->data->dev_private); +probe_failed: rte_eth_dev_release_port(ethdev); - return retval; } @@ -3532,11 +3653,6 @@ rte_eth_dev_destroy(struct rte_eth_dev *ethdev, return ret; } - if (rte_eal_process_type() == RTE_PROC_PRIMARY) - rte_free(ethdev->data->dev_private); - - ethdev->data->dev_private = NULL; - return rte_eth_dev_release_port(ethdev); } @@ -4195,7 +4311,7 @@ enum rte_eth_switch_domain_state { * RTE_MAX_ETHPORTS elements as there cannot be more active switch domains than * ethdev ports in a single process. */ -struct rte_eth_dev_switch { +static struct rte_eth_dev_switch { enum rte_eth_switch_domain_state state; } rte_eth_switch_domains[RTE_MAX_ETHPORTS]; @@ -4236,8 +4352,6 @@ rte_eth_switch_domain_free(uint16_t domain_id) return 0; } -typedef int (*rte_eth_devargs_callback_t)(char *str, void *data); - static int rte_eth_devargs_tokenise(struct rte_kvargs *arglist, const char *str_in) { @@ -4302,89 +4416,6 @@ rte_eth_devargs_tokenise(struct rte_kvargs *arglist, const char *str_in) } } -static int -rte_eth_devargs_parse_list(char *str, rte_eth_devargs_callback_t callback, - void *data) -{ - char *str_start; - int state; - int result; - - if (*str != '[') - /* Single element, not a list */ - return callback(str, data); - - /* Sanity check, then strip the brackets */ - str_start = &str[strlen(str) - 1]; - if (*str_start != ']') { - RTE_LOG(ERR, EAL, "(%s): List does not end with ']'", str); - return -EINVAL; - } - str++; - *str_start = '\0'; - - /* Process list elements */ - state = 0; - while (1) { - if (state == 0) { - if (*str == '\0') - break; - if (*str != ',') { - str_start = str; - state = 1; - } - } else if (state == 1) { - if (*str == ',' || *str == '\0') { - if (str > str_start) { - /* Non-empty string fragment */ - *str = '\0'; - result = callback(str_start, data); - if (result < 0) - return result; - } - state = 0; - } - } - str++; - } - return 0; -} - -static int -rte_eth_devargs_process_range(char *str, uint16_t *list, uint16_t *len_list, - const uint16_t max_list) -{ - uint16_t lo, hi, val; - int result; - - result = sscanf(str, "%hu-%hu", &lo, &hi); - if (result == 1) { - if (*len_list >= max_list) - return -ENOMEM; - list[(*len_list)++] = lo; - } else if (result == 2) { - if (lo >= hi || lo > RTE_MAX_ETHPORTS || hi > RTE_MAX_ETHPORTS) - return -EINVAL; - for (val = lo; val <= hi; val++) { - if (*len_list >= max_list) - return -ENOMEM; - list[(*len_list)++] = val; - } - } else - return -EINVAL; - return 0; -} - - -static int -rte_eth_devargs_parse_representor_ports(char *str, void *data) -{ - struct rte_eth_devargs *eth_da = data; - - return rte_eth_devargs_process_range(str, eth_da->representor_ports, - ð_da->nb_representor_ports, RTE_MAX_ETHPORTS); -} - int __rte_experimental rte_eth_devargs_parse(const char *dargs, struct rte_eth_devargs *eth_da) { diff --git a/lib/librte_ethdev/rte_ethdev.h b/lib/librte_ethdev/rte_ethdev.h index 7070e9ab..769a6943 100644 --- a/lib/librte_ethdev/rte_ethdev.h +++ b/lib/librte_ethdev/rte_ethdev.h @@ -166,6 +166,85 @@ extern int rte_eth_dev_logtype; struct rte_mbuf; +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Initializes a device iterator. + * + * This iterator allows accessing a list of devices matching some devargs. + * + * @param iter + * Device iterator handle initialized by the function. + * The fields bus_str and cls_str might be dynamically allocated, + * and could be freed by calling rte_eth_iterator_cleanup(). + * + * @param devargs + * Device description string. + * + * @return + * 0 on successful initialization, negative otherwise. + */ +__rte_experimental +int rte_eth_iterator_init(struct rte_dev_iterator *iter, const char *devargs); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Iterates on devices with devargs filter. + * The ownership is not checked. + * + * The next port id is returned, and the iterator is updated. + * + * @param iter + * Device iterator handle initialized by rte_eth_iterator_init(). + * Some fields bus_str and cls_str might be freed when no more port is found, + * by calling rte_eth_iterator_cleanup(). + * + * @return + * A port id if found, RTE_MAX_ETHPORTS otherwise. + */ +__rte_experimental +uint16_t rte_eth_iterator_next(struct rte_dev_iterator *iter); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Free some allocated fields of the iterator. + * + * This function is automatically called by rte_eth_iterator_next() + * on the last iteration (i.e. when no more matching port is found). + * + * It is safe to call this function twice; it will do nothing more. + * + * @param iter + * Device iterator handle initialized by rte_eth_iterator_init(). + * The fields bus_str and cls_str are freed if needed. + */ +__rte_experimental +void rte_eth_iterator_cleanup(struct rte_dev_iterator *iter); + +/** + * Macro to iterate over all ethdev ports matching some devargs. + * + * If a break is done before the end of the loop, + * the function rte_eth_iterator_cleanup() must be called. + * + * @param id + * Iterated port id of type uint16_t. + * @param devargs + * Device parameters input as string of type char*. + * @param iter + * Iterator handle of type struct rte_dev_iterator, used internally. + */ +#define RTE_ETH_FOREACH_MATCHING_DEV(id, devargs, iter) \ + for (rte_eth_iterator_init(iter, devargs), \ + id = rte_eth_iterator_next(iter); \ + id != RTE_MAX_ETHPORTS; \ + id = rte_eth_iterator_next(iter)) + /** * A structure used to retrieve statistics for an Ethernet port. * Not all statistics fields in struct rte_eth_stats are supported @@ -869,12 +948,6 @@ struct rte_eth_conf { struct rte_intr_conf intr_conf; /**< Interrupt mode configuration. */ }; -/** - * A structure used to retrieve the contextual information of - * an Ethernet device, such as the controlling driver of the device, - * its PCI context, etc... - */ - /** * RX offload capabilities of a device. */ @@ -890,16 +963,13 @@ struct rte_eth_conf { #define DEV_RX_OFFLOAD_VLAN_FILTER 0x00000200 #define DEV_RX_OFFLOAD_VLAN_EXTEND 0x00000400 #define DEV_RX_OFFLOAD_JUMBO_FRAME 0x00000800 -#define DEV_RX_OFFLOAD_CRC_STRIP 0x00001000 #define DEV_RX_OFFLOAD_SCATTER 0x00002000 #define DEV_RX_OFFLOAD_TIMESTAMP 0x00004000 #define DEV_RX_OFFLOAD_SECURITY 0x00008000 - -/** - * Invalid to set both DEV_RX_OFFLOAD_CRC_STRIP and DEV_RX_OFFLOAD_KEEP_CRC - * No DEV_RX_OFFLOAD_CRC_STRIP flag means keep CRC - */ #define DEV_RX_OFFLOAD_KEEP_CRC 0x00010000 +#define DEV_RX_OFFLOAD_SCTP_CKSUM 0x00020000 +#define DEV_RX_OFFLOAD_OUTER_UDP_CKSUM 0x00040000 + #define DEV_RX_OFFLOAD_CHECKSUM (DEV_RX_OFFLOAD_IPV4_CKSUM | \ DEV_RX_OFFLOAD_UDP_CKSUM | \ DEV_RX_OFFLOAD_TCP_CKSUM) @@ -953,6 +1023,13 @@ struct rte_eth_conf { * for tunnel TSO. */ #define DEV_TX_OFFLOAD_IP_TNL_TSO 0x00080000 +/** Device supports outer UDP checksum */ +#define DEV_TX_OFFLOAD_OUTER_UDP_CKSUM 0x00100000 +/** + * Device supports match on metadata Tx offload.. + * Application must set PKT_TX_METADATA and mbuf metadata field. + */ +#define DEV_TX_OFFLOAD_MATCH_METADATA 0x00200000 #define RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP 0x00000001 /**< Device supports Rx queue setup after device started*/ @@ -1010,6 +1087,12 @@ struct rte_eth_switch_info { /** * Ethernet device information */ + +/** + * A structure used to retrieve the contextual information of + * an Ethernet device, such as the controlling driver of the + * device, etc... + */ struct rte_eth_dev_info { struct rte_device *device; /** Generic device information */ const char *driver_name; /**< Device Driver name. */ @@ -1260,6 +1343,11 @@ struct rte_eth_dev_owner { char name[RTE_ETH_MAX_OWNER_NAME_LEN]; /**< The owner name. */ }; +/** + * Port is released (i.e. totally freed and data erased) on close. + * Temporary flag for PMD migration to new rte_eth_dev_close() behaviour. + */ +#define RTE_ETH_DEV_CLOSE_REMOVE 0x0001 /** Device supports link state interrupt */ #define RTE_ETH_DEV_INTR_LSC 0x0002 /** Device is a bonded slave */ @@ -1268,6 +1356,8 @@ struct rte_eth_dev_owner { #define RTE_ETH_DEV_INTR_RMV 0x0008 /** Device is port representor */ #define RTE_ETH_DEV_REPRESENTOR 0x0010 +/** Device does not support MAC change after started */ +#define RTE_ETH_DEV_NOLIVE_MAC_ADDR 0x0020 /** * Iterates over valid ethdev ports owned by a specific owner. @@ -1419,37 +1509,6 @@ uint16_t rte_eth_dev_count_avail(void); */ uint16_t __rte_experimental rte_eth_dev_count_total(void); -/** - * Attach a new Ethernet device specified by arguments. - * - * @param devargs - * A pointer to a strings array describing the new device - * to be attached. The strings should be a pci address like - * '0000:01:00.0' or virtual device name like 'net_pcap0'. - * @param port_id - * A pointer to a port identifier actually attached. - * @return - * 0 on success and port_id is filled, negative on error - */ -__rte_deprecated -int rte_eth_dev_attach(const char *devargs, uint16_t *port_id); - -/** - * Detach a Ethernet device specified by port identifier. - * This function must be called when the device is in the - * closed state. - * - * @param port_id - * The port identifier of the device to detach. - * @param devname - * A pointer to a buffer that will be filled with the device name. - * This buffer must be at least RTE_DEV_NAME_MAX_LEN long. - * @return - * 0 on success and devname is filled, negative on error - */ -__rte_deprecated -int rte_eth_dev_detach(uint16_t port_id, char *devname); - /** * Convert a numerical speed in Mbps to a bitmap flag that can be used in * the bitmap link_speeds of the struct rte_eth_conf @@ -1464,9 +1523,6 @@ int rte_eth_dev_detach(uint16_t port_id, char *devname); uint32_t rte_eth_speed_bitflag(uint32_t speed, int duplex); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Get DEV_RX_OFFLOAD_* flag name. * * @param offload @@ -1474,12 +1530,9 @@ uint32_t rte_eth_speed_bitflag(uint32_t speed, int duplex); * @return * Offload name or 'UNKNOWN' if the flag cannot be recognised. */ -const char * __rte_experimental rte_eth_dev_rx_offload_name(uint64_t offload); +const char *rte_eth_dev_rx_offload_name(uint64_t offload); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Get DEV_TX_OFFLOAD_* flag name. * * @param offload @@ -1487,7 +1540,7 @@ const char * __rte_experimental rte_eth_dev_rx_offload_name(uint64_t offload); * @return * Offload name or 'UNKNOWN' if the flag cannot be recognised. */ -const char * __rte_experimental rte_eth_dev_tx_offload_name(uint64_t offload); +const char *rte_eth_dev_tx_offload_name(uint64_t offload); /** * Configure an Ethernet device. @@ -1750,6 +1803,10 @@ int rte_eth_dev_tx_queue_stop(uint16_t port_id, uint16_t tx_queue_id); * The device start step is the last one and consists of setting the configured * offload features and in starting the transmit and the receive units of the * device. + * + * Device RTE_ETH_DEV_NOLIVE_MAC_ADDR flag causes MAC address to be set before + * PMD port start callback function is invoked. + * * On success, all basic functions exported by the Ethernet API (link status, * receive/transmit, and so on) can be invoked. * @@ -1797,8 +1854,8 @@ int rte_eth_dev_set_link_down(uint16_t port_id); /** * Close a stopped Ethernet device. The device cannot be restarted! - * The function frees all resources except for needed by the - * closed state. To free these resources, call rte_eth_dev_detach(). + * The function frees all port resources if the driver supports + * the flag RTE_ETH_DEV_CLOSE_REMOVE. * * @param port_id * The port identifier of the Ethernet device. @@ -2718,6 +2775,26 @@ int rte_eth_dev_rx_intr_ctl(uint16_t port_id, int epfd, int op, void *data); int rte_eth_dev_rx_intr_ctl_q(uint16_t port_id, uint16_t queue_id, int epfd, int op, void *data); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Get interrupt fd per Rx queue. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The index of the receive queue from which to retrieve input packets. + * The value must be in the range [0, nb_rx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @return + * - (>=0) the interrupt fd associated to the requested Rx queue if + * successful. + * - (-1) on error. + */ +int __rte_experimental +rte_eth_dev_rx_intr_ctl_q_get_fd(uint16_t port_id, uint16_t queue_id); + /** * Turn on the LED on the Ethernet device. * This function turns on the LED on the Ethernet device. diff --git a/lib/librte_ethdev/rte_ethdev_core.h b/lib/librte_ethdev/rte_ethdev_core.h index 33d12b3a..8f03f83f 100644 --- a/lib/librte_ethdev/rte_ethdev_core.h +++ b/lib/librte_ethdev/rte_ethdev_core.h @@ -539,7 +539,13 @@ struct rte_eth_dev { eth_rx_burst_t rx_pkt_burst; /**< Pointer to PMD receive function. */ eth_tx_burst_t tx_pkt_burst; /**< Pointer to PMD transmit function. */ eth_tx_prep_t tx_pkt_prepare; /**< Pointer to PMD transmit prepare function. */ - struct rte_eth_dev_data *data; /**< Pointer to device data */ + /** + * Next two fields are per-device data but *data is shared between + * primary and secondary processes and *process_private is per-process + * private. The second one is managed by PMDs if necessary. + */ + struct rte_eth_dev_data *data; /**< Pointer to device data. */ + void *process_private; /**< Pointer to per-process device data. */ const struct eth_dev_ops *dev_ops; /**< Functions exported by PMD */ struct rte_device *device; /**< Backing device */ struct rte_intr_handle *intr_handle; /**< Device interrupt handle */ @@ -579,24 +585,30 @@ struct rte_eth_dev_data { struct rte_eth_dev_sriov sriov; /**< SRIOV data */ - void *dev_private; /**< PMD-specific private data */ - - struct rte_eth_link dev_link; - /**< Link-level information & status */ + void *dev_private; + /**< PMD-specific private data. + * @see rte_eth_dev_release_port() + */ + struct rte_eth_link dev_link; /**< Link-level information & status. */ struct rte_eth_conf dev_conf; /**< Configuration applied to device. */ uint16_t mtu; /**< Maximum Transmission Unit. */ - uint32_t min_rx_buf_size; - /**< Common rx buffer size handled by all queues */ + /**< Common RX buffer size handled by all queues. */ uint64_t rx_mbuf_alloc_failed; /**< RX ring mbuf allocation failures. */ - struct ether_addr* mac_addrs;/**< Device Ethernet Link address. */ + struct ether_addr *mac_addrs; + /**< Device Ethernet link address. + * @see rte_eth_dev_release_port() + */ uint64_t mac_pool_sel[ETH_NUM_RECEIVE_MAC_ADDR]; - /** bitmap array of associating Ethernet MAC addresses to pools */ - struct ether_addr* hash_mac_addrs; - /** Device Ethernet MAC addresses of hash filtering. */ + /**< Bitmap associating MAC addresses to pools. */ + struct ether_addr *hash_mac_addrs; + /**< Device Ethernet MAC addresses of hash filtering. + * @see rte_eth_dev_release_port() + */ uint16_t port_id; /**< Device [external] port identifier. */ + __extension__ uint8_t promiscuous : 1, /**< RX promiscuous mode ON(1) / OFF(0). */ scattered_rx : 1, /**< RX of scattered packets is ON(1) / OFF(0) */ @@ -604,15 +616,19 @@ struct rte_eth_dev_data { dev_started : 1, /**< Device state: STARTED(1) / STOPPED(0). */ lro : 1; /**< RX LRO is ON(1) / OFF(0) */ uint8_t rx_queue_state[RTE_MAX_QUEUES_PER_PORT]; - /** Queues state: STARTED(1) / STOPPED(0) */ + /**< Queues state: STARTED(1) / STOPPED(0). */ uint8_t tx_queue_state[RTE_MAX_QUEUES_PER_PORT]; - /** Queues state: STARTED(1) / STOPPED(0) */ - uint32_t dev_flags; /**< Capabilities */ - enum rte_kernel_driver kdrv; /**< Kernel driver passthrough */ - int numa_node; /**< NUMA node connection */ + /**< Queues state: STARTED(1) / STOPPED(0). */ + uint32_t dev_flags; /**< Capabilities. */ + enum rte_kernel_driver kdrv; /**< Kernel driver passthrough. */ + int numa_node; /**< NUMA node connection. */ struct rte_vlan_filter_conf vlan_filter_conf; - /**< VLAN filter configuration. */ + /**< VLAN filter configuration. */ struct rte_eth_dev_owner owner; /**< The port owner. */ + uint16_t representor_id; + /**< Switch-specific identifier. + * Valid if RTE_ETH_DEV_REPRESENTOR in dev_flags. + */ } __rte_cache_aligned; /** diff --git a/lib/librte_ethdev/rte_ethdev_driver.h b/lib/librte_ethdev/rte_ethdev_driver.h index c6d9bc1a..c2ac2632 100644 --- a/lib/librte_ethdev/rte_ethdev_driver.h +++ b/lib/librte_ethdev/rte_ethdev_driver.h @@ -58,10 +58,17 @@ struct rte_eth_dev *rte_eth_dev_attach_secondary(const char *name); /** * @internal - * Release the specified ethdev port. + * Notify RTE_ETH_EVENT_DESTROY and release the specified ethdev port. + * + * The following PMD-managed data fields will be freed: + * - dev_private + * - mac_addrs + * - hash_mac_addrs + * If one of these fields should not be freed, + * it must be reset to NULL by the PMD, typically in dev_close method. * * @param eth_dev - * The *eth_dev* pointer is the address of the *rte_eth_dev* structure. + * Device to be detached. * @return * - 0 on success, negative on error */ @@ -324,32 +331,6 @@ typedef int (*ethdev_uninit_t)(struct rte_eth_dev *ethdev); int __rte_experimental rte_eth_dev_destroy(struct rte_eth_dev *ethdev, ethdev_uninit_t ethdev_uninit); -/** - * PMD helper function to check if keeping CRC is requested - * - * @note - * When CRC_STRIP offload flag is removed and default behavior switch to - * strip CRC, as planned, this helper function is not that useful and will be - * removed. In PMDs this function will be replaced with check: - * if (offloads & DEV_RX_OFFLOAD_KEEP_CRC) - * - * @param rx_offloads - * offload bits to be applied - * - * @return - * Return positive if keeping CRC is requested, - * zero if stripping CRC is requested - */ -static inline int -rte_eth_dev_must_keep_crc(uint64_t rx_offloads) -{ - if (rx_offloads & DEV_RX_OFFLOAD_CRC_STRIP) - return 0; - - /* no KEEP_CRC or CRC_STRIP offload flags means keep CRC */ - return 1; -} - #ifdef __cplusplus } #endif diff --git a/lib/librte_ethdev/rte_ethdev_pci.h b/lib/librte_ethdev/rte_ethdev_pci.h index f652596f..23257e98 100644 --- a/lib/librte_ethdev/rte_ethdev_pci.h +++ b/lib/librte_ethdev/rte_ethdev_pci.h @@ -135,17 +135,6 @@ rte_eth_dev_pci_allocate(struct rte_pci_device *dev, size_t private_data_size) static inline void rte_eth_dev_pci_release(struct rte_eth_dev *eth_dev) { - if (rte_eal_process_type() == RTE_PROC_PRIMARY) - rte_free(eth_dev->data->dev_private); - - eth_dev->data->dev_private = NULL; - - /* - * Secondary process will check the name to attach. - * Clear this field to avoid attaching a released ports. - */ - eth_dev->data->name[0] = '\0'; - eth_dev->device = NULL; eth_dev->intr_handle = NULL; diff --git a/lib/librte_ethdev/rte_ethdev_version.map b/lib/librte_ethdev/rte_ethdev_version.map index 38f117f0..3560c288 100644 --- a/lib/librte_ethdev/rte_ethdev_version.map +++ b/lib/librte_ethdev/rte_ethdev_version.map @@ -8,14 +8,12 @@ DPDK_2.2 { rte_eth_allmulticast_get; rte_eth_dev_allocate; rte_eth_dev_allocated; - rte_eth_dev_attach; rte_eth_dev_callback_register; rte_eth_dev_callback_unregister; rte_eth_dev_close; rte_eth_dev_configure; rte_eth_dev_count; rte_eth_dev_default_mac_addr_set; - rte_eth_dev_detach; rte_eth_dev_filter_supported; rte_eth_dev_flow_ctrl_get; rte_eth_dev_flow_ctrl_set; @@ -220,6 +218,14 @@ DPDK_18.08 { } DPDK_18.05; +DPDK_18.11 { + global: + + rte_eth_dev_rx_offload_name; + rte_eth_dev_tx_offload_name; + +} DPDK_18.08; + EXPERIMENTAL { global: @@ -235,10 +241,13 @@ EXPERIMENTAL { rte_eth_dev_owner_new; rte_eth_dev_owner_set; rte_eth_dev_owner_unset; - rte_eth_dev_rx_offload_name; - rte_eth_dev_tx_offload_name; + rte_eth_dev_rx_intr_ctl_q_get_fd; + rte_eth_iterator_cleanup; + rte_eth_iterator_init; + rte_eth_iterator_next; rte_eth_switch_domain_alloc; rte_eth_switch_domain_free; + rte_flow_conv; rte_flow_expand_rss; rte_mtr_capabilities_get; rte_mtr_create; diff --git a/lib/librte_ethdev/rte_flow.c b/lib/librte_ethdev/rte_flow.c index cff4b520..3277be1e 100644 --- a/lib/librte_ethdev/rte_flow.c +++ b/lib/librte_ethdev/rte_flow.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "rte_ethdev.h" #include "rte_flow_driver.h" #include "rte_flow.h" @@ -50,10 +51,15 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = { MK_FLOW_ITEM(TCP, sizeof(struct rte_flow_item_tcp)), MK_FLOW_ITEM(SCTP, sizeof(struct rte_flow_item_sctp)), MK_FLOW_ITEM(VXLAN, sizeof(struct rte_flow_item_vxlan)), - MK_FLOW_ITEM(MPLS, sizeof(struct rte_flow_item_mpls)), - MK_FLOW_ITEM(GRE, sizeof(struct rte_flow_item_gre)), MK_FLOW_ITEM(E_TAG, sizeof(struct rte_flow_item_e_tag)), MK_FLOW_ITEM(NVGRE, sizeof(struct rte_flow_item_nvgre)), + MK_FLOW_ITEM(MPLS, sizeof(struct rte_flow_item_mpls)), + MK_FLOW_ITEM(GRE, sizeof(struct rte_flow_item_gre)), + MK_FLOW_ITEM(FUZZY, sizeof(struct rte_flow_item_fuzzy)), + MK_FLOW_ITEM(GTP, sizeof(struct rte_flow_item_gtp)), + MK_FLOW_ITEM(GTPC, sizeof(struct rte_flow_item_gtp)), + MK_FLOW_ITEM(GTPU, sizeof(struct rte_flow_item_gtp)), + MK_FLOW_ITEM(ESP, sizeof(struct rte_flow_item_esp)), MK_FLOW_ITEM(GENEVE, sizeof(struct rte_flow_item_geneve)), MK_FLOW_ITEM(VXLAN_GPE, sizeof(struct rte_flow_item_vxlan_gpe)), MK_FLOW_ITEM(ARP_ETH_IPV4, sizeof(struct rte_flow_item_arp_eth_ipv4)), @@ -66,6 +72,8 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = { sizeof(struct rte_flow_item_icmp6_nd_opt_sla_eth)), MK_FLOW_ITEM(ICMP6_ND_OPT_TLA_ETH, sizeof(struct rte_flow_item_icmp6_nd_opt_tla_eth)), + MK_FLOW_ITEM(MARK, sizeof(struct rte_flow_item_mark)), + MK_FLOW_ITEM(META, sizeof(struct rte_flow_item_meta)), }; /** Generate flow_action[] entry. */ @@ -80,6 +88,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = { MK_FLOW_ACTION(END, 0), MK_FLOW_ACTION(VOID, 0), MK_FLOW_ACTION(PASSTHRU, 0), + MK_FLOW_ACTION(JUMP, sizeof(struct rte_flow_action_jump)), MK_FLOW_ACTION(MARK, sizeof(struct rte_flow_action_mark)), MK_FLOW_ACTION(FLAG, 0), MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)), @@ -90,6 +99,8 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = { MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)), MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)), MK_FLOW_ACTION(PORT_ID, sizeof(struct rte_flow_action_port_id)), + MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)), + MK_FLOW_ACTION(SECURITY, sizeof(struct rte_flow_action_security)), MK_FLOW_ACTION(OF_SET_MPLS_TTL, sizeof(struct rte_flow_action_of_set_mpls_ttl)), MK_FLOW_ACTION(OF_DEC_MPLS_TTL, 0), @@ -109,6 +120,29 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = { sizeof(struct rte_flow_action_of_pop_mpls)), MK_FLOW_ACTION(OF_PUSH_MPLS, sizeof(struct rte_flow_action_of_push_mpls)), + MK_FLOW_ACTION(VXLAN_ENCAP, sizeof(struct rte_flow_action_vxlan_encap)), + MK_FLOW_ACTION(VXLAN_DECAP, 0), + MK_FLOW_ACTION(NVGRE_ENCAP, sizeof(struct rte_flow_action_vxlan_encap)), + MK_FLOW_ACTION(NVGRE_DECAP, 0), + MK_FLOW_ACTION(RAW_ENCAP, sizeof(struct rte_flow_action_raw_encap)), + MK_FLOW_ACTION(RAW_DECAP, sizeof(struct rte_flow_action_raw_decap)), + MK_FLOW_ACTION(SET_IPV4_SRC, + sizeof(struct rte_flow_action_set_ipv4)), + MK_FLOW_ACTION(SET_IPV4_DST, + sizeof(struct rte_flow_action_set_ipv4)), + MK_FLOW_ACTION(SET_IPV6_SRC, + sizeof(struct rte_flow_action_set_ipv6)), + MK_FLOW_ACTION(SET_IPV6_DST, + sizeof(struct rte_flow_action_set_ipv6)), + MK_FLOW_ACTION(SET_TP_SRC, + sizeof(struct rte_flow_action_set_tp)), + MK_FLOW_ACTION(SET_TP_DST, + sizeof(struct rte_flow_action_set_tp)), + MK_FLOW_ACTION(MAC_SWAP, 0), + MK_FLOW_ACTION(DEC_TTL, 0), + MK_FLOW_ACTION(SET_TTL, sizeof(struct rte_flow_action_set_ttl)), + MK_FLOW_ACTION(SET_MAC_SRC, sizeof(struct rte_flow_action_set_mac)), + MK_FLOW_ACTION(SET_MAC_DST, sizeof(struct rte_flow_action_set_mac)), }; static int @@ -288,26 +322,41 @@ rte_flow_error_set(struct rte_flow_error *error, } /** Pattern item specification types. */ -enum item_spec_type { - ITEM_SPEC, - ITEM_LAST, - ITEM_MASK, +enum rte_flow_conv_item_spec_type { + RTE_FLOW_CONV_ITEM_SPEC, + RTE_FLOW_CONV_ITEM_LAST, + RTE_FLOW_CONV_ITEM_MASK, }; -/** Compute storage space needed by item specification and copy it. */ +/** + * Copy pattern item specification. + * + * @param[out] buf + * Output buffer. Can be NULL if @p size is zero. + * @param size + * Size of @p buf in bytes. + * @param[in] item + * Pattern item to copy specification from. + * @param type + * Specification selector for either @p spec, @p last or @p mask. + * + * @return + * Number of bytes needed to store pattern item specification regardless + * of @p size. @p buf contents are truncated to @p size if not large + * enough. + */ static size_t -flow_item_spec_copy(void *buf, const struct rte_flow_item *item, - enum item_spec_type type) +rte_flow_conv_item_spec(void *buf, const size_t size, + const struct rte_flow_item *item, + enum rte_flow_conv_item_spec_type type) { - size_t size = 0; + size_t off; const void *data = - type == ITEM_SPEC ? item->spec : - type == ITEM_LAST ? item->last : - type == ITEM_MASK ? item->mask : + type == RTE_FLOW_CONV_ITEM_SPEC ? item->spec : + type == RTE_FLOW_CONV_ITEM_LAST ? item->last : + type == RTE_FLOW_CONV_ITEM_MASK ? item->mask : NULL; - if (!item->spec || !data) - goto empty; switch (item->type) { union { const struct rte_flow_item_raw *raw; @@ -324,7 +373,7 @@ flow_item_spec_copy(void *buf, const struct rte_flow_item *item, union { struct rte_flow_item_raw *raw; } dst; - size_t off; + size_t tmp; case RTE_FLOW_ITEM_TYPE_RAW: spec.raw = item->spec; @@ -332,91 +381,466 @@ flow_item_spec_copy(void *buf, const struct rte_flow_item *item, mask.raw = item->mask ? item->mask : &rte_flow_item_raw_mask; src.raw = data; dst.raw = buf; - off = RTE_ALIGN_CEIL(sizeof(struct rte_flow_item_raw), - sizeof(*src.raw->pattern)); - if (type == ITEM_SPEC || - (type == ITEM_MASK && + rte_memcpy(dst.raw, + (&(struct rte_flow_item_raw){ + .relative = src.raw->relative, + .search = src.raw->search, + .reserved = src.raw->reserved, + .offset = src.raw->offset, + .limit = src.raw->limit, + .length = src.raw->length, + }), + size > sizeof(*dst.raw) ? sizeof(*dst.raw) : size); + off = sizeof(*dst.raw); + if (type == RTE_FLOW_CONV_ITEM_SPEC || + (type == RTE_FLOW_CONV_ITEM_MASK && ((spec.raw->length & mask.raw->length) >= (last.raw->length & mask.raw->length)))) - size = spec.raw->length & mask.raw->length; + tmp = spec.raw->length & mask.raw->length; else - size = last.raw->length & mask.raw->length; - size = off + size * sizeof(*src.raw->pattern); - if (dst.raw) { - memcpy(dst.raw, src.raw, sizeof(*src.raw)); - dst.raw->pattern = memcpy((uint8_t *)dst.raw + off, - src.raw->pattern, - size - off); + tmp = last.raw->length & mask.raw->length; + if (tmp) { + off = RTE_ALIGN_CEIL(off, sizeof(*dst.raw->pattern)); + if (size >= off + tmp) + dst.raw->pattern = rte_memcpy + ((void *)((uintptr_t)dst.raw + off), + src.raw->pattern, tmp); + off += tmp; } break; default: - size = rte_flow_desc_item[item->type].size; - if (buf) - memcpy(buf, data, size); + off = rte_flow_desc_item[item->type].size; + rte_memcpy(buf, data, (size > off ? off : size)); break; } -empty: - return RTE_ALIGN_CEIL(size, sizeof(double)); + return off; } -/** Compute storage space needed by action configuration and copy it. */ +/** + * Copy action configuration. + * + * @param[out] buf + * Output buffer. Can be NULL if @p size is zero. + * @param size + * Size of @p buf in bytes. + * @param[in] action + * Action to copy configuration from. + * + * @return + * Number of bytes needed to store pattern item specification regardless + * of @p size. @p buf contents are truncated to @p size if not large + * enough. + */ static size_t -flow_action_conf_copy(void *buf, const struct rte_flow_action *action) +rte_flow_conv_action_conf(void *buf, const size_t size, + const struct rte_flow_action *action) { - size_t size = 0; + size_t off; - if (!action->conf) - goto empty; switch (action->type) { union { const struct rte_flow_action_rss *rss; + const struct rte_flow_action_vxlan_encap *vxlan_encap; + const struct rte_flow_action_nvgre_encap *nvgre_encap; } src; union { struct rte_flow_action_rss *rss; + struct rte_flow_action_vxlan_encap *vxlan_encap; + struct rte_flow_action_nvgre_encap *nvgre_encap; } dst; - size_t off; + size_t tmp; + int ret; case RTE_FLOW_ACTION_TYPE_RSS: src.rss = action->conf; dst.rss = buf; - off = 0; - if (dst.rss) - *dst.rss = (struct rte_flow_action_rss){ + rte_memcpy(dst.rss, + (&(struct rte_flow_action_rss){ .func = src.rss->func, .level = src.rss->level, .types = src.rss->types, .key_len = src.rss->key_len, .queue_num = src.rss->queue_num, - }; - off += sizeof(*src.rss); + }), + size > sizeof(*dst.rss) ? sizeof(*dst.rss) : size); + off = sizeof(*dst.rss); if (src.rss->key_len) { - off = RTE_ALIGN_CEIL(off, sizeof(double)); - size = sizeof(*src.rss->key) * src.rss->key_len; - if (dst.rss) - dst.rss->key = memcpy + off = RTE_ALIGN_CEIL(off, sizeof(*dst.rss->key)); + tmp = sizeof(*src.rss->key) * src.rss->key_len; + if (size >= off + tmp) + dst.rss->key = rte_memcpy ((void *)((uintptr_t)dst.rss + off), - src.rss->key, size); - off += size; + src.rss->key, tmp); + off += tmp; } if (src.rss->queue_num) { - off = RTE_ALIGN_CEIL(off, sizeof(double)); - size = sizeof(*src.rss->queue) * src.rss->queue_num; - if (dst.rss) - dst.rss->queue = memcpy + off = RTE_ALIGN_CEIL(off, sizeof(*dst.rss->queue)); + tmp = sizeof(*src.rss->queue) * src.rss->queue_num; + if (size >= off + tmp) + dst.rss->queue = rte_memcpy ((void *)((uintptr_t)dst.rss + off), - src.rss->queue, size); - off += size; + src.rss->queue, tmp); + off += tmp; + } + break; + case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: + case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP: + src.vxlan_encap = action->conf; + dst.vxlan_encap = buf; + RTE_BUILD_BUG_ON(sizeof(*src.vxlan_encap) != + sizeof(*src.nvgre_encap) || + offsetof(struct rte_flow_action_vxlan_encap, + definition) != + offsetof(struct rte_flow_action_nvgre_encap, + definition)); + off = sizeof(*dst.vxlan_encap); + if (src.vxlan_encap->definition) { + off = RTE_ALIGN_CEIL + (off, sizeof(*dst.vxlan_encap->definition)); + ret = rte_flow_conv + (RTE_FLOW_CONV_OP_PATTERN, + (void *)((uintptr_t)dst.vxlan_encap + off), + size > off ? size - off : 0, + src.vxlan_encap->definition, NULL); + if (ret < 0) + return 0; + if (size >= off + ret) + dst.vxlan_encap->definition = + (void *)((uintptr_t)dst.vxlan_encap + + off); + off += ret; } - size = off; break; default: - size = rte_flow_desc_action[action->type].size; - if (buf) - memcpy(buf, action->conf, size); + off = rte_flow_desc_action[action->type].size; + rte_memcpy(buf, action->conf, (size > off ? off : size)); break; } -empty: - return RTE_ALIGN_CEIL(size, sizeof(double)); + return off; +} + +/** + * Copy a list of pattern items. + * + * @param[out] dst + * Destination buffer. Can be NULL if @p size is zero. + * @param size + * Size of @p dst in bytes. + * @param[in] src + * Source pattern items. + * @param num + * Maximum number of pattern items to process from @p src or 0 to process + * the entire list. In both cases, processing stops after + * RTE_FLOW_ITEM_TYPE_END is encountered. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * A positive value representing the number of bytes needed to store + * pattern items regardless of @p size on success (@p buf contents are + * truncated to @p size if not large enough), a negative errno value + * otherwise and rte_errno is set. + */ +static int +rte_flow_conv_pattern(struct rte_flow_item *dst, + const size_t size, + const struct rte_flow_item *src, + unsigned int num, + struct rte_flow_error *error) +{ + uintptr_t data = (uintptr_t)dst; + size_t off; + size_t ret; + unsigned int i; + + for (i = 0, off = 0; !num || i != num; ++i, ++src, ++dst) { + if ((size_t)src->type >= RTE_DIM(rte_flow_desc_item) || + !rte_flow_desc_item[src->type].name) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, src, + "cannot convert unknown item type"); + if (size >= off + sizeof(*dst)) + *dst = (struct rte_flow_item){ + .type = src->type, + }; + off += sizeof(*dst); + if (!src->type) + num = i + 1; + } + num = i; + src -= num; + dst -= num; + do { + if (src->spec) { + off = RTE_ALIGN_CEIL(off, sizeof(double)); + ret = rte_flow_conv_item_spec + ((void *)(data + off), + size > off ? size - off : 0, src, + RTE_FLOW_CONV_ITEM_SPEC); + if (size && size >= off + ret) + dst->spec = (void *)(data + off); + off += ret; + + } + if (src->last) { + off = RTE_ALIGN_CEIL(off, sizeof(double)); + ret = rte_flow_conv_item_spec + ((void *)(data + off), + size > off ? size - off : 0, src, + RTE_FLOW_CONV_ITEM_LAST); + if (size && size >= off + ret) + dst->last = (void *)(data + off); + off += ret; + } + if (src->mask) { + off = RTE_ALIGN_CEIL(off, sizeof(double)); + ret = rte_flow_conv_item_spec + ((void *)(data + off), + size > off ? size - off : 0, src, + RTE_FLOW_CONV_ITEM_MASK); + if (size && size >= off + ret) + dst->mask = (void *)(data + off); + off += ret; + } + ++src; + ++dst; + } while (--num); + return off; +} + +/** + * Copy a list of actions. + * + * @param[out] dst + * Destination buffer. Can be NULL if @p size is zero. + * @param size + * Size of @p dst in bytes. + * @param[in] src + * Source actions. + * @param num + * Maximum number of actions to process from @p src or 0 to process the + * entire list. In both cases, processing stops after + * RTE_FLOW_ACTION_TYPE_END is encountered. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * A positive value representing the number of bytes needed to store + * actions regardless of @p size on success (@p buf contents are truncated + * to @p size if not large enough), a negative errno value otherwise and + * rte_errno is set. + */ +static int +rte_flow_conv_actions(struct rte_flow_action *dst, + const size_t size, + const struct rte_flow_action *src, + unsigned int num, + struct rte_flow_error *error) +{ + uintptr_t data = (uintptr_t)dst; + size_t off; + size_t ret; + unsigned int i; + + for (i = 0, off = 0; !num || i != num; ++i, ++src, ++dst) { + if ((size_t)src->type >= RTE_DIM(rte_flow_desc_action) || + !rte_flow_desc_action[src->type].name) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, + src, "cannot convert unknown action type"); + if (size >= off + sizeof(*dst)) + *dst = (struct rte_flow_action){ + .type = src->type, + }; + off += sizeof(*dst); + if (!src->type) + num = i + 1; + } + num = i; + src -= num; + dst -= num; + do { + if (src->conf) { + off = RTE_ALIGN_CEIL(off, sizeof(double)); + ret = rte_flow_conv_action_conf + ((void *)(data + off), + size > off ? size - off : 0, src); + if (size && size >= off + ret) + dst->conf = (void *)(data + off); + off += ret; + } + ++src; + ++dst; + } while (--num); + return off; +} + +/** + * Copy flow rule components. + * + * This comprises the flow rule descriptor itself, attributes, pattern and + * actions list. NULL components in @p src are skipped. + * + * @param[out] dst + * Destination buffer. Can be NULL if @p size is zero. + * @param size + * Size of @p dst in bytes. + * @param[in] src + * Source flow rule descriptor. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * A positive value representing the number of bytes needed to store all + * components including the descriptor regardless of @p size on success + * (@p buf contents are truncated to @p size if not large enough), a + * negative errno value otherwise and rte_errno is set. + */ +static int +rte_flow_conv_rule(struct rte_flow_conv_rule *dst, + const size_t size, + const struct rte_flow_conv_rule *src, + struct rte_flow_error *error) +{ + size_t off; + int ret; + + rte_memcpy(dst, + (&(struct rte_flow_conv_rule){ + .attr = NULL, + .pattern = NULL, + .actions = NULL, + }), + size > sizeof(*dst) ? sizeof(*dst) : size); + off = sizeof(*dst); + if (src->attr_ro) { + off = RTE_ALIGN_CEIL(off, sizeof(double)); + if (size && size >= off + sizeof(*dst->attr)) + dst->attr = rte_memcpy + ((void *)((uintptr_t)dst + off), + src->attr_ro, sizeof(*dst->attr)); + off += sizeof(*dst->attr); + } + if (src->pattern_ro) { + off = RTE_ALIGN_CEIL(off, sizeof(double)); + ret = rte_flow_conv_pattern((void *)((uintptr_t)dst + off), + size > off ? size - off : 0, + src->pattern_ro, 0, error); + if (ret < 0) + return ret; + if (size && size >= off + (size_t)ret) + dst->pattern = (void *)((uintptr_t)dst + off); + off += ret; + } + if (src->actions_ro) { + off = RTE_ALIGN_CEIL(off, sizeof(double)); + ret = rte_flow_conv_actions((void *)((uintptr_t)dst + off), + size > off ? size - off : 0, + src->actions_ro, 0, error); + if (ret < 0) + return ret; + if (size >= off + (size_t)ret) + dst->actions = (void *)((uintptr_t)dst + off); + off += ret; + } + return off; +} + +/** + * Retrieve the name of a pattern item/action type. + * + * @param is_action + * Nonzero when @p src represents an action type instead of a pattern item + * type. + * @param is_ptr + * Nonzero to write string address instead of contents into @p dst. + * @param[out] dst + * Destination buffer. Can be NULL if @p size is zero. + * @param size + * Size of @p dst in bytes. + * @param[in] src + * Depending on @p is_action, source pattern item or action type cast as a + * pointer. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * A positive value representing the number of bytes needed to store the + * name or its address regardless of @p size on success (@p buf contents + * are truncated to @p size if not large enough), a negative errno value + * otherwise and rte_errno is set. + */ +static int +rte_flow_conv_name(int is_action, + int is_ptr, + char *dst, + const size_t size, + const void *src, + struct rte_flow_error *error) +{ + struct desc_info { + const struct rte_flow_desc_data *data; + size_t num; + }; + static const struct desc_info info_rep[2] = { + { rte_flow_desc_item, RTE_DIM(rte_flow_desc_item), }, + { rte_flow_desc_action, RTE_DIM(rte_flow_desc_action), }, + }; + const struct desc_info *const info = &info_rep[!!is_action]; + unsigned int type = (uintptr_t)src; + + if (type >= info->num) + return rte_flow_error_set + (error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "unknown object type to retrieve the name of"); + if (!is_ptr) + return strlcpy(dst, info->data[type].name, size); + if (size >= sizeof(const char **)) + *((const char **)dst) = info->data[type].name; + return sizeof(const char **); +} + +/** Helper function to convert flow API objects. */ +int +rte_flow_conv(enum rte_flow_conv_op op, + void *dst, + size_t size, + const void *src, + struct rte_flow_error *error) +{ + switch (op) { + const struct rte_flow_attr *attr; + + case RTE_FLOW_CONV_OP_NONE: + return 0; + case RTE_FLOW_CONV_OP_ATTR: + attr = src; + if (size > sizeof(*attr)) + size = sizeof(*attr); + rte_memcpy(dst, attr, size); + return sizeof(*attr); + case RTE_FLOW_CONV_OP_ITEM: + return rte_flow_conv_pattern(dst, size, src, 1, error); + case RTE_FLOW_CONV_OP_ACTION: + return rte_flow_conv_actions(dst, size, src, 1, error); + case RTE_FLOW_CONV_OP_PATTERN: + return rte_flow_conv_pattern(dst, size, src, 0, error); + case RTE_FLOW_CONV_OP_ACTIONS: + return rte_flow_conv_actions(dst, size, src, 0, error); + case RTE_FLOW_CONV_OP_RULE: + return rte_flow_conv_rule(dst, size, src, error); + case RTE_FLOW_CONV_OP_ITEM_NAME: + return rte_flow_conv_name(0, 0, dst, size, src, error); + case RTE_FLOW_CONV_OP_ACTION_NAME: + return rte_flow_conv_name(1, 0, dst, size, src, error); + case RTE_FLOW_CONV_OP_ITEM_NAME_PTR: + return rte_flow_conv_name(0, 1, dst, size, src, error); + case RTE_FLOW_CONV_OP_ACTION_NAME_PTR: + return rte_flow_conv_name(1, 1, dst, size, src, error); + } + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "unknown object conversion operation"); } /** Store a full rte_flow description. */ @@ -426,105 +850,49 @@ rte_flow_copy(struct rte_flow_desc *desc, size_t len, const struct rte_flow_item *items, const struct rte_flow_action *actions) { - struct rte_flow_desc *fd = NULL; - size_t tmp; - size_t off1 = 0; - size_t off2 = 0; - size_t size = 0; - -store: - if (items) { - const struct rte_flow_item *item; - - item = items; - if (fd) - fd->items = (void *)&fd->data[off1]; - do { - struct rte_flow_item *dst = NULL; - - if ((size_t)item->type >= - RTE_DIM(rte_flow_desc_item) || - !rte_flow_desc_item[item->type].name) { - rte_errno = ENOTSUP; - return 0; - } - if (fd) - dst = memcpy(fd->data + off1, item, - sizeof(*item)); - off1 += sizeof(*item); - if (item->spec) { - if (fd) - dst->spec = fd->data + off2; - off2 += flow_item_spec_copy - (fd ? fd->data + off2 : NULL, item, - ITEM_SPEC); - } - if (item->last) { - if (fd) - dst->last = fd->data + off2; - off2 += flow_item_spec_copy - (fd ? fd->data + off2 : NULL, item, - ITEM_LAST); - } - if (item->mask) { - if (fd) - dst->mask = fd->data + off2; - off2 += flow_item_spec_copy - (fd ? fd->data + off2 : NULL, item, - ITEM_MASK); - } - off2 = RTE_ALIGN_CEIL(off2, sizeof(double)); - } while ((item++)->type != RTE_FLOW_ITEM_TYPE_END); - off1 = RTE_ALIGN_CEIL(off1, sizeof(double)); - } - if (actions) { - const struct rte_flow_action *action; - - action = actions; - if (fd) - fd->actions = (void *)&fd->data[off1]; - do { - struct rte_flow_action *dst = NULL; - - if ((size_t)action->type >= - RTE_DIM(rte_flow_desc_action) || - !rte_flow_desc_action[action->type].name) { - rte_errno = ENOTSUP; - return 0; - } - if (fd) - dst = memcpy(fd->data + off1, action, - sizeof(*action)); - off1 += sizeof(*action); - if (action->conf) { - if (fd) - dst->conf = fd->data + off2; - off2 += flow_action_conf_copy - (fd ? fd->data + off2 : NULL, action); - } - off2 = RTE_ALIGN_CEIL(off2, sizeof(double)); - } while ((action++)->type != RTE_FLOW_ACTION_TYPE_END); + /* + * Overlap struct rte_flow_conv with struct rte_flow_desc in order + * to convert the former to the latter without wasting space. + */ + struct rte_flow_conv_rule *dst = + len ? + (void *)((uintptr_t)desc + + (offsetof(struct rte_flow_desc, actions) - + offsetof(struct rte_flow_conv_rule, actions))) : + NULL; + size_t dst_size = + len > sizeof(*desc) - sizeof(*dst) ? + len - (sizeof(*desc) - sizeof(*dst)) : + 0; + struct rte_flow_conv_rule src = { + .attr_ro = NULL, + .pattern_ro = items, + .actions_ro = actions, + }; + int ret; + + RTE_BUILD_BUG_ON(sizeof(struct rte_flow_desc) < + sizeof(struct rte_flow_conv_rule)); + if (dst_size && + (&dst->pattern != &desc->items || + &dst->actions != &desc->actions || + (uintptr_t)(dst + 1) != (uintptr_t)(desc + 1))) { + rte_errno = EINVAL; + return 0; } - if (fd != NULL) - return size; - off1 = RTE_ALIGN_CEIL(off1, sizeof(double)); - tmp = RTE_ALIGN_CEIL(offsetof(struct rte_flow_desc, data), - sizeof(double)); - size = tmp + off1 + off2; - if (size > len) - return size; - fd = desc; - if (fd != NULL) { - *fd = (const struct rte_flow_desc) { - .size = size, + ret = rte_flow_conv(RTE_FLOW_CONV_OP_RULE, dst, dst_size, &src, NULL); + if (ret < 0) + return 0; + ret += sizeof(*desc) - sizeof(*dst); + rte_memcpy(desc, + (&(struct rte_flow_desc){ + .size = ret, .attr = *attr, - }; - tmp -= offsetof(struct rte_flow_desc, data); - off2 = tmp + off1; - off1 = tmp; - goto store; - } - return 0; + .items = dst_size ? dst->pattern : NULL, + .actions = dst_size ? dst->actions : NULL, + }), + len > sizeof(*desc) ? sizeof(*desc) : len); + return ret; } /** diff --git a/lib/librte_ethdev/rte_flow.h b/lib/librte_ethdev/rte_flow.h index f8ba71cd..c0fe8792 100644 --- a/lib/librte_ethdev/rte_flow.h +++ b/lib/librte_ethdev/rte_flow.h @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -413,6 +414,14 @@ enum rte_flow_item_type { * See struct rte_flow_item_mark. */ RTE_FLOW_ITEM_TYPE_MARK, + + /** + * [META] + * + * Matches a metadata value specified in mbuf metadata field. + * See struct rte_flow_item_meta. + */ + RTE_FLOW_ITEM_TYPE_META, }; /** @@ -1155,6 +1164,22 @@ rte_flow_item_icmp6_nd_opt_tla_eth_mask = { }; #endif +/** + * RTE_FLOW_ITEM_TYPE_META. + * + * Matches a specified metadata value. + */ +struct rte_flow_item_meta { + rte_be32_t data; +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_META. */ +#ifndef __cplusplus +static const struct rte_flow_item_meta rte_flow_item_meta_mask = { + .data = RTE_BE32(UINT32_MAX), +}; +#endif + /** * @warning * @b EXPERIMENTAL: this structure may change without prior notice @@ -1505,6 +1530,127 @@ enum rte_flow_action_type { * error. */ RTE_FLOW_ACTION_TYPE_NVGRE_DECAP, + + /** + * Add outer header whose template is provided in its data buffer + * + * See struct rte_flow_action_raw_encap. + */ + RTE_FLOW_ACTION_TYPE_RAW_ENCAP, + + /** + * Remove outer header whose template is provided in its data buffer. + * + * See struct rte_flow_action_raw_decap + */ + RTE_FLOW_ACTION_TYPE_RAW_DECAP, + + /** + * Modify IPv4 source address in the outermost IPv4 header. + * + * If flow pattern does not define a valid RTE_FLOW_ITEM_TYPE_IPV4, + * then the PMD should return a RTE_FLOW_ERROR_TYPE_ACTION error. + * + * See struct rte_flow_action_set_ipv4. + */ + RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC, + + /** + * Modify IPv4 destination address in the outermost IPv4 header. + * + * If flow pattern does not define a valid RTE_FLOW_ITEM_TYPE_IPV4, + * then the PMD should return a RTE_FLOW_ERROR_TYPE_ACTION error. + * + * See struct rte_flow_action_set_ipv4. + */ + RTE_FLOW_ACTION_TYPE_SET_IPV4_DST, + + /** + * Modify IPv6 source address in the outermost IPv6 header. + * + * If flow pattern does not define a valid RTE_FLOW_ITEM_TYPE_IPV6, + * then the PMD should return a RTE_FLOW_ERROR_TYPE_ACTION error. + * + * See struct rte_flow_action_set_ipv6. + */ + RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC, + + /** + * Modify IPv6 destination address in the outermost IPv6 header. + * + * If flow pattern does not define a valid RTE_FLOW_ITEM_TYPE_IPV6, + * then the PMD should return a RTE_FLOW_ERROR_TYPE_ACTION error. + * + * See struct rte_flow_action_set_ipv6. + */ + RTE_FLOW_ACTION_TYPE_SET_IPV6_DST, + + /** + * Modify source port number in the outermost TCP/UDP header. + * + * If flow pattern does not define a valid RTE_FLOW_ITEM_TYPE_TCP + * or RTE_FLOW_ITEM_TYPE_UDP, then the PMD should return a + * RTE_FLOW_ERROR_TYPE_ACTION error. + * + * See struct rte_flow_action_set_tp. + */ + RTE_FLOW_ACTION_TYPE_SET_TP_SRC, + + /** + * Modify destination port number in the outermost TCP/UDP header. + * + * If flow pattern does not define a valid RTE_FLOW_ITEM_TYPE_TCP + * or RTE_FLOW_ITEM_TYPE_UDP, then the PMD should return a + * RTE_FLOW_ERROR_TYPE_ACTION error. + * + * See struct rte_flow_action_set_tp. + */ + RTE_FLOW_ACTION_TYPE_SET_TP_DST, + + /** + * Swap the source and destination MAC addresses in the outermost + * Ethernet header. + * + * If flow pattern does not define a valid RTE_FLOW_ITEM_TYPE_ETH, + * then the PMD should return a RTE_FLOW_ERROR_TYPE_ACTION error. + * + * No associated configuration structure. + */ + RTE_FLOW_ACTION_TYPE_MAC_SWAP, + + /** + * Decrease TTL value directly + * + * No associated configuration structure. + */ + RTE_FLOW_ACTION_TYPE_DEC_TTL, + + /** + * Set TTL value + * + * See struct rte_flow_action_set_ttl + */ + RTE_FLOW_ACTION_TYPE_SET_TTL, + + /** + * Set source MAC address from matched flow. + * + * If flow pattern does not define a valid RTE_FLOW_ITEM_TYPE_ETH, + * the PMD should return a RTE_FLOW_ERROR_TYPE_ACTION error. + * + * See struct rte_flow_action_set_mac. + */ + RTE_FLOW_ACTION_TYPE_SET_MAC_SRC, + + /** + * Set destination MAC address from matched flow. + * + * If flow pattern does not define a valid RTE_FLOW_ITEM_TYPE_ETH, + * the PMD should return a RTE_FLOW_ERROR_TYPE_ACTION error. + * + * See struct rte_flow_action_set_mac. + */ + RTE_FLOW_ACTION_TYPE_SET_MAC_DST, }; /** @@ -1868,6 +2014,114 @@ struct rte_flow_action_nvgre_encap { struct rte_flow_item *definition; }; +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * RTE_FLOW_ACTION_TYPE_RAW_ENCAP + * + * Raw tunnel end-point encapsulation data definition. + * + * The data holds the headers definitions to be applied on the packet. + * The data must start with ETH header up to the tunnel item header itself. + * When used right after RAW_DECAP (for decapsulating L3 tunnel type for + * example MPLSoGRE) the data will just hold layer 2 header. + * + * The preserve parameter holds which bits in the packet the PMD is not allowed + * to change, this parameter can also be NULL and then the PMD is allowed + * to update any field. + * + * size holds the number of bytes in @p data and @p preserve. + */ +struct rte_flow_action_raw_encap { + uint8_t *data; /**< Encapsulation data. */ + uint8_t *preserve; /**< Bit-mask of @p data to preserve on output. */ + size_t size; /**< Size of @p data and @p preserve. */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * RTE_FLOW_ACTION_TYPE_RAW_DECAP + * + * Raw tunnel end-point decapsulation data definition. + * + * The data holds the headers definitions to be removed from the packet. + * The data must start with ETH header up to the tunnel item header itself. + * When used right before RAW_DECAP (for encapsulating L3 tunnel type for + * example MPLSoGRE) the data will just hold layer 2 header. + * + * size holds the number of bytes in @p data. + */ +struct rte_flow_action_raw_decap { + uint8_t *data; /**< Encapsulation data. */ + size_t size; /**< Size of @p data and @p preserve. */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC + * RTE_FLOW_ACTION_TYPE_SET_IPV4_DST + * + * Allows modification of IPv4 source (RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC) + * and destination address (RTE_FLOW_ACTION_TYPE_SET_IPV4_DST) in the + * specified outermost IPv4 header. + */ +struct rte_flow_action_set_ipv4 { + rte_be32_t ipv4_addr; +}; + +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC + * RTE_FLOW_ACTION_TYPE_SET_IPV6_DST + * + * Allows modification of IPv6 source (RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC) + * and destination address (RTE_FLOW_ACTION_TYPE_SET_IPV6_DST) in the + * specified outermost IPv6 header. + */ +struct rte_flow_action_set_ipv6 { + uint8_t ipv6_addr[16]; +}; + +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * RTE_FLOW_ACTION_TYPE_SET_TP_SRC + * RTE_FLOW_ACTION_TYPE_SET_TP_DST + * + * Allows modification of source (RTE_FLOW_ACTION_TYPE_SET_TP_SRC) + * and destination (RTE_FLOW_ACTION_TYPE_SET_TP_DST) port numbers + * in the specified outermost TCP/UDP header. + */ +struct rte_flow_action_set_tp { + rte_be16_t port; +}; + +/** + * RTE_FLOW_ACTION_TYPE_SET_TTL + * + * Set the TTL value directly for IPv4 or IPv6 + */ +struct rte_flow_action_set_ttl { + uint8_t ttl_value; +}; + +/** + * RTE_FLOW_ACTION_TYPE_SET_MAC + * + * Set MAC address from the matched flow + */ +struct rte_flow_action_set_mac { + uint8_t mac_addr[ETHER_ADDR_LEN]; +}; + /* * Definition of a single action. * @@ -1931,6 +2185,175 @@ struct rte_flow_error { const char *message; /**< Human-readable error message. */ }; +/** + * Complete flow rule description. + * + * This object type is used when converting a flow rule description. + * + * @see RTE_FLOW_CONV_OP_RULE + * @see rte_flow_conv() + */ +RTE_STD_C11 +struct rte_flow_conv_rule { + union { + const struct rte_flow_attr *attr_ro; /**< RO attributes. */ + struct rte_flow_attr *attr; /**< Attributes. */ + }; + union { + const struct rte_flow_item *pattern_ro; /**< RO pattern. */ + struct rte_flow_item *pattern; /**< Pattern items. */ + }; + union { + const struct rte_flow_action *actions_ro; /**< RO actions. */ + struct rte_flow_action *actions; /**< List of actions. */ + }; +}; + +/** + * Conversion operations for flow API objects. + * + * @see rte_flow_conv() + */ +enum rte_flow_conv_op { + /** + * No operation to perform. + * + * rte_flow_conv() simply returns 0. + */ + RTE_FLOW_CONV_OP_NONE, + + /** + * Convert attributes structure. + * + * This is a basic copy of an attributes structure. + * + * - @p src type: + * @code const struct rte_flow_attr * @endcode + * - @p dst type: + * @code struct rte_flow_attr * @endcode + */ + RTE_FLOW_CONV_OP_ATTR, + + /** + * Convert a single item. + * + * Duplicates @p spec, @p last and @p mask but not outside objects. + * + * - @p src type: + * @code const struct rte_flow_item * @endcode + * - @p dst type: + * @code struct rte_flow_item * @endcode + */ + RTE_FLOW_CONV_OP_ITEM, + + /** + * Convert a single action. + * + * Duplicates @p conf but not outside objects. + * + * - @p src type: + * @code const struct rte_flow_action * @endcode + * - @p dst type: + * @code struct rte_flow_action * @endcode + */ + RTE_FLOW_CONV_OP_ACTION, + + /** + * Convert an entire pattern. + * + * Duplicates all pattern items at once with the same constraints as + * RTE_FLOW_CONV_OP_ITEM. + * + * - @p src type: + * @code const struct rte_flow_item * @endcode + * - @p dst type: + * @code struct rte_flow_item * @endcode + */ + RTE_FLOW_CONV_OP_PATTERN, + + /** + * Convert a list of actions. + * + * Duplicates the entire list of actions at once with the same + * constraints as RTE_FLOW_CONV_OP_ACTION. + * + * - @p src type: + * @code const struct rte_flow_action * @endcode + * - @p dst type: + * @code struct rte_flow_action * @endcode + */ + RTE_FLOW_CONV_OP_ACTIONS, + + /** + * Convert a complete flow rule description. + * + * Comprises attributes, pattern and actions together at once with + * the usual constraints. + * + * - @p src type: + * @code const struct rte_flow_conv_rule * @endcode + * - @p dst type: + * @code struct rte_flow_conv_rule * @endcode + */ + RTE_FLOW_CONV_OP_RULE, + + /** + * Convert item type to its name string. + * + * Writes a NUL-terminated string to @p dst. Like snprintf(), the + * returned value excludes the terminator which is always written + * nonetheless. + * + * - @p src type: + * @code (const void *)enum rte_flow_item_type @endcode + * - @p dst type: + * @code char * @endcode + **/ + RTE_FLOW_CONV_OP_ITEM_NAME, + + /** + * Convert action type to its name string. + * + * Writes a NUL-terminated string to @p dst. Like snprintf(), the + * returned value excludes the terminator which is always written + * nonetheless. + * + * - @p src type: + * @code (const void *)enum rte_flow_action_type @endcode + * - @p dst type: + * @code char * @endcode + **/ + RTE_FLOW_CONV_OP_ACTION_NAME, + + /** + * Convert item type to pointer to item name. + * + * Retrieves item name pointer from its type. The string itself is + * not copied; instead, a unique pointer to an internal static + * constant storage is written to @p dst. + * + * - @p src type: + * @code (const void *)enum rte_flow_item_type @endcode + * - @p dst type: + * @code const char ** @endcode + */ + RTE_FLOW_CONV_OP_ITEM_NAME_PTR, + + /** + * Convert action type to pointer to action name. + * + * Retrieves action name pointer from its type. The string itself is + * not copied; instead, a unique pointer to an internal static + * constant storage is written to @p dst. + * + * - @p src type: + * @code (const void *)enum rte_flow_action_type @endcode + * - @p dst type: + * @code const char ** @endcode + */ + RTE_FLOW_CONV_OP_ACTION_NAME_PTR, +}; + /** * Check whether a flow rule can be created on a given port. * @@ -2162,10 +2585,8 @@ rte_flow_error_set(struct rte_flow_error *error, const char *message); /** - * Generic flow representation. - * - * This form is sufficient to describe an rte_flow independently from any - * PMD implementation and allows for replayability and identification. + * @deprecated + * @see rte_flow_copy() */ struct rte_flow_desc { size_t size; /**< Allocated space including data[]. */ @@ -2176,8 +2597,14 @@ struct rte_flow_desc { }; /** + * @deprecated * Copy an rte_flow rule description. * + * This interface is kept for compatibility with older applications but is + * implemented as a wrapper to rte_flow_conv(). It is deprecated due to its + * lack of flexibility and reliance on a type unusable with C++ programs + * (struct rte_flow_desc). + * * @param[in] fd * Flow rule description. * @param[in] len @@ -2195,12 +2622,61 @@ struct rte_flow_desc { * If len is lower than the size of the flow, the number of bytes that would * have been written to desc had it been sufficient. Nothing is written. */ +__rte_deprecated size_t rte_flow_copy(struct rte_flow_desc *fd, size_t len, const struct rte_flow_attr *attr, const struct rte_flow_item *items, const struct rte_flow_action *actions); +/** + * Flow object conversion helper. + * + * This function performs conversion of various flow API objects to a + * pre-allocated destination buffer. See enum rte_flow_conv_op for possible + * operations and details about each of them. + * + * Since destination buffer must be large enough, it works in a manner + * reminiscent of snprintf(): + * + * - If @p size is 0, @p dst may be a NULL pointer, otherwise @p dst must be + * non-NULL. + * - If positive, the returned value represents the number of bytes needed + * to store the conversion of @p src to @p dst according to @p op + * regardless of the @p size parameter. + * - Since no more than @p size bytes can be written to @p dst, output is + * truncated and may be inconsistent when the returned value is larger + * than that. + * - In case of conversion error, a negative error code is returned and + * @p dst contents are unspecified. + * + * @param op + * Operation to perform, related to the object type of @p dst. + * @param[out] dst + * Destination buffer address. Must be suitably aligned by the caller. + * @param size + * Destination buffer size in bytes. + * @param[in] src + * Source object to copy. Depending on @p op, its type may differ from + * that of @p dst. + * @param[out] error + * Perform verbose error reporting if not NULL. Initialized in case of + * error only. + * + * @return + * The number of bytes required to convert @p src to @p dst on success, a + * negative errno value otherwise and rte_errno is set. + * + * @see rte_flow_conv_op + */ +__rte_experimental +int +rte_flow_conv(enum rte_flow_conv_op op, + void *dst, + size_t size, + const void *src, + struct rte_flow_error *error); + #ifdef __cplusplus } #endif diff --git a/lib/librte_ethdev/rte_tm.h b/lib/librte_ethdev/rte_tm.h index 955f02ff..646ef388 100644 --- a/lib/librte_ethdev/rte_tm.h +++ b/lib/librte_ethdev/rte_tm.h @@ -831,10 +831,10 @@ enum rte_tm_cman_mode { */ struct rte_tm_red_params { /** Minimum queue threshold */ - uint32_t min_th; + uint64_t min_th; /** Maximum queue threshold */ - uint32_t max_th; + uint64_t max_th; /** Inverse of packet marking probability maximum value (maxp), i.e. * maxp_inv = 1 / maxp diff --git a/lib/librte_eventdev/Makefile b/lib/librte_eventdev/Makefile index 47f599a6..94961870 100644 --- a/lib/librte_eventdev/Makefile +++ b/lib/librte_eventdev/Makefile @@ -8,7 +8,7 @@ include $(RTE_SDK)/mk/rte.vars.mk LIB = librte_eventdev.a # library version -LIBABIVER := 5 +LIBABIVER := 6 # build flags CFLAGS += -DALLOW_EXPERIMENTAL_API @@ -28,6 +28,7 @@ SRCS-y += rte_event_ring.c SRCS-y += rte_event_eth_rx_adapter.c SRCS-y += rte_event_timer_adapter.c SRCS-y += rte_event_crypto_adapter.c +SRCS-y += rte_event_eth_tx_adapter.c # export include files SYMLINK-y-include += rte_eventdev.h @@ -39,6 +40,7 @@ SYMLINK-y-include += rte_event_eth_rx_adapter.h SYMLINK-y-include += rte_event_timer_adapter.h SYMLINK-y-include += rte_event_timer_adapter_pmd.h SYMLINK-y-include += rte_event_crypto_adapter.h +SYMLINK-y-include += rte_event_eth_tx_adapter.h # versioning export map EXPORT_MAP := rte_eventdev_version.map diff --git a/lib/librte_eventdev/meson.build b/lib/librte_eventdev/meson.build index 3cbaf298..6becfe86 100644 --- a/lib/librte_eventdev/meson.build +++ b/lib/librte_eventdev/meson.build @@ -1,7 +1,7 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2017 Intel Corporation -version = 5 +version = 6 allow_experimental_apis = true if host_machine.system() == 'linux' @@ -14,7 +14,8 @@ sources = files('rte_eventdev.c', 'rte_event_ring.c', 'rte_event_eth_rx_adapter.c', 'rte_event_timer_adapter.c', - 'rte_event_crypto_adapter.c') + 'rte_event_crypto_adapter.c', + 'rte_event_eth_tx_adapter.c') headers = files('rte_eventdev.h', 'rte_eventdev_pmd.h', 'rte_eventdev_pmd_pci.h', @@ -23,5 +24,6 @@ headers = files('rte_eventdev.h', 'rte_event_eth_rx_adapter.h', 'rte_event_timer_adapter.h', 'rte_event_timer_adapter_pmd.h', - 'rte_event_crypto_adapter.h') + 'rte_event_crypto_adapter.h', + 'rte_event_eth_tx_adapter.h') deps += ['ring', 'ethdev', 'hash', 'mempool', 'mbuf', 'timer', 'cryptodev'] diff --git a/lib/librte_eventdev/rte_event_eth_rx_adapter.c b/lib/librte_eventdev/rte_event_eth_rx_adapter.c index f5e5a0b5..71d008cd 100644 --- a/lib/librte_eventdev/rte_event_eth_rx_adapter.c +++ b/lib/librte_eventdev/rte_event_eth_rx_adapter.c @@ -1125,7 +1125,6 @@ rxa_poll(struct rte_event_eth_rx_adapter *rx_adapter) wrr_pos = rx_adapter->wrr_pos; max_nb_rx = rx_adapter->max_nb_rx; buf = &rx_adapter->event_enqueue_buffer; - stats = &rx_adapter->stats; /* Iterate through a WRR sequence */ for (num_queue = 0; num_queue < rx_adapter->wrr_len; num_queue++) { @@ -1998,8 +1997,7 @@ rte_event_eth_rx_adapter_create_ext(uint8_t id, uint8_t dev_id, rx_adapter->id = id; strcpy(rx_adapter->mem_name, mem_name); rx_adapter->eth_devices = rte_zmalloc_socket(rx_adapter->mem_name, - /* FIXME: incompatible with hotplug */ - rte_eth_dev_count_total() * + RTE_MAX_ETHPORTS * sizeof(struct eth_device_info), 0, socket_id); rte_convert_rss_key((const uint32_t *)default_rss_key, @@ -2012,7 +2010,7 @@ rte_event_eth_rx_adapter_create_ext(uint8_t id, uint8_t dev_id, return -ENOMEM; } rte_spinlock_init(&rx_adapter->rx_lock); - RTE_ETH_FOREACH_DEV(i) + for (i = 0; i < RTE_MAX_ETHPORTS; i++) rx_adapter->eth_devices[i].dev = &rte_eth_devices[i]; event_eth_rx_adapter[id] = rx_adapter; diff --git a/lib/librte_eventdev/rte_event_eth_rx_adapter.h b/lib/librte_eventdev/rte_event_eth_rx_adapter.h index 332ee216..863b72a1 100644 --- a/lib/librte_eventdev/rte_event_eth_rx_adapter.h +++ b/lib/librte_eventdev/rte_event_eth_rx_adapter.h @@ -76,10 +76,6 @@ * rte_event_eth_rx_adapter_cb_register() function allows the * application to register a callback that selects which packets to enqueue * to the event device. - * - * Note: - * 1) Devices created after an instance of rte_event_eth_rx_adapter_create - * should be added to a new instance of the rx adapter. */ #ifdef __cplusplus diff --git a/lib/librte_eventdev/rte_event_eth_tx_adapter.c b/lib/librte_eventdev/rte_event_eth_tx_adapter.c new file mode 100644 index 00000000..3a21defb --- /dev/null +++ b/lib/librte_eventdev/rte_event_eth_tx_adapter.c @@ -0,0 +1,1138 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation. + */ +#include +#include +#include + +#include "rte_eventdev_pmd.h" +#include "rte_event_eth_tx_adapter.h" + +#define TXA_BATCH_SIZE 32 +#define TXA_SERVICE_NAME_LEN 32 +#define TXA_MEM_NAME_LEN 32 +#define TXA_FLUSH_THRESHOLD 1024 +#define TXA_RETRY_CNT 100 +#define TXA_MAX_NB_TX 128 +#define TXA_INVALID_DEV_ID INT32_C(-1) +#define TXA_INVALID_SERVICE_ID INT64_C(-1) + +#define txa_evdev(id) (&rte_eventdevs[txa_dev_id_array[(id)]]) + +#define txa_dev_caps_get(id) txa_evdev((id))->dev_ops->eth_tx_adapter_caps_get + +#define txa_dev_adapter_create(t) txa_evdev(t)->dev_ops->eth_tx_adapter_create + +#define txa_dev_adapter_create_ext(t) \ + txa_evdev(t)->dev_ops->eth_tx_adapter_create + +#define txa_dev_adapter_free(t) txa_evdev(t)->dev_ops->eth_tx_adapter_free + +#define txa_dev_queue_add(id) txa_evdev(id)->dev_ops->eth_tx_adapter_queue_add + +#define txa_dev_queue_del(t) txa_evdev(t)->dev_ops->eth_tx_adapter_queue_del + +#define txa_dev_start(t) txa_evdev(t)->dev_ops->eth_tx_adapter_start + +#define txa_dev_stop(t) txa_evdev(t)->dev_ops->eth_tx_adapter_stop + +#define txa_dev_stats_reset(t) txa_evdev(t)->dev_ops->eth_tx_adapter_stats_reset + +#define txa_dev_stats_get(t) txa_evdev(t)->dev_ops->eth_tx_adapter_stats_get + +#define RTE_EVENT_ETH_TX_ADAPTER_ID_VALID_OR_ERR_RET(id, retval) \ +do { \ + if (!txa_valid_id(id)) { \ + RTE_EDEV_LOG_ERR("Invalid eth Rx adapter id = %d", id); \ + return retval; \ + } \ +} while (0) + +#define TXA_CHECK_OR_ERR_RET(id) \ +do {\ + int ret; \ + RTE_EVENT_ETH_TX_ADAPTER_ID_VALID_OR_ERR_RET((id), -EINVAL); \ + ret = txa_init(); \ + if (ret != 0) \ + return ret; \ + if (!txa_adapter_exist((id))) \ + return -EINVAL; \ +} while (0) + +/* Tx retry callback structure */ +struct txa_retry { + /* Ethernet port id */ + uint16_t port_id; + /* Tx queue */ + uint16_t tx_queue; + /* Adapter ID */ + uint8_t id; +}; + +/* Per queue structure */ +struct txa_service_queue_info { + /* Queue has been added */ + uint8_t added; + /* Retry callback argument */ + struct txa_retry txa_retry; + /* Tx buffer */ + struct rte_eth_dev_tx_buffer *tx_buf; +}; + +/* PMD private structure */ +struct txa_service_data { + /* Max mbufs processed in any service function invocation */ + uint32_t max_nb_tx; + /* Number of Tx queues in adapter */ + uint32_t nb_queues; + /* Synchronization with data path */ + rte_spinlock_t tx_lock; + /* Event port ID */ + uint8_t port_id; + /* Event device identifier */ + uint8_t eventdev_id; + /* Highest port id supported + 1 */ + uint16_t dev_count; + /* Loop count to flush Tx buffers */ + int loop_cnt; + /* Per ethernet device structure */ + struct txa_service_ethdev *txa_ethdev; + /* Statistics */ + struct rte_event_eth_tx_adapter_stats stats; + /* Adapter Identifier */ + uint8_t id; + /* Conf arg must be freed */ + uint8_t conf_free; + /* Configuration callback */ + rte_event_eth_tx_adapter_conf_cb conf_cb; + /* Configuration callback argument */ + void *conf_arg; + /* socket id */ + int socket_id; + /* Per adapter EAL service */ + int64_t service_id; + /* Memory allocation name */ + char mem_name[TXA_MEM_NAME_LEN]; +} __rte_cache_aligned; + +/* Per eth device structure */ +struct txa_service_ethdev { + /* Pointer to ethernet device */ + struct rte_eth_dev *dev; + /* Number of queues added */ + uint16_t nb_queues; + /* PMD specific queue data */ + void *queues; +}; + +/* Array of adapter instances, initialized with event device id + * when adapter is created + */ +static int *txa_dev_id_array; + +/* Array of pointers to service implementation data */ +static struct txa_service_data **txa_service_data_array; + +static int32_t txa_service_func(void *args); +static int txa_service_adapter_create_ext(uint8_t id, + struct rte_eventdev *dev, + rte_event_eth_tx_adapter_conf_cb conf_cb, + void *conf_arg); +static int txa_service_queue_del(uint8_t id, + const struct rte_eth_dev *dev, + int32_t tx_queue_id); + +static int +txa_adapter_exist(uint8_t id) +{ + return txa_dev_id_array[id] != TXA_INVALID_DEV_ID; +} + +static inline int +txa_valid_id(uint8_t id) +{ + return id < RTE_EVENT_ETH_TX_ADAPTER_MAX_INSTANCE; +} + +static void * +txa_memzone_array_get(const char *name, unsigned int elt_size, int nb_elems) +{ + const struct rte_memzone *mz; + unsigned int sz; + + sz = elt_size * nb_elems; + sz = RTE_ALIGN(sz, RTE_CACHE_LINE_SIZE); + + mz = rte_memzone_lookup(name); + if (mz == NULL) { + mz = rte_memzone_reserve_aligned(name, sz, rte_socket_id(), 0, + RTE_CACHE_LINE_SIZE); + if (mz == NULL) { + RTE_EDEV_LOG_ERR("failed to reserve memzone" + " name = %s err = %" + PRId32, name, rte_errno); + return NULL; + } + } + + return mz->addr; +} + +static int +txa_dev_id_array_init(void) +{ + if (txa_dev_id_array == NULL) { + int i; + + txa_dev_id_array = txa_memzone_array_get("txa_adapter_array", + sizeof(int), + RTE_EVENT_ETH_TX_ADAPTER_MAX_INSTANCE); + if (txa_dev_id_array == NULL) + return -ENOMEM; + + for (i = 0; i < RTE_EVENT_ETH_TX_ADAPTER_MAX_INSTANCE; i++) + txa_dev_id_array[i] = TXA_INVALID_DEV_ID; + } + + return 0; +} + +static int +txa_init(void) +{ + return txa_dev_id_array_init(); +} + +static int +txa_service_data_init(void) +{ + if (txa_service_data_array == NULL) { + txa_service_data_array = + txa_memzone_array_get("txa_service_data_array", + sizeof(int), + RTE_EVENT_ETH_TX_ADAPTER_MAX_INSTANCE); + if (txa_service_data_array == NULL) + return -ENOMEM; + } + + return 0; +} + +static inline struct txa_service_data * +txa_service_id_to_data(uint8_t id) +{ + return txa_service_data_array[id]; +} + +static inline struct txa_service_queue_info * +txa_service_queue(struct txa_service_data *txa, uint16_t port_id, + uint16_t tx_queue_id) +{ + struct txa_service_queue_info *tqi; + + if (unlikely(txa->txa_ethdev == NULL || txa->dev_count < port_id + 1)) + return NULL; + + tqi = txa->txa_ethdev[port_id].queues; + + return likely(tqi != NULL) ? tqi + tx_queue_id : NULL; +} + +static int +txa_service_conf_cb(uint8_t __rte_unused id, uint8_t dev_id, + struct rte_event_eth_tx_adapter_conf *conf, void *arg) +{ + int ret; + struct rte_eventdev *dev; + struct rte_event_port_conf *pc; + struct rte_event_dev_config dev_conf; + int started; + uint8_t port_id; + + pc = arg; + dev = &rte_eventdevs[dev_id]; + dev_conf = dev->data->dev_conf; + + started = dev->data->dev_started; + if (started) + rte_event_dev_stop(dev_id); + + port_id = dev_conf.nb_event_ports; + dev_conf.nb_event_ports += 1; + + ret = rte_event_dev_configure(dev_id, &dev_conf); + if (ret) { + RTE_EDEV_LOG_ERR("failed to configure event dev %u", + dev_id); + if (started) { + if (rte_event_dev_start(dev_id)) + return -EIO; + } + return ret; + } + + pc->disable_implicit_release = 0; + ret = rte_event_port_setup(dev_id, port_id, pc); + if (ret) { + RTE_EDEV_LOG_ERR("failed to setup event port %u\n", + port_id); + if (started) { + if (rte_event_dev_start(dev_id)) + return -EIO; + } + return ret; + } + + conf->event_port_id = port_id; + conf->max_nb_tx = TXA_MAX_NB_TX; + if (started) + ret = rte_event_dev_start(dev_id); + return ret; +} + +static int +txa_service_ethdev_alloc(struct txa_service_data *txa) +{ + struct txa_service_ethdev *txa_ethdev; + uint16_t i, dev_count; + + dev_count = rte_eth_dev_count_avail(); + if (txa->txa_ethdev && dev_count == txa->dev_count) + return 0; + + txa_ethdev = rte_zmalloc_socket(txa->mem_name, + dev_count * sizeof(*txa_ethdev), + 0, + txa->socket_id); + if (txa_ethdev == NULL) { + RTE_EDEV_LOG_ERR("Failed to alloc txa::txa_ethdev "); + return -ENOMEM; + } + + if (txa->dev_count) + memcpy(txa_ethdev, txa->txa_ethdev, + txa->dev_count * sizeof(*txa_ethdev)); + + RTE_ETH_FOREACH_DEV(i) { + if (i == dev_count) + break; + txa_ethdev[i].dev = &rte_eth_devices[i]; + } + + txa->txa_ethdev = txa_ethdev; + txa->dev_count = dev_count; + return 0; +} + +static int +txa_service_queue_array_alloc(struct txa_service_data *txa, + uint16_t port_id) +{ + struct txa_service_queue_info *tqi; + uint16_t nb_queue; + int ret; + + ret = txa_service_ethdev_alloc(txa); + if (ret != 0) + return ret; + + if (txa->txa_ethdev[port_id].queues) + return 0; + + nb_queue = txa->txa_ethdev[port_id].dev->data->nb_tx_queues; + tqi = rte_zmalloc_socket(txa->mem_name, + nb_queue * + sizeof(struct txa_service_queue_info), 0, + txa->socket_id); + if (tqi == NULL) + return -ENOMEM; + txa->txa_ethdev[port_id].queues = tqi; + return 0; +} + +static void +txa_service_queue_array_free(struct txa_service_data *txa, + uint16_t port_id) +{ + struct txa_service_ethdev *txa_ethdev; + struct txa_service_queue_info *tqi; + + txa_ethdev = &txa->txa_ethdev[port_id]; + if (txa->txa_ethdev == NULL || txa_ethdev->nb_queues != 0) + return; + + tqi = txa_ethdev->queues; + txa_ethdev->queues = NULL; + rte_free(tqi); + + if (txa->nb_queues == 0) { + rte_free(txa->txa_ethdev); + txa->txa_ethdev = NULL; + } +} + +static void +txa_service_unregister(struct txa_service_data *txa) +{ + if (txa->service_id != TXA_INVALID_SERVICE_ID) { + rte_service_component_runstate_set(txa->service_id, 0); + while (rte_service_may_be_active(txa->service_id)) + rte_pause(); + rte_service_component_unregister(txa->service_id); + } + txa->service_id = TXA_INVALID_SERVICE_ID; +} + +static int +txa_service_register(struct txa_service_data *txa) +{ + int ret; + struct rte_service_spec service; + struct rte_event_eth_tx_adapter_conf conf; + + if (txa->service_id != TXA_INVALID_SERVICE_ID) + return 0; + + memset(&service, 0, sizeof(service)); + snprintf(service.name, TXA_SERVICE_NAME_LEN, "txa_%d", txa->id); + service.socket_id = txa->socket_id; + service.callback = txa_service_func; + service.callback_userdata = txa; + service.capabilities = RTE_SERVICE_CAP_MT_SAFE; + ret = rte_service_component_register(&service, + (uint32_t *)&txa->service_id); + if (ret) { + RTE_EDEV_LOG_ERR("failed to register service %s err = %" + PRId32, service.name, ret); + return ret; + } + + ret = txa->conf_cb(txa->id, txa->eventdev_id, &conf, txa->conf_arg); + if (ret) { + txa_service_unregister(txa); + return ret; + } + + rte_service_component_runstate_set(txa->service_id, 1); + txa->port_id = conf.event_port_id; + txa->max_nb_tx = conf.max_nb_tx; + return 0; +} + +static struct rte_eth_dev_tx_buffer * +txa_service_tx_buf_alloc(struct txa_service_data *txa, + const struct rte_eth_dev *dev) +{ + struct rte_eth_dev_tx_buffer *tb; + uint16_t port_id; + + port_id = dev->data->port_id; + tb = rte_zmalloc_socket(txa->mem_name, + RTE_ETH_TX_BUFFER_SIZE(TXA_BATCH_SIZE), + 0, + rte_eth_dev_socket_id(port_id)); + if (tb == NULL) + RTE_EDEV_LOG_ERR("Failed to allocate memory for tx buffer"); + return tb; +} + +static int +txa_service_is_queue_added(struct txa_service_data *txa, + const struct rte_eth_dev *dev, + uint16_t tx_queue_id) +{ + struct txa_service_queue_info *tqi; + + tqi = txa_service_queue(txa, dev->data->port_id, tx_queue_id); + return tqi && tqi->added; +} + +static int +txa_service_ctrl(uint8_t id, int start) +{ + int ret; + struct txa_service_data *txa; + + txa = txa_service_id_to_data(id); + if (txa->service_id == TXA_INVALID_SERVICE_ID) + return 0; + + ret = rte_service_runstate_set(txa->service_id, start); + if (ret == 0 && !start) { + while (rte_service_may_be_active(txa->service_id)) + rte_pause(); + } + return ret; +} + +static void +txa_service_buffer_retry(struct rte_mbuf **pkts, uint16_t unsent, + void *userdata) +{ + struct txa_retry *tr; + struct txa_service_data *data; + struct rte_event_eth_tx_adapter_stats *stats; + uint16_t sent = 0; + unsigned int retry = 0; + uint16_t i, n; + + tr = (struct txa_retry *)(uintptr_t)userdata; + data = txa_service_id_to_data(tr->id); + stats = &data->stats; + + do { + n = rte_eth_tx_burst(tr->port_id, tr->tx_queue, + &pkts[sent], unsent - sent); + + sent += n; + } while (sent != unsent && retry++ < TXA_RETRY_CNT); + + for (i = sent; i < unsent; i++) + rte_pktmbuf_free(pkts[i]); + + stats->tx_retry += retry; + stats->tx_packets += sent; + stats->tx_dropped += unsent - sent; +} + +static void +txa_service_tx(struct txa_service_data *txa, struct rte_event *ev, + uint32_t n) +{ + uint32_t i; + uint16_t nb_tx; + struct rte_event_eth_tx_adapter_stats *stats; + + stats = &txa->stats; + + nb_tx = 0; + for (i = 0; i < n; i++) { + struct rte_mbuf *m; + uint16_t port; + uint16_t queue; + struct txa_service_queue_info *tqi; + + m = ev[i].mbuf; + port = m->port; + queue = rte_event_eth_tx_adapter_txq_get(m); + + tqi = txa_service_queue(txa, port, queue); + if (unlikely(tqi == NULL || !tqi->added)) { + rte_pktmbuf_free(m); + continue; + } + + nb_tx += rte_eth_tx_buffer(port, queue, tqi->tx_buf, m); + } + + stats->tx_packets += nb_tx; +} + +static int32_t +txa_service_func(void *args) +{ + struct txa_service_data *txa = args; + uint8_t dev_id; + uint8_t port; + uint16_t n; + uint32_t nb_tx, max_nb_tx; + struct rte_event ev[TXA_BATCH_SIZE]; + + dev_id = txa->eventdev_id; + max_nb_tx = txa->max_nb_tx; + port = txa->port_id; + + if (txa->nb_queues == 0) + return 0; + + if (!rte_spinlock_trylock(&txa->tx_lock)) + return 0; + + for (nb_tx = 0; nb_tx < max_nb_tx; nb_tx += n) { + + n = rte_event_dequeue_burst(dev_id, port, ev, RTE_DIM(ev), 0); + if (!n) + break; + txa_service_tx(txa, ev, n); + } + + if ((txa->loop_cnt++ & (TXA_FLUSH_THRESHOLD - 1)) == 0) { + + struct txa_service_ethdev *tdi; + struct txa_service_queue_info *tqi; + struct rte_eth_dev *dev; + uint16_t i; + + tdi = txa->txa_ethdev; + nb_tx = 0; + + RTE_ETH_FOREACH_DEV(i) { + uint16_t q; + + if (i == txa->dev_count) + break; + + dev = tdi[i].dev; + if (tdi[i].nb_queues == 0) + continue; + for (q = 0; q < dev->data->nb_tx_queues; q++) { + + tqi = txa_service_queue(txa, i, q); + if (unlikely(tqi == NULL || !tqi->added)) + continue; + + nb_tx += rte_eth_tx_buffer_flush(i, q, + tqi->tx_buf); + } + } + + txa->stats.tx_packets += nb_tx; + } + rte_spinlock_unlock(&txa->tx_lock); + return 0; +} + +static int +txa_service_adapter_create(uint8_t id, struct rte_eventdev *dev, + struct rte_event_port_conf *port_conf) +{ + struct txa_service_data *txa; + struct rte_event_port_conf *cb_conf; + int ret; + + cb_conf = rte_malloc(NULL, sizeof(*cb_conf), 0); + if (cb_conf == NULL) + return -ENOMEM; + + *cb_conf = *port_conf; + ret = txa_service_adapter_create_ext(id, dev, txa_service_conf_cb, + cb_conf); + if (ret) { + rte_free(cb_conf); + return ret; + } + + txa = txa_service_id_to_data(id); + txa->conf_free = 1; + return ret; +} + +static int +txa_service_adapter_create_ext(uint8_t id, struct rte_eventdev *dev, + rte_event_eth_tx_adapter_conf_cb conf_cb, + void *conf_arg) +{ + struct txa_service_data *txa; + int socket_id; + char mem_name[TXA_SERVICE_NAME_LEN]; + int ret; + + if (conf_cb == NULL) + return -EINVAL; + + socket_id = dev->data->socket_id; + snprintf(mem_name, TXA_MEM_NAME_LEN, + "rte_event_eth_txa_%d", + id); + + ret = txa_service_data_init(); + if (ret != 0) + return ret; + + txa = rte_zmalloc_socket(mem_name, + sizeof(*txa), + RTE_CACHE_LINE_SIZE, socket_id); + if (txa == NULL) { + RTE_EDEV_LOG_ERR("failed to get mem for tx adapter"); + return -ENOMEM; + } + + txa->id = id; + txa->eventdev_id = dev->data->dev_id; + txa->socket_id = socket_id; + strncpy(txa->mem_name, mem_name, TXA_SERVICE_NAME_LEN); + txa->conf_cb = conf_cb; + txa->conf_arg = conf_arg; + txa->service_id = TXA_INVALID_SERVICE_ID; + rte_spinlock_init(&txa->tx_lock); + txa_service_data_array[id] = txa; + + return 0; +} + +static int +txa_service_event_port_get(uint8_t id, uint8_t *port) +{ + struct txa_service_data *txa; + + txa = txa_service_id_to_data(id); + if (txa->service_id == TXA_INVALID_SERVICE_ID) + return -ENODEV; + + *port = txa->port_id; + return 0; +} + +static int +txa_service_adapter_free(uint8_t id) +{ + struct txa_service_data *txa; + + txa = txa_service_id_to_data(id); + if (txa->nb_queues) { + RTE_EDEV_LOG_ERR("%" PRIu16 " Tx queues not deleted", + txa->nb_queues); + return -EBUSY; + } + + if (txa->conf_free) + rte_free(txa->conf_arg); + rte_free(txa); + return 0; +} + +static int +txa_service_queue_add(uint8_t id, + __rte_unused struct rte_eventdev *dev, + const struct rte_eth_dev *eth_dev, + int32_t tx_queue_id) +{ + struct txa_service_data *txa; + struct txa_service_ethdev *tdi; + struct txa_service_queue_info *tqi; + struct rte_eth_dev_tx_buffer *tb; + struct txa_retry *txa_retry; + int ret; + + txa = txa_service_id_to_data(id); + + if (tx_queue_id == -1) { + int nb_queues; + uint16_t i, j; + uint16_t *qdone; + + nb_queues = eth_dev->data->nb_tx_queues; + if (txa->dev_count > eth_dev->data->port_id) { + tdi = &txa->txa_ethdev[eth_dev->data->port_id]; + nb_queues -= tdi->nb_queues; + } + + qdone = rte_zmalloc(txa->mem_name, + nb_queues * sizeof(*qdone), 0); + j = 0; + for (i = 0; i < nb_queues; i++) { + if (txa_service_is_queue_added(txa, eth_dev, i)) + continue; + ret = txa_service_queue_add(id, dev, eth_dev, i); + if (ret == 0) + qdone[j++] = i; + else + break; + } + + if (i != nb_queues) { + for (i = 0; i < j; i++) + txa_service_queue_del(id, eth_dev, qdone[i]); + } + rte_free(qdone); + return ret; + } + + ret = txa_service_register(txa); + if (ret) + return ret; + + rte_spinlock_lock(&txa->tx_lock); + + if (txa_service_is_queue_added(txa, eth_dev, tx_queue_id)) { + rte_spinlock_unlock(&txa->tx_lock); + return 0; + } + + ret = txa_service_queue_array_alloc(txa, eth_dev->data->port_id); + if (ret) + goto err_unlock; + + tb = txa_service_tx_buf_alloc(txa, eth_dev); + if (tb == NULL) + goto err_unlock; + + tdi = &txa->txa_ethdev[eth_dev->data->port_id]; + tqi = txa_service_queue(txa, eth_dev->data->port_id, tx_queue_id); + + txa_retry = &tqi->txa_retry; + txa_retry->id = txa->id; + txa_retry->port_id = eth_dev->data->port_id; + txa_retry->tx_queue = tx_queue_id; + + rte_eth_tx_buffer_init(tb, TXA_BATCH_SIZE); + rte_eth_tx_buffer_set_err_callback(tb, + txa_service_buffer_retry, txa_retry); + + tqi->tx_buf = tb; + tqi->added = 1; + tdi->nb_queues++; + txa->nb_queues++; + +err_unlock: + if (txa->nb_queues == 0) { + txa_service_queue_array_free(txa, + eth_dev->data->port_id); + txa_service_unregister(txa); + } + + rte_spinlock_unlock(&txa->tx_lock); + return 0; +} + +static int +txa_service_queue_del(uint8_t id, + const struct rte_eth_dev *dev, + int32_t tx_queue_id) +{ + struct txa_service_data *txa; + struct txa_service_queue_info *tqi; + struct rte_eth_dev_tx_buffer *tb; + uint16_t port_id; + + if (tx_queue_id == -1) { + uint16_t i; + int ret = -1; + + for (i = 0; i < dev->data->nb_tx_queues; i++) { + ret = txa_service_queue_del(id, dev, i); + if (ret != 0) + break; + } + return ret; + } + + txa = txa_service_id_to_data(id); + port_id = dev->data->port_id; + + tqi = txa_service_queue(txa, port_id, tx_queue_id); + if (tqi == NULL || !tqi->added) + return 0; + + tb = tqi->tx_buf; + tqi->added = 0; + tqi->tx_buf = NULL; + rte_free(tb); + txa->nb_queues--; + txa->txa_ethdev[port_id].nb_queues--; + + txa_service_queue_array_free(txa, port_id); + return 0; +} + +static int +txa_service_id_get(uint8_t id, uint32_t *service_id) +{ + struct txa_service_data *txa; + + txa = txa_service_id_to_data(id); + if (txa->service_id == TXA_INVALID_SERVICE_ID) + return -ESRCH; + + if (service_id == NULL) + return -EINVAL; + + *service_id = txa->service_id; + return 0; +} + +static int +txa_service_start(uint8_t id) +{ + return txa_service_ctrl(id, 1); +} + +static int +txa_service_stats_get(uint8_t id, + struct rte_event_eth_tx_adapter_stats *stats) +{ + struct txa_service_data *txa; + + txa = txa_service_id_to_data(id); + *stats = txa->stats; + return 0; +} + +static int +txa_service_stats_reset(uint8_t id) +{ + struct txa_service_data *txa; + + txa = txa_service_id_to_data(id); + memset(&txa->stats, 0, sizeof(txa->stats)); + return 0; +} + +static int +txa_service_stop(uint8_t id) +{ + return txa_service_ctrl(id, 0); +} + + +int __rte_experimental +rte_event_eth_tx_adapter_create(uint8_t id, uint8_t dev_id, + struct rte_event_port_conf *port_conf) +{ + struct rte_eventdev *dev; + int ret; + + if (port_conf == NULL) + return -EINVAL; + + RTE_EVENT_ETH_TX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + + dev = &rte_eventdevs[dev_id]; + + ret = txa_init(); + if (ret != 0) + return ret; + + if (txa_adapter_exist(id)) + return -EEXIST; + + txa_dev_id_array[id] = dev_id; + if (txa_dev_adapter_create(id)) + ret = txa_dev_adapter_create(id)(id, dev); + + if (ret != 0) { + txa_dev_id_array[id] = TXA_INVALID_DEV_ID; + return ret; + } + + ret = txa_service_adapter_create(id, dev, port_conf); + if (ret != 0) { + if (txa_dev_adapter_free(id)) + txa_dev_adapter_free(id)(id, dev); + txa_dev_id_array[id] = TXA_INVALID_DEV_ID; + return ret; + } + + txa_dev_id_array[id] = dev_id; + return 0; +} + +int __rte_experimental +rte_event_eth_tx_adapter_create_ext(uint8_t id, uint8_t dev_id, + rte_event_eth_tx_adapter_conf_cb conf_cb, + void *conf_arg) +{ + struct rte_eventdev *dev; + int ret; + + RTE_EVENT_ETH_TX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + + ret = txa_init(); + if (ret != 0) + return ret; + + if (txa_adapter_exist(id)) + return -EINVAL; + + dev = &rte_eventdevs[dev_id]; + + txa_dev_id_array[id] = dev_id; + if (txa_dev_adapter_create_ext(id)) + ret = txa_dev_adapter_create_ext(id)(id, dev); + + if (ret != 0) { + txa_dev_id_array[id] = TXA_INVALID_DEV_ID; + return ret; + } + + ret = txa_service_adapter_create_ext(id, dev, conf_cb, conf_arg); + if (ret != 0) { + if (txa_dev_adapter_free(id)) + txa_dev_adapter_free(id)(id, dev); + txa_dev_id_array[id] = TXA_INVALID_DEV_ID; + return ret; + } + + txa_dev_id_array[id] = dev_id; + return 0; +} + + +int __rte_experimental +rte_event_eth_tx_adapter_event_port_get(uint8_t id, uint8_t *event_port_id) +{ + TXA_CHECK_OR_ERR_RET(id); + + return txa_service_event_port_get(id, event_port_id); +} + +int __rte_experimental +rte_event_eth_tx_adapter_free(uint8_t id) +{ + int ret; + + TXA_CHECK_OR_ERR_RET(id); + + ret = txa_dev_adapter_free(id) ? + txa_dev_adapter_free(id)(id, txa_evdev(id)) : + 0; + + if (ret == 0) + ret = txa_service_adapter_free(id); + txa_dev_id_array[id] = TXA_INVALID_DEV_ID; + + return ret; +} + +int __rte_experimental +rte_event_eth_tx_adapter_queue_add(uint8_t id, + uint16_t eth_dev_id, + int32_t queue) +{ + struct rte_eth_dev *eth_dev; + int ret; + uint32_t caps; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(eth_dev_id, -EINVAL); + TXA_CHECK_OR_ERR_RET(id); + + eth_dev = &rte_eth_devices[eth_dev_id]; + if (queue != -1 && (uint16_t)queue >= eth_dev->data->nb_tx_queues) { + RTE_EDEV_LOG_ERR("Invalid tx queue_id %" PRIu16, + (uint16_t)queue); + return -EINVAL; + } + + caps = 0; + if (txa_dev_caps_get(id)) + txa_dev_caps_get(id)(txa_evdev(id), eth_dev, &caps); + + if (caps & RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT) + ret = txa_dev_queue_add(id) ? + txa_dev_queue_add(id)(id, + txa_evdev(id), + eth_dev, + queue) : 0; + else + ret = txa_service_queue_add(id, txa_evdev(id), eth_dev, queue); + + return ret; +} + +int __rte_experimental +rte_event_eth_tx_adapter_queue_del(uint8_t id, + uint16_t eth_dev_id, + int32_t queue) +{ + struct rte_eth_dev *eth_dev; + int ret; + uint32_t caps; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(eth_dev_id, -EINVAL); + TXA_CHECK_OR_ERR_RET(id); + + eth_dev = &rte_eth_devices[eth_dev_id]; + if (queue != -1 && (uint16_t)queue >= eth_dev->data->nb_tx_queues) { + RTE_EDEV_LOG_ERR("Invalid tx queue_id %" PRIu16, + (uint16_t)queue); + return -EINVAL; + } + + caps = 0; + + if (txa_dev_caps_get(id)) + txa_dev_caps_get(id)(txa_evdev(id), eth_dev, &caps); + + if (caps & RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT) + ret = txa_dev_queue_del(id) ? + txa_dev_queue_del(id)(id, txa_evdev(id), + eth_dev, + queue) : 0; + else + ret = txa_service_queue_del(id, eth_dev, queue); + + return ret; +} + +int __rte_experimental +rte_event_eth_tx_adapter_service_id_get(uint8_t id, uint32_t *service_id) +{ + TXA_CHECK_OR_ERR_RET(id); + + return txa_service_id_get(id, service_id); +} + +int __rte_experimental +rte_event_eth_tx_adapter_start(uint8_t id) +{ + int ret; + + TXA_CHECK_OR_ERR_RET(id); + + ret = txa_dev_start(id) ? txa_dev_start(id)(id, txa_evdev(id)) : 0; + if (ret == 0) + ret = txa_service_start(id); + return ret; +} + +int __rte_experimental +rte_event_eth_tx_adapter_stats_get(uint8_t id, + struct rte_event_eth_tx_adapter_stats *stats) +{ + int ret; + + TXA_CHECK_OR_ERR_RET(id); + + if (stats == NULL) + return -EINVAL; + + *stats = (struct rte_event_eth_tx_adapter_stats){0}; + + ret = txa_dev_stats_get(id) ? + txa_dev_stats_get(id)(id, txa_evdev(id), stats) : 0; + + if (ret == 0 && txa_service_id_get(id, NULL) != ESRCH) { + if (txa_dev_stats_get(id)) { + struct rte_event_eth_tx_adapter_stats service_stats; + + ret = txa_service_stats_get(id, &service_stats); + if (ret == 0) { + stats->tx_retry += service_stats.tx_retry; + stats->tx_packets += service_stats.tx_packets; + stats->tx_dropped += service_stats.tx_dropped; + } + } else + ret = txa_service_stats_get(id, stats); + } + + return ret; +} + +int __rte_experimental +rte_event_eth_tx_adapter_stats_reset(uint8_t id) +{ + int ret; + + TXA_CHECK_OR_ERR_RET(id); + + ret = txa_dev_stats_reset(id) ? + txa_dev_stats_reset(id)(id, txa_evdev(id)) : 0; + if (ret == 0) + ret = txa_service_stats_reset(id); + return ret; +} + +int __rte_experimental +rte_event_eth_tx_adapter_stop(uint8_t id) +{ + int ret; + + TXA_CHECK_OR_ERR_RET(id); + + ret = txa_dev_stop(id) ? txa_dev_stop(id)(id, txa_evdev(id)) : 0; + if (ret == 0) + ret = txa_service_stop(id); + return ret; +} diff --git a/lib/librte_eventdev/rte_event_eth_tx_adapter.h b/lib/librte_eventdev/rte_event_eth_tx_adapter.h new file mode 100644 index 00000000..81456d4a --- /dev/null +++ b/lib/librte_eventdev/rte_event_eth_tx_adapter.h @@ -0,0 +1,462 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation. + */ + +#ifndef _RTE_EVENT_ETH_TX_ADAPTER_ +#define _RTE_EVENT_ETH_TX_ADAPTER_ + +/** + * @file + * + * RTE Event Ethernet Tx Adapter + * + * The event ethernet Tx adapter provides configuration and data path APIs + * for the ethernet transmit stage of an event driven packet processing + * application. These APIs abstract the implementation of the transmit stage + * and allow the application to use eventdev PMD support or a common + * implementation. + * + * In the common implementation, the application enqueues mbufs to the adapter + * which runs as a rte_service function. The service function dequeues events + * from its event port and transmits the mbufs referenced by these events. + * + * The ethernet Tx event adapter APIs are: + * + * - rte_event_eth_tx_adapter_create() + * - rte_event_eth_tx_adapter_create_ext() + * - rte_event_eth_tx_adapter_free() + * - rte_event_eth_tx_adapter_start() + * - rte_event_eth_tx_adapter_stop() + * - rte_event_eth_tx_adapter_queue_add() + * - rte_event_eth_tx_adapter_queue_del() + * - rte_event_eth_tx_adapter_stats_get() + * - rte_event_eth_tx_adapter_stats_reset() + * - rte_event_eth_tx_adapter_enqueue() + * - rte_event_eth_tx_adapter_event_port_get() + * - rte_event_eth_tx_adapter_service_id_get() + * + * The application creates the adapter using + * rte_event_eth_tx_adapter_create() or rte_event_eth_tx_adapter_create_ext(). + * + * The adapter will use the common implementation when the eventdev PMD + * does not have the #RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT capability. + * The common implementation uses an event port that is created using the port + * configuration parameter passed to rte_event_eth_tx_adapter_create(). The + * application can get the port identifier using + * rte_event_eth_tx_adapter_event_port_get() and must link an event queue to + * this port. + * + * If the eventdev PMD has the #RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT + * flags set, Tx adapter events should be enqueued using the + * rte_event_eth_tx_adapter_enqueue() function, else the application should + * use rte_event_enqueue_burst(). + * + * Transmit queues can be added and deleted from the adapter using + * rte_event_eth_tx_adapter_queue_add()/del() APIs respectively. + * + * The application can start and stop the adapter using the + * rte_event_eth_tx_adapter_start/stop() calls. + * + * The common adapter implementation uses an EAL service function as described + * before and its execution is controlled using the rte_service APIs. The + * rte_event_eth_tx_adapter_service_id_get() + * function can be used to retrieve the adapter's service function ID. + * + * The ethernet port and transmit queue index to transmit the mbuf on are + * specified using the mbuf port and the higher 16 bits of + * struct rte_mbuf::hash::sched:hi. The application should use the + * rte_event_eth_tx_adapter_txq_set() and rte_event_eth_tx_adapter_txq_get() + * functions to access the transmit queue index since it is expected that the + * transmit queue will be eventually defined within struct rte_mbuf and using + * these macros will help with minimizing application impact due to + * a change in how the transmit queue index is specified. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include + +#include "rte_eventdev.h" + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Adapter configuration structure + * + * @see rte_event_eth_tx_adapter_create_ext + * @see rte_event_eth_tx_adapter_conf_cb + */ +struct rte_event_eth_tx_adapter_conf { + uint8_t event_port_id; + /**< Event port identifier, the adapter service function dequeues mbuf + * events from this port. + * @see RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT + */ + uint32_t max_nb_tx; + /**< The adapter can return early if it has processed at least + * max_nb_tx mbufs. This isn't treated as a requirement; batching may + * cause the adapter to process more than max_nb_tx mbufs. + */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Function type used for adapter configuration callback. The callback is + * used to fill in members of the struct rte_event_eth_tx_adapter_conf, this + * callback is invoked when creating a RTE service function based + * adapter implementation. + * + * @param id + * Adapter identifier. + * @param dev_id + * Event device identifier. + * @param [out] conf + * Structure that needs to be populated by this callback. + * @param arg + * Argument to the callback. This is the same as the conf_arg passed to the + * rte_event_eth_tx_adapter_create_ext(). + * + * @return + * - 0: Success + * - <0: Error code on failure + */ +typedef int (*rte_event_eth_tx_adapter_conf_cb) (uint8_t id, uint8_t dev_id, + struct rte_event_eth_tx_adapter_conf *conf, + void *arg); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * A structure used to retrieve statistics for an ethernet Tx adapter instance. + */ +struct rte_event_eth_tx_adapter_stats { + uint64_t tx_retry; + /**< Number of transmit retries */ + uint64_t tx_packets; + /**< Number of packets transmitted */ + uint64_t tx_dropped; + /**< Number of packets dropped */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Create a new ethernet Tx adapter with the specified identifier. + * + * @param id + * The identifier of the ethernet Tx adapter. + * @param dev_id + * The event device identifier. + * @param port_config + * Event port configuration, the adapter uses this configuration to + * create an event port if needed. + * @return + * - 0: Success + * - <0: Error code on failure + */ +int __rte_experimental +rte_event_eth_tx_adapter_create(uint8_t id, uint8_t dev_id, + struct rte_event_port_conf *port_config); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Create a new ethernet Tx adapter with the specified identifier. + * + * @param id + * The identifier of the ethernet Tx adapter. + * @param dev_id + * The event device identifier. + * @param conf_cb + * Callback function that initializes members of the + * struct rte_event_eth_tx_adapter_conf struct passed into + * it. + * @param conf_arg + * Argument that is passed to the conf_cb function. + * @return + * - 0: Success + * - <0: Error code on failure + */ +int __rte_experimental +rte_event_eth_tx_adapter_create_ext(uint8_t id, uint8_t dev_id, + rte_event_eth_tx_adapter_conf_cb conf_cb, + void *conf_arg); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Free an ethernet Tx adapter + * + * @param id + * Adapter identifier. + * @return + * - 0: Success + * - <0: Error code on failure, If the adapter still has Tx queues + * added to it, the function returns -EBUSY. + */ +int __rte_experimental +rte_event_eth_tx_adapter_free(uint8_t id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Start ethernet Tx adapter + * + * @param id + * Adapter identifier. + * @return + * - 0: Success, Adapter started correctly. + * - <0: Error code on failure. + */ +int __rte_experimental +rte_event_eth_tx_adapter_start(uint8_t id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Stop ethernet Tx adapter + * + * @param id + * Adapter identifier. + * @return + * - 0: Success. + * - <0: Error code on failure. + */ +int __rte_experimental +rte_event_eth_tx_adapter_stop(uint8_t id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Add a Tx queue to the adapter. + * A queue value of -1 is used to indicate all + * queues within the device. + * + * @param id + * Adapter identifier. + * @param eth_dev_id + * Ethernet Port Identifier. + * @param queue + * Tx queue index. + * @return + * - 0: Success, Queues added successfully. + * - <0: Error code on failure. + */ +int __rte_experimental +rte_event_eth_tx_adapter_queue_add(uint8_t id, + uint16_t eth_dev_id, + int32_t queue); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Delete a Tx queue from the adapter. + * A queue value of -1 is used to indicate all + * queues within the device, that have been added to this + * adapter. + * + * @param id + * Adapter identifier. + * @param eth_dev_id + * Ethernet Port Identifier. + * @param queue + * Tx queue index. + * @return + * - 0: Success, Queues deleted successfully. + * - <0: Error code on failure. + */ +int __rte_experimental +rte_event_eth_tx_adapter_queue_del(uint8_t id, + uint16_t eth_dev_id, + int32_t queue); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Set Tx queue in the mbuf. This queue is used by the adapter + * to transmit the mbuf. + * + * @param pkt + * Pointer to the mbuf. + * @param queue + * Tx queue index. + */ +static __rte_always_inline void __rte_experimental +rte_event_eth_tx_adapter_txq_set(struct rte_mbuf *pkt, uint16_t queue) +{ + uint16_t *p = (uint16_t *)&pkt->hash.sched.hi; + p[1] = queue; +} + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Retrieve Tx queue from the mbuf. + * + * @param pkt + * Pointer to the mbuf. + * @return + * Tx queue identifier. + * + * @see rte_event_eth_tx_adapter_txq_set() + */ +static __rte_always_inline uint16_t __rte_experimental +rte_event_eth_tx_adapter_txq_get(struct rte_mbuf *pkt) +{ + uint16_t *p = (uint16_t *)&pkt->hash.sched.hi; + return p[1]; +} + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Retrieve the adapter event port. The adapter creates an event port if + * the #RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT is not set in the + * ethernet Tx capabilities of the event device. + * + * @param id + * Adapter Identifier. + * @param[out] event_port_id + * Event port pointer. + * @return + * - 0: Success. + * - <0: Error code on failure. + */ +int __rte_experimental +rte_event_eth_tx_adapter_event_port_get(uint8_t id, uint8_t *event_port_id); + +/** + * Enqueue a burst of events objects or an event object supplied in *rte_event* + * structure on an event device designated by its *dev_id* through the event + * port specified by *port_id*. This function is supported if the eventdev PMD + * has the #RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT capability flag set. + * + * The *nb_events* parameter is the number of event objects to enqueue which are + * supplied in the *ev* array of *rte_event* structure. + * + * The rte_event_eth_tx_adapter_enqueue() function returns the number of + * events objects it actually enqueued. A return value equal to *nb_events* + * means that all event objects have been enqueued. + * + * @param dev_id + * The identifier of the device. + * @param port_id + * The identifier of the event port. + * @param ev + * Points to an array of *nb_events* objects of type *rte_event* structure + * which contain the event object enqueue operations to be processed. + * @param nb_events + * The number of event objects to enqueue, typically number of + * rte_event_port_enqueue_depth() available for this port. + * + * @return + * The number of event objects actually enqueued on the event device. The + * return value can be less than the value of the *nb_events* parameter when + * the event devices queue is full or if invalid parameters are specified in a + * *rte_event*. If the return value is less than *nb_events*, the remaining + * events at the end of ev[] are not consumed and the caller has to take care + * of them, and rte_errno is set accordingly. Possible errno values include: + * - -EINVAL The port ID is invalid, device ID is invalid, an event's queue + * ID is invalid, or an event's sched type doesn't match the + * capabilities of the destination queue. + * - -ENOSPC The event port was backpressured and unable to enqueue + * one or more events. This error code is only applicable to + * closed systems. + */ +static inline uint16_t __rte_experimental +rte_event_eth_tx_adapter_enqueue(uint8_t dev_id, + uint8_t port_id, + struct rte_event ev[], + uint16_t nb_events) +{ + const struct rte_eventdev *dev = &rte_eventdevs[dev_id]; + +#ifdef RTE_LIBRTE_EVENTDEV_DEBUG + if (dev_id >= RTE_EVENT_MAX_DEVS || + !rte_eventdevs[dev_id].attached) { + rte_errno = -EINVAL; + return 0; + } + + if (port_id >= dev->data->nb_ports) { + rte_errno = -EINVAL; + return 0; + } +#endif + return dev->txa_enqueue(dev->data->ports[port_id], ev, nb_events); +} + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Retrieve statistics for an adapter + * + * @param id + * Adapter identifier. + * @param [out] stats + * A pointer to structure used to retrieve statistics for an adapter. + * @return + * - 0: Success, statistics retrieved successfully. + * - <0: Error code on failure. + */ +int __rte_experimental +rte_event_eth_tx_adapter_stats_get(uint8_t id, + struct rte_event_eth_tx_adapter_stats *stats); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Reset statistics for an adapter. + * + * @param id + * Adapter identifier. + * @return + * - 0: Success, statistics reset successfully. + * - <0: Error code on failure. + */ +int __rte_experimental +rte_event_eth_tx_adapter_stats_reset(uint8_t id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Retrieve the service ID of an adapter. If the adapter doesn't use + * a rte_service function, this function returns -ESRCH. + * + * @param id + * Adapter identifier. + * @param [out] service_id + * A pointer to a uint32_t, to be filled in with the service id. + * @return + * - 0: Success + * - <0: Error code on failure, if the adapter doesn't use a rte_service + * function, this function returns -ESRCH. + */ +int __rte_experimental +rte_event_eth_tx_adapter_service_id_get(uint8_t id, uint32_t *service_id); + +#ifdef __cplusplus +} +#endif +#endif /* _RTE_EVENT_ETH_TX_ADAPTER_ */ diff --git a/lib/librte_eventdev/rte_eventdev.c b/lib/librte_eventdev/rte_eventdev.c index 801810ed..ebaf3087 100644 --- a/lib/librte_eventdev/rte_eventdev.c +++ b/lib/librte_eventdev/rte_eventdev.c @@ -35,22 +35,20 @@ #include "rte_eventdev.h" #include "rte_eventdev_pmd.h" -struct rte_eventdev rte_event_devices[RTE_EVENT_MAX_DEVS]; +static struct rte_eventdev rte_event_devices[RTE_EVENT_MAX_DEVS]; -struct rte_eventdev *rte_eventdevs = &rte_event_devices[0]; +struct rte_eventdev *rte_eventdevs = rte_event_devices; static struct rte_eventdev_global eventdev_globals = { .nb_devs = 0 }; -struct rte_eventdev_global *rte_eventdev_globals = &eventdev_globals; - /* Event dev north bound API implementation */ uint8_t rte_event_dev_count(void) { - return rte_eventdev_globals->nb_devs; + return eventdev_globals.nb_devs; } int @@ -62,7 +60,7 @@ rte_event_dev_get_dev_id(const char *name) if (!name) return -EINVAL; - for (i = 0; i < rte_eventdev_globals->nb_devs; i++) { + for (i = 0; i < eventdev_globals.nb_devs; i++) { cmp = (strncmp(rte_event_devices[i].data->name, name, RTE_EVENTDEV_NAME_MAX_LEN) == 0) || (rte_event_devices[i].dev ? (strncmp( @@ -109,7 +107,7 @@ rte_event_dev_info_get(uint8_t dev_id, struct rte_event_dev_info *dev_info) } int -rte_event_eth_rx_adapter_caps_get(uint8_t dev_id, uint8_t eth_port_id, +rte_event_eth_rx_adapter_caps_get(uint8_t dev_id, uint16_t eth_port_id, uint32_t *caps) { struct rte_eventdev *dev; @@ -175,6 +173,31 @@ rte_event_crypto_adapter_caps_get(uint8_t dev_id, uint8_t cdev_id, (dev, cdev, caps) : -ENOTSUP; } +int __rte_experimental +rte_event_eth_tx_adapter_caps_get(uint8_t dev_id, uint16_t eth_port_id, + uint32_t *caps) +{ + struct rte_eventdev *dev; + struct rte_eth_dev *eth_dev; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + RTE_ETH_VALID_PORTID_OR_ERR_RET(eth_port_id, -EINVAL); + + dev = &rte_eventdevs[dev_id]; + eth_dev = &rte_eth_devices[eth_port_id]; + + if (caps == NULL) + return -EINVAL; + + *caps = 0; + + return dev->dev_ops->eth_tx_adapter_caps_get ? + (*dev->dev_ops->eth_tx_adapter_caps_get)(dev, + eth_dev, + caps) + : 0; +} + static inline int rte_event_dev_queue_config(struct rte_eventdev *dev, uint8_t nb_queues) { @@ -980,6 +1003,28 @@ rte_event_port_unlink(uint8_t dev_id, uint8_t port_id, return diag; } +int __rte_experimental +rte_event_port_unlinks_in_progress(uint8_t dev_id, uint8_t port_id) +{ + struct rte_eventdev *dev; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_eventdevs[dev_id]; + if (!is_valid_port(dev, port_id)) { + RTE_EDEV_LOG_ERR("Invalid port_id=%" PRIu8, port_id); + return -EINVAL; + } + + /* Return 0 if the PMD does not implement unlinks in progress. + * This allows PMDs which handle unlink synchronously to not implement + * this function at all. + */ + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->port_unlinks_in_progress, 0); + + return (*dev->dev_ops->port_unlinks_in_progress)(dev, + dev->data->ports[port_id]); +} + int rte_event_port_links_get(uint8_t dev_id, uint8_t port_id, uint8_t queues[], uint8_t priorities[]) @@ -1275,6 +1320,15 @@ rte_eventdev_find_free_device_index(void) return RTE_EVENT_MAX_DEVS; } +static uint16_t +rte_event_tx_adapter_enqueue(__rte_unused void *port, + __rte_unused struct rte_event ev[], + __rte_unused uint16_t nb_events) +{ + rte_errno = ENOTSUP; + return 0; +} + struct rte_eventdev * rte_event_pmd_allocate(const char *name, int socket_id) { @@ -1295,6 +1349,8 @@ rte_event_pmd_allocate(const char *name, int socket_id) eventdev = &rte_eventdevs[dev_id]; + eventdev->txa_enqueue = rte_event_tx_adapter_enqueue; + if (eventdev->data == NULL) { struct rte_eventdev_data *eventdev_data = NULL; diff --git a/lib/librte_eventdev/rte_eventdev.h b/lib/librte_eventdev/rte_eventdev.h index b6fd6ee7..d7eb69d1 100644 --- a/lib/librte_eventdev/rte_eventdev.h +++ b/lib/librte_eventdev/rte_eventdev.h @@ -1112,7 +1112,7 @@ struct rte_event { * */ int -rte_event_eth_rx_adapter_caps_get(uint8_t dev_id, uint8_t eth_port_id, +rte_event_eth_rx_adapter_caps_get(uint8_t dev_id, uint16_t eth_port_id, uint32_t *caps); #define RTE_EVENT_TIMER_ADAPTER_CAP_INTERNAL_PORT (1ULL << 0) @@ -1186,6 +1186,32 @@ int __rte_experimental rte_event_crypto_adapter_caps_get(uint8_t dev_id, uint8_t cdev_id, uint32_t *caps); +/* Ethdev Tx adapter capability bitmap flags */ +#define RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT 0x1 +/**< This flag is sent when the PMD supports a packet transmit callback + */ + +/** + * Retrieve the event device's eth Tx adapter capabilities + * + * @param dev_id + * The identifier of the device. + * + * @param eth_port_id + * The identifier of the ethernet device. + * + * @param[out] caps + * A pointer to memory filled with eth Tx adapter capabilities. + * + * @return + * - 0: Success, driver provides eth Tx adapter capabilities. + * - <0: Error code returned by the driver function. + * + */ +int __rte_experimental +rte_event_eth_tx_adapter_caps_get(uint8_t dev_id, uint16_t eth_port_id, + uint32_t *caps); + struct rte_eventdev_ops; struct rte_eventdev; @@ -1204,6 +1230,10 @@ typedef uint16_t (*event_dequeue_burst_t)(void *port, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks); /**< @internal Dequeue burst of events from port of a device */ +typedef uint16_t (*event_tx_adapter_enqueue)(void *port, + struct rte_event ev[], uint16_t nb_events); +/**< @internal Enqueue burst of events on port of a device */ + #define RTE_EVENTDEV_NAME_MAX_LEN (64) /**< @internal Max length of name of event PMD */ @@ -1266,7 +1296,8 @@ struct rte_eventdev { /**< Pointer to PMD dequeue function. */ event_dequeue_burst_t dequeue_burst; /**< Pointer to PMD dequeue burst function. */ - + event_tx_adapter_enqueue txa_enqueue; + /**< Pointer to PMD eth Tx adapter enqueue function. */ struct rte_eventdev_data *data; /**< Pointer to device data */ struct rte_eventdev_ops *dev_ops; @@ -1656,12 +1687,13 @@ rte_event_port_link(uint8_t dev_id, uint8_t port_id, * event port designated by its *port_id* on the event device designated * by its *dev_id*. * - * The unlink establishment shall disable the event port *port_id* from - * receiving events from the specified event queue *queue_id* - * + * The unlink call issues an async request to disable the event port *port_id* + * from receiving events from the specified event queue *queue_id*. * Event queue(s) to event port unlink establishment can be changed at runtime * without re-configuring the device. * + * @see rte_event_port_unlinks_in_progress() to poll for completed unlinks. + * * @param dev_id * The identifier of the device. * @@ -1679,21 +1711,47 @@ rte_event_port_link(uint8_t dev_id, uint8_t port_id, * NULL. * * @return - * The number of unlinks actually established. The return value can be less + * The number of unlinks successfully requested. The return value can be less * than the value of the *nb_unlinks* parameter when the implementation has the * limitation on specific queue to port unlink establishment or * if invalid parameters are specified. * If the return value is less than *nb_unlinks*, the remaining queues at the - * end of queues[] are not established, and the caller has to take care of them. + * end of queues[] are not unlinked, and the caller has to take care of them. * If return value is less than *nb_unlinks* then implementation shall update * the rte_errno accordingly, Possible rte_errno values are * (-EINVAL) Invalid parameter - * */ int rte_event_port_unlink(uint8_t dev_id, uint8_t port_id, uint8_t queues[], uint16_t nb_unlinks); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Returns the number of unlinks in progress. + * + * This function provides the application with a method to detect when an + * unlink has been completed by the implementation. + * + * @see rte_event_port_unlink() to issue unlink requests. + * + * @param dev_id + * The indentifier of the device. + * + * @param port_id + * Event port identifier to select port to check for unlinks in progress. + * + * @return + * The number of unlinks that are in progress. A return of zero indicates that + * there are no outstanding unlink requests. A positive return value indicates + * the number of unlinks that are in progress, but are not yet complete. + * A negative return value indicates an error, -EINVAL indicates an invalid + * parameter passed for *dev_id* or *port_id*. + */ +int __rte_experimental +rte_event_port_unlinks_in_progress(uint8_t dev_id, uint8_t port_id); + /** * Retrieve the list of source event queues and its associated service priority * linked to the destination event port designated by its *port_id* diff --git a/lib/librte_eventdev/rte_eventdev_pmd.h b/lib/librte_eventdev/rte_eventdev_pmd.h index 3fbb4d2b..1a01326b 100644 --- a/lib/librte_eventdev/rte_eventdev_pmd.h +++ b/lib/librte_eventdev/rte_eventdev_pmd.h @@ -87,8 +87,6 @@ struct rte_eventdev_global { uint8_t nb_devs; /**< Number of devices found */ }; -extern struct rte_eventdev_global *rte_eventdev_globals; -/** Pointer to global event devices data structure. */ extern struct rte_eventdev *rte_eventdevs; /** The pool of rte_eventdev structures. */ @@ -332,6 +330,23 @@ typedef int (*eventdev_port_link_t)(struct rte_eventdev *dev, void *port, typedef int (*eventdev_port_unlink_t)(struct rte_eventdev *dev, void *port, uint8_t queues[], uint16_t nb_unlinks); +/** + * Unlinks in progress. Returns number of unlinks that the PMD is currently + * performing, but have not yet been completed. + * + * @param dev + * Event device pointer + * + * @param port + * Event port pointer + * + * @return + * Returns the number of in-progress unlinks. Zero is returned if none are + * in progress. + */ +typedef int (*eventdev_port_unlinks_in_progress_t)(struct rte_eventdev *dev, + void *port); + /** * Converts nanoseconds to *timeout_ticks* value for rte_event_dequeue() * @@ -450,7 +465,7 @@ typedef int (*eventdev_eth_rx_adapter_caps_get_t) const struct rte_eth_dev *eth_dev, uint32_t *caps); -struct rte_event_eth_rx_adapter_queue_conf *queue_conf; +struct rte_event_eth_rx_adapter_queue_conf; /** * Retrieve the event device's timer adapter capabilities, as well as the ops @@ -575,7 +590,7 @@ typedef int (*eventdev_eth_rx_adapter_stop_t) (const struct rte_eventdev *dev, const struct rte_eth_dev *eth_dev); -struct rte_event_eth_rx_adapter_stats *stats; +struct rte_event_eth_rx_adapter_stats; /** * Retrieve ethernet Rx adapter statistics. @@ -789,6 +804,186 @@ typedef int (*eventdev_crypto_adapter_stats_reset) (const struct rte_eventdev *dev, const struct rte_cryptodev *cdev); +/** + * Retrieve the event device's eth Tx adapter capabilities. + * + * @param dev + * Event device pointer + * + * @param eth_dev + * Ethernet device pointer + * + * @param[out] caps + * A pointer to memory filled with eth Tx adapter capabilities. + * + * @return + * - 0: Success, driver provides eth Tx adapter capabilities + * - <0: Error code returned by the driver function. + * + */ +typedef int (*eventdev_eth_tx_adapter_caps_get_t) + (const struct rte_eventdev *dev, + const struct rte_eth_dev *eth_dev, + uint32_t *caps); + +/** + * Create adapter callback. + * + * @param id + * Adapter identifier + * + * @param dev + * Event device pointer + * + * @return + * - 0: Success. + * - <0: Error code on failure. + */ +typedef int (*eventdev_eth_tx_adapter_create_t)(uint8_t id, + const struct rte_eventdev *dev); + +/** + * Free adapter callback. + * + * @param id + * Adapter identifier + * + * @param dev + * Event device pointer + * + * @return + * - 0: Success. + * - <0: Error code on failure. + */ +typedef int (*eventdev_eth_tx_adapter_free_t)(uint8_t id, + const struct rte_eventdev *dev); + +/** + * Add a Tx queue to the adapter. + * A queue value of -1 is used to indicate all + * queues within the device. + * + * @param id + * Adapter identifier + * + * @param dev + * Event device pointer + * + * @param eth_dev + * Ethernet device pointer + * + * @param tx_queue_id + * Transmt queue index + * + * @return + * - 0: Success. + * - <0: Error code on failure. + */ +typedef int (*eventdev_eth_tx_adapter_queue_add_t)( + uint8_t id, + const struct rte_eventdev *dev, + const struct rte_eth_dev *eth_dev, + int32_t tx_queue_id); + +/** + * Delete a Tx queue from the adapter. + * A queue value of -1 is used to indicate all + * queues within the device, that have been added to this + * adapter. + * + * @param id + * Adapter identifier + * + * @param dev + * Event device pointer + * + * @param eth_dev + * Ethernet device pointer + * + * @param tx_queue_id + * Transmit queue index + * + * @return + * - 0: Success, Queues deleted successfully. + * - <0: Error code on failure. + */ +typedef int (*eventdev_eth_tx_adapter_queue_del_t)( + uint8_t id, + const struct rte_eventdev *dev, + const struct rte_eth_dev *eth_dev, + int32_t tx_queue_id); + +/** + * Start the adapter. + * + * @param id + * Adapter identifier + * + * @param dev + * Event device pointer + * + * @return + * - 0: Success, Adapter started correctly. + * - <0: Error code on failure. + */ +typedef int (*eventdev_eth_tx_adapter_start_t)(uint8_t id, + const struct rte_eventdev *dev); + +/** + * Stop the adapter. + * + * @param id + * Adapter identifier + * + * @param dev + * Event device pointer + * + * @return + * - 0: Success. + * - <0: Error code on failure. + */ +typedef int (*eventdev_eth_tx_adapter_stop_t)(uint8_t id, + const struct rte_eventdev *dev); + +struct rte_event_eth_tx_adapter_stats; + +/** + * Retrieve statistics for an adapter + * + * @param id + * Adapter identifier + * + * @param dev + * Event device pointer + * + * @param [out] stats + * A pointer to structure used to retrieve statistics for an adapter + * + * @return + * - 0: Success, statistics retrieved successfully. + * - <0: Error code on failure. + */ +typedef int (*eventdev_eth_tx_adapter_stats_get_t)( + uint8_t id, + const struct rte_eventdev *dev, + struct rte_event_eth_tx_adapter_stats *stats); + +/** + * Reset statistics for an adapter + * + * @param id + * Adapter identifier + * + * @param dev + * Event device pointer + * + * @return + * - 0: Success, statistics retrieved successfully. + * - <0: Error code on failure. + */ +typedef int (*eventdev_eth_tx_adapter_stats_reset_t)(uint8_t id, + const struct rte_eventdev *dev); + /** Event device operations function pointer table */ struct rte_eventdev_ops { eventdev_info_get_t dev_infos_get; /**< Get device info. */ @@ -815,6 +1010,8 @@ struct rte_eventdev_ops { /**< Link event queues to an event port. */ eventdev_port_unlink_t port_unlink; /**< Unlink event queues from an event port. */ + eventdev_port_unlinks_in_progress_t port_unlinks_in_progress; + /**< Unlinks in progress on an event port. */ eventdev_dequeue_timeout_ticks_t timeout_ticks; /**< Converts ns to *timeout_ticks* value for rte_event_dequeue() */ eventdev_dump_t dump; @@ -862,6 +1059,26 @@ struct rte_eventdev_ops { eventdev_crypto_adapter_stats_reset crypto_adapter_stats_reset; /**< Reset crypto stats */ + eventdev_eth_tx_adapter_caps_get_t eth_tx_adapter_caps_get; + /**< Get ethernet Tx adapter capabilities */ + + eventdev_eth_tx_adapter_create_t eth_tx_adapter_create; + /**< Create adapter callback */ + eventdev_eth_tx_adapter_free_t eth_tx_adapter_free; + /**< Free adapter callback */ + eventdev_eth_tx_adapter_queue_add_t eth_tx_adapter_queue_add; + /**< Add Tx queues to the eth Tx adapter */ + eventdev_eth_tx_adapter_queue_del_t eth_tx_adapter_queue_del; + /**< Delete Tx queues from the eth Tx adapter */ + eventdev_eth_tx_adapter_start_t eth_tx_adapter_start; + /**< Start eth Tx adapter */ + eventdev_eth_tx_adapter_stop_t eth_tx_adapter_stop; + /**< Stop eth Tx adapter */ + eventdev_eth_tx_adapter_stats_get_t eth_tx_adapter_stats_get; + /**< Get eth Tx adapter statistics */ + eventdev_eth_tx_adapter_stats_reset_t eth_tx_adapter_stats_reset; + /**< Reset eth Tx adapter statistics */ + eventdev_selftest dev_selftest; /**< Start eventdev Selftest */ diff --git a/lib/librte_eventdev/rte_eventdev_version.map b/lib/librte_eventdev/rte_eventdev_version.map index 12835e9f..d558d7d5 100644 --- a/lib/librte_eventdev/rte_eventdev_version.map +++ b/lib/librte_eventdev/rte_eventdev_version.map @@ -96,6 +96,19 @@ EXPERIMENTAL { rte_event_crypto_adapter_stats_reset; rte_event_crypto_adapter_stop; rte_event_eth_rx_adapter_cb_register; + rte_event_port_unlinks_in_progress; + rte_event_eth_tx_adapter_caps_get; + rte_event_eth_tx_adapter_create; + rte_event_eth_tx_adapter_create_ext; + rte_event_eth_tx_adapter_event_port_get; + rte_event_eth_tx_adapter_free; + rte_event_eth_tx_adapter_queue_add; + rte_event_eth_tx_adapter_queue_del; + rte_event_eth_tx_adapter_service_id_get; + rte_event_eth_tx_adapter_start; + rte_event_eth_tx_adapter_stats_get; + rte_event_eth_tx_adapter_stats_reset; + rte_event_eth_tx_adapter_stop; rte_event_timer_adapter_caps_get; rte_event_timer_adapter_create; rte_event_timer_adapter_create_ext; diff --git a/lib/librte_flow_classify/rte_flow_classify.c b/lib/librte_flow_classify/rte_flow_classify.c index 4c3469da..fb652a2b 100644 --- a/lib/librte_flow_classify/rte_flow_classify.c +++ b/lib/librte_flow_classify/rte_flow_classify.c @@ -247,8 +247,7 @@ rte_flow_classifier_check_params(struct rte_flow_classifier_params *params) } /* socket */ - if ((params->socket_id < 0) || - (params->socket_id >= RTE_MAX_NUMA_NODES)) { + if (params->socket_id < 0) { RTE_FLOW_CLASSIFY_LOG(ERR, "%s: Incorrect value for parameter socket_id\n", __func__); diff --git a/lib/librte_hash/rte_cuckoo_hash.c b/lib/librte_hash/rte_cuckoo_hash.c index f7b86c8c..5ddcccd8 100644 --- a/lib/librte_hash/rte_cuckoo_hash.c +++ b/lib/librte_hash/rte_cuckoo_hash.c @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright(c) 2010-2016 Intel Corporation + * Copyright(c) 2018 Arm Limited */ #include @@ -26,11 +27,14 @@ #include #include #include -#include #include "rte_hash.h" #include "rte_cuckoo_hash.h" +#define FOR_EACH_BUCKET(CURRENT_BKT, START_BUCKET) \ + for (CURRENT_BKT = START_BUCKET; \ + CURRENT_BKT != NULL; \ + CURRENT_BKT = CURRENT_BKT->next) TAILQ_HEAD(rte_hash_list, rte_tailq_entry); @@ -63,6 +67,14 @@ rte_hash_find_existing(const char *name) return h; } +static inline struct rte_hash_bucket * +rte_hash_get_last_bkt(struct rte_hash_bucket *lst_bkt) +{ + while (lst_bkt->next != NULL) + lst_bkt = lst_bkt->next; + return lst_bkt; +} + void rte_hash_set_cmp_func(struct rte_hash *h, rte_hash_cmp_eq_t func) { h->cmp_jump_table_idx = KEY_CUSTOM; @@ -78,6 +90,36 @@ rte_hash_cmp_eq(const void *key1, const void *key2, const struct rte_hash *h) return cmp_jump_table[h->cmp_jump_table_idx](key1, key2, h->key_len); } +/* + * We use higher 16 bits of hash as the signature value stored in table. + * We use the lower bits for the primary bucket + * location. Then we XOR primary bucket location and the signature + * to get the secondary bucket location. This is same as + * proposed in Bin Fan, et al's paper + * "MemC3: Compact and Concurrent MemCache with Dumber Caching and + * Smarter Hashing". The benefit to use + * XOR is that one could derive the alternative bucket location + * by only using the current bucket location and the signature. + */ +static inline uint16_t +get_short_sig(const hash_sig_t hash) +{ + return hash >> 16; +} + +static inline uint32_t +get_prim_bucket_index(const struct rte_hash *h, const hash_sig_t hash) +{ + return hash & h->bucket_bitmask; +} + +static inline uint32_t +get_alt_bucket_index(const struct rte_hash *h, + uint32_t cur_bkt_idx, uint16_t sig) +{ + return (cur_bkt_idx ^ sig) & h->bucket_bitmask; +} + struct rte_hash * rte_hash_create(const struct rte_hash_parameters *params) { @@ -85,14 +127,22 @@ rte_hash_create(const struct rte_hash_parameters *params) struct rte_tailq_entry *te = NULL; struct rte_hash_list *hash_list; struct rte_ring *r = NULL; + struct rte_ring *r_ext = NULL; char hash_name[RTE_HASH_NAMESIZE]; void *k = NULL; void *buckets = NULL; + void *buckets_ext = NULL; char ring_name[RTE_RING_NAMESIZE]; + char ext_ring_name[RTE_RING_NAMESIZE]; unsigned num_key_slots; unsigned i; - unsigned int hw_trans_mem_support = 0, multi_writer_support = 0; + unsigned int hw_trans_mem_support = 0, use_local_cache = 0; + unsigned int ext_table_support = 0; unsigned int readwrite_concur_support = 0; + unsigned int writer_takes_lock = 0; + unsigned int no_free_on_del = 0; + uint32_t *tbl_chng_cnt = NULL; + unsigned int readwrite_concur_lf_support = 0; rte_hash_function default_hash_func = (rte_hash_function)rte_jhash; @@ -112,20 +162,52 @@ rte_hash_create(const struct rte_hash_parameters *params) return NULL; } + /* Validate correct usage of extra options */ + if ((params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY) && + (params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF)) { + rte_errno = EINVAL; + RTE_LOG(ERR, HASH, "rte_hash_create: choose rw concurrency or " + "rw concurrency lock free\n"); + return NULL; + } + + if ((params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF) && + (params->extra_flag & RTE_HASH_EXTRA_FLAGS_EXT_TABLE)) { + rte_errno = EINVAL; + RTE_LOG(ERR, HASH, "rte_hash_create: extendable bucket " + "feature not supported with rw concurrency " + "lock free\n"); + return NULL; + } + /* Check extra flags field to check extra options. */ if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT) hw_trans_mem_support = 1; - if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD) - multi_writer_support = 1; + if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD) { + use_local_cache = 1; + writer_takes_lock = 1; + } if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY) { readwrite_concur_support = 1; - multi_writer_support = 1; + writer_takes_lock = 1; + } + + if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_EXT_TABLE) + ext_table_support = 1; + + if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_NO_FREE_ON_DEL) + no_free_on_del = 1; + + if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF) { + readwrite_concur_lf_support = 1; + /* Enable not freeing internal memory/index on delete */ + no_free_on_del = 1; } /* Store all keys and leave the first entry as a dummy entry for lookup_bulk */ - if (multi_writer_support) + if (use_local_cache) /* * Increase number of slots by total number of indices * that can be stored in the lcore caches @@ -145,6 +227,24 @@ rte_hash_create(const struct rte_hash_parameters *params) goto err; } + const uint32_t num_buckets = rte_align32pow2(params->entries) / + RTE_HASH_BUCKET_ENTRIES; + + /* Create ring for extendable buckets. */ + if (ext_table_support) { + snprintf(ext_ring_name, sizeof(ext_ring_name), "HT_EXT_%s", + params->name); + r_ext = rte_ring_create(ext_ring_name, + rte_align32pow2(num_buckets + 1), + params->socket_id, 0); + + if (r_ext == NULL) { + RTE_LOG(ERR, HASH, "ext buckets memory allocation " + "failed\n"); + goto err; + } + } + snprintf(hash_name, sizeof(hash_name), "HT_%s", params->name); rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); @@ -177,19 +277,37 @@ rte_hash_create(const struct rte_hash_parameters *params) goto err_unlock; } - const uint32_t num_buckets = rte_align32pow2(params->entries) - / RTE_HASH_BUCKET_ENTRIES; - buckets = rte_zmalloc_socket(NULL, num_buckets * sizeof(struct rte_hash_bucket), RTE_CACHE_LINE_SIZE, params->socket_id); if (buckets == NULL) { - RTE_LOG(ERR, HASH, "memory allocation failed\n"); + RTE_LOG(ERR, HASH, "buckets memory allocation failed\n"); goto err_unlock; } - const uint32_t key_entry_size = sizeof(struct rte_hash_key) + params->key_len; + /* Allocate same number of extendable buckets */ + if (ext_table_support) { + buckets_ext = rte_zmalloc_socket(NULL, + num_buckets * sizeof(struct rte_hash_bucket), + RTE_CACHE_LINE_SIZE, params->socket_id); + if (buckets_ext == NULL) { + RTE_LOG(ERR, HASH, "ext buckets memory allocation " + "failed\n"); + goto err_unlock; + } + /* Populate ext bkt ring. We reserve 0 similar to the + * key-data slot, just in case in future we want to + * use bucket index for the linked list and 0 means NULL + * for next bucket + */ + for (i = 1; i <= num_buckets; i++) + rte_ring_sp_enqueue(r_ext, (void *)((uintptr_t) i)); + } + + const uint32_t key_entry_size = + RTE_ALIGN(sizeof(struct rte_hash_key) + params->key_len, + KEY_ALIGNMENT); const uint64_t key_tbl_size = (uint64_t) key_entry_size * num_key_slots; k = rte_zmalloc_socket(NULL, key_tbl_size, @@ -200,6 +318,14 @@ rte_hash_create(const struct rte_hash_parameters *params) goto err_unlock; } + tbl_chng_cnt = rte_zmalloc_socket(NULL, sizeof(uint32_t), + RTE_CACHE_LINE_SIZE, params->socket_id); + + if (tbl_chng_cnt == NULL) { + RTE_LOG(ERR, HASH, "memory allocation failed\n"); + goto err_unlock; + } + /* * If x86 architecture is used, select appropriate compare function, * which may use x86 intrinsics, otherwise use memcmp @@ -239,7 +365,7 @@ rte_hash_create(const struct rte_hash_parameters *params) h->cmp_jump_table_idx = KEY_OTHER_BYTES; #endif - if (multi_writer_support) { + if (use_local_cache) { h->local_free_slots = rte_zmalloc_socket(NULL, sizeof(struct lcore_cache) * RTE_MAX_LCORE, RTE_CACHE_LINE_SIZE, params->socket_id); @@ -262,27 +388,34 @@ rte_hash_create(const struct rte_hash_parameters *params) h->num_buckets = num_buckets; h->bucket_bitmask = h->num_buckets - 1; h->buckets = buckets; + h->buckets_ext = buckets_ext; + h->free_ext_bkts = r_ext; h->hash_func = (params->hash_func == NULL) ? default_hash_func : params->hash_func; h->key_store = k; h->free_slots = r; + h->tbl_chng_cnt = tbl_chng_cnt; + *h->tbl_chng_cnt = 0; h->hw_trans_mem_support = hw_trans_mem_support; - h->multi_writer_support = multi_writer_support; + h->use_local_cache = use_local_cache; h->readwrite_concur_support = readwrite_concur_support; + h->ext_table_support = ext_table_support; + h->writer_takes_lock = writer_takes_lock; + h->no_free_on_del = no_free_on_del; + h->readwrite_concur_lf_support = readwrite_concur_lf_support; #if defined(RTE_ARCH_X86) - if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) - h->sig_cmp_fn = RTE_HASH_COMPARE_AVX2; - else if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE2)) + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE2)) h->sig_cmp_fn = RTE_HASH_COMPARE_SSE; else #endif h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR; - /* Turn on multi-writer only with explicit flag from user and TM - * support. + /* Writer threads need to take the lock when: + * 1) RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY is enabled OR + * 2) RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD is enabled */ - if (h->multi_writer_support) { + if (h->writer_takes_lock) { h->readwrite_lock = rte_malloc(NULL, sizeof(rte_rwlock_t), RTE_CACHE_LINE_SIZE); if (h->readwrite_lock == NULL) @@ -304,10 +437,13 @@ err_unlock: rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); err: rte_ring_free(r); + rte_ring_free(r_ext); rte_free(te); rte_free(h); rte_free(buckets); + rte_free(buckets_ext); rte_free(k); + rte_free(tbl_chng_cnt); return NULL; } @@ -339,13 +475,16 @@ rte_hash_free(struct rte_hash *h) rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); - if (h->multi_writer_support) { + if (h->use_local_cache) rte_free(h->local_free_slots); + if (h->writer_takes_lock) rte_free(h->readwrite_lock); - } rte_ring_free(h->free_slots); + rte_ring_free(h->free_ext_bkts); rte_free(h->key_store); rte_free(h->buckets); + rte_free(h->buckets_ext); + rte_free(h->tbl_chng_cnt); rte_free(h); rte_free(te); } @@ -357,18 +496,6 @@ rte_hash_hash(const struct rte_hash *h, const void *key) return h->hash_func(key, h->key_len, h->hash_func_init_val); } -/* Calc the secondary hash value from the primary hash value of a given key */ -static inline hash_sig_t -rte_hash_secondary_hash(const hash_sig_t primary_hash) -{ - static const unsigned all_bits_shift = 12; - static const unsigned alt_bits_xor = 0x5bd1e995; - - uint32_t tag = primary_hash >> all_bits_shift; - - return primary_hash ^ ((tag + 1) * alt_bits_xor); -} - int32_t rte_hash_count(const struct rte_hash *h) { @@ -378,7 +505,7 @@ rte_hash_count(const struct rte_hash *h) if (h == NULL) return -EINVAL; - if (h->multi_writer_support) { + if (h->use_local_cache) { tot_ring_cnt = h->entries + (RTE_MAX_LCORE - 1) * (LCORE_CACHE_SIZE - 1); for (i = 0; i < RTE_MAX_LCORE; i++) @@ -397,13 +524,12 @@ rte_hash_count(const struct rte_hash *h) static inline void __hash_rw_writer_lock(const struct rte_hash *h) { - if (h->multi_writer_support && h->hw_trans_mem_support) + if (h->writer_takes_lock && h->hw_trans_mem_support) rte_rwlock_write_lock_tm(h->readwrite_lock); - else if (h->multi_writer_support) + else if (h->writer_takes_lock) rte_rwlock_write_lock(h->readwrite_lock); } - static inline void __hash_rw_reader_lock(const struct rte_hash *h) { @@ -416,9 +542,9 @@ __hash_rw_reader_lock(const struct rte_hash *h) static inline void __hash_rw_writer_unlock(const struct rte_hash *h) { - if (h->multi_writer_support && h->hw_trans_mem_support) + if (h->writer_takes_lock && h->hw_trans_mem_support) rte_rwlock_write_unlock_tm(h->readwrite_lock); - else if (h->multi_writer_support) + else if (h->writer_takes_lock) rte_rwlock_write_unlock(h->readwrite_lock); } @@ -443,13 +569,22 @@ rte_hash_reset(struct rte_hash *h) __hash_rw_writer_lock(h); memset(h->buckets, 0, h->num_buckets * sizeof(struct rte_hash_bucket)); memset(h->key_store, 0, h->key_entry_size * (h->entries + 1)); + *h->tbl_chng_cnt = 0; /* clear the free ring */ while (rte_ring_dequeue(h->free_slots, &ptr) == 0) - rte_pause(); + continue; + + /* clear free extendable bucket ring and memory */ + if (h->ext_table_support) { + memset(h->buckets_ext, 0, h->num_buckets * + sizeof(struct rte_hash_bucket)); + while (rte_ring_dequeue(h->free_ext_bkts, &ptr) == 0) + continue; + } /* Repopulate the free slots ring. Entry zero is reserved for key misses */ - if (h->multi_writer_support) + if (h->use_local_cache) tot_ring_cnt = h->entries + (RTE_MAX_LCORE - 1) * (LCORE_CACHE_SIZE - 1); else @@ -458,7 +593,14 @@ rte_hash_reset(struct rte_hash *h) for (i = 1; i < tot_ring_cnt + 1; i++) rte_ring_sp_enqueue(h->free_slots, (void *)((uintptr_t) i)); - if (h->multi_writer_support) { + /* Repopulate the free ext bkt ring. */ + if (h->ext_table_support) { + for (i = 1; i <= h->num_buckets; i++) + rte_ring_sp_enqueue(h->free_ext_bkts, + (void *)((uintptr_t) i)); + } + + if (h->use_local_cache) { /* Reset local caches per lcore */ for (i = 0; i < RTE_MAX_LCORE; i++) h->local_free_slots[i].len = 0; @@ -476,29 +618,35 @@ enqueue_slot_back(const struct rte_hash *h, struct lcore_cache *cached_free_slots, void *slot_id) { - if (h->multi_writer_support) { + if (h->use_local_cache) { cached_free_slots->objs[cached_free_slots->len] = slot_id; cached_free_slots->len++; } else rte_ring_sp_enqueue(h->free_slots, slot_id); } -/* Search a key from bucket and update its data */ +/* Search a key from bucket and update its data. + * Writer holds the lock before calling this. + */ static inline int32_t search_and_update(const struct rte_hash *h, void *data, const void *key, - struct rte_hash_bucket *bkt, hash_sig_t sig, hash_sig_t alt_hash) + struct rte_hash_bucket *bkt, uint16_t sig) { int i; struct rte_hash_key *k, *keys = h->key_store; for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { - if (bkt->sig_current[i] == sig && - bkt->sig_alt[i] == alt_hash) { + if (bkt->sig_current[i] == sig) { k = (struct rte_hash_key *) ((char *)keys + bkt->key_idx[i] * h->key_entry_size); if (rte_hash_cmp_eq(key, k->key, h) == 0) { - /* Update data */ - k->pdata = data; + /* 'pdata' acts as the synchronization point + * when an existing hash entry is updated. + * Key is not updated in this case. + */ + __atomic_store_n(&k->pdata, + data, + __ATOMIC_RELEASE); /* * Return index where key is stored, * subtracting the first dummy index @@ -520,28 +668,31 @@ rte_hash_cuckoo_insert_mw(const struct rte_hash *h, struct rte_hash_bucket *prim_bkt, struct rte_hash_bucket *sec_bkt, const struct rte_hash_key *key, void *data, - hash_sig_t sig, hash_sig_t alt_hash, uint32_t new_idx, + uint16_t sig, uint32_t new_idx, int32_t *ret_val) { unsigned int i; - struct rte_hash_bucket *cur_bkt = prim_bkt; + struct rte_hash_bucket *cur_bkt; int32_t ret; __hash_rw_writer_lock(h); /* Check if key was inserted after last check but before this * protected region in case of inserting duplicated keys. */ - ret = search_and_update(h, data, key, cur_bkt, sig, alt_hash); + ret = search_and_update(h, data, key, prim_bkt, sig); if (ret != -1) { __hash_rw_writer_unlock(h); *ret_val = ret; return 1; } - ret = search_and_update(h, data, key, sec_bkt, alt_hash, sig); - if (ret != -1) { - __hash_rw_writer_unlock(h); - *ret_val = ret; - return 1; + + FOR_EACH_BUCKET(cur_bkt, sec_bkt) { + ret = search_and_update(h, data, key, cur_bkt, sig); + if (ret != -1) { + __hash_rw_writer_unlock(h); + *ret_val = ret; + return 1; + } } /* Insert new entry if there is room in the primary @@ -551,8 +702,15 @@ rte_hash_cuckoo_insert_mw(const struct rte_hash *h, /* Check if slot is available */ if (likely(prim_bkt->key_idx[i] == EMPTY_SLOT)) { prim_bkt->sig_current[i] = sig; - prim_bkt->sig_alt[i] = alt_hash; - prim_bkt->key_idx[i] = new_idx; + /* Key can be of arbitrary length, so it is + * not possible to store it atomically. + * Hence the new key element's memory stores + * (key as well as data) should be complete + * before it is referenced. + */ + __atomic_store_n(&prim_bkt->key_idx[i], + new_idx, + __ATOMIC_RELEASE); break; } } @@ -576,11 +734,11 @@ rte_hash_cuckoo_move_insert_mw(const struct rte_hash *h, struct rte_hash_bucket *alt_bkt, const struct rte_hash_key *key, void *data, struct queue_node *leaf, uint32_t leaf_slot, - hash_sig_t sig, hash_sig_t alt_hash, uint32_t new_idx, + uint16_t sig, uint32_t new_idx, int32_t *ret_val) { uint32_t prev_alt_bkt_idx; - struct rte_hash_bucket *cur_bkt = bkt; + struct rte_hash_bucket *cur_bkt; struct queue_node *prev_node, *curr_node = leaf; struct rte_hash_bucket *prev_bkt, *curr_bkt = leaf->bkt; uint32_t prev_slot, curr_slot = leaf_slot; @@ -597,18 +755,20 @@ rte_hash_cuckoo_move_insert_mw(const struct rte_hash *h, /* Check if key was inserted after last check but before this * protected region. */ - ret = search_and_update(h, data, key, cur_bkt, sig, alt_hash); + ret = search_and_update(h, data, key, bkt, sig); if (ret != -1) { __hash_rw_writer_unlock(h); *ret_val = ret; return 1; } - ret = search_and_update(h, data, key, alt_bkt, alt_hash, sig); - if (ret != -1) { - __hash_rw_writer_unlock(h); - *ret_val = ret; - return 1; + FOR_EACH_BUCKET(cur_bkt, alt_bkt) { + ret = search_and_update(h, data, key, cur_bkt, sig); + if (ret != -1) { + __hash_rw_writer_unlock(h); + *ret_val = ret; + return 1; + } } while (likely(curr_node->prev != NULL)) { @@ -616,36 +776,73 @@ rte_hash_cuckoo_move_insert_mw(const struct rte_hash *h, prev_bkt = prev_node->bkt; prev_slot = curr_node->prev_slot; - prev_alt_bkt_idx = - prev_bkt->sig_alt[prev_slot] & h->bucket_bitmask; + prev_alt_bkt_idx = get_alt_bucket_index(h, + prev_node->cur_bkt_idx, + prev_bkt->sig_current[prev_slot]); if (unlikely(&h->buckets[prev_alt_bkt_idx] != curr_bkt)) { /* revert it to empty, otherwise duplicated keys */ - curr_bkt->key_idx[curr_slot] = EMPTY_SLOT; + __atomic_store_n(&curr_bkt->key_idx[curr_slot], + EMPTY_SLOT, + __ATOMIC_RELEASE); __hash_rw_writer_unlock(h); return -1; } + if (h->readwrite_concur_lf_support) { + /* Inform the previous move. The current move need + * not be informed now as the current bucket entry + * is present in both primary and secondary. + * Since there is one writer, load acquires on + * tbl_chng_cnt are not required. + */ + __atomic_store_n(h->tbl_chng_cnt, + *h->tbl_chng_cnt + 1, + __ATOMIC_RELEASE); + /* The stores to sig_alt and sig_current should not + * move above the store to tbl_chng_cnt. + */ + __atomic_thread_fence(__ATOMIC_RELEASE); + } + /* Need to swap current/alt sig to allow later * Cuckoo insert to move elements back to its * primary bucket if available */ - curr_bkt->sig_alt[curr_slot] = - prev_bkt->sig_current[prev_slot]; curr_bkt->sig_current[curr_slot] = - prev_bkt->sig_alt[prev_slot]; - curr_bkt->key_idx[curr_slot] = - prev_bkt->key_idx[prev_slot]; + prev_bkt->sig_current[prev_slot]; + /* Release the updated bucket entry */ + __atomic_store_n(&curr_bkt->key_idx[curr_slot], + prev_bkt->key_idx[prev_slot], + __ATOMIC_RELEASE); curr_slot = prev_slot; curr_node = prev_node; curr_bkt = curr_node->bkt; } + if (h->readwrite_concur_lf_support) { + /* Inform the previous move. The current move need + * not be informed now as the current bucket entry + * is present in both primary and secondary. + * Since there is one writer, load acquires on + * tbl_chng_cnt are not required. + */ + __atomic_store_n(h->tbl_chng_cnt, + *h->tbl_chng_cnt + 1, + __ATOMIC_RELEASE); + /* The stores to sig_alt and sig_current should not + * move above the store to tbl_chng_cnt. + */ + __atomic_thread_fence(__ATOMIC_RELEASE); + } + curr_bkt->sig_current[curr_slot] = sig; - curr_bkt->sig_alt[curr_slot] = alt_hash; - curr_bkt->key_idx[curr_slot] = new_idx; + /* Release the new bucket entry */ + __atomic_store_n(&curr_bkt->key_idx[curr_slot], + new_idx, + __ATOMIC_RELEASE); __hash_rw_writer_unlock(h); @@ -662,39 +859,44 @@ rte_hash_cuckoo_make_space_mw(const struct rte_hash *h, struct rte_hash_bucket *bkt, struct rte_hash_bucket *sec_bkt, const struct rte_hash_key *key, void *data, - hash_sig_t sig, hash_sig_t alt_hash, + uint16_t sig, uint32_t bucket_idx, uint32_t new_idx, int32_t *ret_val) { unsigned int i; struct queue_node queue[RTE_HASH_BFS_QUEUE_MAX_LEN]; struct queue_node *tail, *head; struct rte_hash_bucket *curr_bkt, *alt_bkt; + uint32_t cur_idx, alt_idx; tail = queue; head = queue + 1; tail->bkt = bkt; tail->prev = NULL; tail->prev_slot = -1; + tail->cur_bkt_idx = bucket_idx; /* Cuckoo bfs Search */ while (likely(tail != head && head < queue + RTE_HASH_BFS_QUEUE_MAX_LEN - RTE_HASH_BUCKET_ENTRIES)) { curr_bkt = tail->bkt; + cur_idx = tail->cur_bkt_idx; for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { if (curr_bkt->key_idx[i] == EMPTY_SLOT) { int32_t ret = rte_hash_cuckoo_move_insert_mw(h, bkt, sec_bkt, key, data, - tail, i, sig, alt_hash, + tail, i, sig, new_idx, ret_val); if (likely(ret != -1)) return ret; } /* Enqueue new node and keep prev node info */ - alt_bkt = &(h->buckets[curr_bkt->sig_alt[i] - & h->bucket_bitmask]); + alt_idx = get_alt_bucket_index(h, cur_idx, + curr_bkt->sig_current[i]); + alt_bkt = &(h->buckets[alt_idx]); head->bkt = alt_bkt; + head->cur_bkt_idx = alt_idx; head->prev = tail; head->prev_slot = i; head++; @@ -709,45 +911,50 @@ static inline int32_t __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, hash_sig_t sig, void *data) { - hash_sig_t alt_hash; + uint16_t short_sig; uint32_t prim_bucket_idx, sec_bucket_idx; - struct rte_hash_bucket *prim_bkt, *sec_bkt; + struct rte_hash_bucket *prim_bkt, *sec_bkt, *cur_bkt; struct rte_hash_key *new_k, *keys = h->key_store; void *slot_id = NULL; - uint32_t new_idx; + void *ext_bkt_id = NULL; + uint32_t new_idx, bkt_id; int ret; unsigned n_slots; unsigned lcore_id; + unsigned int i; struct lcore_cache *cached_free_slots = NULL; int32_t ret_val; + struct rte_hash_bucket *last; - prim_bucket_idx = sig & h->bucket_bitmask; + short_sig = get_short_sig(sig); + prim_bucket_idx = get_prim_bucket_index(h, sig); + sec_bucket_idx = get_alt_bucket_index(h, prim_bucket_idx, short_sig); prim_bkt = &h->buckets[prim_bucket_idx]; - rte_prefetch0(prim_bkt); - - alt_hash = rte_hash_secondary_hash(sig); - sec_bucket_idx = alt_hash & h->bucket_bitmask; sec_bkt = &h->buckets[sec_bucket_idx]; + rte_prefetch0(prim_bkt); rte_prefetch0(sec_bkt); /* Check if key is already inserted in primary location */ __hash_rw_writer_lock(h); - ret = search_and_update(h, data, key, prim_bkt, sig, alt_hash); + ret = search_and_update(h, data, key, prim_bkt, short_sig); if (ret != -1) { __hash_rw_writer_unlock(h); return ret; } /* Check if key is already inserted in secondary location */ - ret = search_and_update(h, data, key, sec_bkt, alt_hash, sig); - if (ret != -1) { - __hash_rw_writer_unlock(h); - return ret; + FOR_EACH_BUCKET(cur_bkt, sec_bkt) { + ret = search_and_update(h, data, key, cur_bkt, short_sig); + if (ret != -1) { + __hash_rw_writer_unlock(h); + return ret; + } } + __hash_rw_writer_unlock(h); /* Did not find a match, so get a new slot for storing the new key */ - if (h->multi_writer_support) { + if (h->use_local_cache) { lcore_id = rte_lcore_id(); cached_free_slots = &h->local_free_slots[lcore_id]; /* Try to get a free slot from the local cache */ @@ -776,12 +983,19 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, new_idx = (uint32_t)((uintptr_t) slot_id); /* Copy key */ rte_memcpy(new_k->key, key, h->key_len); - new_k->pdata = data; - + /* Key can be of arbitrary length, so it is not possible to store + * it atomically. Hence the new key element's memory stores + * (key as well as data) should be complete before it is referenced. + * 'pdata' acts as the synchronization point when an existing hash + * entry is updated. + */ + __atomic_store_n(&new_k->pdata, + data, + __ATOMIC_RELEASE); /* Find an empty slot and insert */ ret = rte_hash_cuckoo_insert_mw(h, prim_bkt, sec_bkt, key, data, - sig, alt_hash, new_idx, &ret_val); + short_sig, new_idx, &ret_val); if (ret == 0) return new_idx - 1; else if (ret == 1) { @@ -791,7 +1005,7 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, /* Primary bucket full, need to make space for new entry */ ret = rte_hash_cuckoo_make_space_mw(h, prim_bkt, sec_bkt, key, data, - sig, alt_hash, new_idx, &ret_val); + short_sig, prim_bucket_idx, new_idx, &ret_val); if (ret == 0) return new_idx - 1; else if (ret == 1) { @@ -801,17 +1015,75 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, /* Also search secondary bucket to get better occupancy */ ret = rte_hash_cuckoo_make_space_mw(h, sec_bkt, prim_bkt, key, data, - alt_hash, sig, new_idx, &ret_val); + short_sig, sec_bucket_idx, new_idx, &ret_val); if (ret == 0) return new_idx - 1; else if (ret == 1) { enqueue_slot_back(h, cached_free_slots, slot_id); return ret_val; - } else { + } + + /* if ext table not enabled, we failed the insertion */ + if (!h->ext_table_support) { enqueue_slot_back(h, cached_free_slots, slot_id); return ret; } + + /* Now we need to go through the extendable bucket. Protection is needed + * to protect all extendable bucket processes. + */ + __hash_rw_writer_lock(h); + /* We check for duplicates again since could be inserted before the lock */ + ret = search_and_update(h, data, key, prim_bkt, short_sig); + if (ret != -1) { + enqueue_slot_back(h, cached_free_slots, slot_id); + goto failure; + } + + FOR_EACH_BUCKET(cur_bkt, sec_bkt) { + ret = search_and_update(h, data, key, cur_bkt, short_sig); + if (ret != -1) { + enqueue_slot_back(h, cached_free_slots, slot_id); + goto failure; + } + } + + /* Search sec and ext buckets to find an empty entry to insert. */ + FOR_EACH_BUCKET(cur_bkt, sec_bkt) { + for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { + /* Check if slot is available */ + if (likely(cur_bkt->key_idx[i] == EMPTY_SLOT)) { + cur_bkt->sig_current[i] = short_sig; + cur_bkt->key_idx[i] = new_idx; + __hash_rw_writer_unlock(h); + return new_idx - 1; + } + } + } + + /* Failed to get an empty entry from extendable buckets. Link a new + * extendable bucket. We first get a free bucket from ring. + */ + if (rte_ring_sc_dequeue(h->free_ext_bkts, &ext_bkt_id) != 0) { + ret = -ENOSPC; + goto failure; + } + + bkt_id = (uint32_t)((uintptr_t)ext_bkt_id) - 1; + /* Use the first location of the new bucket */ + (h->buckets_ext[bkt_id]).sig_current[0] = short_sig; + (h->buckets_ext[bkt_id]).key_idx[0] = new_idx; + /* Link the new bucket to sec bucket linked list */ + last = rte_hash_get_last_bkt(sec_bkt); + last->next = &h->buckets_ext[bkt_id]; + __hash_rw_writer_unlock(h); + return new_idx - 1; + +failure: + __hash_rw_writer_unlock(h); + return ret; + } int32_t @@ -859,25 +1131,31 @@ rte_hash_add_key_data(const struct rte_hash *h, const void *key, void *data) /* Search one bucket to find the match key */ static inline int32_t -search_one_bucket(const struct rte_hash *h, const void *key, hash_sig_t sig, +search_one_bucket(const struct rte_hash *h, const void *key, uint16_t sig, void **data, const struct rte_hash_bucket *bkt) { int i; + uint32_t key_idx; + void *pdata; struct rte_hash_key *k, *keys = h->key_store; for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { - if (bkt->sig_current[i] == sig && - bkt->key_idx[i] != EMPTY_SLOT) { + key_idx = __atomic_load_n(&bkt->key_idx[i], + __ATOMIC_ACQUIRE); + if (bkt->sig_current[i] == sig && key_idx != EMPTY_SLOT) { k = (struct rte_hash_key *) ((char *)keys + - bkt->key_idx[i] * h->key_entry_size); + key_idx * h->key_entry_size); + pdata = __atomic_load_n(&k->pdata, + __ATOMIC_ACQUIRE); + if (rte_hash_cmp_eq(key, k->key, h) == 0) { if (data != NULL) - *data = k->pdata; + *data = pdata; /* * Return index where key is stored, * subtracting the first dummy index */ - return bkt->key_idx[i] - 1; + return key_idx - 1; } } } @@ -888,34 +1166,64 @@ static inline int32_t __rte_hash_lookup_with_hash(const struct rte_hash *h, const void *key, hash_sig_t sig, void **data) { - uint32_t bucket_idx; - hash_sig_t alt_hash; - struct rte_hash_bucket *bkt; + uint32_t prim_bucket_idx, sec_bucket_idx; + struct rte_hash_bucket *bkt, *cur_bkt; + uint32_t cnt_b, cnt_a; int ret; + uint16_t short_sig; - bucket_idx = sig & h->bucket_bitmask; - bkt = &h->buckets[bucket_idx]; + short_sig = get_short_sig(sig); + prim_bucket_idx = get_prim_bucket_index(h, sig); + sec_bucket_idx = get_alt_bucket_index(h, prim_bucket_idx, short_sig); __hash_rw_reader_lock(h); - /* Check if key is in primary location */ - ret = search_one_bucket(h, key, sig, data, bkt); - if (ret != -1) { - __hash_rw_reader_unlock(h); - return ret; - } - /* Calculate secondary hash */ - alt_hash = rte_hash_secondary_hash(sig); - bucket_idx = alt_hash & h->bucket_bitmask; - bkt = &h->buckets[bucket_idx]; + do { + /* Load the table change counter before the lookup + * starts. Acquire semantics will make sure that + * loads in search_one_bucket are not hoisted. + */ + cnt_b = __atomic_load_n(h->tbl_chng_cnt, + __ATOMIC_ACQUIRE); + + /* Check if key is in primary location */ + bkt = &h->buckets[prim_bucket_idx]; + ret = search_one_bucket(h, key, short_sig, data, bkt); + if (ret != -1) { + __hash_rw_reader_unlock(h); + return ret; + } + /* Calculate secondary hash */ + bkt = &h->buckets[sec_bucket_idx]; + + /* Check if key is in secondary location */ + FOR_EACH_BUCKET(cur_bkt, bkt) { + ret = search_one_bucket(h, key, short_sig, + data, cur_bkt); + if (ret != -1) { + __hash_rw_reader_unlock(h); + return ret; + } + } + + /* The loads of sig_current in search_one_bucket + * should not move below the load from tbl_chng_cnt. + */ + __atomic_thread_fence(__ATOMIC_ACQUIRE); + /* Re-read the table change counter to check if the + * table has changed during search. If yes, re-do + * the search. + * This load should not get hoisted. The load + * acquires on cnt_b, key index in primary bucket + * and key index in secondary bucket will make sure + * that it does not get hoisted. + */ + cnt_a = __atomic_load_n(h->tbl_chng_cnt, + __ATOMIC_ACQUIRE); + } while (cnt_b != cnt_a); - /* Check if key is in secondary location */ - ret = search_one_bucket(h, key, alt_hash, data, bkt); - if (ret != -1) { - __hash_rw_reader_unlock(h); - return ret; - } __hash_rw_reader_unlock(h); + return -ENOENT; } @@ -955,9 +1263,7 @@ remove_entry(const struct rte_hash *h, struct rte_hash_bucket *bkt, unsigned i) unsigned lcore_id, n_slots; struct lcore_cache *cached_free_slots; - bkt->sig_current[i] = NULL_SIGNATURE; - bkt->sig_alt[i] = NULL_SIGNATURE; - if (h->multi_writer_support) { + if (h->use_local_cache) { lcore_id = rte_lcore_id(); cached_free_slots = &h->local_free_slots[lcore_id]; /* Cache full, need to free it. */ @@ -978,31 +1284,67 @@ remove_entry(const struct rte_hash *h, struct rte_hash_bucket *bkt, unsigned i) } } -/* Search one bucket and remove the matched key */ +/* Compact the linked list by moving key from last entry in linked list to the + * empty slot. + */ +static inline void +__rte_hash_compact_ll(struct rte_hash_bucket *cur_bkt, int pos) { + int i; + struct rte_hash_bucket *last_bkt; + + if (!cur_bkt->next) + return; + + last_bkt = rte_hash_get_last_bkt(cur_bkt); + + for (i = RTE_HASH_BUCKET_ENTRIES - 1; i >= 0; i--) { + if (last_bkt->key_idx[i] != EMPTY_SLOT) { + cur_bkt->key_idx[pos] = last_bkt->key_idx[i]; + cur_bkt->sig_current[pos] = last_bkt->sig_current[i]; + last_bkt->sig_current[i] = NULL_SIGNATURE; + last_bkt->key_idx[i] = EMPTY_SLOT; + return; + } + } +} + +/* Search one bucket and remove the matched key. + * Writer is expected to hold the lock while calling this + * function. + */ static inline int32_t search_and_remove(const struct rte_hash *h, const void *key, - struct rte_hash_bucket *bkt, hash_sig_t sig) + struct rte_hash_bucket *bkt, uint16_t sig, int *pos) { struct rte_hash_key *k, *keys = h->key_store; unsigned int i; - int32_t ret; + uint32_t key_idx; - /* Check if key is in primary location */ + /* Check if key is in bucket */ for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { - if (bkt->sig_current[i] == sig && - bkt->key_idx[i] != EMPTY_SLOT) { + key_idx = __atomic_load_n(&bkt->key_idx[i], + __ATOMIC_ACQUIRE); + if (bkt->sig_current[i] == sig && key_idx != EMPTY_SLOT) { k = (struct rte_hash_key *) ((char *)keys + - bkt->key_idx[i] * h->key_entry_size); + key_idx * h->key_entry_size); if (rte_hash_cmp_eq(key, k->key, h) == 0) { - remove_entry(h, bkt, i); + bkt->sig_current[i] = NULL_SIGNATURE; + /* Free the key store index if + * no_free_on_del is disabled. + */ + if (!h->no_free_on_del) + remove_entry(h, bkt, i); + + __atomic_store_n(&bkt->key_idx[i], + EMPTY_SLOT, + __ATOMIC_RELEASE); + *pos = i; /* * Return index where key is stored, * subtracting the first dummy index */ - ret = bkt->key_idx[i] - 1; - bkt->key_idx[i] = EMPTY_SLOT; - return ret; + return key_idx - 1; } } } @@ -1013,36 +1355,68 @@ static inline int32_t __rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key, hash_sig_t sig) { - uint32_t bucket_idx; - hash_sig_t alt_hash; - struct rte_hash_bucket *bkt; - int32_t ret; - - bucket_idx = sig & h->bucket_bitmask; - bkt = &h->buckets[bucket_idx]; + uint32_t prim_bucket_idx, sec_bucket_idx; + struct rte_hash_bucket *prim_bkt, *sec_bkt, *prev_bkt, *last_bkt; + struct rte_hash_bucket *cur_bkt; + int pos; + int32_t ret, i; + uint16_t short_sig; + + short_sig = get_short_sig(sig); + prim_bucket_idx = get_prim_bucket_index(h, sig); + sec_bucket_idx = get_alt_bucket_index(h, prim_bucket_idx, short_sig); + prim_bkt = &h->buckets[prim_bucket_idx]; __hash_rw_writer_lock(h); /* look for key in primary bucket */ - ret = search_and_remove(h, key, bkt, sig); + ret = search_and_remove(h, key, prim_bkt, short_sig, &pos); if (ret != -1) { - __hash_rw_writer_unlock(h); - return ret; + __rte_hash_compact_ll(prim_bkt, pos); + last_bkt = prim_bkt->next; + prev_bkt = prim_bkt; + goto return_bkt; } /* Calculate secondary hash */ - alt_hash = rte_hash_secondary_hash(sig); - bucket_idx = alt_hash & h->bucket_bitmask; - bkt = &h->buckets[bucket_idx]; + sec_bkt = &h->buckets[sec_bucket_idx]; - /* look for key in secondary bucket */ - ret = search_and_remove(h, key, bkt, alt_hash); - if (ret != -1) { + FOR_EACH_BUCKET(cur_bkt, sec_bkt) { + ret = search_and_remove(h, key, cur_bkt, short_sig, &pos); + if (ret != -1) { + __rte_hash_compact_ll(cur_bkt, pos); + last_bkt = sec_bkt->next; + prev_bkt = sec_bkt; + goto return_bkt; + } + } + + __hash_rw_writer_unlock(h); + return -ENOENT; + +/* Search last bucket to see if empty to be recycled */ +return_bkt: + if (!last_bkt) { __hash_rw_writer_unlock(h); return ret; } + while (last_bkt->next) { + prev_bkt = last_bkt; + last_bkt = last_bkt->next; + } + + for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { + if (last_bkt->key_idx[i] != EMPTY_SLOT) + break; + } + /* found empty bucket and recycle */ + if (i == RTE_HASH_BUCKET_ENTRIES) { + prev_bkt->next = last_bkt->next = NULL; + uint32_t index = last_bkt - h->buckets_ext + 1; + rte_ring_sp_enqueue(h->free_ext_bkts, (void *)(uintptr_t)index); + } __hash_rw_writer_unlock(h); - return -ENOENT; + return ret; } int32_t @@ -1080,59 +1454,76 @@ rte_hash_get_key_with_position(const struct rte_hash *h, const int32_t position, return 0; } +int __rte_experimental +rte_hash_free_key_with_position(const struct rte_hash *h, + const int32_t position) +{ + RETURN_IF_TRUE(((h == NULL) || (position == EMPTY_SLOT)), -EINVAL); + + unsigned int lcore_id, n_slots; + struct lcore_cache *cached_free_slots; + const int32_t total_entries = h->num_buckets * RTE_HASH_BUCKET_ENTRIES; + + /* Out of bounds */ + if (position >= total_entries) + return -EINVAL; + + if (h->use_local_cache) { + lcore_id = rte_lcore_id(); + cached_free_slots = &h->local_free_slots[lcore_id]; + /* Cache full, need to free it. */ + if (cached_free_slots->len == LCORE_CACHE_SIZE) { + /* Need to enqueue the free slots in global ring. */ + n_slots = rte_ring_mp_enqueue_burst(h->free_slots, + cached_free_slots->objs, + LCORE_CACHE_SIZE, NULL); + cached_free_slots->len -= n_slots; + } + /* Put index of new free slot in cache. */ + cached_free_slots->objs[cached_free_slots->len] = + (void *)((uintptr_t)position); + cached_free_slots->len++; + } else { + rte_ring_sp_enqueue(h->free_slots, + (void *)((uintptr_t)position)); + } + + return 0; +} + static inline void compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches, const struct rte_hash_bucket *prim_bkt, const struct rte_hash_bucket *sec_bkt, - hash_sig_t prim_hash, hash_sig_t sec_hash, + uint16_t sig, enum rte_hash_sig_compare_function sig_cmp_fn) { unsigned int i; + /* For match mask the first bit of every two bits indicates the match */ switch (sig_cmp_fn) { -#ifdef RTE_MACHINE_CPUFLAG_AVX2 - case RTE_HASH_COMPARE_AVX2: - *prim_hash_matches = _mm256_movemask_ps((__m256)_mm256_cmpeq_epi32( - _mm256_load_si256( - (__m256i const *)prim_bkt->sig_current), - _mm256_set1_epi32(prim_hash))); - *sec_hash_matches = _mm256_movemask_ps((__m256)_mm256_cmpeq_epi32( - _mm256_load_si256( - (__m256i const *)sec_bkt->sig_current), - _mm256_set1_epi32(sec_hash))); - break; -#endif #ifdef RTE_MACHINE_CPUFLAG_SSE2 case RTE_HASH_COMPARE_SSE: - /* Compare the first 4 signatures in the bucket */ - *prim_hash_matches = _mm_movemask_ps((__m128)_mm_cmpeq_epi16( + /* Compare all signatures in the bucket */ + *prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16( _mm_load_si128( (__m128i const *)prim_bkt->sig_current), - _mm_set1_epi32(prim_hash))); - *prim_hash_matches |= (_mm_movemask_ps((__m128)_mm_cmpeq_epi16( - _mm_load_si128( - (__m128i const *)&prim_bkt->sig_current[4]), - _mm_set1_epi32(prim_hash)))) << 4; - /* Compare the first 4 signatures in the bucket */ - *sec_hash_matches = _mm_movemask_ps((__m128)_mm_cmpeq_epi16( + _mm_set1_epi16(sig))); + /* Compare all signatures in the bucket */ + *sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16( _mm_load_si128( (__m128i const *)sec_bkt->sig_current), - _mm_set1_epi32(sec_hash))); - *sec_hash_matches |= (_mm_movemask_ps((__m128)_mm_cmpeq_epi16( - _mm_load_si128( - (__m128i const *)&sec_bkt->sig_current[4]), - _mm_set1_epi32(sec_hash)))) << 4; + _mm_set1_epi16(sig))); break; #endif default: for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { *prim_hash_matches |= - ((prim_hash == prim_bkt->sig_current[i]) << i); + ((sig == prim_bkt->sig_current[i]) << (i << 1)); *sec_hash_matches |= - ((sec_hash == sec_bkt->sig_current[i]) << i); + ((sig == sec_bkt->sig_current[i]) << (i << 1)); } } - } #define PREFETCH_OFFSET 4 @@ -1143,12 +1534,18 @@ __rte_hash_lookup_bulk(const struct rte_hash *h, const void **keys, { uint64_t hits = 0; int32_t i; + int32_t ret; uint32_t prim_hash[RTE_HASH_LOOKUP_BULK_MAX]; - uint32_t sec_hash[RTE_HASH_LOOKUP_BULK_MAX]; + uint32_t prim_index[RTE_HASH_LOOKUP_BULK_MAX]; + uint32_t sec_index[RTE_HASH_LOOKUP_BULK_MAX]; + uint16_t sig[RTE_HASH_LOOKUP_BULK_MAX]; const struct rte_hash_bucket *primary_bkt[RTE_HASH_LOOKUP_BULK_MAX]; const struct rte_hash_bucket *secondary_bkt[RTE_HASH_LOOKUP_BULK_MAX]; uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0}; uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0}; + struct rte_hash_bucket *cur_bkt, *next_bkt; + void *pdata[RTE_HASH_LOOKUP_BULK_MAX]; + uint32_t cnt_b, cnt_a; /* Prefetch first keys */ for (i = 0; i < PREFETCH_OFFSET && i < num_keys; i++) @@ -1162,10 +1559,13 @@ __rte_hash_lookup_bulk(const struct rte_hash *h, const void **keys, rte_prefetch0(keys[i + PREFETCH_OFFSET]); prim_hash[i] = rte_hash_hash(h, keys[i]); - sec_hash[i] = rte_hash_secondary_hash(prim_hash[i]); - primary_bkt[i] = &h->buckets[prim_hash[i] & h->bucket_bitmask]; - secondary_bkt[i] = &h->buckets[sec_hash[i] & h->bucket_bitmask]; + sig[i] = get_short_sig(prim_hash[i]); + prim_index[i] = get_prim_bucket_index(h, prim_hash[i]); + sec_index[i] = get_alt_bucket_index(h, prim_index[i], sig[i]); + + primary_bkt[i] = &h->buckets[prim_index[i]]; + secondary_bkt[i] = &h->buckets[sec_index[i]]; rte_prefetch0(primary_bkt[i]); rte_prefetch0(secondary_bkt[i]); @@ -1174,96 +1574,178 @@ __rte_hash_lookup_bulk(const struct rte_hash *h, const void **keys, /* Calculate and prefetch rest of the buckets */ for (; i < num_keys; i++) { prim_hash[i] = rte_hash_hash(h, keys[i]); - sec_hash[i] = rte_hash_secondary_hash(prim_hash[i]); - primary_bkt[i] = &h->buckets[prim_hash[i] & h->bucket_bitmask]; - secondary_bkt[i] = &h->buckets[sec_hash[i] & h->bucket_bitmask]; + sig[i] = get_short_sig(prim_hash[i]); + prim_index[i] = get_prim_bucket_index(h, prim_hash[i]); + sec_index[i] = get_alt_bucket_index(h, prim_index[i], sig[i]); + + primary_bkt[i] = &h->buckets[prim_index[i]]; + secondary_bkt[i] = &h->buckets[sec_index[i]]; rte_prefetch0(primary_bkt[i]); rte_prefetch0(secondary_bkt[i]); } __hash_rw_reader_lock(h); - /* Compare signatures and prefetch key slot of first hit */ - for (i = 0; i < num_keys; i++) { - compare_signatures(&prim_hitmask[i], &sec_hitmask[i], + do { + /* Load the table change counter before the lookup + * starts. Acquire semantics will make sure that + * loads in compare_signatures are not hoisted. + */ + cnt_b = __atomic_load_n(h->tbl_chng_cnt, + __ATOMIC_ACQUIRE); + + /* Compare signatures and prefetch key slot of first hit */ + for (i = 0; i < num_keys; i++) { + compare_signatures(&prim_hitmask[i], &sec_hitmask[i], primary_bkt[i], secondary_bkt[i], - prim_hash[i], sec_hash[i], h->sig_cmp_fn); - - if (prim_hitmask[i]) { - uint32_t first_hit = __builtin_ctzl(prim_hitmask[i]); - uint32_t key_idx = primary_bkt[i]->key_idx[first_hit]; - const struct rte_hash_key *key_slot = - (const struct rte_hash_key *)( - (const char *)h->key_store + - key_idx * h->key_entry_size); - rte_prefetch0(key_slot); - continue; - } + sig[i], h->sig_cmp_fn); + + if (prim_hitmask[i]) { + uint32_t first_hit = + __builtin_ctzl(prim_hitmask[i]) + >> 1; + uint32_t key_idx = + primary_bkt[i]->key_idx[first_hit]; + const struct rte_hash_key *key_slot = + (const struct rte_hash_key *)( + (const char *)h->key_store + + key_idx * h->key_entry_size); + rte_prefetch0(key_slot); + continue; + } - if (sec_hitmask[i]) { - uint32_t first_hit = __builtin_ctzl(sec_hitmask[i]); - uint32_t key_idx = secondary_bkt[i]->key_idx[first_hit]; - const struct rte_hash_key *key_slot = - (const struct rte_hash_key *)( - (const char *)h->key_store + - key_idx * h->key_entry_size); - rte_prefetch0(key_slot); + if (sec_hitmask[i]) { + uint32_t first_hit = + __builtin_ctzl(sec_hitmask[i]) + >> 1; + uint32_t key_idx = + secondary_bkt[i]->key_idx[first_hit]; + const struct rte_hash_key *key_slot = + (const struct rte_hash_key *)( + (const char *)h->key_store + + key_idx * h->key_entry_size); + rte_prefetch0(key_slot); + } } - } - /* Compare keys, first hits in primary first */ - for (i = 0; i < num_keys; i++) { - positions[i] = -ENOENT; - while (prim_hitmask[i]) { - uint32_t hit_index = __builtin_ctzl(prim_hitmask[i]); - - uint32_t key_idx = primary_bkt[i]->key_idx[hit_index]; - const struct rte_hash_key *key_slot = - (const struct rte_hash_key *)( - (const char *)h->key_store + - key_idx * h->key_entry_size); - /* - * If key index is 0, do not compare key, - * as it is checking the dummy slot - */ - if (!!key_idx & !rte_hash_cmp_eq(key_slot->key, keys[i], h)) { - if (data != NULL) - data[i] = key_slot->pdata; + /* Compare keys, first hits in primary first */ + for (i = 0; i < num_keys; i++) { + positions[i] = -ENOENT; + while (prim_hitmask[i]) { + uint32_t hit_index = + __builtin_ctzl(prim_hitmask[i]) + >> 1; + uint32_t key_idx = + __atomic_load_n( + &primary_bkt[i]->key_idx[hit_index], + __ATOMIC_ACQUIRE); + const struct rte_hash_key *key_slot = + (const struct rte_hash_key *)( + (const char *)h->key_store + + key_idx * h->key_entry_size); + + if (key_idx != EMPTY_SLOT) + pdata[i] = __atomic_load_n( + &key_slot->pdata, + __ATOMIC_ACQUIRE); + /* + * If key index is 0, do not compare key, + * as it is checking the dummy slot + */ + if (!!key_idx & + !rte_hash_cmp_eq( + key_slot->key, keys[i], h)) { + if (data != NULL) + data[i] = pdata[i]; + + hits |= 1ULL << i; + positions[i] = key_idx - 1; + goto next_key; + } + prim_hitmask[i] &= ~(3ULL << (hit_index << 1)); + } - hits |= 1ULL << i; - positions[i] = key_idx - 1; - goto next_key; + while (sec_hitmask[i]) { + uint32_t hit_index = + __builtin_ctzl(sec_hitmask[i]) + >> 1; + uint32_t key_idx = + __atomic_load_n( + &secondary_bkt[i]->key_idx[hit_index], + __ATOMIC_ACQUIRE); + const struct rte_hash_key *key_slot = + (const struct rte_hash_key *)( + (const char *)h->key_store + + key_idx * h->key_entry_size); + + if (key_idx != EMPTY_SLOT) + pdata[i] = __atomic_load_n( + &key_slot->pdata, + __ATOMIC_ACQUIRE); + /* + * If key index is 0, do not compare key, + * as it is checking the dummy slot + */ + + if (!!key_idx & + !rte_hash_cmp_eq( + key_slot->key, keys[i], h)) { + if (data != NULL) + data[i] = pdata[i]; + + hits |= 1ULL << i; + positions[i] = key_idx - 1; + goto next_key; + } + sec_hitmask[i] &= ~(3ULL << (hit_index << 1)); } - prim_hitmask[i] &= ~(1 << (hit_index)); +next_key: + continue; } - while (sec_hitmask[i]) { - uint32_t hit_index = __builtin_ctzl(sec_hitmask[i]); - - uint32_t key_idx = secondary_bkt[i]->key_idx[hit_index]; - const struct rte_hash_key *key_slot = - (const struct rte_hash_key *)( - (const char *)h->key_store + - key_idx * h->key_entry_size); - /* - * If key index is 0, do not compare key, - * as it is checking the dummy slot - */ - - if (!!key_idx & !rte_hash_cmp_eq(key_slot->key, keys[i], h)) { - if (data != NULL) - data[i] = key_slot->pdata; + /* The loads of sig_current in compare_signatures + * should not move below the load from tbl_chng_cnt. + */ + __atomic_thread_fence(__ATOMIC_ACQUIRE); + /* Re-read the table change counter to check if the + * table has changed during search. If yes, re-do + * the search. + * This load should not get hoisted. The load + * acquires on cnt_b, primary key index and secondary + * key index will make sure that it does not get + * hoisted. + */ + cnt_a = __atomic_load_n(h->tbl_chng_cnt, + __ATOMIC_ACQUIRE); + } while (cnt_b != cnt_a); + + /* all found, do not need to go through ext bkt */ + if ((hits == ((1ULL << num_keys) - 1)) || !h->ext_table_support) { + if (hit_mask != NULL) + *hit_mask = hits; + __hash_rw_reader_unlock(h); + return; + } + /* need to check ext buckets for match */ + for (i = 0; i < num_keys; i++) { + if ((hits & (1ULL << i)) != 0) + continue; + next_bkt = secondary_bkt[i]->next; + FOR_EACH_BUCKET(cur_bkt, next_bkt) { + if (data != NULL) + ret = search_one_bucket(h, keys[i], + sig[i], &data[i], cur_bkt); + else + ret = search_one_bucket(h, keys[i], + sig[i], NULL, cur_bkt); + if (ret != -1) { + positions[i] = ret; hits |= 1ULL << i; - positions[i] = key_idx - 1; - goto next_key; + break; } - sec_hitmask[i] &= ~(1 << (hit_index)); } - -next_key: - continue; } __hash_rw_reader_unlock(h); @@ -1308,27 +1790,30 @@ rte_hash_iterate(const struct rte_hash *h, const void **key, void **data, uint32 RETURN_IF_TRUE(((h == NULL) || (next == NULL)), -EINVAL); - const uint32_t total_entries = h->num_buckets * RTE_HASH_BUCKET_ENTRIES; - /* Out of bounds */ - if (*next >= total_entries) - return -ENOENT; + const uint32_t total_entries_main = h->num_buckets * + RTE_HASH_BUCKET_ENTRIES; + const uint32_t total_entries = total_entries_main << 1; + + /* Out of bounds of all buckets (both main table and ext table) */ + if (*next >= total_entries_main) + goto extend_table; /* Calculate bucket and index of current iterator */ bucket_idx = *next / RTE_HASH_BUCKET_ENTRIES; idx = *next % RTE_HASH_BUCKET_ENTRIES; /* If current position is empty, go to the next one */ - while (h->buckets[bucket_idx].key_idx[idx] == EMPTY_SLOT) { + while ((position = __atomic_load_n(&h->buckets[bucket_idx].key_idx[idx], + __ATOMIC_ACQUIRE)) == EMPTY_SLOT) { (*next)++; /* End of table */ - if (*next == total_entries) - return -ENOENT; + if (*next == total_entries_main) + goto extend_table; bucket_idx = *next / RTE_HASH_BUCKET_ENTRIES; idx = *next % RTE_HASH_BUCKET_ENTRIES; } + __hash_rw_reader_lock(h); - /* Get position of entry in key table */ - position = h->buckets[bucket_idx].key_idx[idx]; next_key = (struct rte_hash_key *) ((char *)h->key_store + position * h->key_entry_size); /* Return key and data */ @@ -1341,4 +1826,34 @@ rte_hash_iterate(const struct rte_hash *h, const void **key, void **data, uint32 (*next)++; return position - 1; + +/* Begin to iterate extendable buckets */ +extend_table: + /* Out of total bound or if ext bucket feature is not enabled */ + if (*next >= total_entries || !h->ext_table_support) + return -ENOENT; + + bucket_idx = (*next - total_entries_main) / RTE_HASH_BUCKET_ENTRIES; + idx = (*next - total_entries_main) % RTE_HASH_BUCKET_ENTRIES; + + while ((position = h->buckets_ext[bucket_idx].key_idx[idx]) == EMPTY_SLOT) { + (*next)++; + if (*next == total_entries) + return -ENOENT; + bucket_idx = (*next - total_entries_main) / + RTE_HASH_BUCKET_ENTRIES; + idx = (*next - total_entries_main) % RTE_HASH_BUCKET_ENTRIES; + } + __hash_rw_reader_lock(h); + next_key = (struct rte_hash_key *) ((char *)h->key_store + + position * h->key_entry_size); + /* Return key and data */ + *key = next_key->key; + *data = next_key->pdata; + + __hash_rw_reader_unlock(h); + + /* Increment iterator */ + (*next)++; + return position - 1; } diff --git a/lib/librte_hash/rte_cuckoo_hash.h b/lib/librte_hash/rte_cuckoo_hash.h index b43f467d..5dfbbc48 100644 --- a/lib/librte_hash/rte_cuckoo_hash.h +++ b/lib/librte_hash/rte_cuckoo_hash.h @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright(c) 2016 Intel Corporation + * Copyright(c) 2018 Arm Limited */ /* rte_cuckoo_hash.h @@ -104,8 +105,6 @@ const rte_hash_cmp_eq_t cmp_jump_table[NUM_KEY_CMP_CASES] = { #define LCORE_CACHE_SIZE 64 -#define RTE_HASH_MAX_PUSHES 100 - #define RTE_HASH_BFS_QUEUE_MAX_LEN 1000 #define RTE_XABORT_CUCKOO_PATH_INVALIDED 0x4 @@ -125,25 +124,24 @@ struct rte_hash_key { }; /* Variable key size */ char key[0]; -} __attribute__((aligned(KEY_ALIGNMENT))); +}; /* All different signature compare functions */ enum rte_hash_sig_compare_function { RTE_HASH_COMPARE_SCALAR = 0, RTE_HASH_COMPARE_SSE, - RTE_HASH_COMPARE_AVX2, RTE_HASH_COMPARE_NUM }; /** Bucket structure */ struct rte_hash_bucket { - hash_sig_t sig_current[RTE_HASH_BUCKET_ENTRIES]; + uint16_t sig_current[RTE_HASH_BUCKET_ENTRIES]; uint32_t key_idx[RTE_HASH_BUCKET_ENTRIES]; - hash_sig_t sig_alt[RTE_HASH_BUCKET_ENTRIES]; - uint8_t flag[RTE_HASH_BUCKET_ENTRIES]; + + void *next; } __rte_cache_aligned; /** A hash table structure. */ @@ -164,10 +162,23 @@ struct rte_hash { /**< Length of hash key. */ uint8_t hw_trans_mem_support; /**< If hardware transactional memory is used. */ - uint8_t multi_writer_support; - /**< If multi-writer support is enabled. */ + uint8_t use_local_cache; + /**< If multi-writer support is enabled, use local cache + * to allocate key-store slots. + */ uint8_t readwrite_concur_support; /**< If read-write concurrency support is enabled */ + uint8_t ext_table_support; /**< Enable extendable bucket table */ + uint8_t no_free_on_del; + /**< If key index should be freed on calling rte_hash_del_xxx APIs. + * If this is set, rte_hash_free_key_with_position must be called to + * free the key index associated with the deleted entry. + * This flag is enabled by default. + */ + uint8_t readwrite_concur_lf_support; + /**< If read-write concurrency lock free support is enabled */ + uint8_t writer_takes_lock; + /**< Indicates if the writer threads need to take lock */ rte_hash_function hash_func; /**< Function used to calculate hash. */ uint32_t hash_func_init_val; /**< Init value used by hash_func. */ rte_hash_cmp_eq_t rte_hash_custom_cmp_eq; @@ -186,10 +197,15 @@ struct rte_hash { * to the key table. */ rte_rwlock_t *readwrite_lock; /**< Read-write lock thread-safety. */ + struct rte_hash_bucket *buckets_ext; /**< Extra buckets array */ + struct rte_ring *free_ext_bkts; /**< Ring of indexes of free buckets */ + uint32_t *tbl_chng_cnt; + /**< Indicates if the hash table changed from last read. */ } __rte_cache_aligned; struct queue_node { struct rte_hash_bucket *bkt; /* Current bucket on the bfs search */ + uint32_t cur_bkt_idx; struct queue_node *prev; /* Parent(bucket) in search path */ int prev_slot; /* Parent(slot) in search path */ diff --git a/lib/librte_hash/rte_hash.h b/lib/librte_hash/rte_hash.h index 9e7d9315..c93d1a13 100644 --- a/lib/librte_hash/rte_hash.h +++ b/lib/librte_hash/rte_hash.h @@ -14,6 +14,8 @@ #include #include +#include + #ifdef __cplusplus extern "C" { #endif @@ -37,7 +39,27 @@ extern "C" { /** Flag to support reader writer concurrency */ #define RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY 0x04 -/** Signature of key that is stored internally. */ +/** Flag to indicate the extendabe bucket table feature should be used */ +#define RTE_HASH_EXTRA_FLAGS_EXT_TABLE 0x08 + +/** Flag to disable freeing of key index on hash delete. + * Refer to rte_hash_del_xxx APIs for more details. + * This is enabled by default when RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF + * is enabled. + */ +#define RTE_HASH_EXTRA_FLAGS_NO_FREE_ON_DEL 0x10 + +/** Flag to support lock free reader writer concurrency. Both single writer + * and multi writer use cases are supported. + * Currently, extendable bucket table feature is not supported with + * this feature. + */ +#define RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF 0x20 + +/** + * The type of hash value of a key. + * It should be a value of at least 32bit with fully random pattern. + */ typedef uint32_t hash_sig_t; /** Type of function that can be used for calculating the hash value. */ @@ -119,7 +141,12 @@ void rte_hash_free(struct rte_hash *h); /** - * Reset all hash structure, by zeroing all entries + * Reset all hash structure, by zeroing all entries. + * When RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF is enabled, + * it is application's responsibility to make sure that + * none of the readers are referencing the hash table + * while calling this API. + * * @param h * Hash table to reset */ @@ -143,6 +170,11 @@ rte_hash_count(const struct rte_hash *h); * and should only be called from one thread by default. * Thread safety can be enabled by setting flag during * table creation. + * If the key exists already in the table, this API updates its value + * with 'data' passed in this API. It is the responsibility of + * the application to manage any memory associated with the old value. + * The readers might still be using the old value even after this API + * has returned. * * @param h * Hash table to add the key to. @@ -165,6 +197,11 @@ rte_hash_add_key_data(const struct rte_hash *h, const void *key, void *data); * and should only be called from one thread by default. * Thread safety can be enabled by setting flag during * table creation. + * If the key exists already in the table, this API updates its value + * with 'data' passed in this API. It is the responsibility of + * the application to manage any memory associated with the old value. + * The readers might still be using the old value even after this API + * has returned. * * @param h * Hash table to add the key to. @@ -230,6 +267,14 @@ rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, hash_sig_t * and should only be called from one thread by default. * Thread safety can be enabled by setting flag during * table creation. + * If RTE_HASH_EXTRA_FLAGS_NO_FREE_ON_DEL or + * RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF is enabled, + * the key index returned by rte_hash_add_key_xxx APIs will not be + * freed by this API. rte_hash_free_key_with_position API must be called + * additionally to free the index associated with the key. + * rte_hash_free_key_with_position API should be called after all + * the readers have stopped referencing the entry corresponding to + * this key. RCU mechanisms could be used to determine such a state. * * @param h * Hash table to remove the key from. @@ -251,6 +296,14 @@ rte_hash_del_key(const struct rte_hash *h, const void *key); * and should only be called from one thread by default. * Thread safety can be enabled by setting flag during * table creation. + * If RTE_HASH_EXTRA_FLAGS_NO_FREE_ON_DEL or + * RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF is enabled, + * the key index returned by rte_hash_add_key_xxx APIs will not be + * freed by this API. rte_hash_free_key_with_position API must be called + * additionally to free the index associated with the key. + * rte_hash_free_key_with_position API should be called after all + * the readers have stopped referencing the entry corresponding to + * this key. RCU mechanisms could be used to determine such a state. * * @param h * Hash table to remove the key from. @@ -289,6 +342,34 @@ int rte_hash_get_key_with_position(const struct rte_hash *h, const int32_t position, void **key); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Free a hash key in the hash table given the position + * of the key. This operation is not multi-thread safe and should + * only be called from one thread by default. Thread safety + * can be enabled by setting flag during table creation. + * If RTE_HASH_EXTRA_FLAGS_NO_FREE_ON_DEL or + * RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF is enabled, + * the key index returned by rte_hash_del_key_xxx APIs must be freed + * using this API. This API should be called after all the readers + * have stopped referencing the entry corresponding to this key. + * RCU mechanisms could be used to determine such a state. + * This API does not validate if the key is already freed. + * + * @param h + * Hash table to free the key from. + * @param position + * Position returned when the key was deleted. + * @return + * - 0 if freed successfully + * - -EINVAL if the parameters are invalid. + */ +int __rte_experimental +rte_hash_free_key_with_position(const struct rte_hash *h, + const int32_t position); + /** * Find a key-value pair in the hash table. * This operation is multi-thread safe with regarding to other lookup threads. diff --git a/lib/librte_hash/rte_hash_version.map b/lib/librte_hash/rte_hash_version.map index e216ac8e..734ae28b 100644 --- a/lib/librte_hash/rte_hash_version.map +++ b/lib/librte_hash/rte_hash_version.map @@ -53,3 +53,10 @@ DPDK_18.08 { rte_hash_count; } DPDK_16.07; + +EXPERIMENTAL { + global: + + rte_hash_free_key_with_position; + +}; diff --git a/lib/librte_ip_frag/ip_frag_common.h b/lib/librte_ip_frag/ip_frag_common.h index 197acf8d..0f62e2e1 100644 --- a/lib/librte_ip_frag/ip_frag_common.h +++ b/lib/librte_ip_frag/ip_frag_common.h @@ -25,6 +25,12 @@ #define IPv6_KEY_BYTES_FMT \ "%08" PRIx64 "%08" PRIx64 "%08" PRIx64 "%08" PRIx64 +#ifdef RTE_LIBRTE_IP_FRAG_TBL_STAT +#define IP_FRAG_TBL_STAT_UPDATE(s, f, v) ((s)->f += (v)) +#else +#define IP_FRAG_TBL_STAT_UPDATE(s, f, v) do {} while (0) +#endif /* IP_FRAG_TBL_STAT */ + /* internal functions declarations */ struct rte_mbuf * ip_frag_process(struct ip_frag_pkt *fp, struct rte_ip_frag_death_row *dr, struct rte_mbuf *mb, @@ -69,10 +75,11 @@ ip_frag_key_invalidate(struct ip_frag_key * key) } /* compare two keys */ -static inline int +static inline uint64_t ip_frag_key_cmp(const struct ip_frag_key * k1, const struct ip_frag_key * k2) { - uint32_t i, val; + uint32_t i; + uint64_t val; val = k1->id ^ k2->id; for (i = 0; i < k1->key_len; i++) val |= k1->src_dst[i] ^ k2->src_dst[i]; @@ -149,4 +156,16 @@ ip_frag_reset(struct ip_frag_pkt *fp, uint64_t tms) fp->frags[IP_FIRST_FRAG_IDX] = zero_frag; } +/* local frag table helper functions */ +static inline void +ip_frag_tbl_del(struct rte_ip_frag_tbl *tbl, struct rte_ip_frag_death_row *dr, + struct ip_frag_pkt *fp) +{ + ip_frag_free(fp, dr); + ip_frag_key_invalidate(&fp->key); + TAILQ_REMOVE(&tbl->lru, fp, lru); + tbl->use_entries--; + IP_FRAG_TBL_STAT_UPDATE(&tbl->stat, del_num, 1); +} + #endif /* _IP_FRAG_COMMON_H_ */ diff --git a/lib/librte_ip_frag/ip_frag_internal.c b/lib/librte_ip_frag/ip_frag_internal.c index 2560c771..97470a87 100644 --- a/lib/librte_ip_frag/ip_frag_internal.c +++ b/lib/librte_ip_frag/ip_frag_internal.c @@ -14,24 +14,6 @@ #define IP_FRAG_TBL_POS(tbl, sig) \ ((tbl)->pkt + ((sig) & (tbl)->entry_mask)) -#ifdef RTE_LIBRTE_IP_FRAG_TBL_STAT -#define IP_FRAG_TBL_STAT_UPDATE(s, f, v) ((s)->f += (v)) -#else -#define IP_FRAG_TBL_STAT_UPDATE(s, f, v) do {} while (0) -#endif /* IP_FRAG_TBL_STAT */ - -/* local frag table helper functions */ -static inline void -ip_frag_tbl_del(struct rte_ip_frag_tbl *tbl, struct rte_ip_frag_death_row *dr, - struct ip_frag_pkt *fp) -{ - ip_frag_free(fp, dr); - ip_frag_key_invalidate(&fp->key); - TAILQ_REMOVE(&tbl->lru, fp, lru); - tbl->use_entries--; - IP_FRAG_TBL_STAT_UPDATE(&tbl->stat, del_num, 1); -} - static inline void ip_frag_tbl_add(struct rte_ip_frag_tbl *tbl, struct ip_frag_pkt *fp, const struct ip_frag_key *key, uint64_t tms) diff --git a/lib/librte_ip_frag/rte_ip_frag.h b/lib/librte_ip_frag/rte_ip_frag.h index b3f3f78d..7f425f61 100644 --- a/lib/librte_ip_frag/rte_ip_frag.h +++ b/lib/librte_ip_frag/rte_ip_frag.h @@ -65,10 +65,13 @@ struct ip_frag_pkt { #define IP_FRAG_DEATH_ROW_LEN 32 /**< death row size (in packets) */ +/* death row size in mbufs */ +#define IP_FRAG_DEATH_ROW_MBUF_LEN (IP_FRAG_DEATH_ROW_LEN * (IP_MAX_FRAG_NUM + 1)) + /** mbuf death row (packets to be freed) */ struct rte_ip_frag_death_row { uint32_t cnt; /**< number of mbufs currently on death row */ - struct rte_mbuf *row[IP_FRAG_DEATH_ROW_LEN * (IP_MAX_FRAG_NUM + 1)]; + struct rte_mbuf *row[IP_FRAG_DEATH_ROW_MBUF_LEN]; /**< mbufs to be freed */ }; @@ -325,6 +328,20 @@ void rte_ip_frag_free_death_row(struct rte_ip_frag_death_row *dr, void rte_ip_frag_table_statistics_dump(FILE * f, const struct rte_ip_frag_tbl *tbl); +/** + * Delete expired fragments + * + * @param tbl + * Table to delete expired fragments from + * @param dr + * Death row to free buffers to + * @param tms + * Current timestamp + */ +void __rte_experimental +rte_frag_table_del_expired_entries(struct rte_ip_frag_tbl *tbl, + struct rte_ip_frag_death_row *dr, uint64_t tms); + #ifdef __cplusplus } #endif diff --git a/lib/librte_ip_frag/rte_ip_frag_common.c b/lib/librte_ip_frag/rte_ip_frag_common.c index 659a1795..a23f6f24 100644 --- a/lib/librte_ip_frag/rte_ip_frag_common.c +++ b/lib/librte_ip_frag/rte_ip_frag_common.c @@ -121,3 +121,24 @@ rte_ip_frag_table_statistics_dump(FILE *f, const struct rte_ip_frag_tbl *tbl) fail_nospace, fail_total - fail_nospace); } + +/* Delete expired fragments */ +void __rte_experimental +rte_frag_table_del_expired_entries(struct rte_ip_frag_tbl *tbl, + struct rte_ip_frag_death_row *dr, uint64_t tms) +{ + uint64_t max_cycles; + struct ip_frag_pkt *fp; + + max_cycles = tbl->max_cycles; + + TAILQ_FOREACH(fp, &tbl->lru, lru) + if (max_cycles + fp->start < tms) { + /* check that death row has enough space */ + if (IP_FRAG_DEATH_ROW_MBUF_LEN - dr->cnt >= fp->last_idx) + ip_frag_tbl_del(tbl, dr, fp); + else + return; + } else + return; +} diff --git a/lib/librte_ip_frag/rte_ip_frag_version.map b/lib/librte_ip_frag/rte_ip_frag_version.map index d1acf07c..d40d5515 100644 --- a/lib/librte_ip_frag/rte_ip_frag_version.map +++ b/lib/librte_ip_frag/rte_ip_frag_version.map @@ -18,3 +18,9 @@ DPDK_17.08 { rte_ip_frag_table_destroy; } DPDK_2.0; + +EXPERIMENTAL { + global: + + rte_frag_table_del_expired_entries; +}; diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c index 65f6a2b0..c9726d4f 100644 --- a/lib/librte_kni/rte_kni.c +++ b/lib/librte_kni/rte_kni.c @@ -18,6 +18,9 @@ #include #include #include +#include +#include +#include #include #include "rte_kni_fifo.h" @@ -30,7 +33,23 @@ #define KNI_REQUEST_MBUF_NUM_MAX 32 -#define KNI_MEM_CHECK(cond) do { if (cond) goto kni_fail; } while (0) +#define KNI_MEM_CHECK(cond, fail) do { if (cond) goto fail; } while (0) + +#define KNI_MZ_NAME_FMT "kni_info_%s" +#define KNI_TX_Q_MZ_NAME_FMT "kni_tx_%s" +#define KNI_RX_Q_MZ_NAME_FMT "kni_rx_%s" +#define KNI_ALLOC_Q_MZ_NAME_FMT "kni_alloc_%s" +#define KNI_FREE_Q_MZ_NAME_FMT "kni_free_%s" +#define KNI_REQ_Q_MZ_NAME_FMT "kni_req_%s" +#define KNI_RESP_Q_MZ_NAME_FMT "kni_resp_%s" +#define KNI_SYNC_ADDR_MZ_NAME_FMT "kni_sync_%s" + +TAILQ_HEAD(rte_kni_list, rte_tailq_entry); + +static struct rte_tailq_elem rte_kni_tailq = { + .name = "RTE_KNI", +}; +EAL_REGISTER_TAILQ(rte_kni_tailq) /** * KNI context @@ -42,18 +61,26 @@ struct rte_kni { struct rte_mempool *pktmbuf_pool; /**< pkt mbuf mempool */ unsigned mbuf_size; /**< mbuf size */ + const struct rte_memzone *m_tx_q; /**< TX queue memzone */ + const struct rte_memzone *m_rx_q; /**< RX queue memzone */ + const struct rte_memzone *m_alloc_q;/**< Alloc queue memzone */ + const struct rte_memzone *m_free_q; /**< Free queue memzone */ + struct rte_kni_fifo *tx_q; /**< TX queue */ struct rte_kni_fifo *rx_q; /**< RX queue */ struct rte_kni_fifo *alloc_q; /**< Allocated mbufs queue */ struct rte_kni_fifo *free_q; /**< To be freed mbufs queue */ + const struct rte_memzone *m_req_q; /**< Request queue memzone */ + const struct rte_memzone *m_resp_q; /**< Response queue memzone */ + const struct rte_memzone *m_sync_addr;/**< Sync addr memzone */ + /* For request & response */ struct rte_kni_fifo *req_q; /**< Request queue */ struct rte_kni_fifo *resp_q; /**< Response queue */ void * sync_addr; /**< Req/Resp Mem address */ struct rte_kni_ops ops; /**< operations for request */ - uint8_t in_use : 1; /**< kni in use */ }; enum kni_ops_status { @@ -61,231 +88,111 @@ enum kni_ops_status { KNI_REQ_REGISTERED, }; -/** - * KNI memzone pool slot - */ -struct rte_kni_memzone_slot { - uint32_t id; - uint8_t in_use : 1; /**< slot in use */ - - /* Memzones */ - const struct rte_memzone *m_ctx; /**< KNI ctx */ - const struct rte_memzone *m_tx_q; /**< TX queue */ - const struct rte_memzone *m_rx_q; /**< RX queue */ - const struct rte_memzone *m_alloc_q; /**< Allocated mbufs queue */ - const struct rte_memzone *m_free_q; /**< To be freed mbufs queue */ - const struct rte_memzone *m_req_q; /**< Request queue */ - const struct rte_memzone *m_resp_q; /**< Response queue */ - const struct rte_memzone *m_sync_addr; - - /* Free linked list */ - struct rte_kni_memzone_slot *next; /**< Next slot link.list */ -}; - -/** - * KNI memzone pool - */ -struct rte_kni_memzone_pool { - uint8_t initialized : 1; /**< Global KNI pool init flag */ - - uint32_t max_ifaces; /**< Max. num of KNI ifaces */ - struct rte_kni_memzone_slot *slots; /**< Pool slots */ - rte_spinlock_t mutex; /**< alloc/release mutex */ - - /* Free memzone slots linked-list */ - struct rte_kni_memzone_slot *free; /**< First empty slot */ - struct rte_kni_memzone_slot *free_tail; /**< Last empty slot */ -}; - - static void kni_free_mbufs(struct rte_kni *kni); static void kni_allocate_mbufs(struct rte_kni *kni); static volatile int kni_fd = -1; -static struct rte_kni_memzone_pool kni_memzone_pool = { - .initialized = 0, -}; -static const struct rte_memzone * -kni_memzone_reserve(const char *name, size_t len, int socket_id, - unsigned flags) +/* Shall be called before any allocation happens */ +int +rte_kni_init(unsigned int max_kni_ifaces __rte_unused) { - const struct rte_memzone *mz = rte_memzone_lookup(name); - - if (mz == NULL) - mz = rte_memzone_reserve(name, len, socket_id, flags); + /* Check FD and open */ + if (kni_fd < 0) { + kni_fd = open("/dev/" KNI_DEVICE, O_RDWR); + if (kni_fd < 0) { + RTE_LOG(ERR, KNI, + "Can not open /dev/%s\n", KNI_DEVICE); + return -1; + } + } - return mz; + return 0; } -/* Pool mgmt */ -static struct rte_kni_memzone_slot* -kni_memzone_pool_alloc(void) +static struct rte_kni * +__rte_kni_get(const char *name) { - struct rte_kni_memzone_slot *slot; + struct rte_kni *kni; + struct rte_tailq_entry *te; + struct rte_kni_list *kni_list; - rte_spinlock_lock(&kni_memzone_pool.mutex); + kni_list = RTE_TAILQ_CAST(rte_kni_tailq.head, rte_kni_list); - if (!kni_memzone_pool.free) { - rte_spinlock_unlock(&kni_memzone_pool.mutex); - return NULL; + TAILQ_FOREACH(te, kni_list, next) { + kni = te->data; + if (strncmp(name, kni->name, RTE_KNI_NAMESIZE) == 0) + break; } - slot = kni_memzone_pool.free; - kni_memzone_pool.free = slot->next; - slot->in_use = 1; + if (te == NULL) + kni = NULL; - if (!kni_memzone_pool.free) - kni_memzone_pool.free_tail = NULL; - - rte_spinlock_unlock(&kni_memzone_pool.mutex); - - return slot; + return kni; } -static void -kni_memzone_pool_release(struct rte_kni_memzone_slot *slot) +static int +kni_reserve_mz(struct rte_kni *kni) { - rte_spinlock_lock(&kni_memzone_pool.mutex); + char mz_name[RTE_MEMZONE_NAMESIZE]; - if (kni_memzone_pool.free) - kni_memzone_pool.free_tail->next = slot; - else - kni_memzone_pool.free = slot; + snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_TX_Q_MZ_NAME_FMT, kni->name); + kni->m_tx_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); + KNI_MEM_CHECK(kni->m_tx_q == NULL, tx_q_fail); - kni_memzone_pool.free_tail = slot; - slot->next = NULL; - slot->in_use = 0; + snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_RX_Q_MZ_NAME_FMT, kni->name); + kni->m_rx_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); + KNI_MEM_CHECK(kni->m_rx_q == NULL, rx_q_fail); - rte_spinlock_unlock(&kni_memzone_pool.mutex); -} + snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_ALLOC_Q_MZ_NAME_FMT, kni->name); + kni->m_alloc_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); + KNI_MEM_CHECK(kni->m_alloc_q == NULL, alloc_q_fail); + snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_FREE_Q_MZ_NAME_FMT, kni->name); + kni->m_free_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); + KNI_MEM_CHECK(kni->m_free_q == NULL, free_q_fail); -/* Shall be called before any allocation happens */ -void -rte_kni_init(unsigned int max_kni_ifaces) -{ - uint32_t i; - struct rte_kni_memzone_slot *it; - const struct rte_memzone *mz; -#define OBJNAMSIZ 32 - char obj_name[OBJNAMSIZ]; - char mz_name[RTE_MEMZONE_NAMESIZE]; - - /* Immediately return if KNI is already initialized */ - if (kni_memzone_pool.initialized) { - RTE_LOG(WARNING, KNI, "Double call to rte_kni_init()"); - return; - } + snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_REQ_Q_MZ_NAME_FMT, kni->name); + kni->m_req_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); + KNI_MEM_CHECK(kni->m_req_q == NULL, req_q_fail); - if (max_kni_ifaces == 0) { - RTE_LOG(ERR, KNI, "Invalid number of max_kni_ifaces %d\n", - max_kni_ifaces); - RTE_LOG(ERR, KNI, "Unable to initialize KNI\n"); - return; - } + snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_RESP_Q_MZ_NAME_FMT, kni->name); + kni->m_resp_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); + KNI_MEM_CHECK(kni->m_resp_q == NULL, resp_q_fail); - /* Check FD and open */ - if (kni_fd < 0) { - kni_fd = open("/dev/" KNI_DEVICE, O_RDWR); - if (kni_fd < 0) { - RTE_LOG(ERR, KNI, - "Can not open /dev/%s\n", KNI_DEVICE); - return; - } - } + snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_SYNC_ADDR_MZ_NAME_FMT, kni->name); + kni->m_sync_addr = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); + KNI_MEM_CHECK(kni->m_sync_addr == NULL, sync_addr_fail); - /* Allocate slot objects */ - kni_memzone_pool.slots = (struct rte_kni_memzone_slot *) - rte_malloc(NULL, - sizeof(struct rte_kni_memzone_slot) * - max_kni_ifaces, - 0); - KNI_MEM_CHECK(kni_memzone_pool.slots == NULL); - - /* Initialize general pool variables */ - kni_memzone_pool.initialized = 1; - kni_memzone_pool.max_ifaces = max_kni_ifaces; - kni_memzone_pool.free = &kni_memzone_pool.slots[0]; - rte_spinlock_init(&kni_memzone_pool.mutex); - - /* Pre-allocate all memzones of all the slots; panic on error */ - for (i = 0; i < max_kni_ifaces; i++) { - - /* Recover current slot */ - it = &kni_memzone_pool.slots[i]; - it->id = i; - - /* Allocate KNI context */ - snprintf(mz_name, RTE_MEMZONE_NAMESIZE, "KNI_INFO_%d", i); - mz = kni_memzone_reserve(mz_name, sizeof(struct rte_kni), - SOCKET_ID_ANY, 0); - KNI_MEM_CHECK(mz == NULL); - it->m_ctx = mz; - - /* TX RING */ - snprintf(obj_name, OBJNAMSIZ, "kni_tx_%d", i); - mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, - SOCKET_ID_ANY, 0); - KNI_MEM_CHECK(mz == NULL); - it->m_tx_q = mz; - - /* RX RING */ - snprintf(obj_name, OBJNAMSIZ, "kni_rx_%d", i); - mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, - SOCKET_ID_ANY, 0); - KNI_MEM_CHECK(mz == NULL); - it->m_rx_q = mz; - - /* ALLOC RING */ - snprintf(obj_name, OBJNAMSIZ, "kni_alloc_%d", i); - mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, - SOCKET_ID_ANY, 0); - KNI_MEM_CHECK(mz == NULL); - it->m_alloc_q = mz; - - /* FREE RING */ - snprintf(obj_name, OBJNAMSIZ, "kni_free_%d", i); - mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, - SOCKET_ID_ANY, 0); - KNI_MEM_CHECK(mz == NULL); - it->m_free_q = mz; - - /* Request RING */ - snprintf(obj_name, OBJNAMSIZ, "kni_req_%d", i); - mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, - SOCKET_ID_ANY, 0); - KNI_MEM_CHECK(mz == NULL); - it->m_req_q = mz; - - /* Response RING */ - snprintf(obj_name, OBJNAMSIZ, "kni_resp_%d", i); - mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, - SOCKET_ID_ANY, 0); - KNI_MEM_CHECK(mz == NULL); - it->m_resp_q = mz; - - /* Req/Resp sync mem area */ - snprintf(obj_name, OBJNAMSIZ, "kni_sync_%d", i); - mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, - SOCKET_ID_ANY, 0); - KNI_MEM_CHECK(mz == NULL); - it->m_sync_addr = mz; - - if ((i+1) == max_kni_ifaces) { - it->next = NULL; - kni_memzone_pool.free_tail = it; - } else - it->next = &kni_memzone_pool.slots[i+1]; - } - - return; + return 0; -kni_fail: - RTE_LOG(ERR, KNI, "Unable to allocate memory for max_kni_ifaces:%d." - "Increase the amount of hugepages memory\n", max_kni_ifaces); +sync_addr_fail: + rte_memzone_free(kni->m_resp_q); +resp_q_fail: + rte_memzone_free(kni->m_req_q); +req_q_fail: + rte_memzone_free(kni->m_free_q); +free_q_fail: + rte_memzone_free(kni->m_alloc_q); +alloc_q_fail: + rte_memzone_free(kni->m_rx_q); +rx_q_fail: + rte_memzone_free(kni->m_tx_q); +tx_q_fail: + return -1; } +static void +kni_release_mz(struct rte_kni *kni) +{ + rte_memzone_free(kni->m_tx_q); + rte_memzone_free(kni->m_rx_q); + rte_memzone_free(kni->m_alloc_q); + rte_memzone_free(kni->m_free_q); + rte_memzone_free(kni->m_req_q); + rte_memzone_free(kni->m_resp_q); + rte_memzone_free(kni->m_sync_addr); +} struct rte_kni * rte_kni_alloc(struct rte_mempool *pktmbuf_pool, @@ -294,41 +201,45 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool, { int ret; struct rte_kni_device_info dev_info; - struct rte_kni *ctx; - char intf_name[RTE_KNI_NAMESIZE]; - const struct rte_memzone *mz; - struct rte_kni_memzone_slot *slot = NULL; + struct rte_kni *kni; + struct rte_tailq_entry *te; + struct rte_kni_list *kni_list; if (!pktmbuf_pool || !conf || !conf->name[0]) return NULL; /* Check if KNI subsystem has been initialized */ - if (kni_memzone_pool.initialized != 1) { + if (kni_fd < 0) { RTE_LOG(ERR, KNI, "KNI subsystem has not been initialized. Invoke rte_kni_init() first\n"); return NULL; } - /* Get an available slot from the pool */ - slot = kni_memzone_pool_alloc(); - if (!slot) { - RTE_LOG(ERR, KNI, "Cannot allocate more KNI interfaces; increase the number of max_kni_ifaces(current %d) or release unused ones.\n", - kni_memzone_pool.max_ifaces); - return NULL; + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + kni = __rte_kni_get(conf->name); + if (kni != NULL) { + RTE_LOG(ERR, KNI, "KNI already exists\n"); + goto unlock; } - /* Recover ctx */ - ctx = slot->m_ctx->addr; - snprintf(intf_name, RTE_KNI_NAMESIZE, "%s", conf->name); + te = rte_zmalloc("KNI_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_LOG(ERR, KNI, "Failed to allocate tailq entry\n"); + goto unlock; + } - if (ctx->in_use) { - RTE_LOG(ERR, KNI, "KNI %s is in use\n", ctx->name); - return NULL; + kni = rte_zmalloc("KNI", sizeof(struct rte_kni), RTE_CACHE_LINE_SIZE); + if (kni == NULL) { + RTE_LOG(ERR, KNI, "KNI memory allocation failed\n"); + goto kni_fail; } - memset(ctx, 0, sizeof(struct rte_kni)); + + snprintf(kni->name, RTE_KNI_NAMESIZE, "%s", conf->name); + if (ops) - memcpy(&ctx->ops, ops, sizeof(struct rte_kni_ops)); + memcpy(&kni->ops, ops, sizeof(struct rte_kni_ops)); else - ctx->ops.port_id = UINT16_MAX; + kni->ops.port_id = UINT16_MAX; memset(&dev_info, 0, sizeof(dev_info)); dev_info.bus = conf->addr.bus; @@ -344,72 +255,79 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool, memcpy(dev_info.mac_addr, conf->mac_addr, ETHER_ADDR_LEN); - snprintf(ctx->name, RTE_KNI_NAMESIZE, "%s", intf_name); - snprintf(dev_info.name, RTE_KNI_NAMESIZE, "%s", intf_name); + snprintf(dev_info.name, RTE_KNI_NAMESIZE, "%s", conf->name); RTE_LOG(INFO, KNI, "pci: %02x:%02x:%02x \t %02x:%02x\n", dev_info.bus, dev_info.devid, dev_info.function, dev_info.vendor_id, dev_info.device_id); + + ret = kni_reserve_mz(kni); + if (ret < 0) + goto mz_fail; + /* TX RING */ - mz = slot->m_tx_q; - ctx->tx_q = mz->addr; - kni_fifo_init(ctx->tx_q, KNI_FIFO_COUNT_MAX); - dev_info.tx_phys = mz->phys_addr; + kni->tx_q = kni->m_tx_q->addr; + kni_fifo_init(kni->tx_q, KNI_FIFO_COUNT_MAX); + dev_info.tx_phys = kni->m_tx_q->phys_addr; /* RX RING */ - mz = slot->m_rx_q; - ctx->rx_q = mz->addr; - kni_fifo_init(ctx->rx_q, KNI_FIFO_COUNT_MAX); - dev_info.rx_phys = mz->phys_addr; + kni->rx_q = kni->m_rx_q->addr; + kni_fifo_init(kni->rx_q, KNI_FIFO_COUNT_MAX); + dev_info.rx_phys = kni->m_rx_q->phys_addr; /* ALLOC RING */ - mz = slot->m_alloc_q; - ctx->alloc_q = mz->addr; - kni_fifo_init(ctx->alloc_q, KNI_FIFO_COUNT_MAX); - dev_info.alloc_phys = mz->phys_addr; + kni->alloc_q = kni->m_alloc_q->addr; + kni_fifo_init(kni->alloc_q, KNI_FIFO_COUNT_MAX); + dev_info.alloc_phys = kni->m_alloc_q->phys_addr; /* FREE RING */ - mz = slot->m_free_q; - ctx->free_q = mz->addr; - kni_fifo_init(ctx->free_q, KNI_FIFO_COUNT_MAX); - dev_info.free_phys = mz->phys_addr; + kni->free_q = kni->m_free_q->addr; + kni_fifo_init(kni->free_q, KNI_FIFO_COUNT_MAX); + dev_info.free_phys = kni->m_free_q->phys_addr; /* Request RING */ - mz = slot->m_req_q; - ctx->req_q = mz->addr; - kni_fifo_init(ctx->req_q, KNI_FIFO_COUNT_MAX); - dev_info.req_phys = mz->phys_addr; + kni->req_q = kni->m_req_q->addr; + kni_fifo_init(kni->req_q, KNI_FIFO_COUNT_MAX); + dev_info.req_phys = kni->m_req_q->phys_addr; /* Response RING */ - mz = slot->m_resp_q; - ctx->resp_q = mz->addr; - kni_fifo_init(ctx->resp_q, KNI_FIFO_COUNT_MAX); - dev_info.resp_phys = mz->phys_addr; + kni->resp_q = kni->m_resp_q->addr; + kni_fifo_init(kni->resp_q, KNI_FIFO_COUNT_MAX); + dev_info.resp_phys = kni->m_resp_q->phys_addr; /* Req/Resp sync mem area */ - mz = slot->m_sync_addr; - ctx->sync_addr = mz->addr; - dev_info.sync_va = mz->addr; - dev_info.sync_phys = mz->phys_addr; + kni->sync_addr = kni->m_sync_addr->addr; + dev_info.sync_va = kni->m_sync_addr->addr; + dev_info.sync_phys = kni->m_sync_addr->phys_addr; - ctx->pktmbuf_pool = pktmbuf_pool; - ctx->group_id = conf->group_id; - ctx->slot_id = slot->id; - ctx->mbuf_size = conf->mbuf_size; + kni->pktmbuf_pool = pktmbuf_pool; + kni->group_id = conf->group_id; + kni->mbuf_size = conf->mbuf_size; ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info); - KNI_MEM_CHECK(ret < 0); + if (ret < 0) + goto ioctl_fail; + + te->data = kni; + + kni_list = RTE_TAILQ_CAST(rte_kni_tailq.head, rte_kni_list); + TAILQ_INSERT_TAIL(kni_list, te, next); - ctx->in_use = 1; + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); /* Allocate mbufs and then put them into alloc_q */ - kni_allocate_mbufs(ctx); + kni_allocate_mbufs(kni); - return ctx; + return kni; +ioctl_fail: + kni_release_mz(kni); +mz_fail: + rte_free(kni); kni_fail: - if (slot) - kni_memzone_pool_release(&kni_memzone_pool.slots[slot->id]); + rte_free(te); +unlock: + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); return NULL; } @@ -462,19 +380,36 @@ kni_free_fifo_phy(struct rte_mempool *mp, struct rte_kni_fifo *fifo) int rte_kni_release(struct rte_kni *kni) { + struct rte_tailq_entry *te; + struct rte_kni_list *kni_list; struct rte_kni_device_info dev_info; - uint32_t slot_id; uint32_t retry = 5; - if (!kni || !kni->in_use) + if (!kni) return -1; + kni_list = RTE_TAILQ_CAST(rte_kni_tailq.head, rte_kni_list); + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + TAILQ_FOREACH(te, kni_list, next) { + if (te->data == kni) + break; + } + + if (te == NULL) + goto unlock; + snprintf(dev_info.name, sizeof(dev_info.name), "%s", kni->name); if (ioctl(kni_fd, RTE_KNI_IOCTL_RELEASE, &dev_info) < 0) { RTE_LOG(ERR, KNI, "Fail to release kni device\n"); - return -1; + goto unlock; } + TAILQ_REMOVE(kni_list, te, next); + + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + /* mbufs in all fifo should be released, except request/response */ /* wait until all rxq packets processed by kernel */ @@ -488,20 +423,18 @@ rte_kni_release(struct rte_kni *kni) kni_free_fifo(kni->tx_q); kni_free_fifo(kni->free_q); - slot_id = kni->slot_id; + kni_release_mz(kni); - /* Memset the KNI struct */ - memset(kni, 0, sizeof(struct rte_kni)); + rte_free(kni); - /* Release memzone */ - if (slot_id > kni_memzone_pool.max_ifaces) { - RTE_LOG(ERR, KNI, "KNI pool: corrupted slot ID: %d, max: %d\n", - slot_id, kni_memzone_pool.max_ifaces); - return -1; - } - kni_memzone_pool_release(&kni_memzone_pool.slots[slot_id]); + rte_free(te); return 0; + +unlock: + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + return -1; } /* default callback for request of configuring device mac address */ @@ -711,24 +644,18 @@ kni_allocate_mbufs(struct rte_kni *kni) struct rte_kni * rte_kni_get(const char *name) { - uint32_t i; - struct rte_kni_memzone_slot *it; struct rte_kni *kni; if (name == NULL || name[0] == '\0') return NULL; - /* Note: could be improved perf-wise if necessary */ - for (i = 0; i < kni_memzone_pool.max_ifaces; i++) { - it = &kni_memzone_pool.slots[i]; - if (it->in_use == 0) - continue; - kni = it->m_ctx->addr; - if (strncmp(kni->name, name, RTE_KNI_NAMESIZE) == 0) - return kni; - } + rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK); - return NULL; + kni = __rte_kni_get(name); + + rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK); + + return kni; } const char * @@ -790,6 +717,47 @@ rte_kni_unregister_handlers(struct rte_kni *kni) return 0; } + +int __rte_experimental +rte_kni_update_link(struct rte_kni *kni, unsigned int linkup) +{ + char path[64]; + char old_carrier[2]; + const char *new_carrier; + int old_linkup; + int fd, ret; + + if (kni == NULL) + return -1; + + snprintf(path, sizeof(path), "/sys/devices/virtual/net/%s/carrier", + kni->name); + + fd = open(path, O_RDWR); + if (fd == -1) { + RTE_LOG(ERR, KNI, "Failed to open file: %s.\n", path); + return -1; + } + + ret = read(fd, old_carrier, 2); + if (ret < 1) { + close(fd); + return -1; + } + old_linkup = (old_carrier[0] == '1'); + + new_carrier = linkup ? "1" : "0"; + ret = write(fd, new_carrier, 1); + if (ret < 1) { + RTE_LOG(ERR, KNI, "Failed to write file: %s.\n", path); + close(fd); + return -1; + } + + close(fd); + return old_linkup; +} + void rte_kni_close(void) { diff --git a/lib/librte_kni/rte_kni.h b/lib/librte_kni/rte_kni.h index 99055e2c..02ca43b4 100644 --- a/lib/librte_kni/rte_kni.h +++ b/lib/librte_kni/rte_kni.h @@ -81,8 +81,12 @@ struct rte_kni_conf { * * @param max_kni_ifaces * The maximum number of KNI interfaces that can coexist concurrently + * + * @return + * - 0 indicates success. + * - negative value indicates failure. */ -void rte_kni_init(unsigned int max_kni_ifaces); +int rte_kni_init(unsigned int max_kni_ifaces); /** @@ -228,6 +232,26 @@ int rte_kni_register_handlers(struct rte_kni *kni, struct rte_kni_ops *ops); */ int rte_kni_unregister_handlers(struct rte_kni *kni); +/** + * Update link carrier state for KNI port. + * + * Update the linkup/linkdown state of a KNI interface in the kernel. + * + * @param kni + * pointer to struct rte_kni. + * @param linkup + * New link state: + * 0 for linkdown. + * > 0 for linkup. + * + * @return + * On failure: -1 + * Previous link state == linkdown: 0 + * Previous link state == linkup: 1 + */ +int __rte_experimental +rte_kni_update_link(struct rte_kni *kni, unsigned int linkup); + /** * Close KNI device. */ diff --git a/lib/librte_kni/rte_kni_fifo.h b/lib/librte_kni/rte_kni_fifo.h index ac26a8c0..287d7deb 100644 --- a/lib/librte_kni/rte_kni_fifo.h +++ b/lib/librte_kni/rte_kni_fifo.h @@ -4,6 +4,36 @@ +/** + * @internal when c11 memory model enabled use c11 atomic memory barrier. + * when under non c11 memory model use rte_smp_* memory barrier. + * + * @param src + * Pointer to the source data. + * @param dst + * Pointer to the destination data. + * @param value + * Data value. + */ +#ifdef RTE_USE_C11_MEM_MODEL +#define __KNI_LOAD_ACQUIRE(src) ({ \ + __atomic_load_n((src), __ATOMIC_ACQUIRE); \ + }) +#define __KNI_STORE_RELEASE(dst, value) do { \ + __atomic_store_n((dst), value, __ATOMIC_RELEASE); \ + } while(0) +#else +#define __KNI_LOAD_ACQUIRE(src) ({ \ + typeof (*(src)) val = *(src); \ + rte_smp_rmb(); \ + val; \ + }) +#define __KNI_STORE_RELEASE(dst, value) do { \ + *(dst) = value; \ + rte_smp_wmb(); \ + } while(0) +#endif + /** * Initializes the kni fifo structure */ @@ -28,8 +58,8 @@ kni_fifo_put(struct rte_kni_fifo *fifo, void **data, unsigned num) { unsigned i = 0; unsigned fifo_write = fifo->write; - unsigned fifo_read = fifo->read; unsigned new_write = fifo_write; + unsigned fifo_read = __KNI_LOAD_ACQUIRE(&fifo->read); for (i = 0; i < num; i++) { new_write = (new_write + 1) & (fifo->len - 1); @@ -39,7 +69,7 @@ kni_fifo_put(struct rte_kni_fifo *fifo, void **data, unsigned num) fifo->buffer[fifo_write] = data[i]; fifo_write = new_write; } - fifo->write = fifo_write; + __KNI_STORE_RELEASE(&fifo->write, fifo_write); return i; } @@ -51,7 +81,8 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void **data, unsigned num) { unsigned i = 0; unsigned new_read = fifo->read; - unsigned fifo_write = fifo->write; + unsigned fifo_write = __KNI_LOAD_ACQUIRE(&fifo->write); + for (i = 0; i < num; i++) { if (new_read == fifo_write) break; @@ -59,7 +90,7 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void **data, unsigned num) data[i] = fifo->buffer[new_read]; new_read = (new_read + 1) & (fifo->len - 1); } - fifo->read = new_read; + __KNI_STORE_RELEASE(&fifo->read, new_read); return i; } @@ -69,5 +100,7 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void **data, unsigned num) static inline uint32_t kni_fifo_count(struct rte_kni_fifo *fifo) { - return (fifo->len + fifo->write - fifo->read) & (fifo->len - 1); + unsigned fifo_write = __KNI_LOAD_ACQUIRE(&fifo->write); + unsigned fifo_read = __KNI_LOAD_ACQUIRE(&fifo->read); + return (fifo->len + fifo_write - fifo_read) & (fifo->len - 1); } diff --git a/lib/librte_kni/rte_kni_version.map b/lib/librte_kni/rte_kni_version.map index acd515eb..c877dc6a 100644 --- a/lib/librte_kni/rte_kni_version.map +++ b/lib/librte_kni/rte_kni_version.map @@ -15,3 +15,9 @@ DPDK_2.0 { local: *; }; + +EXPERIMENTAL { + global: + + rte_kni_update_link; +}; diff --git a/lib/librte_kvargs/rte_kvargs.c b/lib/librte_kvargs/rte_kvargs.c index a28f7694..f7030c63 100644 --- a/lib/librte_kvargs/rte_kvargs.c +++ b/lib/librte_kvargs/rte_kvargs.c @@ -44,6 +44,20 @@ rte_kvargs_tokenize(struct rte_kvargs *kvlist, const char *params) kvlist->pairs[i].value == NULL) return -1; + /* Detect list [a,b] to skip comma delimiter in list. */ + str = kvlist->pairs[i].value; + if (str[0] == '[') { + /* Find the end of the list. */ + while (str[strlen(str) - 1] != ']') { + /* Restore the comma erased by strtok_r(). */ + str[strlen(str)] = ','; + /* Parse until next comma. */ + str = strtok_r(NULL, RTE_KVARGS_PAIRS_DELIM, &ctx1); + if (str == NULL) + return -1; /* no closing bracket */ + } + } + kvlist->count++; str = NULL; } @@ -120,6 +134,9 @@ rte_kvargs_process(const struct rte_kvargs *kvlist, const struct rte_kvargs_pair *pair; unsigned i; + if (kvlist == NULL) + return 0; + for (i = 0; i < kvlist->count; i++) { pair = &kvlist->pairs[i]; if (key_match == NULL || strcmp(pair->key, key_match) == 0) { diff --git a/lib/librte_kvargs/rte_kvargs.h b/lib/librte_kvargs/rte_kvargs.h index fc041956..1946195d 100644 --- a/lib/librte_kvargs/rte_kvargs.h +++ b/lib/librte_kvargs/rte_kvargs.h @@ -110,7 +110,7 @@ struct rte_kvargs *rte_kvargs_parse_delim(const char *args, * rte_kvargs_parse(). * * @param kvlist - * The rte_kvargs structure + * The rte_kvargs structure. No error if NULL. */ void rte_kvargs_free(struct rte_kvargs *kvlist); @@ -119,11 +119,10 @@ void rte_kvargs_free(struct rte_kvargs *kvlist); * * For each key/value association that matches the given key, calls the * handler function with the for a given arg_name passing the value on the - * dictionary for that key and a given extra argument. If *kvlist* is NULL - * function does nothing. + * dictionary for that key and a given extra argument. * * @param kvlist - * The rte_kvargs structure + * The rte_kvargs structure. No error if NULL. * @param key_match * The key on which the handler should be called, or NULL to process handler * on all associations diff --git a/lib/librte_latencystats/rte_latencystats.c b/lib/librte_latencystats/rte_latencystats.c index 1fdec68e..5715549e 100644 --- a/lib/librte_latencystats/rte_latencystats.c +++ b/lib/librte_latencystats/rte_latencystats.c @@ -125,8 +125,11 @@ add_time_stamps(uint16_t pid __rte_unused, for (i = 0; i < nb_pkts; i++) { diff_tsc = now - prev_tsc; timer_tsc += diff_tsc; - if (timer_tsc >= samp_intvl) { + + if ((pkts[i]->ol_flags & PKT_RX_TIMESTAMP) == 0 + && (timer_tsc >= samp_intvl)) { pkts[i]->timestamp = now; + pkts[i]->ol_flags |= PKT_RX_TIMESTAMP; timer_tsc = 0; } prev_tsc = now; @@ -156,7 +159,7 @@ calc_latency(uint16_t pid __rte_unused, now = rte_rdtsc(); for (i = 0; i < nb_pkts; i++) { - if (pkts[i]->timestamp) + if (pkts[i]->ol_flags & PKT_RX_TIMESTAMP) latency[cnt++] = now - pkts[i]->timestamp; } diff --git a/lib/librte_lpm/Makefile b/lib/librte_lpm/Makefile index 482bd72e..a7946a1c 100644 --- a/lib/librte_lpm/Makefile +++ b/lib/librte_lpm/Makefile @@ -8,7 +8,7 @@ LIB = librte_lpm.a CFLAGS += -O3 CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -LDLIBS += -lrte_eal +LDLIBS += -lrte_eal -lrte_hash EXPORT_MAP := rte_lpm_version.map diff --git a/lib/librte_lpm/meson.build b/lib/librte_lpm/meson.build index 06784942..a5176d8a 100644 --- a/lib/librte_lpm/meson.build +++ b/lib/librte_lpm/meson.build @@ -7,3 +7,4 @@ headers = files('rte_lpm.h', 'rte_lpm6.h') # since header files have different names, we can install all vector headers # without worrying about which architecture we actually need headers += files('rte_lpm_altivec.h', 'rte_lpm_neon.h', 'rte_lpm_sse.h') +deps += ['hash'] diff --git a/lib/librte_lpm/rte_lpm6.c b/lib/librte_lpm/rte_lpm6.c index 149677eb..6212003f 100644 --- a/lib/librte_lpm/rte_lpm6.c +++ b/lib/librte_lpm/rte_lpm6.c @@ -21,6 +21,9 @@ #include #include #include +#include +#include +#include #include "rte_lpm6.h" @@ -37,6 +40,9 @@ #define BYTE_SIZE 8 #define BYTES2_SIZE 16 +#define RULE_HASH_TABLE_EXTRA_SPACE 64 +#define TBL24_IND UINT32_MAX + #define lpm6_tbl8_gindex next_hop /** Flags for setting an entry as valid/invalid. */ @@ -70,6 +76,23 @@ struct rte_lpm6_rule { uint8_t depth; /**< Rule depth. */ }; +/** Rules tbl entry key. */ +struct rte_lpm6_rule_key { + uint8_t ip[RTE_LPM6_IPV6_ADDR_SIZE]; /**< Rule IP address. */ + uint8_t depth; /**< Rule depth. */ +}; + +/* Header of tbl8 */ +struct rte_lpm_tbl8_hdr { + uint32_t owner_tbl_ind; /**< owner table: TBL24_IND if owner is tbl24, + * otherwise index of tbl8 + */ + uint32_t owner_entry_ind; /**< index of the owner table entry where + * pointer to the tbl8 is stored + */ + uint32_t ref_cnt; /**< table reference counter */ +}; + /** LPM6 structure. */ struct rte_lpm6 { /* LPM metadata. */ @@ -77,12 +100,17 @@ struct rte_lpm6 { uint32_t max_rules; /**< Max number of rules. */ uint32_t used_rules; /**< Used rules so far. */ uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */ - uint32_t next_tbl8; /**< Next tbl8 to be used. */ /* LPM Tables. */ - struct rte_lpm6_rule *rules_tbl; /**< LPM rules. */ + struct rte_hash *rules_tbl; /**< LPM rules. */ struct rte_lpm6_tbl_entry tbl24[RTE_LPM6_TBL24_NUM_ENTRIES] __rte_cache_aligned; /**< LPM tbl24 table. */ + + uint32_t *tbl8_pool; /**< pool of indexes of free tbl8s */ + uint32_t tbl8_pool_pos; /**< current position in the tbl8 pool */ + + struct rte_lpm_tbl8_hdr *tbl8_hdrs; /* array of tbl8 headers */ + struct rte_lpm6_tbl_entry tbl8[0] __rte_cache_aligned; /**< LPM tbl8 table. */ }; @@ -93,22 +121,122 @@ struct rte_lpm6 { * and set the rest to 0. */ static inline void -mask_ip(uint8_t *ip, uint8_t depth) +ip6_mask_addr(uint8_t *ip, uint8_t depth) { - int16_t part_depth, mask; - int i; + int16_t part_depth, mask; + int i; - part_depth = depth; + part_depth = depth; - for (i = 0; i < RTE_LPM6_IPV6_ADDR_SIZE; i++) { - if (part_depth < BYTE_SIZE && part_depth >= 0) { - mask = (uint16_t)(~(UINT8_MAX >> part_depth)); - ip[i] = (uint8_t)(ip[i] & mask); - } else if (part_depth < 0) { - ip[i] = 0; - } - part_depth -= BYTE_SIZE; - } + for (i = 0; i < RTE_LPM6_IPV6_ADDR_SIZE; i++) { + if (part_depth < BYTE_SIZE && part_depth >= 0) { + mask = (uint16_t)(~(UINT8_MAX >> part_depth)); + ip[i] = (uint8_t)(ip[i] & mask); + } else if (part_depth < 0) + ip[i] = 0; + + part_depth -= BYTE_SIZE; + } +} + +/* copy ipv6 address */ +static inline void +ip6_copy_addr(uint8_t *dst, const uint8_t *src) +{ + rte_memcpy(dst, src, RTE_LPM6_IPV6_ADDR_SIZE); +} + +/* + * LPM6 rule hash function + * + * It's used as a hash function for the rte_hash + * containing rules + */ +static inline uint32_t +rule_hash(const void *data, __rte_unused uint32_t data_len, + uint32_t init_val) +{ + return rte_jhash(data, sizeof(struct rte_lpm6_rule_key), init_val); +} + +/* + * Init pool of free tbl8 indexes + */ +static void +tbl8_pool_init(struct rte_lpm6 *lpm) +{ + uint32_t i; + + /* put entire range of indexes to the tbl8 pool */ + for (i = 0; i < lpm->number_tbl8s; i++) + lpm->tbl8_pool[i] = i; + + lpm->tbl8_pool_pos = 0; +} + +/* + * Get an index of a free tbl8 from the pool + */ +static inline uint32_t +tbl8_get(struct rte_lpm6 *lpm, uint32_t *tbl8_ind) +{ + if (lpm->tbl8_pool_pos == lpm->number_tbl8s) + /* no more free tbl8 */ + return -ENOSPC; + + /* next index */ + *tbl8_ind = lpm->tbl8_pool[lpm->tbl8_pool_pos++]; + return 0; +} + +/* + * Put an index of a free tbl8 back to the pool + */ +static inline uint32_t +tbl8_put(struct rte_lpm6 *lpm, uint32_t tbl8_ind) +{ + if (lpm->tbl8_pool_pos == 0) + /* pool is full */ + return -ENOSPC; + + lpm->tbl8_pool[--lpm->tbl8_pool_pos] = tbl8_ind; + return 0; +} + +/* + * Returns number of tbl8s available in the pool + */ +static inline uint32_t +tbl8_available(struct rte_lpm6 *lpm) +{ + return lpm->number_tbl8s - lpm->tbl8_pool_pos; +} + +/* + * Init a rule key. + * note that ip must be already masked + */ +static inline void +rule_key_init(struct rte_lpm6_rule_key *key, uint8_t *ip, uint8_t depth) +{ + ip6_copy_addr(key->ip, ip); + key->depth = depth; +} + +/* + * Rebuild the entire LPM tree by reinserting all rules + */ +static void +rebuild_lpm(struct rte_lpm6 *lpm) +{ + uint64_t next_hop; + struct rte_lpm6_rule_key *rule_key; + uint32_t iter = 0; + + while (rte_hash_iterate(lpm->rules_tbl, (void *) &rule_key, + (void **) &next_hop, &iter) >= 0) + rte_lpm6_add(lpm, rule_key->ip, rule_key->depth, + (uint32_t) next_hop); } /* @@ -121,8 +249,11 @@ rte_lpm6_create(const char *name, int socket_id, char mem_name[RTE_LPM6_NAMESIZE]; struct rte_lpm6 *lpm = NULL; struct rte_tailq_entry *te; - uint64_t mem_size, rules_size; + uint64_t mem_size; struct rte_lpm6_list *lpm_list; + struct rte_hash *rules_tbl = NULL; + uint32_t *tbl8_pool = NULL; + struct rte_lpm_tbl8_hdr *tbl8_hdrs = NULL; lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list); @@ -136,12 +267,54 @@ rte_lpm6_create(const char *name, int socket_id, return NULL; } + /* create rules hash table */ + snprintf(mem_name, sizeof(mem_name), "LRH_%s", name); + struct rte_hash_parameters rule_hash_tbl_params = { + .entries = config->max_rules * 1.2 + + RULE_HASH_TABLE_EXTRA_SPACE, + .key_len = sizeof(struct rte_lpm6_rule_key), + .hash_func = rule_hash, + .hash_func_init_val = 0, + .name = mem_name, + .reserved = 0, + .socket_id = socket_id, + .extra_flag = 0 + }; + + rules_tbl = rte_hash_create(&rule_hash_tbl_params); + if (rules_tbl == NULL) { + RTE_LOG(ERR, LPM, "LPM rules hash table allocation failed: %s (%d)", + rte_strerror(rte_errno), rte_errno); + goto fail_wo_unlock; + } + + /* allocate tbl8 indexes pool */ + tbl8_pool = rte_malloc(NULL, + sizeof(uint32_t) * config->number_tbl8s, + RTE_CACHE_LINE_SIZE); + if (tbl8_pool == NULL) { + RTE_LOG(ERR, LPM, "LPM tbl8 pool allocation failed: %s (%d)", + rte_strerror(rte_errno), rte_errno); + rte_errno = ENOMEM; + goto fail_wo_unlock; + } + + /* allocate tbl8 headers */ + tbl8_hdrs = rte_malloc(NULL, + sizeof(struct rte_lpm_tbl8_hdr) * config->number_tbl8s, + RTE_CACHE_LINE_SIZE); + if (tbl8_hdrs == NULL) { + RTE_LOG(ERR, LPM, "LPM tbl8 headers allocation failed: %s (%d)", + rte_strerror(rte_errno), rte_errno); + rte_errno = ENOMEM; + goto fail_wo_unlock; + } + snprintf(mem_name, sizeof(mem_name), "LPM_%s", name); /* Determine the amount of memory to allocate. */ mem_size = sizeof(*lpm) + (sizeof(lpm->tbl8[0]) * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * config->number_tbl8s); - rules_size = sizeof(struct rte_lpm6_rule) * config->max_rules; rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); @@ -154,7 +327,7 @@ rte_lpm6_create(const char *name, int socket_id, lpm = NULL; if (te != NULL) { rte_errno = EEXIST; - goto exit; + goto fail; } /* allocate tailq entry */ @@ -162,7 +335,7 @@ rte_lpm6_create(const char *name, int socket_id, if (te == NULL) { RTE_LOG(ERR, LPM, "Failed to allocate tailq entry!\n"); rte_errno = ENOMEM; - goto exit; + goto fail; } /* Allocate memory to store the LPM data structures. */ @@ -173,34 +346,35 @@ rte_lpm6_create(const char *name, int socket_id, RTE_LOG(ERR, LPM, "LPM memory allocation failed\n"); rte_free(te); rte_errno = ENOMEM; - goto exit; - } - - lpm->rules_tbl = rte_zmalloc_socket(NULL, - (size_t)rules_size, RTE_CACHE_LINE_SIZE, socket_id); - - if (lpm->rules_tbl == NULL) { - RTE_LOG(ERR, LPM, "LPM rules_tbl allocation failed\n"); - rte_free(lpm); - lpm = NULL; - rte_free(te); - rte_errno = ENOMEM; - goto exit; + goto fail; } /* Save user arguments. */ lpm->max_rules = config->max_rules; lpm->number_tbl8s = config->number_tbl8s; snprintf(lpm->name, sizeof(lpm->name), "%s", name); + lpm->rules_tbl = rules_tbl; + lpm->tbl8_pool = tbl8_pool; + lpm->tbl8_hdrs = tbl8_hdrs; + + /* init the stack */ + tbl8_pool_init(lpm); te->data = (void *) lpm; TAILQ_INSERT_TAIL(lpm_list, te, next); + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + return lpm; -exit: +fail: rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); - return lpm; +fail_wo_unlock: + rte_free(tbl8_hdrs); + rte_free(tbl8_pool); + rte_hash_free(rules_tbl); + + return NULL; } /* @@ -259,50 +433,88 @@ rte_lpm6_free(struct rte_lpm6 *lpm) rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); - rte_free(lpm->rules_tbl); + rte_free(lpm->tbl8_hdrs); + rte_free(lpm->tbl8_pool); + rte_hash_free(lpm->rules_tbl); rte_free(lpm); rte_free(te); } +/* Find a rule */ +static inline int +rule_find_with_key(struct rte_lpm6 *lpm, + const struct rte_lpm6_rule_key *rule_key, + uint32_t *next_hop) +{ + uint64_t hash_val; + int ret; + + /* lookup for a rule */ + ret = rte_hash_lookup_data(lpm->rules_tbl, (const void *) rule_key, + (void **) &hash_val); + if (ret >= 0) { + *next_hop = (uint32_t) hash_val; + return 1; + } + + return 0; +} + +/* Find a rule */ +static int +rule_find(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, + uint32_t *next_hop) +{ + struct rte_lpm6_rule_key rule_key; + + /* init a rule key */ + rule_key_init(&rule_key, ip, depth); + + return rule_find_with_key(lpm, &rule_key, next_hop); +} + /* * Checks if a rule already exists in the rules table and updates * the nexthop if so. Otherwise it adds a new rule if enough space is available. + * + * Returns: + * 0 - next hop of existed rule is updated + * 1 - new rule successfully added + * <0 - error */ -static inline int32_t -rule_add(struct rte_lpm6 *lpm, uint8_t *ip, uint32_t next_hop, uint8_t depth) +static inline int +rule_add(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, uint32_t next_hop) { - uint32_t rule_index; - - /* Scan through rule list to see if rule already exists. */ - for (rule_index = 0; rule_index < lpm->used_rules; rule_index++) { + int ret, rule_exist; + struct rte_lpm6_rule_key rule_key; + uint32_t unused; - /* If rule already exists update its next_hop and return. */ - if ((memcmp (lpm->rules_tbl[rule_index].ip, ip, - RTE_LPM6_IPV6_ADDR_SIZE) == 0) && - lpm->rules_tbl[rule_index].depth == depth) { - lpm->rules_tbl[rule_index].next_hop = next_hop; + /* init a rule key */ + rule_key_init(&rule_key, ip, depth); - return rule_index; - } - } + /* Scan through rule list to see if rule already exists. */ + rule_exist = rule_find_with_key(lpm, &rule_key, &unused); /* * If rule does not exist check if there is space to add a new rule to * this rule group. If there is no space return error. */ - if (lpm->used_rules == lpm->max_rules) { + if (!rule_exist && lpm->used_rules == lpm->max_rules) return -ENOSPC; - } - /* If there is space for the new rule add it. */ - rte_memcpy(lpm->rules_tbl[rule_index].ip, ip, RTE_LPM6_IPV6_ADDR_SIZE); - lpm->rules_tbl[rule_index].next_hop = next_hop; - lpm->rules_tbl[rule_index].depth = depth; + /* add the rule or update rules next hop */ + ret = rte_hash_add_key_data(lpm->rules_tbl, &rule_key, + (void *)(uintptr_t) next_hop); + if (ret < 0) + return ret; /* Increment the used rules counter for this rule group. */ - lpm->used_rules++; + if (!rule_exist) { + lpm->used_rules++; + return 1; + } - return rule_index; + return 0; } /* @@ -311,24 +523,24 @@ rule_add(struct rte_lpm6 *lpm, uint8_t *ip, uint32_t next_hop, uint8_t depth) * in the IP address returns a match. */ static void -expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t depth, - uint32_t next_hop) +expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t old_depth, + uint8_t new_depth, uint32_t next_hop, uint8_t valid) { uint32_t tbl8_group_end, tbl8_gindex_next, j; tbl8_group_end = tbl8_gindex + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; struct rte_lpm6_tbl_entry new_tbl8_entry = { - .valid = VALID, - .valid_group = VALID, - .depth = depth, + .valid = valid, + .valid_group = valid, + .depth = new_depth, .next_hop = next_hop, .ext_entry = 0, }; for (j = tbl8_gindex; j < tbl8_group_end; j++) { if (!lpm->tbl8[j].valid || (lpm->tbl8[j].ext_entry == 0 - && lpm->tbl8[j].depth <= depth)) { + && lpm->tbl8[j].depth <= old_depth)) { lpm->tbl8[j] = new_tbl8_entry; @@ -336,11 +548,101 @@ expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t depth, tbl8_gindex_next = lpm->tbl8[j].lpm6_tbl8_gindex * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; - expand_rule(lpm, tbl8_gindex_next, depth, next_hop); + expand_rule(lpm, tbl8_gindex_next, old_depth, new_depth, + next_hop, valid); } } } +/* + * Init a tbl8 header + */ +static inline void +init_tbl8_header(struct rte_lpm6 *lpm, uint32_t tbl_ind, + uint32_t owner_tbl_ind, uint32_t owner_entry_ind) +{ + struct rte_lpm_tbl8_hdr *tbl_hdr = &lpm->tbl8_hdrs[tbl_ind]; + tbl_hdr->owner_tbl_ind = owner_tbl_ind; + tbl_hdr->owner_entry_ind = owner_entry_ind; + tbl_hdr->ref_cnt = 0; +} + +/* + * Calculate index to the table based on the number and position + * of the bytes being inspected in this step. + */ +static uint32_t +get_bitshift(const uint8_t *ip, uint8_t first_byte, uint8_t bytes) +{ + uint32_t entry_ind, i; + int8_t bitshift; + + entry_ind = 0; + for (i = first_byte; i < (uint32_t)(first_byte + bytes); i++) { + bitshift = (int8_t)((bytes - i)*BYTE_SIZE); + + if (bitshift < 0) + bitshift = 0; + entry_ind = entry_ind | ip[i-1] << bitshift; + } + + return entry_ind; +} + +/* + * Simulate adding a new route to the LPM counting number + * of new tables that will be needed + * + * It returns 0 on success, or 1 if + * the process needs to be continued by calling the function again. + */ +static inline int +simulate_add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, + struct rte_lpm6_tbl_entry **next_tbl, const uint8_t *ip, + uint8_t bytes, uint8_t first_byte, uint8_t depth, + uint32_t *need_tbl_nb) +{ + uint32_t entry_ind; + uint8_t bits_covered; + uint32_t next_tbl_ind; + + /* + * Calculate index to the table based on the number and position + * of the bytes being inspected in this step. + */ + entry_ind = get_bitshift(ip, first_byte, bytes); + + /* Number of bits covered in this step */ + bits_covered = (uint8_t)((bytes+first_byte-1)*BYTE_SIZE); + + if (depth <= bits_covered) { + *need_tbl_nb = 0; + return 0; + } + + if (tbl[entry_ind].valid == 0 || tbl[entry_ind].ext_entry == 0) { + /* from this point on a new table is needed on each level + * that is not covered yet + */ + depth -= bits_covered; + uint32_t cnt = depth >> 3; /* depth / BYTE_SIZE */ + if (depth & 7) /* 0b00000111 */ + /* if depth % 8 > 0 then one more table is needed + * for those last bits + */ + cnt++; + + *need_tbl_nb = cnt; + return 0; + } + + next_tbl_ind = tbl[entry_ind].lpm6_tbl8_gindex; + *next_tbl = &(lpm->tbl8[next_tbl_ind * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]); + *need_tbl_nb = 0; + return 1; +} + /* * Partially adds a new route to the data structure (tbl24+tbl8s). * It returns 0 on success, a negative number on failure, or 1 if @@ -348,25 +650,21 @@ expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t depth, */ static inline int add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, - struct rte_lpm6_tbl_entry **tbl_next, uint8_t *ip, uint8_t bytes, - uint8_t first_byte, uint8_t depth, uint32_t next_hop) + uint32_t tbl_ind, struct rte_lpm6_tbl_entry **next_tbl, + uint32_t *next_tbl_ind, uint8_t *ip, uint8_t bytes, + uint8_t first_byte, uint8_t depth, uint32_t next_hop, + uint8_t is_new_rule) { - uint32_t tbl_index, tbl_range, tbl8_group_start, tbl8_group_end, i; - int32_t tbl8_gindex; - int8_t bitshift; + uint32_t entry_ind, tbl_range, tbl8_group_start, tbl8_group_end, i; + uint32_t tbl8_gindex; uint8_t bits_covered; + int ret; /* * Calculate index to the table based on the number and position * of the bytes being inspected in this step. */ - tbl_index = 0; - for (i = first_byte; i < (uint32_t)(first_byte + bytes); i++) { - bitshift = (int8_t)((bytes - i)*BYTE_SIZE); - - if (bitshift < 0) bitshift = 0; - tbl_index = tbl_index | ip[i-1] << bitshift; - } + entry_ind = get_bitshift(ip, first_byte, bytes); /* Number of bits covered in this step */ bits_covered = (uint8_t)((bytes+first_byte-1)*BYTE_SIZE); @@ -378,7 +676,7 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, if (depth <= bits_covered) { tbl_range = 1 << (bits_covered - depth); - for (i = tbl_index; i < (tbl_index + tbl_range); i++) { + for (i = entry_ind; i < (entry_ind + tbl_range); i++) { if (!tbl[i].valid || (tbl[i].ext_entry == 0 && tbl[i].depth <= depth)) { @@ -400,10 +698,15 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, */ tbl8_gindex = tbl[i].lpm6_tbl8_gindex * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; - expand_rule(lpm, tbl8_gindex, depth, next_hop); + expand_rule(lpm, tbl8_gindex, depth, depth, + next_hop, VALID); } } + /* update tbl8 rule reference counter */ + if (tbl_ind != TBL24_IND && is_new_rule) + lpm->tbl8_hdrs[tbl_ind].ref_cnt++; + return 0; } /* @@ -412,12 +715,24 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, */ else { /* If it's invalid a new tbl8 is needed */ - if (!tbl[tbl_index].valid) { - if (lpm->next_tbl8 < lpm->number_tbl8s) - tbl8_gindex = (lpm->next_tbl8)++; - else + if (!tbl[entry_ind].valid) { + /* get a new table */ + ret = tbl8_get(lpm, &tbl8_gindex); + if (ret != 0) return -ENOSPC; + /* invalidate all new tbl8 entries */ + tbl8_group_start = tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + memset(&lpm->tbl8[tbl8_group_start], 0, + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES); + + /* init the new table's header: + * save the reference to the owner table + */ + init_tbl8_header(lpm, tbl8_gindex, tbl_ind, entry_ind); + + /* reference to a new tbl8 */ struct rte_lpm6_tbl_entry new_tbl_entry = { .lpm6_tbl8_gindex = tbl8_gindex, .depth = 0, @@ -426,17 +741,20 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, .ext_entry = 1, }; - tbl[tbl_index] = new_tbl_entry; + tbl[entry_ind] = new_tbl_entry; + + /* update the current table's reference counter */ + if (tbl_ind != TBL24_IND) + lpm->tbl8_hdrs[tbl_ind].ref_cnt++; } /* - * If it's valid but not extended the rule that was stored * + * If it's valid but not extended the rule that was stored * here needs to be moved to the next table. */ - else if (tbl[tbl_index].ext_entry == 0) { - /* Search for free tbl8 group. */ - if (lpm->next_tbl8 < lpm->number_tbl8s) - tbl8_gindex = (lpm->next_tbl8)++; - else + else if (tbl[entry_ind].ext_entry == 0) { + /* get a new tbl8 index */ + ret = tbl8_get(lpm, &tbl8_gindex); + if (ret != 0) return -ENOSPC; tbl8_group_start = tbl8_gindex * @@ -444,13 +762,22 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, tbl8_group_end = tbl8_group_start + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + struct rte_lpm6_tbl_entry tbl_entry = { + .next_hop = tbl[entry_ind].next_hop, + .depth = tbl[entry_ind].depth, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 0 + }; + /* Populate new tbl8 with tbl value. */ - for (i = tbl8_group_start; i < tbl8_group_end; i++) { - lpm->tbl8[i].valid = VALID; - lpm->tbl8[i].depth = tbl[tbl_index].depth; - lpm->tbl8[i].next_hop = tbl[tbl_index].next_hop; - lpm->tbl8[i].ext_entry = 0; - } + for (i = tbl8_group_start; i < tbl8_group_end; i++) + lpm->tbl8[i] = tbl_entry; + + /* init the new table's header: + * save the reference to the owner table + */ + init_tbl8_header(lpm, tbl8_gindex, tbl_ind, entry_ind); /* * Update tbl entry to point to new tbl8 entry. Note: The @@ -465,11 +792,16 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, .ext_entry = 1, }; - tbl[tbl_index] = new_tbl_entry; + tbl[entry_ind] = new_tbl_entry; + + /* update the current table's reference counter */ + if (tbl_ind != TBL24_IND) + lpm->tbl8_hdrs[tbl_ind].ref_cnt++; } - *tbl_next = &(lpm->tbl8[tbl[tbl_index].lpm6_tbl8_gindex * - RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]); + *next_tbl_ind = tbl[entry_ind].lpm6_tbl8_gindex; + *next_tbl = &(lpm->tbl8[*next_tbl_ind * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]); } return 1; @@ -486,13 +818,56 @@ rte_lpm6_add_v20(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, } VERSION_SYMBOL(rte_lpm6_add, _v20, 2.0); + +/* + * Simulate adding a route to LPM + * + * Returns: + * 0 on success + * -ENOSPC not enought tbl8 left + */ +static int +simulate_add(struct rte_lpm6 *lpm, const uint8_t *masked_ip, uint8_t depth) +{ + struct rte_lpm6_tbl_entry *tbl; + struct rte_lpm6_tbl_entry *tbl_next = NULL; + int ret, i; + + /* number of new tables needed for a step */ + uint32_t need_tbl_nb; + /* total number of new tables needed */ + uint32_t total_need_tbl_nb; + + /* Inspect the first three bytes through tbl24 on the first step. */ + ret = simulate_add_step(lpm, lpm->tbl24, &tbl_next, masked_ip, + ADD_FIRST_BYTE, 1, depth, &need_tbl_nb); + total_need_tbl_nb = need_tbl_nb; + /* + * Inspect one by one the rest of the bytes until + * the process is completed. + */ + for (i = ADD_FIRST_BYTE; i < RTE_LPM6_IPV6_ADDR_SIZE && ret == 1; i++) { + tbl = tbl_next; + ret = simulate_add_step(lpm, tbl, &tbl_next, masked_ip, 1, + (uint8_t)(i+1), depth, &need_tbl_nb); + total_need_tbl_nb += need_tbl_nb; + } + + if (tbl8_available(lpm) < total_need_tbl_nb) + /* not enought tbl8 to add a rule */ + return -ENOSPC; + + return 0; +} + int rte_lpm6_add_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, uint32_t next_hop) { struct rte_lpm6_tbl_entry *tbl; struct rte_lpm6_tbl_entry *tbl_next = NULL; - int32_t rule_index; + /* init to avoid compiler warning */ + uint32_t tbl_next_num = 123456; int status; uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; int i; @@ -502,26 +877,26 @@ rte_lpm6_add_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, return -EINVAL; /* Copy the IP and mask it to avoid modifying user's input data. */ - memcpy(masked_ip, ip, RTE_LPM6_IPV6_ADDR_SIZE); - mask_ip(masked_ip, depth); + ip6_copy_addr(masked_ip, ip); + ip6_mask_addr(masked_ip, depth); - /* Add the rule to the rule table. */ - rule_index = rule_add(lpm, masked_ip, next_hop, depth); + /* Simulate adding a new route */ + int ret = simulate_add(lpm, masked_ip, depth); + if (ret < 0) + return ret; + /* Add the rule to the rule table. */ + int is_new_rule = rule_add(lpm, masked_ip, depth, next_hop); /* If there is no space available for new rule return error. */ - if (rule_index < 0) { - return rule_index; - } + if (is_new_rule < 0) + return is_new_rule; /* Inspect the first three bytes through tbl24 on the first step. */ tbl = lpm->tbl24; - status = add_step (lpm, tbl, &tbl_next, masked_ip, ADD_FIRST_BYTE, 1, - depth, next_hop); - if (status < 0) { - rte_lpm6_delete(lpm, masked_ip, depth); - - return status; - } + status = add_step(lpm, tbl, TBL24_IND, &tbl_next, &tbl_next_num, + masked_ip, ADD_FIRST_BYTE, 1, depth, next_hop, + is_new_rule); + assert(status >= 0); /* * Inspect one by one the rest of the bytes until @@ -529,13 +904,10 @@ rte_lpm6_add_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, */ for (i = ADD_FIRST_BYTE; i < RTE_LPM6_IPV6_ADDR_SIZE && status == 1; i++) { tbl = tbl_next; - status = add_step (lpm, tbl, &tbl_next, masked_ip, 1, (uint8_t)(i+1), - depth, next_hop); - if (status < 0) { - rte_lpm6_delete(lpm, masked_ip, depth); - - return status; - } + status = add_step(lpm, tbl, tbl_next_num, &tbl_next, + &tbl_next_num, masked_ip, 1, (uint8_t)(i+1), + depth, next_hop, is_new_rule); + assert(status >= 0); } return status; @@ -610,9 +982,8 @@ rte_lpm6_lookup_v1705(const struct rte_lpm6 *lpm, uint8_t *ip, uint32_t tbl24_index; /* DEBUG: Check user input arguments. */ - if ((lpm == NULL) || (ip == NULL) || (next_hop == NULL)) { + if ((lpm == NULL) || (ip == NULL) || (next_hop == NULL)) return -EINVAL; - } first_byte = LOOKUP_FIRST_BYTE; tbl24_index = (ip[0] << BYTES2_SIZE) | (ip[1] << BYTE_SIZE) | ip[2]; @@ -648,9 +1019,8 @@ rte_lpm6_lookup_bulk_func_v20(const struct rte_lpm6 *lpm, int status; /* DEBUG: Check user input arguments. */ - if ((lpm == NULL) || (ips == NULL) || (next_hops == NULL)) { + if ((lpm == NULL) || (ips == NULL) || (next_hops == NULL)) return -EINVAL; - } for (i = 0; i < n; i++) { first_byte = LOOKUP_FIRST_BYTE; @@ -724,30 +1094,6 @@ MAP_STATIC_SYMBOL(int rte_lpm6_lookup_bulk_func(const struct rte_lpm6 *lpm, int32_t *next_hops, unsigned int n), rte_lpm6_lookup_bulk_func_v1705); -/* - * Finds a rule in rule table. - * NOTE: Valid range for depth parameter is 1 .. 128 inclusive. - */ -static inline int32_t -rule_find(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth) -{ - uint32_t rule_index; - - /* Scan used rules at given depth to find rule. */ - for (rule_index = 0; rule_index < lpm->used_rules; rule_index++) { - /* If rule is found return the rule index. */ - if ((memcmp (lpm->rules_tbl[rule_index].ip, ip, - RTE_LPM6_IPV6_ADDR_SIZE) == 0) && - lpm->rules_tbl[rule_index].depth == depth) { - - return rule_index; - } - } - - /* If rule is not found return -ENOENT. */ - return -ENOENT; -} - /* * Look for a rule in the high-level rules table */ @@ -775,8 +1121,7 @@ int rte_lpm6_is_rule_present_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, uint32_t *next_hop) { - uint8_t ip_masked[RTE_LPM6_IPV6_ADDR_SIZE]; - int32_t rule_index; + uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; /* Check user arguments. */ if ((lpm == NULL) || next_hop == NULL || ip == NULL || @@ -784,19 +1129,10 @@ rte_lpm6_is_rule_present_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, return -EINVAL; /* Copy the IP and mask it to avoid modifying user's input data. */ - memcpy(ip_masked, ip, RTE_LPM6_IPV6_ADDR_SIZE); - mask_ip(ip_masked, depth); - - /* Look for the rule using rule_find. */ - rule_index = rule_find(lpm, ip_masked, depth); - - if (rule_index >= 0) { - *next_hop = lpm->rules_tbl[rule_index].next_hop; - return 1; - } + ip6_copy_addr(masked_ip, ip); + ip6_mask_addr(masked_ip, depth); - /* If rule is not found return 0. */ - return 0; + return rule_find(lpm, masked_ip, depth, next_hop); } BIND_DEFAULT_SYMBOL(rte_lpm6_is_rule_present, _v1705, 17.05); MAP_STATIC_SYMBOL(int rte_lpm6_is_rule_present(struct rte_lpm6 *lpm, @@ -806,133 +1142,66 @@ MAP_STATIC_SYMBOL(int rte_lpm6_is_rule_present(struct rte_lpm6 *lpm, /* * Delete a rule from the rule table. * NOTE: Valid range for depth parameter is 1 .. 128 inclusive. + * return + * 0 on success + * <0 on failure */ -static inline void -rule_delete(struct rte_lpm6 *lpm, int32_t rule_index) -{ - /* - * Overwrite redundant rule with last rule in group and decrement rule - * counter. - */ - lpm->rules_tbl[rule_index] = lpm->rules_tbl[lpm->used_rules-1]; - lpm->used_rules--; -} - -/* - * Deletes a rule - */ -int -rte_lpm6_delete(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth) +static inline int +rule_delete(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth) { - int32_t rule_to_delete_index; - uint8_t ip_masked[RTE_LPM6_IPV6_ADDR_SIZE]; - unsigned i; - - /* - * Check input arguments. - */ - if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH)) { - return -EINVAL; - } - - /* Copy the IP and mask it to avoid modifying user's input data. */ - memcpy(ip_masked, ip, RTE_LPM6_IPV6_ADDR_SIZE); - mask_ip(ip_masked, depth); - - /* - * Find the index of the input rule, that needs to be deleted, in the - * rule table. - */ - rule_to_delete_index = rule_find(lpm, ip_masked, depth); - - /* - * Check if rule_to_delete_index was found. If no rule was found the - * function rule_find returns -ENOENT. - */ - if (rule_to_delete_index < 0) - return rule_to_delete_index; - - /* Delete the rule from the rule table. */ - rule_delete(lpm, rule_to_delete_index); + int ret; + struct rte_lpm6_rule_key rule_key; - /* - * Set all the table entries to 0 (ie delete every rule - * from the data structure. - */ - lpm->next_tbl8 = 0; - memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); - memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) - * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s); + /* init rule key */ + rule_key_init(&rule_key, ip, depth); - /* - * Add every rule again (except for the one that was removed from - * the rules table). - */ - for (i = 0; i < lpm->used_rules; i++) { - rte_lpm6_add(lpm, lpm->rules_tbl[i].ip, lpm->rules_tbl[i].depth, - lpm->rules_tbl[i].next_hop); - } + /* delete the rule */ + ret = rte_hash_del_key(lpm->rules_tbl, (void *) &rule_key); + if (ret >= 0) + lpm->used_rules--; - return 0; + return ret; } /* * Deletes a group of rules + * + * Note that the function rebuilds the lpm table, + * rather than doing incremental updates like + * the regular delete function */ int rte_lpm6_delete_bulk_func(struct rte_lpm6 *lpm, - uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], uint8_t *depths, unsigned n) + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], uint8_t *depths, + unsigned n) { - int32_t rule_to_delete_index; - uint8_t ip_masked[RTE_LPM6_IPV6_ADDR_SIZE]; + uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; unsigned i; - /* - * Check input arguments. - */ - if ((lpm == NULL) || (ips == NULL) || (depths == NULL)) { + /* Check input arguments. */ + if ((lpm == NULL) || (ips == NULL) || (depths == NULL)) return -EINVAL; - } for (i = 0; i < n; i++) { - /* Copy the IP and mask it to avoid modifying user's input data. */ - memcpy(ip_masked, ips[i], RTE_LPM6_IPV6_ADDR_SIZE); - mask_ip(ip_masked, depths[i]); - - /* - * Find the index of the input rule, that needs to be deleted, in the - * rule table. - */ - rule_to_delete_index = rule_find(lpm, ip_masked, depths[i]); - - /* - * Check if rule_to_delete_index was found. If no rule was found the - * function rule_find returns -ENOENT. - */ - if (rule_to_delete_index < 0) - continue; - - /* Delete the rule from the rule table. */ - rule_delete(lpm, rule_to_delete_index); + ip6_copy_addr(masked_ip, ips[i]); + ip6_mask_addr(masked_ip, depths[i]); + rule_delete(lpm, masked_ip, depths[i]); } /* * Set all the table entries to 0 (ie delete every rule * from the data structure. */ - lpm->next_tbl8 = 0; memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s); + tbl8_pool_init(lpm); /* * Add every rule again (except for the ones that were removed from * the rules table). */ - for (i = 0; i < lpm->used_rules; i++) { - rte_lpm6_add(lpm, lpm->rules_tbl[i].ip, lpm->rules_tbl[i].depth, - lpm->rules_tbl[i].next_hop); - } + rebuild_lpm(lpm); return 0; } @@ -946,9 +1215,6 @@ rte_lpm6_delete_all(struct rte_lpm6 *lpm) /* Zero used rules counter. */ lpm->used_rules = 0; - /* Zero next tbl8 index. */ - lpm->next_tbl8 = 0; - /* Zero tbl24. */ memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); @@ -956,6 +1222,268 @@ rte_lpm6_delete_all(struct rte_lpm6 *lpm) memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s); + /* init pool of free tbl8 indexes */ + tbl8_pool_init(lpm); + /* Delete all rules form the rules table. */ - memset(lpm->rules_tbl, 0, sizeof(struct rte_lpm6_rule) * lpm->max_rules); + rte_hash_reset(lpm->rules_tbl); +} + +/* + * Convert a depth to a one byte long mask + * Example: 4 will be converted to 0xF0 + */ +static uint8_t __attribute__((pure)) +depth_to_mask_1b(uint8_t depth) +{ + /* To calculate a mask start with a 1 on the left hand side and right + * shift while populating the left hand side with 1's + */ + return (signed char)0x80 >> (depth - 1); +} + +/* + * Find a less specific rule + */ +static int +rule_find_less_specific(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, + struct rte_lpm6_rule *rule) +{ + int ret; + uint32_t next_hop; + uint8_t mask; + struct rte_lpm6_rule_key rule_key; + + if (depth == 1) + return 0; + + rule_key_init(&rule_key, ip, depth); + + while (depth > 1) { + depth--; + + /* each iteration zero one more bit of the key */ + mask = depth & 7; /* depth % BYTE_SIZE */ + if (mask > 0) + mask = depth_to_mask_1b(mask); + + rule_key.depth = depth; + rule_key.ip[depth >> 3] &= mask; + + ret = rule_find_with_key(lpm, &rule_key, &next_hop); + if (ret) { + rule->depth = depth; + ip6_copy_addr(rule->ip, rule_key.ip); + rule->next_hop = next_hop; + return 1; + } + } + + return 0; +} + +/* + * Find range of tbl8 cells occupied by a rule + */ +static void +rule_find_range(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + struct rte_lpm6_tbl_entry **from, + struct rte_lpm6_tbl_entry **to, + uint32_t *out_tbl_ind) +{ + uint32_t ind; + uint32_t first_3bytes = (uint32_t)ip[0] << 16 | ip[1] << 8 | ip[2]; + + if (depth <= 24) { + /* rule is within the top level */ + ind = first_3bytes; + *from = &lpm->tbl24[ind]; + ind += (1 << (24 - depth)) - 1; + *to = &lpm->tbl24[ind]; + *out_tbl_ind = TBL24_IND; + } else { + /* top level entry */ + struct rte_lpm6_tbl_entry *tbl = &lpm->tbl24[first_3bytes]; + assert(tbl->ext_entry == 1); + /* first tbl8 */ + uint32_t tbl_ind = tbl->lpm6_tbl8_gindex; + tbl = &lpm->tbl8[tbl_ind * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]; + /* current ip byte, the top level is already behind */ + uint8_t byte = 3; + /* minus top level */ + depth -= 24; + + /* interate through levels (tbl8s) + * until we reach the last one + */ + while (depth > 8) { + tbl += ip[byte]; + assert(tbl->ext_entry == 1); + /* go to the next level/tbl8 */ + tbl_ind = tbl->lpm6_tbl8_gindex; + tbl = &lpm->tbl8[tbl_ind * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]; + byte += 1; + depth -= 8; + } + + /* last level/tbl8 */ + ind = ip[byte] & depth_to_mask_1b(depth); + *from = &tbl[ind]; + ind += (1 << (8 - depth)) - 1; + *to = &tbl[ind]; + *out_tbl_ind = tbl_ind; + } +} + +/* + * Remove a table from the LPM tree + */ +static void +remove_tbl(struct rte_lpm6 *lpm, struct rte_lpm_tbl8_hdr *tbl_hdr, + uint32_t tbl_ind, struct rte_lpm6_rule *lsp_rule) +{ + struct rte_lpm6_tbl_entry *owner_entry; + + if (tbl_hdr->owner_tbl_ind == TBL24_IND) + owner_entry = &lpm->tbl24[tbl_hdr->owner_entry_ind]; + else { + uint32_t owner_tbl_ind = tbl_hdr->owner_tbl_ind; + owner_entry = &lpm->tbl8[ + owner_tbl_ind * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES + + tbl_hdr->owner_entry_ind]; + + struct rte_lpm_tbl8_hdr *owner_tbl_hdr = + &lpm->tbl8_hdrs[owner_tbl_ind]; + if (--owner_tbl_hdr->ref_cnt == 0) + remove_tbl(lpm, owner_tbl_hdr, owner_tbl_ind, lsp_rule); + } + + assert(owner_entry->ext_entry == 1); + + /* unlink the table */ + if (lsp_rule != NULL) { + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = lsp_rule->next_hop, + .depth = lsp_rule->depth, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 0 + }; + + *owner_entry = new_tbl_entry; + } else { + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = 0, + .depth = 0, + .valid = INVALID, + .valid_group = INVALID, + .ext_entry = 0 + }; + + *owner_entry = new_tbl_entry; + } + + /* return the table to the pool */ + tbl8_put(lpm, tbl_ind); +} + +/* + * Deletes a rule + */ +int +rte_lpm6_delete(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth) +{ + uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; + struct rte_lpm6_rule lsp_rule_obj; + struct rte_lpm6_rule *lsp_rule; + int ret; + uint32_t tbl_ind; + struct rte_lpm6_tbl_entry *from, *to; + + /* Check input arguments. */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH)) + return -EINVAL; + + /* Copy the IP and mask it to avoid modifying user's input data. */ + ip6_copy_addr(masked_ip, ip); + ip6_mask_addr(masked_ip, depth); + + /* Delete the rule from the rule table. */ + ret = rule_delete(lpm, masked_ip, depth); + if (ret < 0) + return -ENOENT; + + /* find rule cells */ + rule_find_range(lpm, masked_ip, depth, &from, &to, &tbl_ind); + + /* find a less specific rule (a rule with smaller depth) + * note: masked_ip will be modified, don't use it anymore + */ + ret = rule_find_less_specific(lpm, masked_ip, depth, + &lsp_rule_obj); + lsp_rule = ret ? &lsp_rule_obj : NULL; + + /* decrement the table rule counter, + * note that tbl24 doesn't have a header + */ + if (tbl_ind != TBL24_IND) { + struct rte_lpm_tbl8_hdr *tbl_hdr = &lpm->tbl8_hdrs[tbl_ind]; + if (--tbl_hdr->ref_cnt == 0) { + /* remove the table */ + remove_tbl(lpm, tbl_hdr, tbl_ind, lsp_rule); + return 0; + } + } + + /* iterate rule cells */ + for (; from <= to; from++) + if (from->ext_entry == 1) { + /* reference to a more specific space + * of the prefix/rule. Entries in a more + * specific space that are not used by + * a more specific prefix must be occupied + * by the prefix + */ + if (lsp_rule != NULL) + expand_rule(lpm, + from->lpm6_tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES, + depth, lsp_rule->depth, + lsp_rule->next_hop, VALID); + else + /* since the prefix has no less specific prefix, + * its more specific space must be invalidated + */ + expand_rule(lpm, + from->lpm6_tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES, + depth, 0, 0, INVALID); + } else if (from->depth == depth) { + /* entry is not a reference and belongs to the prefix */ + if (lsp_rule != NULL) { + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = lsp_rule->next_hop, + .depth = lsp_rule->depth, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 0 + }; + + *from = new_tbl_entry; + } else { + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = 0, + .depth = 0, + .valid = INVALID, + .valid_group = INVALID, + .ext_entry = 0 + }; + + *from = new_tbl_entry; + } + } + + return 0; } diff --git a/lib/librte_mbuf/meson.build b/lib/librte_mbuf/meson.build index 45ffb0db..94d9c4c9 100644 --- a/lib/librte_mbuf/meson.build +++ b/lib/librte_mbuf/meson.build @@ -1,7 +1,7 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2017 Intel Corporation -version = 3 +version = 4 sources = files('rte_mbuf.c', 'rte_mbuf_ptype.c', 'rte_mbuf_pool_ops.c') headers = files('rte_mbuf.h', 'rte_mbuf_ptype.h', 'rte_mbuf_pool_ops.h') deps += ['mempool'] diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c index e714c5a5..9790b4fb 100644 --- a/lib/librte_mbuf/rte_mbuf.c +++ b/lib/librte_mbuf/rte_mbuf.c @@ -296,11 +296,19 @@ const char *rte_get_rx_ol_flag_name(uint64_t mask) case PKT_RX_VLAN_STRIPPED: return "PKT_RX_VLAN_STRIPPED"; case PKT_RX_IEEE1588_PTP: return "PKT_RX_IEEE1588_PTP"; case PKT_RX_IEEE1588_TMST: return "PKT_RX_IEEE1588_TMST"; + case PKT_RX_FDIR_ID: return "PKT_RX_FDIR_ID"; + case PKT_RX_FDIR_FLX: return "PKT_RX_FDIR_FLX"; case PKT_RX_QINQ_STRIPPED: return "PKT_RX_QINQ_STRIPPED"; + case PKT_RX_QINQ: return "PKT_RX_QINQ"; case PKT_RX_LRO: return "PKT_RX_LRO"; case PKT_RX_TIMESTAMP: return "PKT_RX_TIMESTAMP"; case PKT_RX_SEC_OFFLOAD: return "PKT_RX_SEC_OFFLOAD"; case PKT_RX_SEC_OFFLOAD_FAILED: return "PKT_RX_SEC_OFFLOAD_FAILED"; + case PKT_RX_OUTER_L4_CKSUM_BAD: return "PKT_RX_OUTER_L4_CKSUM_BAD"; + case PKT_RX_OUTER_L4_CKSUM_GOOD: return "PKT_RX_OUTER_L4_CKSUM_GOOD"; + case PKT_RX_OUTER_L4_CKSUM_INVALID: + return "PKT_RX_OUTER_L4_CKSUM_INVALID"; + default: return NULL; } } @@ -333,12 +341,21 @@ rte_get_rx_ol_flag_list(uint64_t mask, char *buf, size_t buflen) { PKT_RX_VLAN_STRIPPED, PKT_RX_VLAN_STRIPPED, NULL }, { PKT_RX_IEEE1588_PTP, PKT_RX_IEEE1588_PTP, NULL }, { PKT_RX_IEEE1588_TMST, PKT_RX_IEEE1588_TMST, NULL }, + { PKT_RX_FDIR_ID, PKT_RX_FDIR_ID, NULL }, + { PKT_RX_FDIR_FLX, PKT_RX_FDIR_FLX, NULL }, { PKT_RX_QINQ_STRIPPED, PKT_RX_QINQ_STRIPPED, NULL }, { PKT_RX_LRO, PKT_RX_LRO, NULL }, { PKT_RX_TIMESTAMP, PKT_RX_TIMESTAMP, NULL }, { PKT_RX_SEC_OFFLOAD, PKT_RX_SEC_OFFLOAD, NULL }, { PKT_RX_SEC_OFFLOAD_FAILED, PKT_RX_SEC_OFFLOAD_FAILED, NULL }, { PKT_RX_QINQ, PKT_RX_QINQ, NULL }, + { PKT_RX_OUTER_L4_CKSUM_BAD, PKT_RX_OUTER_L4_CKSUM_MASK, NULL }, + { PKT_RX_OUTER_L4_CKSUM_GOOD, PKT_RX_OUTER_L4_CKSUM_MASK, + NULL }, + { PKT_RX_OUTER_L4_CKSUM_INVALID, PKT_RX_OUTER_L4_CKSUM_MASK, + NULL }, + { PKT_RX_OUTER_L4_CKSUM_UNKNOWN, PKT_RX_OUTER_L4_CKSUM_MASK, + "PKT_RX_OUTER_L4_CKSUM_UNKNOWN" }, }; const char *name; unsigned int i; @@ -373,7 +390,7 @@ rte_get_rx_ol_flag_list(uint64_t mask, char *buf, size_t buflen) const char *rte_get_tx_ol_flag_name(uint64_t mask) { switch (mask) { - case PKT_TX_VLAN_PKT: return "PKT_TX_VLAN_PKT"; + case PKT_TX_VLAN: return "PKT_TX_VLAN"; case PKT_TX_IP_CKSUM: return "PKT_TX_IP_CKSUM"; case PKT_TX_TCP_CKSUM: return "PKT_TX_TCP_CKSUM"; case PKT_TX_SCTP_CKSUM: return "PKT_TX_SCTP_CKSUM"; @@ -393,8 +410,12 @@ const char *rte_get_tx_ol_flag_name(uint64_t mask) case PKT_TX_TUNNEL_VXLAN_GPE: return "PKT_TX_TUNNEL_VXLAN_GPE"; case PKT_TX_TUNNEL_IP: return "PKT_TX_TUNNEL_IP"; case PKT_TX_TUNNEL_UDP: return "PKT_TX_TUNNEL_UDP"; + case PKT_TX_QINQ: return "PKT_TX_QINQ"; case PKT_TX_MACSEC: return "PKT_TX_MACSEC"; case PKT_TX_SEC_OFFLOAD: return "PKT_TX_SEC_OFFLOAD"; + case PKT_TX_UDP_SEG: return "PKT_TX_UDP_SEG"; + case PKT_TX_OUTER_UDP_CKSUM: return "PKT_TX_OUTER_UDP_CKSUM"; + case PKT_TX_METADATA: return "PKT_TX_METADATA"; default: return NULL; } } @@ -404,7 +425,7 @@ int rte_get_tx_ol_flag_list(uint64_t mask, char *buf, size_t buflen) { const struct flag_mask tx_flags[] = { - { PKT_TX_VLAN_PKT, PKT_TX_VLAN_PKT, NULL }, + { PKT_TX_VLAN, PKT_TX_VLAN, NULL }, { PKT_TX_IP_CKSUM, PKT_TX_IP_CKSUM, NULL }, { PKT_TX_TCP_CKSUM, PKT_TX_L4_MASK, NULL }, { PKT_TX_SCTP_CKSUM, PKT_TX_L4_MASK, NULL }, @@ -417,24 +438,20 @@ rte_get_tx_ol_flag_list(uint64_t mask, char *buf, size_t buflen) { PKT_TX_OUTER_IP_CKSUM, PKT_TX_OUTER_IP_CKSUM, NULL }, { PKT_TX_OUTER_IPV4, PKT_TX_OUTER_IPV4, NULL }, { PKT_TX_OUTER_IPV6, PKT_TX_OUTER_IPV6, NULL }, - { PKT_TX_TUNNEL_VXLAN, PKT_TX_TUNNEL_MASK, - "PKT_TX_TUNNEL_NONE" }, - { PKT_TX_TUNNEL_GRE, PKT_TX_TUNNEL_MASK, - "PKT_TX_TUNNEL_NONE" }, - { PKT_TX_TUNNEL_IPIP, PKT_TX_TUNNEL_MASK, - "PKT_TX_TUNNEL_NONE" }, - { PKT_TX_TUNNEL_GENEVE, PKT_TX_TUNNEL_MASK, - "PKT_TX_TUNNEL_NONE" }, - { PKT_TX_TUNNEL_MPLSINUDP, PKT_TX_TUNNEL_MASK, - "PKT_TX_TUNNEL_NONE" }, - { PKT_TX_TUNNEL_VXLAN_GPE, PKT_TX_TUNNEL_MASK, - "PKT_TX_TUNNEL_NONE" }, - { PKT_TX_TUNNEL_IP, PKT_TX_TUNNEL_MASK, - "PKT_TX_TUNNEL_NONE" }, - { PKT_TX_TUNNEL_UDP, PKT_TX_TUNNEL_MASK, - "PKT_TX_TUNNEL_NONE" }, + { PKT_TX_TUNNEL_VXLAN, PKT_TX_TUNNEL_MASK, NULL }, + { PKT_TX_TUNNEL_GRE, PKT_TX_TUNNEL_MASK, NULL }, + { PKT_TX_TUNNEL_IPIP, PKT_TX_TUNNEL_MASK, NULL }, + { PKT_TX_TUNNEL_GENEVE, PKT_TX_TUNNEL_MASK, NULL }, + { PKT_TX_TUNNEL_MPLSINUDP, PKT_TX_TUNNEL_MASK, NULL }, + { PKT_TX_TUNNEL_VXLAN_GPE, PKT_TX_TUNNEL_MASK, NULL }, + { PKT_TX_TUNNEL_IP, PKT_TX_TUNNEL_MASK, NULL }, + { PKT_TX_TUNNEL_UDP, PKT_TX_TUNNEL_MASK, NULL }, + { PKT_TX_QINQ, PKT_TX_QINQ, NULL }, { PKT_TX_MACSEC, PKT_TX_MACSEC, NULL }, { PKT_TX_SEC_OFFLOAD, PKT_TX_SEC_OFFLOAD, NULL }, + { PKT_TX_UDP_SEG, PKT_TX_UDP_SEG, NULL }, + { PKT_TX_OUTER_UDP_CKSUM, PKT_TX_OUTER_UDP_CKSUM, NULL }, + { PKT_TX_METADATA, PKT_TX_METADATA, NULL }, }; const char *name; unsigned int i; diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h index 9ce5d76d..3dbc6695 100644 --- a/lib/librte_mbuf/rte_mbuf.h +++ b/lib/librte_mbuf/rte_mbuf.h @@ -140,7 +140,7 @@ extern "C" { * The 2 vlans have been stripped by the hardware and their tci are * saved in mbuf->vlan_tci (inner) and mbuf->vlan_tci_outer (outer). * This can only happen if vlan stripping is enabled in the RX - * configuration of the PMD. If this flag is set, + * configuration of the PMD. * When PKT_RX_QINQ_STRIPPED is set, the flags (PKT_RX_VLAN | * PKT_RX_VLAN_STRIPPED | PKT_RX_QINQ) must also be set. */ @@ -170,17 +170,53 @@ extern "C" { /** * The RX packet is a double VLAN, and the outer tci has been - * saved in in mbuf->vlan_tci_outer. + * saved in in mbuf->vlan_tci_outer. If PKT_RX_QINQ set, PKT_RX_VLAN + * also should be set and inner tci should be saved to mbuf->vlan_tci. * If the flag PKT_RX_QINQ_STRIPPED is also present, both VLANs * headers have been stripped from mbuf data, else they are still * present. */ #define PKT_RX_QINQ (1ULL << 20) +/** + * Mask of bits used to determine the status of outer RX L4 checksum. + * - PKT_RX_OUTER_L4_CKSUM_UNKNOWN: no info about the outer RX L4 checksum + * - PKT_RX_OUTER_L4_CKSUM_BAD: the outer L4 checksum in the packet is wrong + * - PKT_RX_OUTER_L4_CKSUM_GOOD: the outer L4 checksum in the packet is valid + * - PKT_RX_OUTER_L4_CKSUM_INVALID: invalid outer L4 checksum state. + * + * The detection of PKT_RX_OUTER_L4_CKSUM_GOOD shall be based on the given + * HW capability, At minimum, the PMD should support + * PKT_RX_OUTER_L4_CKSUM_UNKNOWN and PKT_RX_OUTER_L4_CKSUM_BAD states + * if the DEV_RX_OFFLOAD_OUTER_UDP_CKSUM offload is available. + */ +#define PKT_RX_OUTER_L4_CKSUM_MASK ((1ULL << 21) | (1ULL << 22)) + +#define PKT_RX_OUTER_L4_CKSUM_UNKNOWN 0 +#define PKT_RX_OUTER_L4_CKSUM_BAD (1ULL << 21) +#define PKT_RX_OUTER_L4_CKSUM_GOOD (1ULL << 22) +#define PKT_RX_OUTER_L4_CKSUM_INVALID ((1ULL << 21) | (1ULL << 22)) + /* add new RX flags here */ /* add new TX flags here */ +/** + * Indicate that the metadata field in the mbuf is in use. + */ +#define PKT_TX_METADATA (1ULL << 40) + +/** + * Outer UDP checksum offload flag. This flag is used for enabling + * outer UDP checksum in PMD. To use outer UDP checksum, the user needs to + * 1) Enable the following in mbuff, + * a) Fill outer_l2_len and outer_l3_len in mbuf. + * b) Set the PKT_TX_OUTER_UDP_CKSUM flag. + * c) Set the PKT_TX_OUTER_IPV4 or PKT_TX_OUTER_IPV6 flag. + * 2) Configure DEV_TX_OFFLOAD_OUTER_UDP_CKSUM offload flag. + */ +#define PKT_TX_OUTER_UDP_CKSUM (1ULL << 41) + /** * UDP Fragmentation Offload flag. This flag is used for enabling UDP * fragmentation in SW or in HW. When use UFO, mbuf->tso_segsz is used @@ -334,16 +370,23 @@ extern "C" { * which can be set for packet. */ #define PKT_TX_OFFLOAD_MASK ( \ + PKT_TX_OUTER_IPV6 | \ + PKT_TX_OUTER_IPV4 | \ + PKT_TX_OUTER_IP_CKSUM | \ + PKT_TX_VLAN_PKT | \ + PKT_TX_IPV6 | \ + PKT_TX_IPV4 | \ PKT_TX_IP_CKSUM | \ PKT_TX_L4_MASK | \ - PKT_TX_OUTER_IP_CKSUM | \ - PKT_TX_TCP_SEG | \ PKT_TX_IEEE1588_TMST | \ + PKT_TX_TCP_SEG | \ PKT_TX_QINQ_PKT | \ - PKT_TX_VLAN_PKT | \ PKT_TX_TUNNEL_MASK | \ PKT_TX_MACSEC | \ - PKT_TX_SEC_OFFLOAD) + PKT_TX_SEC_OFFLOAD | \ + PKT_TX_UDP_SEG | \ + PKT_TX_OUTER_UDP_CKSUM | \ + PKT_TX_METADATA) /** * Mbuf having an external buffer attached. shinfo in mbuf must be filled. @@ -464,7 +507,9 @@ struct rte_mbuf { }; uint16_t nb_segs; /**< Number of segments. */ - /** Input port (16 bits to support more than 256 virtual ports). */ + /** Input port (16 bits to support more than 256 virtual ports). + * The event eth Tx adapter uses this field to specify the output port. + */ uint16_t port; uint64_t ol_flags; /**< Offload features. */ @@ -511,28 +556,47 @@ struct rte_mbuf { /** VLAN TCI (CPU order), valid if PKT_RX_VLAN is set. */ uint16_t vlan_tci; + RTE_STD_C11 union { - uint32_t rss; /**< RSS hash result if RSS enabled */ - struct { - RTE_STD_C11 - union { - struct { - uint16_t hash; - uint16_t id; + union { + uint32_t rss; /**< RSS hash result if RSS enabled */ + struct { + union { + struct { + uint16_t hash; + uint16_t id; + }; + uint32_t lo; + /**< Second 4 flexible bytes */ }; + uint32_t hi; + /**< First 4 flexible bytes or FD ID, dependent + * on PKT_RX_FDIR_* flag in ol_flags. + */ + } fdir; /**< Filter identifier if FDIR enabled */ + struct { uint32_t lo; - /**< Second 4 flexible bytes */ - }; - uint32_t hi; - /**< First 4 flexible bytes or FD ID, dependent on - PKT_RX_FDIR_* flag in ol_flags. */ - } fdir; /**< Filter identifier if FDIR enabled */ + uint32_t hi; + /**< The event eth Tx adapter uses this field + * to store Tx queue id. + * @see rte_event_eth_tx_adapter_txq_set() + */ + } sched; /**< Hierarchical scheduler */ + /**< User defined tags. See rte_distributor_process() */ + uint32_t usr; + } hash; /**< hash information */ struct { - uint32_t lo; - uint32_t hi; - } sched; /**< Hierarchical scheduler */ - uint32_t usr; /**< User defined tags. See rte_distributor_process() */ - } hash; /**< hash information */ + /** + * Application specific metadata value + * for egress flow rule match. + * Valid if PKT_TX_METADATA is set. + * Located here to allow conjunct use + * with hash.sched.hi. + */ + uint32_t tx_metadata; + uint32_t reserved; + }; + }; /** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ is set. */ uint16_t vlan_tci_outer; @@ -1038,14 +1102,6 @@ rte_mbuf_raw_free(struct rte_mbuf *m) rte_mempool_put(m->pool, m); } -/* compat with older versions */ -__rte_deprecated -static inline void -__rte_mbuf_raw_free(struct rte_mbuf *m) -{ - rte_mbuf_raw_free(m); -} - /** * The packet mbuf constructor. * @@ -1658,14 +1714,6 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m) return NULL; } -/* deprecated, replaced by rte_pktmbuf_prefree_seg() */ -__rte_deprecated -static inline struct rte_mbuf * -__rte_pktmbuf_prefree_seg(struct rte_mbuf *m) -{ - return rte_pktmbuf_prefree_seg(m); -} - /** * Free a segment of a packet mbuf into its original mempool. * diff --git a/lib/librte_mbuf/rte_mbuf_ptype.c b/lib/librte_mbuf/rte_mbuf_ptype.c index d7835e28..d6f906b0 100644 --- a/lib/librte_mbuf/rte_mbuf_ptype.c +++ b/lib/librte_mbuf/rte_mbuf_ptype.c @@ -19,6 +19,8 @@ const char *rte_get_ptype_l2_name(uint32_t ptype) case RTE_PTYPE_L2_ETHER_VLAN: return "L2_ETHER_VLAN"; case RTE_PTYPE_L2_ETHER_QINQ: return "L2_ETHER_QINQ"; case RTE_PTYPE_L2_ETHER_PPPOE: return "L2_ETHER_PPPOE"; + case RTE_PTYPE_L2_ETHER_FCOE: return "L2_ETHER_FCOE"; + case RTE_PTYPE_L2_ETHER_MPLS: return "L2_ETHER_MPLS"; default: return "L2_UNKNOWN"; } } @@ -47,6 +49,7 @@ const char *rte_get_ptype_l4_name(uint32_t ptype) case RTE_PTYPE_L4_SCTP: return "L4_SCTP"; case RTE_PTYPE_L4_ICMP: return "L4_ICMP"; case RTE_PTYPE_L4_NONFRAG: return "L4_NONFRAG"; + case RTE_PTYPE_L4_IGMP: return "L4_IGMP"; default: return "L4_UNKNOWN"; } } diff --git a/lib/librte_mbuf/rte_mbuf_ptype.h b/lib/librte_mbuf/rte_mbuf_ptype.h index 01acc66e..23bc635f 100644 --- a/lib/librte_mbuf/rte_mbuf_ptype.h +++ b/lib/librte_mbuf/rte_mbuf_ptype.h @@ -130,6 +130,20 @@ extern "C" { * <'ether type'=[0x8863|0x8864]> */ #define RTE_PTYPE_L2_ETHER_PPPOE 0x00000008 +/** + * FCoE packet type. + * + * Packet format: + * <'ether type'=[0x8906]> + */ +#define RTE_PTYPE_L2_ETHER_FCOE 0x00000009 +/** + * MPLS packet type. + * + * Packet format: + * <'ether type'=[0x8847|0x8848]> + */ +#define RTE_PTYPE_L2_ETHER_MPLS 0x0000000a /** * Mask of layer 2 packet types. * It is used for outer packet for tunneling cases. @@ -286,6 +300,14 @@ extern "C" { * | 'version'=6, 'next header'!=[6|17|44|132|1]> */ #define RTE_PTYPE_L4_NONFRAG 0x00000600 +/** + * IGMP (Internet Group Management Protocol) packet type. + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=2, 'MF'=0, 'frag_offset'=0> + */ +#define RTE_PTYPE_L4_IGMP 0x00000700 /** * Mask of layer 4 packet types. * It is used for outer packet for tunneling cases. diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c index 03e6b5f7..683b216f 100644 --- a/lib/librte_mempool/rte_mempool.c +++ b/lib/librte_mempool/rte_mempool.c @@ -99,25 +99,44 @@ static unsigned optimize_object_size(unsigned obj_size) return new_obj_size * RTE_MEMPOOL_ALIGN; } +struct pagesz_walk_arg { + int socket_id; + size_t min; +}; + static int find_min_pagesz(const struct rte_memseg_list *msl, void *arg) { - size_t *min = arg; + struct pagesz_walk_arg *wa = arg; + bool valid; + + /* + * we need to only look at page sizes available for a particular socket + * ID. so, we either need an exact match on socket ID (can match both + * native and external memory), or, if SOCKET_ID_ANY was specified as a + * socket ID argument, we must only look at native memory and ignore any + * page sizes associated with external memory. + */ + valid = msl->socket_id == wa->socket_id; + valid |= wa->socket_id == SOCKET_ID_ANY && msl->external == 0; - if (msl->page_sz < *min) - *min = msl->page_sz; + if (valid && msl->page_sz < wa->min) + wa->min = msl->page_sz; return 0; } static size_t -get_min_page_size(void) +get_min_page_size(int socket_id) { - size_t min_pagesz = SIZE_MAX; + struct pagesz_walk_arg wa; - rte_memseg_list_walk(find_min_pagesz, &min_pagesz); + wa.min = SIZE_MAX; + wa.socket_id = socket_id; - return min_pagesz == SIZE_MAX ? (size_t) getpagesize() : min_pagesz; + rte_memseg_list_walk(find_min_pagesz, &wa); + + return wa.min == SIZE_MAX ? (size_t) getpagesize() : wa.min; } @@ -409,12 +428,18 @@ rte_mempool_populate_default(struct rte_mempool *mp) rte_iova_t iova; unsigned mz_id, n; int ret; - bool no_contig, try_contig, no_pageshift; + bool no_contig, try_contig, no_pageshift, external; ret = mempool_ops_alloc_once(mp); if (ret != 0) return ret; + /* check if we can retrieve a valid socket ID */ + ret = rte_malloc_heap_socket_is_external(mp->socket_id); + if (ret < 0) + return -EINVAL; + external = ret; + /* mempool must not be populated */ if (mp->nb_mem_chunks != 0) return -EEXIST; @@ -462,15 +487,25 @@ rte_mempool_populate_default(struct rte_mempool *mp) * in one contiguous chunk as well (otherwise we might end up wasting a * 1G page on a 10MB memzone). If we fail to get enough contiguous * memory, then we'll go and reserve space page-by-page. + * + * We also have to take into account the fact that memory that we're + * going to allocate from can belong to an externally allocated memory + * area, in which case the assumption of IOVA as VA mode being + * synonymous with IOVA contiguousness will not hold. We should also try + * to go for contiguous memory even if we're in no-huge mode, because + * external memory may in fact be IOVA-contiguous. */ - no_pageshift = no_contig || rte_eal_iova_mode() == RTE_IOVA_VA; - try_contig = !no_contig && !no_pageshift && rte_eal_has_hugepages(); + external = rte_malloc_heap_socket_is_external(mp->socket_id) == 1; + no_pageshift = no_contig || + (!external && rte_eal_iova_mode() == RTE_IOVA_VA); + try_contig = !no_contig && !no_pageshift && + (rte_eal_has_hugepages() || external); if (no_pageshift) { pg_sz = 0; pg_shift = 0; } else if (try_contig) { - pg_sz = get_min_page_size(); + pg_sz = get_min_page_size(mp->socket_id); pg_shift = rte_bsf32(pg_sz); } else { pg_sz = getpagesize(); diff --git a/lib/librte_net/Makefile b/lib/librte_net/Makefile index 85e403f4..c3082069 100644 --- a/lib/librte_net/Makefile +++ b/lib/librte_net/Makefile @@ -20,6 +20,6 @@ SRCS-$(CONFIG_RTE_LIBRTE_NET) += rte_arp.c SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include := rte_ip.h rte_tcp.h rte_udp.h rte_esp.h SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include += rte_sctp.h rte_icmp.h rte_arp.h SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include += rte_ether.h rte_gre.h rte_net.h -SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include += rte_net_crc.h +SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include += rte_net_crc.h rte_mpls.h include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_net/meson.build b/lib/librte_net/meson.build index d3ea1feb..7d66f693 100644 --- a/lib/librte_net/meson.build +++ b/lib/librte_net/meson.build @@ -13,7 +13,8 @@ headers = files('rte_ip.h', 'rte_ether.h', 'rte_gre.h', 'rte_net.h', - 'rte_net_crc.h') + 'rte_net_crc.h', + 'rte_mpls.h') sources = files('rte_arp.c', 'rte_net.c', 'rte_net_crc.c') deps += ['mbuf'] diff --git a/lib/librte_net/net_crc_sse.h b/lib/librte_net/net_crc_sse.h index da815243..1c7b7a54 100644 --- a/lib/librte_net/net_crc_sse.h +++ b/lib/librte_net/net_crc_sse.h @@ -21,8 +21,8 @@ struct crc_pclmulqdq_ctx { __m128i rk7_rk8; }; -struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16); -struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16); +static struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16); +static struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16); /** * @brief Performs one folding round * diff --git a/lib/librte_net/rte_ether.h b/lib/librte_net/rte_ether.h index bee2b34f..c2c5e249 100644 --- a/lib/librte_net/rte_ether.h +++ b/lib/librte_net/rte_ether.h @@ -306,6 +306,8 @@ struct vxlan_hdr { #define ETHER_TYPE_SLOW 0x8809 /**< Slow protocols (LACP and Marker). */ #define ETHER_TYPE_TEB 0x6558 /**< Transparent Ethernet Bridging. */ #define ETHER_TYPE_LLDP 0x88CC /**< LLDP Protocol. */ +#define ETHER_TYPE_MPLS 0x8847 /**< MPLS ethertype. */ +#define ETHER_TYPE_MPLSM 0x8848 /**< MPLS multicast ethertype. */ #define ETHER_VXLAN_HLEN (sizeof(struct udp_hdr) + sizeof(struct vxlan_hdr)) /**< VXLAN tunnel header length. */ diff --git a/lib/librte_net/rte_mpls.h b/lib/librte_net/rte_mpls.h new file mode 100644 index 00000000..11d26ba3 --- /dev/null +++ b/lib/librte_net/rte_mpls.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 6WIND S.A. + */ + +#ifndef _RTE_MPLS_H_ +#define _RTE_MPLS_H_ + +/** + * @file + * + * MPLS-related defines + */ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * MPLS header. + */ +struct mpls_hdr { + uint16_t tag_msb; /**< Label(msb). */ +#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN + uint8_t tag_lsb:4; /**< Label(lsb). */ + uint8_t tc:3; /**< Traffic class. */ + uint8_t bs:1; /**< Bottom of stack. */ +#else + uint8_t bs:1; /**< Bottom of stack. */ + uint8_t tc:3; /**< Traffic class. */ + uint8_t tag_lsb:4; /**< label(lsb) */ +#endif + uint8_t ttl; /**< Time to live. */ +} __attribute__((__packed__)); + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_MPLS_H_ */ diff --git a/lib/librte_net/rte_net.c b/lib/librte_net/rte_net.c index 9eb7c743..378a4126 100644 --- a/lib/librte_net/rte_net.c +++ b/lib/librte_net/rte_net.c @@ -13,6 +13,7 @@ #include #include #include +#include #include /* get l3 packet type from ip6 next protocol */ @@ -274,9 +275,27 @@ uint32_t rte_net_get_ptype(const struct rte_mbuf *m, off += 2 * sizeof(*vh); hdr_lens->l2_len += 2 * sizeof(*vh); proto = vh->eth_proto; + } else if ((proto == rte_cpu_to_be_16(ETHER_TYPE_MPLS)) || + (proto == rte_cpu_to_be_16(ETHER_TYPE_MPLSM))) { + unsigned int i; + const struct mpls_hdr *mh; + struct mpls_hdr mh_copy; + +#define MAX_MPLS_HDR 5 + for (i = 0; i < MAX_MPLS_HDR; i++) { + mh = rte_pktmbuf_read(m, off + (i * sizeof(*mh)), + sizeof(*mh), &mh_copy); + if (unlikely(mh == NULL)) + return pkt_type; + } + if (i == MAX_MPLS_HDR) + return pkt_type; + pkt_type = RTE_PTYPE_L2_ETHER_MPLS; + hdr_lens->l2_len += (sizeof(*mh) * i); + return pkt_type; } - l3: +l3: if ((layers & RTE_PTYPE_L3_MASK) == 0) return pkt_type; diff --git a/lib/librte_net/rte_net.h b/lib/librte_net/rte_net.h index b6ab6e1d..e59760a0 100644 --- a/lib/librte_net/rte_net.h +++ b/lib/librte_net/rte_net.h @@ -122,14 +122,16 @@ rte_net_intel_cksum_flags_prepare(struct rte_mbuf *m, uint64_t ol_flags) (ol_flags & PKT_TX_OUTER_IPV6)) inner_l3_offset += m->outer_l2_len + m->outer_l3_len; - if ((ol_flags & PKT_TX_UDP_CKSUM) == PKT_TX_UDP_CKSUM) { - if (ol_flags & PKT_TX_IPV4) { - ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, - inner_l3_offset); + if (ol_flags & PKT_TX_IPV4) { + ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, + inner_l3_offset); - if (ol_flags & PKT_TX_IP_CKSUM) - ipv4_hdr->hdr_checksum = 0; + if (ol_flags & PKT_TX_IP_CKSUM) + ipv4_hdr->hdr_checksum = 0; + } + if ((ol_flags & PKT_TX_UDP_CKSUM) == PKT_TX_UDP_CKSUM) { + if (ol_flags & PKT_TX_IPV4) { udp_hdr = (struct udp_hdr *)((char *)ipv4_hdr + m->l3_len); udp_hdr->dgram_cksum = rte_ipv4_phdr_cksum(ipv4_hdr, @@ -146,12 +148,6 @@ rte_net_intel_cksum_flags_prepare(struct rte_mbuf *m, uint64_t ol_flags) } else if ((ol_flags & PKT_TX_TCP_CKSUM) || (ol_flags & PKT_TX_TCP_SEG)) { if (ol_flags & PKT_TX_IPV4) { - ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, - inner_l3_offset); - - if (ol_flags & PKT_TX_IP_CKSUM) - ipv4_hdr->hdr_checksum = 0; - /* non-TSO tcp or TSO */ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + m->l3_len); diff --git a/lib/librte_pdump/Makefile b/lib/librte_pdump/Makefile index 0ee0fa1a..b241151d 100644 --- a/lib/librte_pdump/Makefile +++ b/lib/librte_pdump/Makefile @@ -8,8 +8,6 @@ LIB = librte_pdump.a CFLAGS += -DALLOW_EXPERIMENTAL_API CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -CFLAGS += -D_GNU_SOURCE -LDLIBS += -lpthread LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev EXPORT_MAP := rte_pdump_version.map diff --git a/lib/librte_pipeline/Makefile b/lib/librte_pipeline/Makefile index 84afe98c..cf265503 100644 --- a/lib/librte_pipeline/Makefile +++ b/lib/librte_pipeline/Makefile @@ -12,7 +12,7 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API CFLAGS += -O3 CFLAGS += $(WERROR_FLAGS) LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_table -LDLIBS += -lrte_port -lrte_meter -lrte_sched +LDLIBS += -lrte_port -lrte_meter -lrte_sched -lrte_cryptodev EXPORT_MAP := rte_pipeline_version.map diff --git a/lib/librte_pipeline/meson.build b/lib/librte_pipeline/meson.build index dc16ab42..04e5f517 100644 --- a/lib/librte_pipeline/meson.build +++ b/lib/librte_pipeline/meson.build @@ -5,4 +5,4 @@ version = 3 allow_experimental_apis = true sources = files('rte_pipeline.c', 'rte_port_in_action.c', 'rte_table_action.c') headers = files('rte_pipeline.h', 'rte_port_in_action.h', 'rte_table_action.h') -deps += ['port', 'table', 'meter', 'sched'] +deps += ['port', 'table', 'meter', 'sched', 'cryptodev'] diff --git a/lib/librte_pipeline/rte_pipeline.c b/lib/librte_pipeline/rte_pipeline.c index 0cb8b804..2c047a8a 100644 --- a/lib/librte_pipeline/rte_pipeline.c +++ b/lib/librte_pipeline/rte_pipeline.c @@ -178,8 +178,7 @@ rte_pipeline_check_params(struct rte_pipeline_params *params) } /* socket */ - if ((params->socket_id < 0) || - (params->socket_id >= RTE_MAX_NUMA_NODES)) { + if (params->socket_id < 0) { RTE_LOG(ERR, PIPELINE, "%s: Incorrect value for parameter socket_id\n", __func__); diff --git a/lib/librte_pipeline/rte_pipeline_version.map b/lib/librte_pipeline/rte_pipeline_version.map index d820b22f..420f065d 100644 --- a/lib/librte_pipeline/rte_pipeline_version.map +++ b/lib/librte_pipeline/rte_pipeline_version.map @@ -72,4 +72,5 @@ EXPERIMENTAL { rte_table_action_stats_read; rte_table_action_time_read; rte_table_action_ttl_read; + rte_table_action_crypto_sym_session_get; }; diff --git a/lib/librte_pipeline/rte_table_action.c b/lib/librte_pipeline/rte_table_action.c index 83ffa5de..537e6593 100644 --- a/lib/librte_pipeline/rte_table_action.c +++ b/lib/librte_pipeline/rte_table_action.c @@ -1,7 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright(c) 2010-2018 Intel Corporation */ - #include #include @@ -15,6 +14,8 @@ #include #include #include +#include +#include #include "rte_table_action.h" @@ -430,6 +431,7 @@ encap_valid(enum rte_table_action_encap_type encap) case RTE_TABLE_ACTION_ENCAP_QINQ: case RTE_TABLE_ACTION_ENCAP_MPLS: case RTE_TABLE_ACTION_ENCAP_PPPOE: + case RTE_TABLE_ACTION_ENCAP_VXLAN: return 1; default: return 0; @@ -498,6 +500,38 @@ struct encap_pppoe_data { struct pppoe_ppp_hdr pppoe_ppp; } __attribute__((__packed__)); +#define IP_PROTO_UDP 17 + +struct encap_vxlan_ipv4_data { + struct ether_hdr ether; + struct ipv4_hdr ipv4; + struct udp_hdr udp; + struct vxlan_hdr vxlan; +} __attribute__((__packed__)); + +struct encap_vxlan_ipv4_vlan_data { + struct ether_hdr ether; + struct vlan_hdr vlan; + struct ipv4_hdr ipv4; + struct udp_hdr udp; + struct vxlan_hdr vxlan; +} __attribute__((__packed__)); + +struct encap_vxlan_ipv6_data { + struct ether_hdr ether; + struct ipv6_hdr ipv6; + struct udp_hdr udp; + struct vxlan_hdr vxlan; +} __attribute__((__packed__)); + +struct encap_vxlan_ipv6_vlan_data { + struct ether_hdr ether; + struct vlan_hdr vlan; + struct ipv6_hdr ipv6; + struct udp_hdr udp; + struct vxlan_hdr vxlan; +} __attribute__((__packed__)); + static size_t encap_data_size(struct rte_table_action_encap_config *encap) { @@ -517,6 +551,18 @@ encap_data_size(struct rte_table_action_encap_config *encap) case 1LLU << RTE_TABLE_ACTION_ENCAP_PPPOE: return sizeof(struct encap_pppoe_data); + case 1LLU << RTE_TABLE_ACTION_ENCAP_VXLAN: + if (encap->vxlan.ip_version) + if (encap->vxlan.vlan) + return sizeof(struct encap_vxlan_ipv4_vlan_data); + else + return sizeof(struct encap_vxlan_ipv4_data); + else + if (encap->vxlan.vlan) + return sizeof(struct encap_vxlan_ipv6_vlan_data); + else + return sizeof(struct encap_vxlan_ipv6_data); + default: return 0; } @@ -550,6 +596,9 @@ encap_apply_check(struct rte_table_action_encap_params *p, case RTE_TABLE_ACTION_ENCAP_PPPOE: return 0; + case RTE_TABLE_ACTION_ENCAP_VXLAN: + return 0; + default: return -EINVAL; } @@ -678,6 +727,168 @@ encap_pppoe_apply(void *data, return 0; } +static int +encap_vxlan_apply(void *data, + struct rte_table_action_encap_params *p, + struct rte_table_action_encap_config *cfg) +{ + if ((p->vxlan.vxlan.vni > 0xFFFFFF) || + (cfg->vxlan.ip_version && (p->vxlan.ipv4.dscp > 0x3F)) || + (!cfg->vxlan.ip_version && (p->vxlan.ipv6.flow_label > 0xFFFFF)) || + (!cfg->vxlan.ip_version && (p->vxlan.ipv6.dscp > 0x3F)) || + (cfg->vxlan.vlan && (p->vxlan.vlan.vid > 0xFFF))) + return -1; + + if (cfg->vxlan.ip_version) + if (cfg->vxlan.vlan) { + struct encap_vxlan_ipv4_vlan_data *d = data; + + /* Ethernet */ + ether_addr_copy(&p->vxlan.ether.da, &d->ether.d_addr); + ether_addr_copy(&p->vxlan.ether.sa, &d->ether.s_addr); + d->ether.ether_type = rte_htons(ETHER_TYPE_VLAN); + + /* VLAN */ + d->vlan.vlan_tci = rte_htons(VLAN(p->vxlan.vlan.pcp, + p->vxlan.vlan.dei, + p->vxlan.vlan.vid)); + d->vlan.eth_proto = rte_htons(ETHER_TYPE_IPv4); + + /* IPv4*/ + d->ipv4.version_ihl = 0x45; + d->ipv4.type_of_service = p->vxlan.ipv4.dscp << 2; + d->ipv4.total_length = 0; /* not pre-computed */ + d->ipv4.packet_id = 0; + d->ipv4.fragment_offset = 0; + d->ipv4.time_to_live = p->vxlan.ipv4.ttl; + d->ipv4.next_proto_id = IP_PROTO_UDP; + d->ipv4.hdr_checksum = 0; + d->ipv4.src_addr = rte_htonl(p->vxlan.ipv4.sa); + d->ipv4.dst_addr = rte_htonl(p->vxlan.ipv4.da); + + d->ipv4.hdr_checksum = rte_ipv4_cksum(&d->ipv4); + + /* UDP */ + d->udp.src_port = rte_htons(p->vxlan.udp.sp); + d->udp.dst_port = rte_htons(p->vxlan.udp.dp); + d->udp.dgram_len = 0; /* not pre-computed */ + d->udp.dgram_cksum = 0; + + /* VXLAN */ + d->vxlan.vx_flags = rte_htonl(0x08000000); + d->vxlan.vx_vni = rte_htonl(p->vxlan.vxlan.vni << 8); + + return 0; + } else { + struct encap_vxlan_ipv4_data *d = data; + + /* Ethernet */ + ether_addr_copy(&p->vxlan.ether.da, &d->ether.d_addr); + ether_addr_copy(&p->vxlan.ether.sa, &d->ether.s_addr); + d->ether.ether_type = rte_htons(ETHER_TYPE_IPv4); + + /* IPv4*/ + d->ipv4.version_ihl = 0x45; + d->ipv4.type_of_service = p->vxlan.ipv4.dscp << 2; + d->ipv4.total_length = 0; /* not pre-computed */ + d->ipv4.packet_id = 0; + d->ipv4.fragment_offset = 0; + d->ipv4.time_to_live = p->vxlan.ipv4.ttl; + d->ipv4.next_proto_id = IP_PROTO_UDP; + d->ipv4.hdr_checksum = 0; + d->ipv4.src_addr = rte_htonl(p->vxlan.ipv4.sa); + d->ipv4.dst_addr = rte_htonl(p->vxlan.ipv4.da); + + d->ipv4.hdr_checksum = rte_ipv4_cksum(&d->ipv4); + + /* UDP */ + d->udp.src_port = rte_htons(p->vxlan.udp.sp); + d->udp.dst_port = rte_htons(p->vxlan.udp.dp); + d->udp.dgram_len = 0; /* not pre-computed */ + d->udp.dgram_cksum = 0; + + /* VXLAN */ + d->vxlan.vx_flags = rte_htonl(0x08000000); + d->vxlan.vx_vni = rte_htonl(p->vxlan.vxlan.vni << 8); + + return 0; + } + else + if (cfg->vxlan.vlan) { + struct encap_vxlan_ipv6_vlan_data *d = data; + + /* Ethernet */ + ether_addr_copy(&p->vxlan.ether.da, &d->ether.d_addr); + ether_addr_copy(&p->vxlan.ether.sa, &d->ether.s_addr); + d->ether.ether_type = rte_htons(ETHER_TYPE_VLAN); + + /* VLAN */ + d->vlan.vlan_tci = rte_htons(VLAN(p->vxlan.vlan.pcp, + p->vxlan.vlan.dei, + p->vxlan.vlan.vid)); + d->vlan.eth_proto = rte_htons(ETHER_TYPE_IPv6); + + /* IPv6*/ + d->ipv6.vtc_flow = rte_htonl((6 << 28) | + (p->vxlan.ipv6.dscp << 22) | + p->vxlan.ipv6.flow_label); + d->ipv6.payload_len = 0; /* not pre-computed */ + d->ipv6.proto = IP_PROTO_UDP; + d->ipv6.hop_limits = p->vxlan.ipv6.hop_limit; + memcpy(d->ipv6.src_addr, + p->vxlan.ipv6.sa, + sizeof(p->vxlan.ipv6.sa)); + memcpy(d->ipv6.dst_addr, + p->vxlan.ipv6.da, + sizeof(p->vxlan.ipv6.da)); + + /* UDP */ + d->udp.src_port = rte_htons(p->vxlan.udp.sp); + d->udp.dst_port = rte_htons(p->vxlan.udp.dp); + d->udp.dgram_len = 0; /* not pre-computed */ + d->udp.dgram_cksum = 0; + + /* VXLAN */ + d->vxlan.vx_flags = rte_htonl(0x08000000); + d->vxlan.vx_vni = rte_htonl(p->vxlan.vxlan.vni << 8); + + return 0; + } else { + struct encap_vxlan_ipv6_data *d = data; + + /* Ethernet */ + ether_addr_copy(&p->vxlan.ether.da, &d->ether.d_addr); + ether_addr_copy(&p->vxlan.ether.sa, &d->ether.s_addr); + d->ether.ether_type = rte_htons(ETHER_TYPE_IPv6); + + /* IPv6*/ + d->ipv6.vtc_flow = rte_htonl((6 << 28) | + (p->vxlan.ipv6.dscp << 22) | + p->vxlan.ipv6.flow_label); + d->ipv6.payload_len = 0; /* not pre-computed */ + d->ipv6.proto = IP_PROTO_UDP; + d->ipv6.hop_limits = p->vxlan.ipv6.hop_limit; + memcpy(d->ipv6.src_addr, + p->vxlan.ipv6.sa, + sizeof(p->vxlan.ipv6.sa)); + memcpy(d->ipv6.dst_addr, + p->vxlan.ipv6.da, + sizeof(p->vxlan.ipv6.da)); + + /* UDP */ + d->udp.src_port = rte_htons(p->vxlan.udp.sp); + d->udp.dst_port = rte_htons(p->vxlan.udp.dp); + d->udp.dgram_len = 0; /* not pre-computed */ + d->udp.dgram_cksum = 0; + + /* VXLAN */ + d->vxlan.vx_flags = rte_htonl(0x08000000); + d->vxlan.vx_vni = rte_htonl(p->vxlan.vxlan.vni << 8); + + return 0; + } +} + static int encap_apply(void *data, struct rte_table_action_encap_params *p, @@ -707,11 +918,31 @@ encap_apply(void *data, case RTE_TABLE_ACTION_ENCAP_PPPOE: return encap_pppoe_apply(data, p); + case RTE_TABLE_ACTION_ENCAP_VXLAN: + return encap_vxlan_apply(data, p, cfg); + default: return -EINVAL; } } +static __rte_always_inline uint16_t +encap_vxlan_ipv4_checksum_update(uint16_t cksum0, + uint16_t total_length) +{ + int32_t cksum1; + + cksum1 = cksum0; + cksum1 = ~cksum1 & 0xFFFF; + + /* Add total length (one's complement logic) */ + cksum1 += total_length; + cksum1 = (cksum1 & 0xFFFF) + (cksum1 >> 16); + cksum1 = (cksum1 & 0xFFFF) + (cksum1 >> 16); + + return (uint16_t)(~cksum1); +} + static __rte_always_inline void * encap(void *dst, const void *src, size_t n) { @@ -719,6 +950,118 @@ encap(void *dst, const void *src, size_t n) return rte_memcpy(dst, src, n); } +static __rte_always_inline void +pkt_work_encap_vxlan_ipv4(struct rte_mbuf *mbuf, + struct encap_vxlan_ipv4_data *vxlan_tbl, + struct rte_table_action_encap_config *cfg) +{ + uint32_t ether_offset = cfg->vxlan.data_offset; + void *ether = RTE_MBUF_METADATA_UINT32_PTR(mbuf, ether_offset); + struct encap_vxlan_ipv4_data *vxlan_pkt; + uint16_t ether_length, ipv4_total_length, ipv4_hdr_cksum, udp_length; + + ether_length = (uint16_t)mbuf->pkt_len; + ipv4_total_length = ether_length + + (sizeof(struct vxlan_hdr) + + sizeof(struct udp_hdr) + + sizeof(struct ipv4_hdr)); + ipv4_hdr_cksum = encap_vxlan_ipv4_checksum_update(vxlan_tbl->ipv4.hdr_checksum, + rte_htons(ipv4_total_length)); + udp_length = ether_length + + (sizeof(struct vxlan_hdr) + + sizeof(struct udp_hdr)); + + vxlan_pkt = encap(ether, vxlan_tbl, sizeof(*vxlan_tbl)); + vxlan_pkt->ipv4.total_length = rte_htons(ipv4_total_length); + vxlan_pkt->ipv4.hdr_checksum = ipv4_hdr_cksum; + vxlan_pkt->udp.dgram_len = rte_htons(udp_length); + + mbuf->data_off = ether_offset - (sizeof(struct rte_mbuf) + sizeof(*vxlan_pkt)); + mbuf->pkt_len = mbuf->data_len = ether_length + sizeof(*vxlan_pkt); +} + +static __rte_always_inline void +pkt_work_encap_vxlan_ipv4_vlan(struct rte_mbuf *mbuf, + struct encap_vxlan_ipv4_vlan_data *vxlan_tbl, + struct rte_table_action_encap_config *cfg) +{ + uint32_t ether_offset = cfg->vxlan.data_offset; + void *ether = RTE_MBUF_METADATA_UINT32_PTR(mbuf, ether_offset); + struct encap_vxlan_ipv4_vlan_data *vxlan_pkt; + uint16_t ether_length, ipv4_total_length, ipv4_hdr_cksum, udp_length; + + ether_length = (uint16_t)mbuf->pkt_len; + ipv4_total_length = ether_length + + (sizeof(struct vxlan_hdr) + + sizeof(struct udp_hdr) + + sizeof(struct ipv4_hdr)); + ipv4_hdr_cksum = encap_vxlan_ipv4_checksum_update(vxlan_tbl->ipv4.hdr_checksum, + rte_htons(ipv4_total_length)); + udp_length = ether_length + + (sizeof(struct vxlan_hdr) + + sizeof(struct udp_hdr)); + + vxlan_pkt = encap(ether, vxlan_tbl, sizeof(*vxlan_tbl)); + vxlan_pkt->ipv4.total_length = rte_htons(ipv4_total_length); + vxlan_pkt->ipv4.hdr_checksum = ipv4_hdr_cksum; + vxlan_pkt->udp.dgram_len = rte_htons(udp_length); + + mbuf->data_off = ether_offset - (sizeof(struct rte_mbuf) + sizeof(*vxlan_pkt)); + mbuf->pkt_len = mbuf->data_len = ether_length + sizeof(*vxlan_pkt); +} + +static __rte_always_inline void +pkt_work_encap_vxlan_ipv6(struct rte_mbuf *mbuf, + struct encap_vxlan_ipv6_data *vxlan_tbl, + struct rte_table_action_encap_config *cfg) +{ + uint32_t ether_offset = cfg->vxlan.data_offset; + void *ether = RTE_MBUF_METADATA_UINT32_PTR(mbuf, ether_offset); + struct encap_vxlan_ipv6_data *vxlan_pkt; + uint16_t ether_length, ipv6_payload_length, udp_length; + + ether_length = (uint16_t)mbuf->pkt_len; + ipv6_payload_length = ether_length + + (sizeof(struct vxlan_hdr) + + sizeof(struct udp_hdr)); + udp_length = ether_length + + (sizeof(struct vxlan_hdr) + + sizeof(struct udp_hdr)); + + vxlan_pkt = encap(ether, vxlan_tbl, sizeof(*vxlan_tbl)); + vxlan_pkt->ipv6.payload_len = rte_htons(ipv6_payload_length); + vxlan_pkt->udp.dgram_len = rte_htons(udp_length); + + mbuf->data_off = ether_offset - (sizeof(struct rte_mbuf) + sizeof(*vxlan_pkt)); + mbuf->pkt_len = mbuf->data_len = ether_length + sizeof(*vxlan_pkt); +} + +static __rte_always_inline void +pkt_work_encap_vxlan_ipv6_vlan(struct rte_mbuf *mbuf, + struct encap_vxlan_ipv6_vlan_data *vxlan_tbl, + struct rte_table_action_encap_config *cfg) +{ + uint32_t ether_offset = cfg->vxlan.data_offset; + void *ether = RTE_MBUF_METADATA_UINT32_PTR(mbuf, ether_offset); + struct encap_vxlan_ipv6_vlan_data *vxlan_pkt; + uint16_t ether_length, ipv6_payload_length, udp_length; + + ether_length = (uint16_t)mbuf->pkt_len; + ipv6_payload_length = ether_length + + (sizeof(struct vxlan_hdr) + + sizeof(struct udp_hdr)); + udp_length = ether_length + + (sizeof(struct vxlan_hdr) + + sizeof(struct udp_hdr)); + + vxlan_pkt = encap(ether, vxlan_tbl, sizeof(*vxlan_tbl)); + vxlan_pkt->ipv6.payload_len = rte_htons(ipv6_payload_length); + vxlan_pkt->udp.dgram_len = rte_htons(udp_length); + + mbuf->data_off = ether_offset - (sizeof(struct rte_mbuf) + sizeof(*vxlan_pkt)); + mbuf->pkt_len = mbuf->data_len = ether_length + sizeof(*vxlan_pkt); +} + static __rte_always_inline void pkt_work_encap(struct rte_mbuf *mbuf, void *data, @@ -776,6 +1119,20 @@ pkt_work_encap(struct rte_mbuf *mbuf, break; } + case 1LLU << RTE_TABLE_ACTION_ENCAP_VXLAN: + { + if (cfg->vxlan.ip_version) + if (cfg->vxlan.vlan) + pkt_work_encap_vxlan_ipv4_vlan(mbuf, data, cfg); + else + pkt_work_encap_vxlan_ipv4(mbuf, data, cfg); + else + if (cfg->vxlan.vlan) + pkt_work_encap_vxlan_ipv6_vlan(mbuf, data, cfg); + else + pkt_work_encap_vxlan_ipv6(mbuf, data, cfg); + } + default: break; } @@ -1219,6 +1576,562 @@ pkt_work_time(struct time_data *data, data->time = time; } + +/** + * RTE_TABLE_ACTION_CRYPTO + */ + +#define CRYPTO_OP_MASK_CIPHER 0x1 +#define CRYPTO_OP_MASK_AUTH 0x2 +#define CRYPTO_OP_MASK_AEAD 0x4 + +struct crypto_op_sym_iv_aad { + struct rte_crypto_op op; + struct rte_crypto_sym_op sym_op; + union { + struct { + uint8_t cipher_iv[ + RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX]; + uint8_t auth_iv[ + RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX]; + } cipher_auth; + + struct { + uint8_t iv[RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX]; + uint8_t aad[RTE_TABLE_ACTION_SYM_CRYPTO_AAD_SIZE_MAX]; + } aead_iv_aad; + + } iv_aad; +}; + +struct sym_crypto_data { + + union { + struct { + + /** Length of cipher iv. */ + uint16_t cipher_iv_len; + + /** Offset from start of IP header to the cipher iv. */ + uint16_t cipher_iv_data_offset; + + /** Length of cipher iv to be updated in the mbuf. */ + uint16_t cipher_iv_update_len; + + /** Offset from start of IP header to the auth iv. */ + uint16_t auth_iv_data_offset; + + /** Length of auth iv in the mbuf. */ + uint16_t auth_iv_len; + + /** Length of auth iv to be updated in the mbuf. */ + uint16_t auth_iv_update_len; + + } cipher_auth; + struct { + + /** Length of iv. */ + uint16_t iv_len; + + /** Offset from start of IP header to the aead iv. */ + uint16_t iv_data_offset; + + /** Length of iv to be updated in the mbuf. */ + uint16_t iv_update_len; + + /** Length of aad */ + uint16_t aad_len; + + /** Offset from start of IP header to the aad. */ + uint16_t aad_data_offset; + + /** Length of aad to updated in the mbuf. */ + uint16_t aad_update_len; + + } aead; + }; + + /** Offset from start of IP header to the data. */ + uint16_t data_offset; + + /** Digest length. */ + uint16_t digest_len; + + /** block size */ + uint16_t block_size; + + /** Mask of crypto operation */ + uint16_t op_mask; + + /** Session pointer. */ + struct rte_cryptodev_sym_session *session; + + /** Direction of crypto, encrypt or decrypt */ + uint16_t direction; + + /** Private data size to store cipher iv / aad. */ + uint8_t iv_aad_data[32]; + +} __attribute__((__packed__)); + +static int +sym_crypto_cfg_check(struct rte_table_action_sym_crypto_config *cfg) +{ + if (!rte_cryptodev_pmd_is_valid_dev(cfg->cryptodev_id)) + return -EINVAL; + if (cfg->mp_create == NULL || cfg->mp_init == NULL) + return -EINVAL; + + return 0; +} + +static int +get_block_size(const struct rte_crypto_sym_xform *xform, uint8_t cdev_id) +{ + struct rte_cryptodev_info dev_info; + const struct rte_cryptodev_capabilities *cap; + uint32_t i; + + rte_cryptodev_info_get(cdev_id, &dev_info); + + for (i = 0;; i++) { + cap = &dev_info.capabilities[i]; + if (!cap) + break; + + if (cap->sym.xform_type != xform->type) + continue; + + if ((xform->type == RTE_CRYPTO_SYM_XFORM_CIPHER) && + (cap->sym.cipher.algo == xform->cipher.algo)) + return cap->sym.cipher.block_size; + + if ((xform->type == RTE_CRYPTO_SYM_XFORM_AEAD) && + (cap->sym.aead.algo == xform->aead.algo)) + return cap->sym.aead.block_size; + + if (xform->type == RTE_CRYPTO_SYM_XFORM_NOT_SPECIFIED) + break; + } + + return -1; +} + +static int +sym_crypto_apply(struct sym_crypto_data *data, + struct rte_table_action_sym_crypto_config *cfg, + struct rte_table_action_sym_crypto_params *p) +{ + const struct rte_crypto_cipher_xform *cipher_xform = NULL; + const struct rte_crypto_auth_xform *auth_xform = NULL; + const struct rte_crypto_aead_xform *aead_xform = NULL; + struct rte_crypto_sym_xform *xform = p->xform; + struct rte_cryptodev_sym_session *session; + int ret; + + memset(data, 0, sizeof(*data)); + + while (xform) { + if (xform->type == RTE_CRYPTO_SYM_XFORM_CIPHER) { + cipher_xform = &xform->cipher; + + if (cipher_xform->iv.length > + RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX) + return -ENOMEM; + if (cipher_xform->iv.offset != + RTE_TABLE_ACTION_SYM_CRYPTO_IV_OFFSET) + return -EINVAL; + + ret = get_block_size(xform, cfg->cryptodev_id); + if (ret < 0) + return -1; + data->block_size = (uint16_t)ret; + data->op_mask |= CRYPTO_OP_MASK_CIPHER; + + data->cipher_auth.cipher_iv_len = + cipher_xform->iv.length; + data->cipher_auth.cipher_iv_data_offset = (uint16_t) + p->cipher_auth.cipher_iv_update.offset; + data->cipher_auth.cipher_iv_update_len = (uint16_t) + p->cipher_auth.cipher_iv_update.length; + + rte_memcpy(data->iv_aad_data, + p->cipher_auth.cipher_iv.val, + p->cipher_auth.cipher_iv.length); + + data->direction = cipher_xform->op; + + } else if (xform->type == RTE_CRYPTO_SYM_XFORM_AUTH) { + auth_xform = &xform->auth; + if (auth_xform->iv.length > + RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX) + return -ENOMEM; + data->op_mask |= CRYPTO_OP_MASK_AUTH; + + data->cipher_auth.auth_iv_len = auth_xform->iv.length; + data->cipher_auth.auth_iv_data_offset = (uint16_t) + p->cipher_auth.auth_iv_update.offset; + data->cipher_auth.auth_iv_update_len = (uint16_t) + p->cipher_auth.auth_iv_update.length; + data->digest_len = auth_xform->digest_length; + + data->direction = (auth_xform->op == + RTE_CRYPTO_AUTH_OP_GENERATE) ? + RTE_CRYPTO_CIPHER_OP_ENCRYPT : + RTE_CRYPTO_CIPHER_OP_DECRYPT; + + } else if (xform->type == RTE_CRYPTO_SYM_XFORM_AEAD) { + aead_xform = &xform->aead; + + if ((aead_xform->iv.length > + RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX) || ( + aead_xform->aad_length > + RTE_TABLE_ACTION_SYM_CRYPTO_AAD_SIZE_MAX)) + return -EINVAL; + if (aead_xform->iv.offset != + RTE_TABLE_ACTION_SYM_CRYPTO_IV_OFFSET) + return -EINVAL; + + ret = get_block_size(xform, cfg->cryptodev_id); + if (ret < 0) + return -1; + data->block_size = (uint16_t)ret; + data->op_mask |= CRYPTO_OP_MASK_AEAD; + + data->digest_len = aead_xform->digest_length; + data->aead.iv_len = aead_xform->iv.length; + data->aead.aad_len = aead_xform->aad_length; + + data->aead.iv_data_offset = (uint16_t) + p->aead.iv_update.offset; + data->aead.iv_update_len = (uint16_t) + p->aead.iv_update.length; + data->aead.aad_data_offset = (uint16_t) + p->aead.aad_update.offset; + data->aead.aad_update_len = (uint16_t) + p->aead.aad_update.length; + + rte_memcpy(data->iv_aad_data, + p->aead.iv.val, + p->aead.iv.length); + + rte_memcpy(data->iv_aad_data + p->aead.iv.length, + p->aead.aad.val, + p->aead.aad.length); + + data->direction = (aead_xform->op == + RTE_CRYPTO_AEAD_OP_ENCRYPT) ? + RTE_CRYPTO_CIPHER_OP_ENCRYPT : + RTE_CRYPTO_CIPHER_OP_DECRYPT; + } else + return -EINVAL; + + xform = xform->next; + } + + if (auth_xform && auth_xform->iv.length) { + if (cipher_xform) { + if (auth_xform->iv.offset != + RTE_TABLE_ACTION_SYM_CRYPTO_IV_OFFSET + + cipher_xform->iv.length) + return -EINVAL; + + rte_memcpy(data->iv_aad_data + cipher_xform->iv.length, + p->cipher_auth.auth_iv.val, + p->cipher_auth.auth_iv.length); + } else { + rte_memcpy(data->iv_aad_data, + p->cipher_auth.auth_iv.val, + p->cipher_auth.auth_iv.length); + } + } + + session = rte_cryptodev_sym_session_create(cfg->mp_create); + if (!session) + return -ENOMEM; + + ret = rte_cryptodev_sym_session_init(cfg->cryptodev_id, session, + p->xform, cfg->mp_init); + if (ret < 0) { + rte_cryptodev_sym_session_free(session); + return ret; + } + + data->data_offset = (uint16_t)p->data_offset; + data->session = session; + + return 0; +} + +static __rte_always_inline uint64_t +pkt_work_sym_crypto(struct rte_mbuf *mbuf, struct sym_crypto_data *data, + struct rte_table_action_sym_crypto_config *cfg, + uint16_t ip_offset) +{ + struct crypto_op_sym_iv_aad *crypto_op = (struct crypto_op_sym_iv_aad *) + RTE_MBUF_METADATA_UINT8_PTR(mbuf, cfg->op_offset); + struct rte_crypto_op *op = &crypto_op->op; + struct rte_crypto_sym_op *sym = op->sym; + uint32_t pkt_offset = sizeof(*mbuf) + mbuf->data_off; + uint32_t payload_len = pkt_offset + mbuf->data_len - data->data_offset; + + op->type = RTE_CRYPTO_OP_TYPE_SYMMETRIC; + op->sess_type = RTE_CRYPTO_OP_WITH_SESSION; + op->phys_addr = mbuf->buf_iova + cfg->op_offset - sizeof(*mbuf); + op->status = RTE_CRYPTO_OP_STATUS_NOT_PROCESSED; + sym->m_src = mbuf; + sym->m_dst = NULL; + sym->session = data->session; + + /** pad the packet */ + if (data->direction == RTE_CRYPTO_CIPHER_OP_ENCRYPT) { + uint32_t append_len = RTE_ALIGN_CEIL(payload_len, + data->block_size) - payload_len; + + if (unlikely(rte_pktmbuf_append(mbuf, append_len + + data->digest_len) == NULL)) + return 1; + + payload_len += append_len; + } else + payload_len -= data->digest_len; + + if (data->op_mask & CRYPTO_OP_MASK_CIPHER) { + /** prepare cipher op */ + uint8_t *iv = crypto_op->iv_aad.cipher_auth.cipher_iv; + + sym->cipher.data.length = payload_len; + sym->cipher.data.offset = data->data_offset - pkt_offset; + + if (data->cipher_auth.cipher_iv_update_len) { + uint8_t *pkt_iv = RTE_MBUF_METADATA_UINT8_PTR(mbuf, + data->cipher_auth.cipher_iv_data_offset + + ip_offset); + + /** For encryption, update the pkt iv field, otherwise + * update the iv_aad_field + **/ + if (data->direction == RTE_CRYPTO_CIPHER_OP_ENCRYPT) + rte_memcpy(pkt_iv, data->iv_aad_data, + data->cipher_auth.cipher_iv_update_len); + else + rte_memcpy(data->iv_aad_data, pkt_iv, + data->cipher_auth.cipher_iv_update_len); + } + + /** write iv */ + rte_memcpy(iv, data->iv_aad_data, + data->cipher_auth.cipher_iv_len); + } + + if (data->op_mask & CRYPTO_OP_MASK_AUTH) { + /** authentication always start from IP header. */ + sym->auth.data.offset = ip_offset - pkt_offset; + sym->auth.data.length = mbuf->data_len - sym->auth.data.offset - + data->digest_len; + sym->auth.digest.data = rte_pktmbuf_mtod_offset(mbuf, + uint8_t *, rte_pktmbuf_pkt_len(mbuf) - + data->digest_len); + sym->auth.digest.phys_addr = rte_pktmbuf_iova_offset(mbuf, + rte_pktmbuf_pkt_len(mbuf) - data->digest_len); + + if (data->cipher_auth.auth_iv_update_len) { + uint8_t *pkt_iv = RTE_MBUF_METADATA_UINT8_PTR(mbuf, + data->cipher_auth.auth_iv_data_offset + + ip_offset); + uint8_t *data_iv = data->iv_aad_data + + data->cipher_auth.cipher_iv_len; + + if (data->direction == RTE_CRYPTO_CIPHER_OP_ENCRYPT) + rte_memcpy(pkt_iv, data_iv, + data->cipher_auth.auth_iv_update_len); + else + rte_memcpy(data_iv, pkt_iv, + data->cipher_auth.auth_iv_update_len); + } + + if (data->cipher_auth.auth_iv_len) { + /** prepare cipher op */ + uint8_t *iv = crypto_op->iv_aad.cipher_auth.auth_iv; + + rte_memcpy(iv, data->iv_aad_data + + data->cipher_auth.cipher_iv_len, + data->cipher_auth.auth_iv_len); + } + } + + if (data->op_mask & CRYPTO_OP_MASK_AEAD) { + uint8_t *iv = crypto_op->iv_aad.aead_iv_aad.iv; + uint8_t *aad = crypto_op->iv_aad.aead_iv_aad.aad; + + sym->aead.aad.data = aad; + sym->aead.aad.phys_addr = rte_pktmbuf_iova_offset(mbuf, + aad - rte_pktmbuf_mtod(mbuf, uint8_t *)); + sym->aead.digest.data = rte_pktmbuf_mtod_offset(mbuf, + uint8_t *, rte_pktmbuf_pkt_len(mbuf) - + data->digest_len); + sym->aead.digest.phys_addr = rte_pktmbuf_iova_offset(mbuf, + rte_pktmbuf_pkt_len(mbuf) - data->digest_len); + sym->aead.data.offset = data->data_offset - pkt_offset; + sym->aead.data.length = payload_len; + + if (data->aead.iv_update_len) { + uint8_t *pkt_iv = RTE_MBUF_METADATA_UINT8_PTR(mbuf, + data->aead.iv_data_offset + ip_offset); + uint8_t *data_iv = data->iv_aad_data; + + if (data->direction == RTE_CRYPTO_CIPHER_OP_ENCRYPT) + rte_memcpy(pkt_iv, data_iv, + data->aead.iv_update_len); + else + rte_memcpy(data_iv, pkt_iv, + data->aead.iv_update_len); + } + + rte_memcpy(iv, data->iv_aad_data, data->aead.iv_len); + + if (data->aead.aad_update_len) { + uint8_t *pkt_aad = RTE_MBUF_METADATA_UINT8_PTR(mbuf, + data->aead.aad_data_offset + ip_offset); + uint8_t *data_aad = data->iv_aad_data + + data->aead.iv_len; + + if (data->direction == RTE_CRYPTO_CIPHER_OP_ENCRYPT) + rte_memcpy(pkt_aad, data_aad, + data->aead.iv_update_len); + else + rte_memcpy(data_aad, pkt_aad, + data->aead.iv_update_len); + } + + rte_memcpy(aad, data->iv_aad_data + data->aead.iv_len, + data->aead.aad_len); + } + + return 0; +} + +/** + * RTE_TABLE_ACTION_TAG + */ +struct tag_data { + uint32_t tag; +} __attribute__((__packed__)); + +static int +tag_apply(struct tag_data *data, + struct rte_table_action_tag_params *p) +{ + data->tag = p->tag; + return 0; +} + +static __rte_always_inline void +pkt_work_tag(struct rte_mbuf *mbuf, + struct tag_data *data) +{ + mbuf->hash.fdir.hi = data->tag; + mbuf->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID; +} + +static __rte_always_inline void +pkt4_work_tag(struct rte_mbuf *mbuf0, + struct rte_mbuf *mbuf1, + struct rte_mbuf *mbuf2, + struct rte_mbuf *mbuf3, + struct tag_data *data0, + struct tag_data *data1, + struct tag_data *data2, + struct tag_data *data3) +{ + mbuf0->hash.fdir.hi = data0->tag; + mbuf1->hash.fdir.hi = data1->tag; + mbuf2->hash.fdir.hi = data2->tag; + mbuf3->hash.fdir.hi = data3->tag; + + mbuf0->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID; + mbuf1->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID; + mbuf2->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID; + mbuf3->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID; +} + +/** + * RTE_TABLE_ACTION_DECAP + */ +struct decap_data { + uint16_t n; +} __attribute__((__packed__)); + +static int +decap_apply(struct decap_data *data, + struct rte_table_action_decap_params *p) +{ + data->n = p->n; + return 0; +} + +static __rte_always_inline void +pkt_work_decap(struct rte_mbuf *mbuf, + struct decap_data *data) +{ + uint16_t data_off = mbuf->data_off; + uint16_t data_len = mbuf->data_len; + uint32_t pkt_len = mbuf->pkt_len; + uint16_t n = data->n; + + mbuf->data_off = data_off + n; + mbuf->data_len = data_len - n; + mbuf->pkt_len = pkt_len - n; +} + +static __rte_always_inline void +pkt4_work_decap(struct rte_mbuf *mbuf0, + struct rte_mbuf *mbuf1, + struct rte_mbuf *mbuf2, + struct rte_mbuf *mbuf3, + struct decap_data *data0, + struct decap_data *data1, + struct decap_data *data2, + struct decap_data *data3) +{ + uint16_t data_off0 = mbuf0->data_off; + uint16_t data_len0 = mbuf0->data_len; + uint32_t pkt_len0 = mbuf0->pkt_len; + + uint16_t data_off1 = mbuf1->data_off; + uint16_t data_len1 = mbuf1->data_len; + uint32_t pkt_len1 = mbuf1->pkt_len; + + uint16_t data_off2 = mbuf2->data_off; + uint16_t data_len2 = mbuf2->data_len; + uint32_t pkt_len2 = mbuf2->pkt_len; + + uint16_t data_off3 = mbuf3->data_off; + uint16_t data_len3 = mbuf3->data_len; + uint32_t pkt_len3 = mbuf3->pkt_len; + + uint16_t n0 = data0->n; + uint16_t n1 = data1->n; + uint16_t n2 = data2->n; + uint16_t n3 = data3->n; + + mbuf0->data_off = data_off0 + n0; + mbuf0->data_len = data_len0 - n0; + mbuf0->pkt_len = pkt_len0 - n0; + + mbuf1->data_off = data_off1 + n1; + mbuf1->data_len = data_len1 - n1; + mbuf1->pkt_len = pkt_len1 - n1; + + mbuf2->data_off = data_off2 + n2; + mbuf2->data_len = data_len2 - n2; + mbuf2->pkt_len = pkt_len2 - n2; + + mbuf3->data_off = data_off3 + n3; + mbuf3->data_len = data_len3 - n3; + mbuf3->pkt_len = pkt_len3 - n3; +} + /** * Action profile */ @@ -1235,6 +2148,9 @@ action_valid(enum rte_table_action_type action) case RTE_TABLE_ACTION_TTL: case RTE_TABLE_ACTION_STATS: case RTE_TABLE_ACTION_TIME: + case RTE_TABLE_ACTION_SYM_CRYPTO: + case RTE_TABLE_ACTION_TAG: + case RTE_TABLE_ACTION_DECAP: return 1; default: return 0; @@ -1254,6 +2170,7 @@ struct ap_config { struct rte_table_action_nat_config nat; struct rte_table_action_ttl_config ttl; struct rte_table_action_stats_config stats; + struct rte_table_action_sym_crypto_config sym_crypto; }; static size_t @@ -1274,6 +2191,8 @@ action_cfg_size(enum rte_table_action_type action) return sizeof(struct rte_table_action_ttl_config); case RTE_TABLE_ACTION_STATS: return sizeof(struct rte_table_action_stats_config); + case RTE_TABLE_ACTION_SYM_CRYPTO: + return sizeof(struct rte_table_action_sym_crypto_config); default: return 0; } @@ -1305,6 +2224,8 @@ action_cfg_get(struct ap_config *ap_config, case RTE_TABLE_ACTION_STATS: return &ap_config->stats; + case RTE_TABLE_ACTION_SYM_CRYPTO: + return &ap_config->sym_crypto; default: return NULL; } @@ -1361,6 +2282,15 @@ action_data_size(enum rte_table_action_type action, case RTE_TABLE_ACTION_TIME: return sizeof(struct time_data); + case RTE_TABLE_ACTION_SYM_CRYPTO: + return (sizeof(struct sym_crypto_data)); + + case RTE_TABLE_ACTION_TAG: + return sizeof(struct tag_data); + + case RTE_TABLE_ACTION_DECAP: + return sizeof(struct decap_data); + default: return 0; } @@ -1460,6 +2390,10 @@ rte_table_action_profile_action_register(struct rte_table_action_profile *profil status = stats_cfg_check(action_config); break; + case RTE_TABLE_ACTION_SYM_CRYPTO: + status = sym_crypto_cfg_check(action_config); + break; + default: status = 0; break; @@ -1609,6 +2543,19 @@ rte_table_action_apply(struct rte_table_action *action, return time_apply(action_data, action_params); + case RTE_TABLE_ACTION_SYM_CRYPTO: + return sym_crypto_apply(action_data, + &action->cfg.sym_crypto, + action_params); + + case RTE_TABLE_ACTION_TAG: + return tag_apply(action_data, + action_params); + + case RTE_TABLE_ACTION_DECAP: + return decap_apply(action_data, + action_params); + default: return -EINVAL; } @@ -1861,6 +2808,25 @@ rte_table_action_time_read(struct rte_table_action *action, return 0; } +struct rte_cryptodev_sym_session * +rte_table_action_crypto_sym_session_get(struct rte_table_action *action, + void *data) +{ + struct sym_crypto_data *sym_crypto_data; + + /* Check input arguments */ + if ((action == NULL) || + ((action->cfg.action_mask & + (1LLU << RTE_TABLE_ACTION_SYM_CRYPTO)) == 0) || + (data == NULL)) + return NULL; + + sym_crypto_data = action_data_get(data, action, + RTE_TABLE_ACTION_SYM_CRYPTO); + + return sym_crypto_data->session; +} + static __rte_always_inline uint64_t pkt_work(struct rte_mbuf *mbuf, struct rte_pipeline_table_entry *table_entry, @@ -1920,6 +2886,14 @@ pkt_work(struct rte_mbuf *mbuf, dscp); } + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_DECAP)) { + void *data = action_data_get(table_entry, + action, + RTE_TABLE_ACTION_DECAP); + + pkt_work_decap(mbuf, data); + } + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_ENCAP)) { void *data = action_data_get(table_entry, action, RTE_TABLE_ACTION_ENCAP); @@ -1966,6 +2940,22 @@ pkt_work(struct rte_mbuf *mbuf, pkt_work_time(data, time); } + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_SYM_CRYPTO)) { + void *data = action_data_get(table_entry, action, + RTE_TABLE_ACTION_SYM_CRYPTO); + + drop_mask |= pkt_work_sym_crypto(mbuf, data, &cfg->sym_crypto, + ip_offset); + } + + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_TAG)) { + void *data = action_data_get(table_entry, + action, + RTE_TABLE_ACTION_TAG); + + pkt_work_tag(mbuf, data); + } + return drop_mask; } @@ -2137,6 +3127,24 @@ pkt4_work(struct rte_mbuf **mbufs, dscp3); } + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_DECAP)) { + void *data0 = action_data_get(table_entry0, + action, + RTE_TABLE_ACTION_DECAP); + void *data1 = action_data_get(table_entry1, + action, + RTE_TABLE_ACTION_DECAP); + void *data2 = action_data_get(table_entry2, + action, + RTE_TABLE_ACTION_DECAP); + void *data3 = action_data_get(table_entry3, + action, + RTE_TABLE_ACTION_DECAP); + + pkt4_work_decap(mbuf0, mbuf1, mbuf2, mbuf3, + data0, data1, data2, data3); + } + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_ENCAP)) { void *data0 = action_data_get(table_entry0, action, RTE_TABLE_ACTION_ENCAP); @@ -2254,6 +3262,44 @@ pkt4_work(struct rte_mbuf **mbufs, pkt_work_time(data3, time); } + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_SYM_CRYPTO)) { + void *data0 = action_data_get(table_entry0, action, + RTE_TABLE_ACTION_SYM_CRYPTO); + void *data1 = action_data_get(table_entry1, action, + RTE_TABLE_ACTION_SYM_CRYPTO); + void *data2 = action_data_get(table_entry2, action, + RTE_TABLE_ACTION_SYM_CRYPTO); + void *data3 = action_data_get(table_entry3, action, + RTE_TABLE_ACTION_SYM_CRYPTO); + + drop_mask0 |= pkt_work_sym_crypto(mbuf0, data0, &cfg->sym_crypto, + ip_offset); + drop_mask1 |= pkt_work_sym_crypto(mbuf1, data1, &cfg->sym_crypto, + ip_offset); + drop_mask2 |= pkt_work_sym_crypto(mbuf2, data2, &cfg->sym_crypto, + ip_offset); + drop_mask3 |= pkt_work_sym_crypto(mbuf3, data3, &cfg->sym_crypto, + ip_offset); + } + + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_TAG)) { + void *data0 = action_data_get(table_entry0, + action, + RTE_TABLE_ACTION_TAG); + void *data1 = action_data_get(table_entry1, + action, + RTE_TABLE_ACTION_TAG); + void *data2 = action_data_get(table_entry2, + action, + RTE_TABLE_ACTION_TAG); + void *data3 = action_data_get(table_entry3, + action, + RTE_TABLE_ACTION_TAG); + + pkt4_work_tag(mbuf0, mbuf1, mbuf2, mbuf3, + data0, data1, data2, data3); + } + return drop_mask0 | (drop_mask1 << 1) | (drop_mask2 << 2) | diff --git a/lib/librte_pipeline/rte_table_action.h b/lib/librte_pipeline/rte_table_action.h index c7f751aa..c9606129 100644 --- a/lib/librte_pipeline/rte_table_action.h +++ b/lib/librte_pipeline/rte_table_action.h @@ -93,6 +93,15 @@ enum rte_table_action_type { /** Timestamp. */ RTE_TABLE_ACTION_TIME, + + /** Crypto. */ + RTE_TABLE_ACTION_SYM_CRYPTO, + + /** Tag. */ + RTE_TABLE_ACTION_TAG, + + /** Packet decapsulations. */ + RTE_TABLE_ACTION_DECAP, }; /** Common action configuration (per table action profile). */ @@ -366,6 +375,11 @@ enum rte_table_action_encap_type { /** IP -> { Ether | PPPoE | PPP | IP } */ RTE_TABLE_ACTION_ENCAP_PPPOE, + + /** Ether -> { Ether | IP | UDP | VXLAN | Ether } + * Ether -> { Ether | VLAN | IP | UDP | VXLAN | Ether } + */ + RTE_TABLE_ACTION_ENCAP_VXLAN, }; /** Pre-computed Ethernet header fields for encapsulation action. */ @@ -393,6 +407,34 @@ struct rte_table_action_pppoe_hdr { uint16_t session_id; /**< Session ID. */ }; +/** Pre-computed IPv4 header fields for encapsulation action. */ +struct rte_table_action_ipv4_header { + uint32_t sa; /**< Source address. */ + uint32_t da; /**< Destination address. */ + uint8_t dscp; /**< DiffServ Code Point (DSCP). */ + uint8_t ttl; /**< Time To Live (TTL). */ +}; + +/** Pre-computed IPv6 header fields for encapsulation action. */ +struct rte_table_action_ipv6_header { + uint8_t sa[16]; /**< Source address. */ + uint8_t da[16]; /**< Destination address. */ + uint32_t flow_label; /**< Flow label. */ + uint8_t dscp; /**< DiffServ Code Point (DSCP). */ + uint8_t hop_limit; /**< Hop Limit (HL). */ +}; + +/** Pre-computed UDP header fields for encapsulation action. */ +struct rte_table_action_udp_header { + uint16_t sp; /**< Source port. */ + uint16_t dp; /**< Destination port. */ +}; + +/** Pre-computed VXLAN header fields for encapsulation action. */ +struct rte_table_action_vxlan_hdr { + uint32_t vni; /**< VXLAN Network Identifier (VNI). */ +}; + /** Ether encap parameters. */ struct rte_table_action_encap_ether_params { struct rte_table_action_ether_hdr ether; /**< Ethernet header. */ @@ -437,6 +479,21 @@ struct rte_table_action_encap_pppoe_params { struct rte_table_action_pppoe_hdr pppoe; /**< PPPoE/PPP headers. */ }; +/** VXLAN encap parameters. */ +struct rte_table_action_encap_vxlan_params { + struct rte_table_action_ether_hdr ether; /**< Ethernet header. */ + struct rte_table_action_vlan_hdr vlan; /**< VLAN header. */ + + RTE_STD_C11 + union { + struct rte_table_action_ipv4_header ipv4; /**< IPv4 header. */ + struct rte_table_action_ipv6_header ipv6; /**< IPv6 header. */ + }; + + struct rte_table_action_udp_header udp; /**< UDP header. */ + struct rte_table_action_vxlan_hdr vxlan; /**< VXLAN header. */ +}; + /** Encap action configuration (per table action profile). */ struct rte_table_action_encap_config { /** Bit mask defining the set of packet encapsulations enabled for the @@ -446,6 +503,30 @@ struct rte_table_action_encap_config { * @see enum rte_table_action_encap_type */ uint64_t encap_mask; + + /** Encapsulation type specific configuration. */ + RTE_STD_C11 + union { + struct { + /** Input packet to be encapsulated: offset within the + * input packet buffer to the start of the Ethernet + * frame to be encapsulated. Offset 0 points to the + * first byte of the MBUF structure. + */ + uint32_t data_offset; + + /** Encapsulation header: non-zero when encapsulation + * header includes a VLAN tag, zero otherwise. + */ + int vlan; + + /** Encapsulation header: IP version of the IP header + * within the encapsulation header. Non-zero for IPv4, + * zero for IPv6. + */ + int ip_version; + } vxlan; /**< VXLAN specific configuration. */ + }; }; /** Encap action parameters (per table rule). */ @@ -469,6 +550,9 @@ struct rte_table_action_encap_params { /** Only valid when *type* is set to PPPoE. */ struct rte_table_action_encap_pppoe_params pppoe; + + /** Only valid when *type* is set to VXLAN. */ + struct rte_table_action_encap_vxlan_params vxlan; }; }; @@ -605,6 +689,111 @@ struct rte_table_action_time_params { uint64_t time; }; +/** + * RTE_TABLE_ACTION_CRYPTO + */ +#ifndef RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX +#define RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX (16) +#endif + +#ifndef RTE_TABLE_ACTION_SYM_CRYPTO_AAD_SIZE_MAX +#define RTE_TABLE_ACTION_SYM_CRYPTO_AAD_SIZE_MAX (16) +#endif + +#ifndef RTE_TABLE_ACTION_SYM_CRYPTO_IV_OFFSET +#define RTE_TABLE_ACTION_SYM_CRYPTO_IV_OFFSET \ + (sizeof(struct rte_crypto_op) + sizeof(struct rte_crypto_sym_op)) +#endif + +/** Common action structure to store the data's value, length, and offset */ +struct rte_table_action_vlo { + uint8_t *val; + uint32_t length; + uint32_t offset; +}; + +/** Symmetric crypto action configuration (per table action profile). */ +struct rte_table_action_sym_crypto_config { + /** Target Cryptodev ID. */ + uint8_t cryptodev_id; + + /** + * Offset to rte_crypto_op structure within the input packet buffer. + * Offset 0 points to the first byte of the MBUF structure. + */ + uint32_t op_offset; + + /** The mempool for creating cryptodev sessions. */ + struct rte_mempool *mp_create; + + /** The mempool for initializing cryptodev sessions. */ + struct rte_mempool *mp_init; +}; + +/** Symmetric Crypto action parameters (per table rule). */ +struct rte_table_action_sym_crypto_params { + + /** Xform pointer contains all relevant information */ + struct rte_crypto_sym_xform *xform; + + /** + * Offset within the input packet buffer to the first byte of data + * to be processed by the crypto unit. Offset 0 points to the first + * byte of the MBUF structure. + */ + uint32_t data_offset; + + union { + struct { + /** Cipher iv data. */ + struct rte_table_action_vlo cipher_iv; + + /** Cipher iv data. */ + struct rte_table_action_vlo cipher_iv_update; + + /** Auth iv data. */ + struct rte_table_action_vlo auth_iv; + + /** Auth iv data. */ + struct rte_table_action_vlo auth_iv_update; + + } cipher_auth; + + struct { + /** AEAD AAD data. */ + struct rte_table_action_vlo aad; + + /** AEAD iv data. */ + struct rte_table_action_vlo iv; + + /** AEAD AAD data. */ + struct rte_table_action_vlo aad_update; + + /** AEAD iv data. */ + struct rte_table_action_vlo iv_update; + + } aead; + }; +}; + +/** + * RTE_TABLE_ACTION_TAG + */ +/** Tag action parameters (per table rule). */ +struct rte_table_action_tag_params { + /** Tag to be attached to the input packet. */ + uint32_t tag; +}; + +/** + * RTE_TABLE_ACTION_DECAP + */ +/** Decap action parameters (per table rule). */ +struct rte_table_action_decap_params { + /** Number of bytes to be removed from the start of the packet. */ + uint16_t n; +}; + /** * Table action profile. */ @@ -898,6 +1087,20 @@ rte_table_action_time_read(struct rte_table_action *action, void *data, uint64_t *timestamp); +/** + * Table action cryptodev symmetric session get. + * + * @param[in] action + * Handle to table action object (needs to be valid). + * @param[in] data + * Data byte array (typically table rule data) with sym crypto action. + * @return + * The pointer to the session on success, NULL otherwise. + */ +struct rte_cryptodev_sym_session *__rte_experimental +rte_table_action_crypto_sym_session_get(struct rte_table_action *action, + void *data); + #ifdef __cplusplus } #endif diff --git a/lib/librte_port/Makefile b/lib/librte_port/Makefile index 8df4864e..1b83f6f2 100644 --- a/lib/librte_port/Makefile +++ b/lib/librte_port/Makefile @@ -11,7 +11,7 @@ ifeq ($(CONFIG_RTE_PORT_PCAP),y) LDLIBS += -lpcap endif LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev -LDLIBS += -lrte_ip_frag -lrte_sched +LDLIBS += -lrte_ip_frag -lrte_sched -lrte_cryptodev ifeq ($(CONFIG_RTE_LIBRTE_KNI),y) LDLIBS += -lrte_kni endif @@ -38,6 +38,7 @@ ifeq ($(CONFIG_RTE_LIBRTE_KNI),y) SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_kni.c endif SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_source_sink.c +SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_sym_crypto.c # install includes SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port.h @@ -53,5 +54,6 @@ ifeq ($(CONFIG_RTE_LIBRTE_KNI),y) SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_kni.h endif SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_source_sink.h +SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_sym_crypto.h include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_port/meson.build b/lib/librte_port/meson.build index f3d8b443..0d11456f 100644 --- a/lib/librte_port/meson.build +++ b/lib/librte_port/meson.build @@ -9,7 +9,8 @@ sources = files( 'rte_port_ras.c', 'rte_port_ring.c', 'rte_port_sched.c', - 'rte_port_source_sink.c') + 'rte_port_source_sink.c', + 'rte_port_sym_crypto.c') headers = files( 'rte_port_ethdev.h', 'rte_port_fd.h', @@ -18,8 +19,9 @@ headers = files( 'rte_port.h', 'rte_port_ring.h', 'rte_port_sched.h', - 'rte_port_source_sink.h') -deps += ['ethdev', 'sched', 'ip_frag'] + 'rte_port_source_sink.h', + 'rte_port_sym_crypto.h') +deps += ['ethdev', 'sched', 'ip_frag', 'cryptodev'] if dpdk_conf.has('RTE_LIBRTE_KNI') sources += files('rte_port_kni.c') diff --git a/lib/librte_port/rte_port_sym_crypto.c b/lib/librte_port/rte_port_sym_crypto.c new file mode 100644 index 00000000..295984d0 --- /dev/null +++ b/lib/librte_port/rte_port_sym_crypto.c @@ -0,0 +1,552 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ +#include + +#include +#include + +#include "rte_port_sym_crypto.h" + +/* + * Port Crypto Reader + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_SYM_CRYPTO_READER_STATS_PKTS_IN_ADD(port, val) \ + (port)->stats.n_pkts_in += (val) +#define RTE_PORT_SYM_CRYPTO_READER_STATS_PKTS_DROP_ADD(port, val) \ + (port)->stats.n_pkts_drop += (val) + +#else + +#define RTE_PORT_SYM_CRYPTO_READER_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_SYM_CRYPTO_READER_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_sym_crypto_reader { + struct rte_port_in_stats stats; + + uint8_t cryptodev_id; + uint16_t queue_id; + struct rte_crypto_op *ops[RTE_PORT_IN_BURST_SIZE_MAX]; + rte_port_sym_crypto_reader_callback_fn f_callback; + void *arg_callback; +}; + +static void * +rte_port_sym_crypto_reader_create(void *params, int socket_id) +{ + struct rte_port_sym_crypto_reader_params *conf = + params; + struct rte_port_sym_crypto_reader *port; + + /* Check input parameters */ + if (conf == NULL) { + RTE_LOG(ERR, PORT, "%s: params is NULL\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->cryptodev_id = conf->cryptodev_id; + port->queue_id = conf->queue_id; + port->f_callback = conf->f_callback; + port->arg_callback = conf->arg_callback; + + return port; +} + +static int +rte_port_sym_crypto_reader_rx(void *port, struct rte_mbuf **pkts, uint32_t n_pkts) +{ + struct rte_port_sym_crypto_reader *p = + port; + uint16_t rx_ops_cnt, i, n = 0; + + rx_ops_cnt = rte_cryptodev_dequeue_burst(p->cryptodev_id, p->queue_id, + p->ops, n_pkts); + + for (i = 0; i < rx_ops_cnt; i++) { + struct rte_crypto_op *op = p->ops[i]; + + /** Drop failed pkts */ + if (unlikely(op->status != RTE_CRYPTO_OP_STATUS_SUCCESS)) { + rte_pktmbuf_free(op->sym->m_src); + continue; + } + + pkts[n++] = op->sym->m_src; + } + + if (p->f_callback) + (*p->f_callback)(pkts, n, p->arg_callback); + + RTE_PORT_SYM_CRYPTO_READER_STATS_PKTS_IN_ADD(p, n); + RTE_PORT_SYM_CRYPTO_READER_STATS_PKTS_DROP_ADD(p, rx_ops_cnt - n); + + return n; +} + +static int +rte_port_sym_crypto_reader_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: port is NULL\n", __func__); + return -EINVAL; + } + + rte_free(port); + + return 0; +} + +static int rte_port_sym_crypto_reader_stats_read(void *port, + struct rte_port_in_stats *stats, int clear) +{ + struct rte_port_sym_crypto_reader *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Port crypto Writer + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_SYM_CRYPTO_WRITER_STATS_PKTS_IN_ADD(port, val) \ + (port)->stats.n_pkts_in += (val) +#define RTE_PORT_SYM_CRYPTO_WRITER_STATS_PKTS_DROP_ADD(port, val) \ + (port)->stats.n_pkts_drop += (val) + +#else + +#define RTE_PORT_SYM_CRYPTO_WRITER_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_SYM_CRYPTO_WRITER_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_sym_crypto_writer { + struct rte_port_out_stats stats; + + struct rte_crypto_op *tx_buf[2 * RTE_PORT_IN_BURST_SIZE_MAX]; + + uint32_t tx_burst_sz; + uint32_t tx_buf_count; + uint64_t bsz_mask; + + uint8_t cryptodev_id; + uint16_t queue_id; + uint16_t crypto_op_offset; +}; + +static void * +rte_port_sym_crypto_writer_create(void *params, int socket_id) +{ + struct rte_port_sym_crypto_writer_params *conf = + params; + struct rte_port_sym_crypto_writer *port; + + /* Check input parameters */ + if ((conf == NULL) || + (conf->tx_burst_sz == 0) || + (conf->tx_burst_sz > RTE_PORT_IN_BURST_SIZE_MAX) || + (!rte_is_power_of_2(conf->tx_burst_sz))) { + RTE_LOG(ERR, PORT, "%s: Invalid input parameters\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->tx_burst_sz = conf->tx_burst_sz; + port->tx_buf_count = 0; + port->bsz_mask = 1LLU << (conf->tx_burst_sz - 1); + + port->cryptodev_id = conf->cryptodev_id; + port->queue_id = conf->queue_id; + port->crypto_op_offset = conf->crypto_op_offset; + + return port; +} + +static inline void +send_burst(struct rte_port_sym_crypto_writer *p) +{ + uint32_t nb_tx; + + nb_tx = rte_cryptodev_enqueue_burst(p->cryptodev_id, p->queue_id, + p->tx_buf, p->tx_buf_count); + + RTE_PORT_SYM_CRYPTO_WRITER_STATS_PKTS_DROP_ADD(p, p->tx_buf_count - + nb_tx); + for (; nb_tx < p->tx_buf_count; nb_tx++) + rte_pktmbuf_free(p->tx_buf[nb_tx]->sym->m_src); + + p->tx_buf_count = 0; +} + +static int +rte_port_sym_crypto_writer_tx(void *port, struct rte_mbuf *pkt) +{ + struct rte_port_sym_crypto_writer *p = + port; + + p->tx_buf[p->tx_buf_count++] = (struct rte_crypto_op *) + RTE_MBUF_METADATA_UINT8_PTR(pkt, p->crypto_op_offset); + RTE_PORT_SYM_CRYPTO_WRITER_STATS_PKTS_IN_ADD(p, 1); + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst(p); + + return 0; +} + +static int +rte_port_sym_crypto_writer_tx_bulk(void *port, + struct rte_mbuf **pkts, + uint64_t pkts_mask) +{ + struct rte_port_sym_crypto_writer *p = + port; + uint64_t bsz_mask = p->bsz_mask; + uint32_t tx_buf_count = p->tx_buf_count; + uint64_t expr = (pkts_mask & (pkts_mask + 1)) | + ((pkts_mask & bsz_mask) ^ bsz_mask); + + if (expr == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t i; + + RTE_PORT_SYM_CRYPTO_WRITER_STATS_PKTS_IN_ADD(p, n_pkts); + + for (i = 0; i < n_pkts; i++) + p->tx_buf[p->tx_buf_count++] = (struct rte_crypto_op *) + RTE_MBUF_METADATA_UINT8_PTR(pkts[i], + p->crypto_op_offset); + + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst(p); + } else { + for (; pkts_mask;) { + uint32_t pkt_index = __builtin_ctzll(pkts_mask); + uint64_t pkt_mask = 1LLU << pkt_index; + struct rte_mbuf *pkt = pkts[pkt_index]; + + p->tx_buf[tx_buf_count++] = (struct rte_crypto_op *) + RTE_MBUF_METADATA_UINT8_PTR(pkt, + p->crypto_op_offset); + + RTE_PORT_SYM_CRYPTO_WRITER_STATS_PKTS_IN_ADD(p, 1); + pkts_mask &= ~pkt_mask; + } + + p->tx_buf_count = tx_buf_count; + if (tx_buf_count >= p->tx_burst_sz) + send_burst(p); + } + + return 0; +} + +static int +rte_port_sym_crypto_writer_flush(void *port) +{ + struct rte_port_sym_crypto_writer *p = + port; + + if (p->tx_buf_count > 0) + send_burst(p); + + return 0; +} + +static int +rte_port_sym_crypto_writer_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Port is NULL\n", __func__); + return -EINVAL; + } + + rte_port_sym_crypto_writer_flush(port); + rte_free(port); + + return 0; +} + +static int rte_port_sym_crypto_writer_stats_read(void *port, + struct rte_port_out_stats *stats, int clear) +{ + struct rte_port_sym_crypto_writer *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Port crypto Writer Nodrop + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_SYM_CRYPTO_WRITER_NODROP_STATS_PKTS_IN_ADD(port, val) \ + (port)->stats.n_pkts_in += (val) +#define RTE_PORT_SYM_CRYPTO_WRITER_NODROP_STATS_PKTS_DROP_ADD(port, val) \ + (port)->stats.n_pkts_drop += (val) + +#else + +#define RTE_PORT_SYM_CRYPTO_WRITER_NODROP_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_SYM_CRYPTO_WRITER_NODROP_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_sym_crypto_writer_nodrop { + struct rte_port_out_stats stats; + + struct rte_crypto_op *tx_buf[2 * RTE_PORT_IN_BURST_SIZE_MAX]; + uint32_t tx_burst_sz; + uint32_t tx_buf_count; + uint64_t bsz_mask; + uint64_t n_retries; + + uint8_t cryptodev_id; + uint16_t queue_id; + uint16_t crypto_op_offset; +}; + +static void * +rte_port_sym_crypto_writer_nodrop_create(void *params, int socket_id) +{ + struct rte_port_sym_crypto_writer_nodrop_params *conf = + params; + struct rte_port_sym_crypto_writer_nodrop *port; + + /* Check input parameters */ + if ((conf == NULL) || + (conf->tx_burst_sz == 0) || + (conf->tx_burst_sz > RTE_PORT_IN_BURST_SIZE_MAX) || + (!rte_is_power_of_2(conf->tx_burst_sz))) { + RTE_LOG(ERR, PORT, "%s: Invalid input parameters\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->cryptodev_id = conf->cryptodev_id; + port->queue_id = conf->queue_id; + port->crypto_op_offset = conf->crypto_op_offset; + port->tx_burst_sz = conf->tx_burst_sz; + port->tx_buf_count = 0; + port->bsz_mask = 1LLU << (conf->tx_burst_sz - 1); + + /* + * When n_retries is 0 it means that we should wait for every packet to + * send no matter how many retries should it take. To limit number of + * branches in fast path, we use UINT64_MAX instead of branching. + */ + port->n_retries = (conf->n_retries == 0) ? UINT64_MAX : conf->n_retries; + + return port; +} + +static inline void +send_burst_nodrop(struct rte_port_sym_crypto_writer_nodrop *p) +{ + uint32_t nb_tx = 0, i; + + nb_tx = rte_cryptodev_enqueue_burst(p->cryptodev_id, p->queue_id, + p->tx_buf, p->tx_buf_count); + + /* We sent all the packets in a first try */ + if (nb_tx >= p->tx_buf_count) { + p->tx_buf_count = 0; + return; + } + + for (i = 0; i < p->n_retries; i++) { + nb_tx += rte_cryptodev_enqueue_burst(p->cryptodev_id, + p->queue_id, p->tx_buf + nb_tx, + p->tx_buf_count - nb_tx); + + /* We sent all the packets in more than one try */ + if (nb_tx >= p->tx_buf_count) { + p->tx_buf_count = 0; + return; + } + } + + /* We didn't send the packets in maximum allowed attempts */ + RTE_PORT_SYM_CRYPTO_WRITER_NODROP_STATS_PKTS_DROP_ADD(p, + p->tx_buf_count - nb_tx); + for ( ; nb_tx < p->tx_buf_count; nb_tx++) + rte_pktmbuf_free(p->tx_buf[nb_tx]->sym->m_src); + + p->tx_buf_count = 0; +} + +static int +rte_port_sym_crypto_writer_nodrop_tx(void *port, struct rte_mbuf *pkt) +{ + struct rte_port_sym_crypto_writer_nodrop *p = + port; + + p->tx_buf[p->tx_buf_count++] = (struct rte_crypto_op *) + RTE_MBUF_METADATA_UINT8_PTR(pkt, p->crypto_op_offset); + RTE_PORT_SYM_CRYPTO_WRITER_STATS_PKTS_IN_ADD(p, 1); + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst_nodrop(p); + + return 0; +} + +static int +rte_port_sym_crypto_writer_nodrop_tx_bulk(void *port, + struct rte_mbuf **pkts, + uint64_t pkts_mask) +{ + struct rte_port_sym_crypto_writer_nodrop *p = + port; + + uint64_t bsz_mask = p->bsz_mask; + uint32_t tx_buf_count = p->tx_buf_count; + uint64_t expr = (pkts_mask & (pkts_mask + 1)) | + ((pkts_mask & bsz_mask) ^ bsz_mask); + + if (expr == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t i; + + RTE_PORT_SYM_CRYPTO_WRITER_NODROP_STATS_PKTS_IN_ADD(p, n_pkts); + + for (i = 0; i < n_pkts; i++) + p->tx_buf[p->tx_buf_count++] = (struct rte_crypto_op *) + RTE_MBUF_METADATA_UINT8_PTR(pkts[i], + p->crypto_op_offset); + + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst_nodrop(p); + } else { + for ( ; pkts_mask; ) { + uint32_t pkt_index = __builtin_ctzll(pkts_mask); + uint64_t pkt_mask = 1LLU << pkt_index; + struct rte_mbuf *pkt = pkts[pkt_index]; + + p->tx_buf[tx_buf_count++] = (struct rte_crypto_op *) + RTE_MBUF_METADATA_UINT8_PTR(pkt, + p->crypto_op_offset); + RTE_PORT_SYM_CRYPTO_WRITER_NODROP_STATS_PKTS_IN_ADD(p, + 1); + pkts_mask &= ~pkt_mask; + } + + p->tx_buf_count = tx_buf_count; + if (tx_buf_count >= p->tx_burst_sz) + send_burst_nodrop(p); + } + + return 0; +} + +static int +rte_port_sym_crypto_writer_nodrop_flush(void *port) +{ + struct rte_port_sym_crypto_writer_nodrop *p = + port; + + if (p->tx_buf_count > 0) + send_burst_nodrop(p); + + return 0; +} + +static int +rte_port_sym_crypto_writer_nodrop_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Port is NULL\n", __func__); + return -EINVAL; + } + + rte_port_sym_crypto_writer_nodrop_flush(port); + rte_free(port); + + return 0; +} + +static int rte_port_sym_crypto_writer_nodrop_stats_read(void *port, + struct rte_port_out_stats *stats, int clear) +{ + struct rte_port_sym_crypto_writer_nodrop *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + + +/* + * Summary of port operations + */ +struct rte_port_in_ops rte_port_sym_crypto_reader_ops = { + .f_create = rte_port_sym_crypto_reader_create, + .f_free = rte_port_sym_crypto_reader_free, + .f_rx = rte_port_sym_crypto_reader_rx, + .f_stats = rte_port_sym_crypto_reader_stats_read, +}; + +struct rte_port_out_ops rte_port_sym_crypto_writer_ops = { + .f_create = rte_port_sym_crypto_writer_create, + .f_free = rte_port_sym_crypto_writer_free, + .f_tx = rte_port_sym_crypto_writer_tx, + .f_tx_bulk = rte_port_sym_crypto_writer_tx_bulk, + .f_flush = rte_port_sym_crypto_writer_flush, + .f_stats = rte_port_sym_crypto_writer_stats_read, +}; + +struct rte_port_out_ops rte_port_sym_crypto_writer_nodrop_ops = { + .f_create = rte_port_sym_crypto_writer_nodrop_create, + .f_free = rte_port_sym_crypto_writer_nodrop_free, + .f_tx = rte_port_sym_crypto_writer_nodrop_tx, + .f_tx_bulk = rte_port_sym_crypto_writer_nodrop_tx_bulk, + .f_flush = rte_port_sym_crypto_writer_nodrop_flush, + .f_stats = rte_port_sym_crypto_writer_nodrop_stats_read, +}; diff --git a/lib/librte_port/rte_port_sym_crypto.h b/lib/librte_port/rte_port_sym_crypto.h new file mode 100644 index 00000000..181f6ce0 --- /dev/null +++ b/lib/librte_port/rte_port_sym_crypto.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_PORT_SYM_CRYPTO_H__ +#define __INCLUDE_RTE_PORT_SYM_CRYPTO_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Port sym crypto Interface + * + * crypto_reader: input port built on top of pre-initialized crypto interface + * crypto_writer: output port built on top of pre-initialized crypto interface + * + **/ + +#include + +#include + +#include "rte_port.h" + +/** Function prototype for reader post action. */ +typedef void (*rte_port_sym_crypto_reader_callback_fn)(struct rte_mbuf **pkts, + uint16_t n_pkts, void *arg); + +/** Crypto_reader port parameters */ +struct rte_port_sym_crypto_reader_params { + /** Target cryptodev ID. */ + uint8_t cryptodev_id; + + /** Target cryptodev Queue Pair ID. */ + uint16_t queue_id; + + /** Crypto reader post callback function. */ + rte_port_sym_crypto_reader_callback_fn f_callback; + + /** Crypto reader post callback function arguments. */ + void *arg_callback; +}; + +/** Crypto_reader port operations. */ +extern struct rte_port_in_ops rte_port_sym_crypto_reader_ops; + + +/** Crypto_writer port parameters. */ +struct rte_port_sym_crypto_writer_params { + /** Target cryptodev ID. */ + uint8_t cryptodev_id; + + /** Target cryptodev Queue Pair ID. */ + uint16_t queue_id; + + /** offset to rte_crypto_op in the mbufs. */ + uint16_t crypto_op_offset; + + /** Burst size to crypto interface. */ + uint32_t tx_burst_sz; +}; + +/** Crypto_writer port operations. */ +extern struct rte_port_out_ops rte_port_sym_crypto_writer_ops; + +/** Crypto_writer_nodrop port parameters. */ +struct rte_port_sym_crypto_writer_nodrop_params { + /** Target cryptodev ID. */ + uint8_t cryptodev_id; + + /** Target cryptodev queue pair id. */ + uint16_t queue_id; + + /** Offset to rte_crypto_op in the mbufs. */ + uint16_t crypto_op_offset; + + /** Burst size to crypto interface. */ + uint32_t tx_burst_sz; + + /** Maximum number of retries, 0 for no limit. */ + uint32_t n_retries; +}; + +/** Crypto_writer_nodrop port operations. */ +extern struct rte_port_out_ops rte_port_sym_crypto_writer_nodrop_ops; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/librte_port/rte_port_version.map b/lib/librte_port/rte_port_version.map index 6470629b..609bcec3 100644 --- a/lib/librte_port/rte_port_version.map +++ b/lib/librte_port/rte_port_version.map @@ -51,3 +51,12 @@ DPDK_16.11 { rte_port_fd_writer_nodrop_ops; } DPDK_16.07; + +DPDK_18.11 { + global: + + rte_port_sym_crypto_reader_ops; + rte_port_sym_crypto_writer_ops; + rte_port_sym_crypto_writer_nodrop_ops; + +} DPDK_16.11; diff --git a/lib/librte_power/Makefile b/lib/librte_power/Makefile index 6f85e885..9bec668d 100644 --- a/lib/librte_power/Makefile +++ b/lib/librte_power/Makefile @@ -7,7 +7,7 @@ include $(RTE_SDK)/mk/rte.vars.mk LIB = librte_power.a CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing -LDLIBS += -lrte_eal +LDLIBS += -lrte_eal -lrte_timer EXPORT_MAP := rte_power_version.map @@ -16,8 +16,9 @@ LIBABIVER := 1 # all source are stored in SRCS-y SRCS-$(CONFIG_RTE_LIBRTE_POWER) := rte_power.c power_acpi_cpufreq.c SRCS-$(CONFIG_RTE_LIBRTE_POWER) += power_kvm_vm.c guest_channel.c +SRCS-$(CONFIG_RTE_LIBRTE_POWER) += rte_power_empty_poll.c # install this header file -SYMLINK-$(CONFIG_RTE_LIBRTE_POWER)-include := rte_power.h +SYMLINK-$(CONFIG_RTE_LIBRTE_POWER)-include := rte_power.h rte_power_empty_poll.h include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_power/channel_commands.h b/lib/librte_power/channel_commands.h index ee638eef..e7b93a79 100644 --- a/lib/librte_power/channel_commands.h +++ b/lib/librte_power/channel_commands.h @@ -19,6 +19,7 @@ extern "C" { #define CPU_POWER 1 #define CPU_POWER_CONNECT 2 #define PKT_POLICY 3 +#define PKT_POLICY_REMOVE 4 /* CPU Power Command Scaling */ #define CPU_POWER_SCALE_UP 1 @@ -58,6 +59,9 @@ struct traffic { uint32_t max_max_packet_thresh; }; +#define CORE_TYPE_VIRTUAL 0 +#define CORE_TYPE_PHYSICAL 1 + struct channel_packet { uint64_t resource_id; /**< core_num, device */ uint32_t unit; /**< scale down/up/min/max */ @@ -70,6 +74,7 @@ struct channel_packet { uint8_t vcpu_to_control[MAX_VCPU_PER_VM]; uint8_t num_vcpu; struct timer_profile timer_policy; + bool core_type; enum workload workload; enum policy_to_use policy_to_use; struct t_boost_status t_boost_status; diff --git a/lib/librte_power/meson.build b/lib/librte_power/meson.build index 253173f2..9ed8b56d 100644 --- a/lib/librte_power/meson.build +++ b/lib/librte_power/meson.build @@ -5,5 +5,7 @@ if host_machine.system() != 'linux' build = false endif sources = files('rte_power.c', 'power_acpi_cpufreq.c', - 'power_kvm_vm.c', 'guest_channel.c') -headers = files('rte_power.h') + 'power_kvm_vm.c', 'guest_channel.c', + 'rte_power_empty_poll.c') +headers = files('rte_power.h','rte_power_empty_poll.h') +deps += ['timer'] diff --git a/lib/librte_power/rte_power_empty_poll.c b/lib/librte_power/rte_power_empty_poll.c new file mode 100644 index 00000000..e6145462 --- /dev/null +++ b/lib/librte_power/rte_power_empty_poll.c @@ -0,0 +1,545 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#include + +#include +#include +#include +#include +#include + +#include "rte_power.h" +#include "rte_power_empty_poll.h" + +#define INTERVALS_PER_SECOND 100 /* (10ms) */ +#define SECONDS_TO_TRAIN_FOR 2 +#define DEFAULT_MED_TO_HIGH_PERCENT_THRESHOLD 70 +#define DEFAULT_HIGH_TO_MED_PERCENT_THRESHOLD 30 +#define DEFAULT_CYCLES_PER_PACKET 800 + +static struct ep_params *ep_params; +static uint32_t med_to_high_threshold = DEFAULT_MED_TO_HIGH_PERCENT_THRESHOLD; +static uint32_t high_to_med_threshold = DEFAULT_HIGH_TO_MED_PERCENT_THRESHOLD; + +static uint32_t avail_freqs[RTE_MAX_LCORE][NUM_FREQS]; + +static uint32_t total_avail_freqs[RTE_MAX_LCORE]; + +static uint32_t freq_index[NUM_FREQ]; + +static uint32_t +get_freq_index(enum freq_val index) +{ + return freq_index[index]; +} + + +static int +set_power_freq(int lcore_id, enum freq_val freq, bool specific_freq) +{ + int err = 0; + uint32_t power_freq_index; + if (!specific_freq) + power_freq_index = get_freq_index(freq); + else + power_freq_index = freq; + + err = rte_power_set_freq(lcore_id, power_freq_index); + + return err; +} + + +static inline void __attribute__((always_inline)) +exit_training_state(struct priority_worker *poll_stats) +{ + RTE_SET_USED(poll_stats); +} + +static inline void __attribute__((always_inline)) +enter_training_state(struct priority_worker *poll_stats) +{ + poll_stats->iter_counter = 0; + poll_stats->cur_freq = LOW; + poll_stats->queue_state = TRAINING; +} + +static inline void __attribute__((always_inline)) +enter_normal_state(struct priority_worker *poll_stats) +{ + /* Clear the averages arrays and strs */ + memset(poll_stats->edpi_av, 0, sizeof(poll_stats->edpi_av)); + poll_stats->ec = 0; + memset(poll_stats->ppi_av, 0, sizeof(poll_stats->ppi_av)); + poll_stats->pc = 0; + + poll_stats->cur_freq = MED; + poll_stats->iter_counter = 0; + poll_stats->threshold_ctr = 0; + poll_stats->queue_state = MED_NORMAL; + RTE_LOG(INFO, POWER, "Set the power freq to MED\n"); + set_power_freq(poll_stats->lcore_id, MED, false); + + poll_stats->thresh[MED].threshold_percent = med_to_high_threshold; + poll_stats->thresh[HGH].threshold_percent = high_to_med_threshold; +} + +static inline void __attribute__((always_inline)) +enter_busy_state(struct priority_worker *poll_stats) +{ + memset(poll_stats->edpi_av, 0, sizeof(poll_stats->edpi_av)); + poll_stats->ec = 0; + memset(poll_stats->ppi_av, 0, sizeof(poll_stats->ppi_av)); + poll_stats->pc = 0; + + poll_stats->cur_freq = HGH; + poll_stats->iter_counter = 0; + poll_stats->threshold_ctr = 0; + poll_stats->queue_state = HGH_BUSY; + set_power_freq(poll_stats->lcore_id, HGH, false); +} + +static inline void __attribute__((always_inline)) +enter_purge_state(struct priority_worker *poll_stats) +{ + poll_stats->iter_counter = 0; + poll_stats->queue_state = LOW_PURGE; +} + +static inline void __attribute__((always_inline)) +set_state(struct priority_worker *poll_stats, + enum queue_state new_state) +{ + enum queue_state old_state = poll_stats->queue_state; + if (old_state != new_state) { + + /* Call any old state exit functions */ + if (old_state == TRAINING) + exit_training_state(poll_stats); + + /* Call any new state entry functions */ + if (new_state == TRAINING) + enter_training_state(poll_stats); + if (new_state == MED_NORMAL) + enter_normal_state(poll_stats); + if (new_state == HGH_BUSY) + enter_busy_state(poll_stats); + if (new_state == LOW_PURGE) + enter_purge_state(poll_stats); + } +} + +static inline void __attribute__((always_inline)) +set_policy(struct priority_worker *poll_stats, + struct ep_policy *policy) +{ + set_state(poll_stats, policy->state); + + if (policy->state == TRAINING) + return; + + poll_stats->thresh[MED_NORMAL].base_edpi = policy->med_base_edpi; + poll_stats->thresh[HGH_BUSY].base_edpi = policy->hgh_base_edpi; + + poll_stats->thresh[MED_NORMAL].trained = true; + poll_stats->thresh[HGH_BUSY].trained = true; + +} + +static void +update_training_stats(struct priority_worker *poll_stats, + uint32_t freq, + bool specific_freq, + uint32_t max_train_iter) +{ + RTE_SET_USED(specific_freq); + + char pfi_str[32]; + uint64_t p0_empty_deq; + + sprintf(pfi_str, "%02d", freq); + + if (poll_stats->cur_freq == freq && + poll_stats->thresh[freq].trained == false) { + if (poll_stats->thresh[freq].cur_train_iter == 0) { + + set_power_freq(poll_stats->lcore_id, + freq, specific_freq); + + poll_stats->empty_dequeues_prev = + poll_stats->empty_dequeues; + + poll_stats->thresh[freq].cur_train_iter++; + + return; + } else if (poll_stats->thresh[freq].cur_train_iter + <= max_train_iter) { + + p0_empty_deq = poll_stats->empty_dequeues - + poll_stats->empty_dequeues_prev; + + poll_stats->empty_dequeues_prev = + poll_stats->empty_dequeues; + + poll_stats->thresh[freq].base_edpi += p0_empty_deq; + poll_stats->thresh[freq].cur_train_iter++; + + } else { + if (poll_stats->thresh[freq].trained == false) { + poll_stats->thresh[freq].base_edpi = + poll_stats->thresh[freq].base_edpi / + max_train_iter; + + /* Add on a factor of 0.05% + * this should remove any + * false negatives when the system is 0% busy + */ + poll_stats->thresh[freq].base_edpi += + poll_stats->thresh[freq].base_edpi / 2000; + + poll_stats->thresh[freq].trained = true; + poll_stats->cur_freq++; + + } + } + } +} + +static inline uint32_t __attribute__((always_inline)) +update_stats(struct priority_worker *poll_stats) +{ + uint64_t tot_edpi = 0, tot_ppi = 0; + uint32_t j, percent; + + struct priority_worker *s = poll_stats; + + uint64_t cur_edpi = s->empty_dequeues - s->empty_dequeues_prev; + + s->empty_dequeues_prev = s->empty_dequeues; + + uint64_t ppi = s->num_dequeue_pkts - s->num_dequeue_pkts_prev; + + s->num_dequeue_pkts_prev = s->num_dequeue_pkts; + + if (s->thresh[s->cur_freq].base_edpi < cur_edpi) { + + /* edpi mean empty poll counter difference per interval */ + RTE_LOG(DEBUG, POWER, "cur_edpi is too large " + "cur edpi %"PRId64" " + "base edpi %"PRId64"\n", + cur_edpi, + s->thresh[s->cur_freq].base_edpi); + /* Value to make us fail need debug log*/ + return 1000UL; + } + + s->edpi_av[s->ec++ % BINS_AV] = cur_edpi; + s->ppi_av[s->pc++ % BINS_AV] = ppi; + + for (j = 0; j < BINS_AV; j++) { + tot_edpi += s->edpi_av[j]; + tot_ppi += s->ppi_av[j]; + } + + tot_edpi = tot_edpi / BINS_AV; + + percent = 100 - (uint32_t)(((float)tot_edpi / + (float)s->thresh[s->cur_freq].base_edpi) * 100); + + return (uint32_t)percent; +} + + +static inline void __attribute__((always_inline)) +update_stats_normal(struct priority_worker *poll_stats) +{ + uint32_t percent; + + if (poll_stats->thresh[poll_stats->cur_freq].base_edpi == 0) { + + enum freq_val cur_freq = poll_stats->cur_freq; + + /* edpi mean empty poll counter difference per interval */ + RTE_LOG(DEBUG, POWER, "cure freq is %d, edpi is %"PRIu64"\n", + cur_freq, + poll_stats->thresh[cur_freq].base_edpi); + return; + } + + percent = update_stats(poll_stats); + + if (percent > 100) { + /* edpi mean empty poll counter difference per interval */ + RTE_LOG(DEBUG, POWER, "Edpi is bigger than threshold\n"); + return; + } + + if (poll_stats->cur_freq == LOW) + RTE_LOG(INFO, POWER, "Purge Mode is not currently supported\n"); + else if (poll_stats->cur_freq == MED) { + + if (percent > + poll_stats->thresh[MED].threshold_percent) { + + if (poll_stats->threshold_ctr < INTERVALS_PER_SECOND) + poll_stats->threshold_ctr++; + else { + set_state(poll_stats, HGH_BUSY); + RTE_LOG(INFO, POWER, "MOVE to HGH\n"); + } + + } else { + /* reset */ + poll_stats->threshold_ctr = 0; + } + + } else if (poll_stats->cur_freq == HGH) { + + if (percent < + poll_stats->thresh[HGH].threshold_percent) { + + if (poll_stats->threshold_ctr < INTERVALS_PER_SECOND) + poll_stats->threshold_ctr++; + else { + set_state(poll_stats, MED_NORMAL); + RTE_LOG(INFO, POWER, "MOVE to MED\n"); + } + } else { + /* reset */ + poll_stats->threshold_ctr = 0; + } + + } +} + +static int +empty_poll_training(struct priority_worker *poll_stats, + uint32_t max_train_iter) +{ + + if (poll_stats->iter_counter < INTERVALS_PER_SECOND) { + poll_stats->iter_counter++; + return 0; + } + + + update_training_stats(poll_stats, + LOW, + false, + max_train_iter); + + update_training_stats(poll_stats, + MED, + false, + max_train_iter); + + update_training_stats(poll_stats, + HGH, + false, + max_train_iter); + + + if (poll_stats->thresh[LOW].trained == true + && poll_stats->thresh[MED].trained == true + && poll_stats->thresh[HGH].trained == true) { + + set_state(poll_stats, MED_NORMAL); + + RTE_LOG(INFO, POWER, "LOW threshold is %"PRIu64"\n", + poll_stats->thresh[LOW].base_edpi); + + RTE_LOG(INFO, POWER, "MED threshold is %"PRIu64"\n", + poll_stats->thresh[MED].base_edpi); + + + RTE_LOG(INFO, POWER, "HIGH threshold is %"PRIu64"\n", + poll_stats->thresh[HGH].base_edpi); + + RTE_LOG(INFO, POWER, "Training is Complete for %d\n", + poll_stats->lcore_id); + } + + return 0; +} + +void __rte_experimental +rte_empty_poll_detection(struct rte_timer *tim, void *arg) +{ + + uint32_t i; + + struct priority_worker *poll_stats; + + RTE_SET_USED(tim); + + RTE_SET_USED(arg); + + for (i = 0; i < NUM_NODES; i++) { + + poll_stats = &(ep_params->wrk_data.wrk_stats[i]); + + if (rte_lcore_is_enabled(poll_stats->lcore_id) == 0) + continue; + + switch (poll_stats->queue_state) { + case(TRAINING): + empty_poll_training(poll_stats, + ep_params->max_train_iter); + break; + + case(HGH_BUSY): + case(MED_NORMAL): + update_stats_normal(poll_stats); + break; + + case(LOW_PURGE): + break; + default: + break; + + } + + } + +} + +int __rte_experimental +rte_power_empty_poll_stat_init(struct ep_params **eptr, uint8_t *freq_tlb, + struct ep_policy *policy) +{ + uint32_t i; + /* Allocate the ep_params structure */ + ep_params = rte_zmalloc_socket(NULL, + sizeof(struct ep_params), + 0, + rte_socket_id()); + + if (!ep_params) + return -1; + + if (freq_tlb == NULL) { + freq_index[LOW] = 14; + freq_index[MED] = 9; + freq_index[HGH] = 1; + } else { + freq_index[LOW] = freq_tlb[LOW]; + freq_index[MED] = freq_tlb[MED]; + freq_index[HGH] = freq_tlb[HGH]; + } + + RTE_LOG(INFO, POWER, "Initialize the Empty Poll\n"); + + /* Train for pre-defined period */ + ep_params->max_train_iter = INTERVALS_PER_SECOND * SECONDS_TO_TRAIN_FOR; + + struct stats_data *w = &ep_params->wrk_data; + + *eptr = ep_params; + + /* initialize all wrk_stats state */ + for (i = 0; i < NUM_NODES; i++) { + + if (rte_lcore_is_enabled(i) == 0) + continue; + /*init the freqs table */ + total_avail_freqs[i] = rte_power_freqs(i, + avail_freqs[i], + NUM_FREQS); + + RTE_LOG(INFO, POWER, "total avail freq is %d , lcoreid %d\n", + total_avail_freqs[i], + i); + + if (get_freq_index(LOW) > total_avail_freqs[i]) + return -1; + + if (rte_get_master_lcore() != i) { + w->wrk_stats[i].lcore_id = i; + set_policy(&w->wrk_stats[i], policy); + } + } + + return 0; +} + +void __rte_experimental +rte_power_empty_poll_stat_free(void) +{ + + RTE_LOG(INFO, POWER, "Close the Empty Poll\n"); + + if (ep_params != NULL) + rte_free(ep_params); +} + +int __rte_experimental +rte_power_empty_poll_stat_update(unsigned int lcore_id) +{ + struct priority_worker *poll_stats; + + if (lcore_id >= NUM_NODES) + return -1; + + poll_stats = &(ep_params->wrk_data.wrk_stats[lcore_id]); + + if (poll_stats->lcore_id == 0) + poll_stats->lcore_id = lcore_id; + + poll_stats->empty_dequeues++; + + return 0; +} + +int __rte_experimental +rte_power_poll_stat_update(unsigned int lcore_id, uint8_t nb_pkt) +{ + + struct priority_worker *poll_stats; + + if (lcore_id >= NUM_NODES) + return -1; + + poll_stats = &(ep_params->wrk_data.wrk_stats[lcore_id]); + + if (poll_stats->lcore_id == 0) + poll_stats->lcore_id = lcore_id; + + poll_stats->num_dequeue_pkts += nb_pkt; + + return 0; +} + + +uint64_t __rte_experimental +rte_power_empty_poll_stat_fetch(unsigned int lcore_id) +{ + struct priority_worker *poll_stats; + + if (lcore_id >= NUM_NODES) + return -1; + + poll_stats = &(ep_params->wrk_data.wrk_stats[lcore_id]); + + if (poll_stats->lcore_id == 0) + poll_stats->lcore_id = lcore_id; + + return poll_stats->empty_dequeues; +} + +uint64_t __rte_experimental +rte_power_poll_stat_fetch(unsigned int lcore_id) +{ + struct priority_worker *poll_stats; + + if (lcore_id >= NUM_NODES) + return -1; + + poll_stats = &(ep_params->wrk_data.wrk_stats[lcore_id]); + + if (poll_stats->lcore_id == 0) + poll_stats->lcore_id = lcore_id; + + return poll_stats->num_dequeue_pkts; +} diff --git a/lib/librte_power/rte_power_empty_poll.h b/lib/librte_power/rte_power_empty_poll.h new file mode 100644 index 00000000..c1ad5c24 --- /dev/null +++ b/lib/librte_power/rte_power_empty_poll.h @@ -0,0 +1,223 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#ifndef _RTE_EMPTY_POLL_H +#define _RTE_EMPTY_POLL_H + +/** + * @file + * RTE Power Management + */ +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define NUM_FREQS RTE_MAX_LCORE_FREQS + +#define BINS_AV 4 /* Has to be ^2 */ + +#define DROP (NUM_DIRECTIONS * NUM_DEVICES) + +#define NUM_PRIORITIES 2 + +#define NUM_NODES 256 /* Max core number*/ + +/* Processor Power State */ +enum freq_val { + LOW, + MED, + HGH, + NUM_FREQ = NUM_FREQS +}; + + +/* Queue Polling State */ +enum queue_state { + TRAINING, /* NO TRAFFIC */ + MED_NORMAL, /* MED */ + HGH_BUSY, /* HIGH */ + LOW_PURGE, /* LOW */ +}; + +/* Queue Stats */ +struct freq_threshold { + + uint64_t base_edpi; + bool trained; + uint32_t threshold_percent; + uint32_t cur_train_iter; +}; + +/* Each Worder Thread Empty Poll Stats */ +struct priority_worker { + + /* Current dequeue and throughput counts */ + /* These 2 are written to by the worker threads */ + /* So keep them on their own cache line */ + uint64_t empty_dequeues; + uint64_t num_dequeue_pkts; + + enum queue_state queue_state; + + uint64_t empty_dequeues_prev; + uint64_t num_dequeue_pkts_prev; + + /* Used for training only */ + struct freq_threshold thresh[NUM_FREQ]; + enum freq_val cur_freq; + + /* bucket arrays to calculate the averages */ + /* edpi mean empty poll counter difference per interval */ + uint64_t edpi_av[BINS_AV]; + /* empty poll counter */ + uint32_t ec; + /* ppi mean valid poll counter per interval */ + uint64_t ppi_av[BINS_AV]; + /* valid poll counter */ + uint32_t pc; + + uint32_t lcore_id; + uint32_t iter_counter; + uint32_t threshold_ctr; + uint32_t display_ctr; + uint8_t dev_id; + +} __rte_cache_aligned; + + +struct stats_data { + + struct priority_worker wrk_stats[NUM_NODES]; + + /* flag to stop rx threads processing packets until training over */ + bool start_rx; + +}; + +/* Empty Poll Parameters */ +struct ep_params { + + /* Timer related stuff */ + uint64_t interval_ticks; + uint32_t max_train_iter; + + struct rte_timer timer0; + struct stats_data wrk_data; +}; + + +/* Sample App Init information */ +struct ep_policy { + + uint64_t med_base_edpi; + uint64_t hgh_base_edpi; + + enum queue_state state; +}; + + + +/** + * Initialize the power management system. + * + * @param eptr + * the structure of empty poll configuration + * @param freq_tlb + * the power state/frequency mapping table + * @param policy + * the initialization policy from sample app + * + * @return + * - 0 on success. + * - Negative on error. + */ +int __rte_experimental +rte_power_empty_poll_stat_init(struct ep_params **eptr, uint8_t *freq_tlb, + struct ep_policy *policy); + +/** + * Free the resource hold by power management system. + */ +void __rte_experimental +rte_power_empty_poll_stat_free(void); + +/** + * Update specific core empty poll counter + * It's not thread safe. + * + * @param lcore_id + * lcore id + * + * @return + * - 0 on success. + * - Negative on error. + */ +int __rte_experimental +rte_power_empty_poll_stat_update(unsigned int lcore_id); + +/** + * Update specific core valid poll counter, not thread safe. + * + * @param lcore_id + * lcore id. + * @param nb_pkt + * The packet number of one valid poll. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int __rte_experimental +rte_power_poll_stat_update(unsigned int lcore_id, uint8_t nb_pkt); + +/** + * Fetch specific core empty poll counter. + * + * @param lcore_id + * lcore id + * + * @return + * Current lcore empty poll counter value. + */ +uint64_t __rte_experimental +rte_power_empty_poll_stat_fetch(unsigned int lcore_id); + +/** + * Fetch specific core valid poll counter. + * + * @param lcore_id + * lcore id + * + * @return + * Current lcore valid poll counter value. + */ +uint64_t __rte_experimental +rte_power_poll_stat_fetch(unsigned int lcore_id); + +/** + * Empty poll state change detection function + * + * @param tim + * The timer structure + * @param arg + * The customized parameter + */ +void __rte_experimental +rte_empty_poll_detection(struct rte_timer *tim, void *arg); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/librte_power/rte_power_version.map b/lib/librte_power/rte_power_version.map index dd587dfb..17a083b2 100644 --- a/lib/librte_power/rte_power_version.map +++ b/lib/librte_power/rte_power_version.map @@ -33,3 +33,16 @@ DPDK_18.08 { rte_power_get_capabilities; } DPDK_17.11; + +EXPERIMENTAL { + global: + + rte_empty_poll_detection; + rte_power_empty_poll_stat_fetch; + rte_power_empty_poll_stat_free; + rte_power_empty_poll_stat_init; + rte_power_empty_poll_stat_update; + rte_power_poll_stat_fetch; + rte_power_poll_stat_update; + +}; diff --git a/lib/librte_rawdev/rte_rawdev.c b/lib/librte_rawdev/rte_rawdev.c index 62b6b97e..9f1e3592 100644 --- a/lib/librte_rawdev/rte_rawdev.c +++ b/lib/librte_rawdev/rte_rawdev.c @@ -35,21 +35,19 @@ /* dynamic log identifier */ int librawdev_logtype; -struct rte_rawdev rte_rawdevices[RTE_RAWDEV_MAX_DEVS]; +static struct rte_rawdev rte_rawdevices[RTE_RAWDEV_MAX_DEVS]; -struct rte_rawdev *rte_rawdevs = &rte_rawdevices[0]; +struct rte_rawdev *rte_rawdevs = rte_rawdevices; static struct rte_rawdev_global rawdev_globals = { .nb_devs = 0 }; -struct rte_rawdev_global *rte_rawdev_globals = &rawdev_globals; - /* Raw device, northbound API implementation */ uint8_t rte_rawdev_count(void) { - return rte_rawdev_globals->nb_devs; + return rawdev_globals.nb_devs; } uint16_t @@ -60,7 +58,7 @@ rte_rawdev_get_dev_id(const char *name) if (!name) return -EINVAL; - for (i = 0; i < rte_rawdev_globals->nb_devs; i++) + for (i = 0; i < rawdev_globals.nb_devs; i++) if ((strcmp(rte_rawdevices[i].name, name) == 0) && (rte_rawdevices[i].attached == diff --git a/lib/librte_rawdev/rte_rawdev_pmd.h b/lib/librte_rawdev/rte_rawdev_pmd.h index bb9bbc35..811e51d0 100644 --- a/lib/librte_rawdev/rte_rawdev_pmd.h +++ b/lib/librte_rawdev/rte_rawdev_pmd.h @@ -73,8 +73,6 @@ struct rte_rawdev_global { uint16_t nb_devs; }; -extern struct rte_rawdev_global *rte_rawdev_globals; -/** Pointer to global raw devices data structure. */ extern struct rte_rawdev *rte_rawdevs; /** The pool of rte_rawdev structures. */ diff --git a/lib/librte_ring/meson.build b/lib/librte_ring/meson.build index ca8a435e..ab8b0b46 100644 --- a/lib/librte_ring/meson.build +++ b/lib/librte_ring/meson.build @@ -1,6 +1,7 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2017 Intel Corporation +version = 2 sources = files('rte_ring.c') headers = files('rte_ring.h', 'rte_ring_c11_mem.h', diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h index 7a731d07..af5444a9 100644 --- a/lib/librte_ring/rte_ring.h +++ b/lib/librte_ring/rte_ring.h @@ -303,11 +303,11 @@ void rte_ring_dump(FILE *f, const struct rte_ring *r); * There are 2 choices for the users * 1.use rmb() memory barrier * 2.use one-direcion load_acquire/store_release barrier,defined by - * CONFIG_RTE_RING_USE_C11_MEM_MODEL=y + * CONFIG_RTE_USE_C11_MEM_MODEL=y * It depends on performance test results. * By default, move common functions to rte_ring_generic.h */ -#ifdef RTE_RING_USE_C11_MEM_MODEL +#ifdef RTE_USE_C11_MEM_MODEL #include "rte_ring_c11_mem.h" #else #include "rte_ring_generic.h" diff --git a/lib/librte_sched/Makefile b/lib/librte_sched/Makefile index 55d9c698..46c53ed7 100644 --- a/lib/librte_sched/Makefile +++ b/lib/librte_sched/Makefile @@ -11,8 +11,6 @@ LIB = librte_sched.a CFLAGS += -O3 CFLAGS += $(WERROR_FLAGS) -CFLAGS_rte_red.o := -D_GNU_SOURCE - LDLIBS += -lm LDLIBS += -lrt LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_net diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c index 9269e5c7..587d5e60 100644 --- a/lib/librte_sched/rte_sched.c +++ b/lib/librte_sched/rte_sched.c @@ -329,7 +329,7 @@ rte_sched_port_check_params(struct rte_sched_port_params *params) return -1; /* socket */ - if ((params->socket < 0) || (params->socket >= RTE_MAX_NUMA_NODES)) + if (params->socket < 0) return -3; /* rate */ @@ -633,7 +633,8 @@ rte_sched_port_config(struct rte_sched_port_params *params) return NULL; /* Allocate memory to store the data structures */ - port = rte_zmalloc("qos_params", mem_size, RTE_CACHE_LINE_SIZE); + port = rte_zmalloc_socket("qos_params", mem_size, RTE_CACHE_LINE_SIZE, + params->socket); if (port == NULL) return NULL; diff --git a/lib/librte_security/rte_security.c b/lib/librte_security/rte_security.c index 1954960a..c6355de9 100644 --- a/lib/librte_security/rte_security.c +++ b/lib/librte_security/rte_security.c @@ -131,6 +131,10 @@ rte_security_capability_get(struct rte_security_ctx *instance, capability->ipsec.direction == idx->ipsec.direction) return capability; + } else if (idx->protocol == RTE_SECURITY_PROTOCOL_PDCP) { + if (capability->pdcp.domain == + idx->pdcp.domain) + return capability; } } } diff --git a/lib/librte_security/rte_security.h b/lib/librte_security/rte_security.h index b0d1b97e..1431b4df 100644 --- a/lib/librte_security/rte_security.h +++ b/lib/librte_security/rte_security.h @@ -206,6 +206,64 @@ struct rte_security_macsec_xform { int dummy; }; +/** + * PDCP Mode of session + */ +enum rte_security_pdcp_domain { + RTE_SECURITY_PDCP_MODE_CONTROL, /**< PDCP control plane */ + RTE_SECURITY_PDCP_MODE_DATA, /**< PDCP data plane */ +}; + +/** PDCP Frame direction */ +enum rte_security_pdcp_direction { + RTE_SECURITY_PDCP_UPLINK, /**< Uplink */ + RTE_SECURITY_PDCP_DOWNLINK, /**< Downlink */ +}; + +/** PDCP Sequence Number Size selectors */ +enum rte_security_pdcp_sn_size { + /** PDCP_SN_SIZE_5: 5bit sequence number */ + RTE_SECURITY_PDCP_SN_SIZE_5 = 5, + /** PDCP_SN_SIZE_7: 7bit sequence number */ + RTE_SECURITY_PDCP_SN_SIZE_7 = 7, + /** PDCP_SN_SIZE_12: 12bit sequence number */ + RTE_SECURITY_PDCP_SN_SIZE_12 = 12, + /** PDCP_SN_SIZE_15: 15bit sequence number */ + RTE_SECURITY_PDCP_SN_SIZE_15 = 15, + /** PDCP_SN_SIZE_18: 18bit sequence number */ + RTE_SECURITY_PDCP_SN_SIZE_18 = 18 +}; + +/** + * PDCP security association configuration data. + * + * This structure contains data required to create a PDCP security session. + */ +struct rte_security_pdcp_xform { + int8_t bearer; /**< PDCP bearer ID */ + /** Enable in order delivery, this field shall be set only if + * driver/HW is capable. See RTE_SECURITY_PDCP_ORDERING_CAP. + */ + uint8_t en_ordering; + /** Notify driver/HW to detect and remove duplicate packets. + * This field should be set only when driver/hw is capable. + * See RTE_SECURITY_PDCP_DUP_DETECT_CAP. + */ + uint8_t remove_duplicates; + /** PDCP mode of operation: Control or data */ + enum rte_security_pdcp_domain domain; + /** PDCP Frame Direction 0:UL 1:DL */ + enum rte_security_pdcp_direction pkt_dir; + /** Sequence number size, 5/7/12/15/18 */ + enum rte_security_pdcp_sn_size sn_size; + /** Starting Hyper Frame Number to be used together with the SN + * from the PDCP frames + */ + uint32_t hfn; + /** HFN Threshold for key renegotiation */ + uint32_t hfn_threshold; +}; + /** * Security session action type. */ @@ -232,6 +290,8 @@ enum rte_security_session_protocol { /**< IPsec Protocol */ RTE_SECURITY_PROTOCOL_MACSEC, /**< MACSec Protocol */ + RTE_SECURITY_PROTOCOL_PDCP, + /**< PDCP Protocol */ }; /** @@ -246,6 +306,7 @@ struct rte_security_session_conf { union { struct rte_security_ipsec_xform ipsec; struct rte_security_macsec_xform macsec; + struct rte_security_pdcp_xform pdcp; }; /**< Configuration parameters for security session */ struct rte_crypto_sym_xform *crypto_xform; @@ -413,6 +474,10 @@ struct rte_security_ipsec_stats { }; +struct rte_security_pdcp_stats { + uint64_t reserved; +}; + struct rte_security_stats { enum rte_security_session_protocol protocol; /**< Security protocol to be configured */ @@ -421,6 +486,7 @@ struct rte_security_stats { union { struct rte_security_macsec_stats macsec; struct rte_security_ipsec_stats ipsec; + struct rte_security_pdcp_stats pdcp; }; }; @@ -465,6 +531,13 @@ struct rte_security_capability { int dummy; } macsec; /**< MACsec capability */ + struct { + enum rte_security_pdcp_domain domain; + /**< PDCP mode of operation: Control or data */ + uint32_t capa_flags; + /**< Capabilitity flags, see RTE_SECURITY_PDCP_* */ + } pdcp; + /**< PDCP capability */ }; const struct rte_cryptodev_capabilities *crypto_capabilities; @@ -474,6 +547,19 @@ struct rte_security_capability { /**< Device offload flags */ }; +/** Underlying Hardware/driver which support PDCP may or may not support + * packet ordering. Set RTE_SECURITY_PDCP_ORDERING_CAP if it support. + * If it is not set, driver/HW assumes packets received are in order + * and it will be application's responsibility to maintain ordering. + */ +#define RTE_SECURITY_PDCP_ORDERING_CAP 0x00000001 + +/** Underlying Hardware/driver which support PDCP may or may not detect + * duplicate packet. Set RTE_SECURITY_PDCP_DUP_DETECT_CAP if it support. + * If it is not set, driver/HW assumes there is no duplicate packet received. + */ +#define RTE_SECURITY_PDCP_DUP_DETECT_CAP 0x00000002 + #define RTE_SECURITY_TX_OLOAD_NEED_MDATA 0x00000001 /**< HW needs metadata update, see rte_security_set_pkt_metadata(). */ @@ -506,6 +592,10 @@ struct rte_security_capability_idx { enum rte_security_ipsec_sa_mode mode; enum rte_security_ipsec_sa_direction direction; } ipsec; + struct { + enum rte_security_pdcp_domain domain; + uint32_t capa_flags; + } pdcp; }; }; diff --git a/lib/librte_table/Makefile b/lib/librte_table/Makefile index 276d476a..f935678a 100644 --- a/lib/librte_table/Makefile +++ b/lib/librte_table/Makefile @@ -46,6 +46,8 @@ SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_table_acl.h endif SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_table_hash.h SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_table_hash_cuckoo.h +SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_table_hash_func.h +SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_table_hash_func_arm64.h SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_lru.h ifeq ($(CONFIG_RTE_ARCH_X86),y) SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_lru_x86.h diff --git a/lib/librte_table/meson.build b/lib/librte_table/meson.build index 8b2f8413..6ae3cd6c 100644 --- a/lib/librte_table/meson.build +++ b/lib/librte_table/meson.build @@ -19,6 +19,8 @@ headers = files('rte_table.h', 'rte_table_lpm_ipv6.h', 'rte_table_hash.h', 'rte_table_hash_cuckoo.h', + 'rte_table_hash_func.h', + 'rte_table_hash_func_arm64.h', 'rte_lru.h', 'rte_table_array.h', 'rte_table_stub.h') diff --git a/lib/librte_table/rte_table_hash_func.h b/lib/librte_table/rte_table_hash_func.h new file mode 100644 index 00000000..02296eab --- /dev/null +++ b/lib/librte_table/rte_table_hash_func.h @@ -0,0 +1,245 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_TABLE_HASH_FUNC_H__ +#define __INCLUDE_RTE_TABLE_HASH_FUNC_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include +#include + +static inline uint64_t __rte_experimental +rte_crc32_u64_generic(uint64_t crc, uint64_t value) +{ + int i; + + crc = (crc & 0xFFFFFFFFLLU) ^ value; + for (i = 63; i >= 0; i--) { + uint64_t mask; + + mask = -(crc & 1LLU); + crc = (crc >> 1LLU) ^ (0x82F63B78LLU & mask); + } + + return crc; +} + +#if defined(RTE_ARCH_X86_64) + +#include + +static inline uint64_t +rte_crc32_u64(uint64_t crc, uint64_t v) +{ + return _mm_crc32_u64(crc, v); +} + +#elif defined(RTE_ARCH_ARM64) +#include "rte_table_hash_func_arm64.h" +#else + +static inline uint64_t +rte_crc32_u64(uint64_t crc, uint64_t v) +{ + return rte_crc32_u64_generic(crc, v); +} + +#endif + +static inline uint64_t __rte_experimental +rte_table_hash_crc_key8(void *key, void *mask, __rte_unused uint32_t key_size, + uint64_t seed) +{ + uint64_t *k = key; + uint64_t *m = mask; + uint64_t crc0; + + crc0 = rte_crc32_u64(seed, k[0] & m[0]); + + return crc0; +} + +static inline uint64_t __rte_experimental +rte_table_hash_crc_key16(void *key, void *mask, __rte_unused uint32_t key_size, + uint64_t seed) +{ + uint64_t *k = key; + uint64_t *m = mask; + uint64_t k0, crc0, crc1; + + k0 = k[0] & m[0]; + + crc0 = rte_crc32_u64(k0, seed); + crc1 = rte_crc32_u64(k0 >> 32, k[1] & m[1]); + + crc0 ^= crc1; + + return crc0; +} + +static inline uint64_t __rte_experimental +rte_table_hash_crc_key24(void *key, void *mask, __rte_unused uint32_t key_size, + uint64_t seed) +{ + uint64_t *k = key; + uint64_t *m = mask; + uint64_t k0, k2, crc0, crc1; + + k0 = k[0] & m[0]; + k2 = k[2] & m[2]; + + crc0 = rte_crc32_u64(k0, seed); + crc1 = rte_crc32_u64(k0 >> 32, k[1] & m[1]); + + crc0 = rte_crc32_u64(crc0, k2); + + crc0 ^= crc1; + + return crc0; +} + +static inline uint64_t __rte_experimental +rte_table_hash_crc_key32(void *key, void *mask, __rte_unused uint32_t key_size, + uint64_t seed) +{ + uint64_t *k = key; + uint64_t *m = mask; + uint64_t k0, k2, crc0, crc1, crc2, crc3; + + k0 = k[0] & m[0]; + k2 = k[2] & m[2]; + + crc0 = rte_crc32_u64(k0, seed); + crc1 = rte_crc32_u64(k0 >> 32, k[1] & m[1]); + + crc2 = rte_crc32_u64(k2, k[3] & m[3]); + crc3 = k2 >> 32; + + crc0 = rte_crc32_u64(crc0, crc1); + crc1 = rte_crc32_u64(crc2, crc3); + + crc0 ^= crc1; + + return crc0; +} + +static inline uint64_t __rte_experimental +rte_table_hash_crc_key40(void *key, void *mask, __rte_unused uint32_t key_size, + uint64_t seed) +{ + uint64_t *k = key; + uint64_t *m = mask; + uint64_t k0, k2, crc0, crc1, crc2, crc3; + + k0 = k[0] & m[0]; + k2 = k[2] & m[2]; + + crc0 = rte_crc32_u64(k0, seed); + crc1 = rte_crc32_u64(k0 >> 32, k[1] & m[1]); + + crc2 = rte_crc32_u64(k2, k[3] & m[3]); + crc3 = rte_crc32_u64(k2 >> 32, k[4] & m[4]); + + crc0 = rte_crc32_u64(crc0, crc1); + crc1 = rte_crc32_u64(crc2, crc3); + + crc0 ^= crc1; + + return crc0; +} + +static inline uint64_t __rte_experimental +rte_table_hash_crc_key48(void *key, void *mask, __rte_unused uint32_t key_size, + uint64_t seed) +{ + uint64_t *k = key; + uint64_t *m = mask; + uint64_t k0, k2, k5, crc0, crc1, crc2, crc3; + + k0 = k[0] & m[0]; + k2 = k[2] & m[2]; + k5 = k[5] & m[5]; + + crc0 = rte_crc32_u64(k0, seed); + crc1 = rte_crc32_u64(k0 >> 32, k[1] & m[1]); + + crc2 = rte_crc32_u64(k2, k[3] & m[3]); + crc3 = rte_crc32_u64(k2 >> 32, k[4] & m[4]); + + crc0 = rte_crc32_u64(crc0, (crc1 << 32) ^ crc2); + crc1 = rte_crc32_u64(crc3, k5); + + crc0 ^= crc1; + + return crc0; +} + +static inline uint64_t __rte_experimental +rte_table_hash_crc_key56(void *key, void *mask, __rte_unused uint32_t key_size, + uint64_t seed) +{ + uint64_t *k = key; + uint64_t *m = mask; + uint64_t k0, k2, k5, crc0, crc1, crc2, crc3, crc4, crc5; + + k0 = k[0] & m[0]; + k2 = k[2] & m[2]; + k5 = k[5] & m[5]; + + crc0 = rte_crc32_u64(k0, seed); + crc1 = rte_crc32_u64(k0 >> 32, k[1] & m[1]); + + crc2 = rte_crc32_u64(k2, k[3] & m[3]); + crc3 = rte_crc32_u64(k2 >> 32, k[4] & m[4]); + + crc4 = rte_crc32_u64(k5, k[6] & m[6]); + crc5 = k5 >> 32; + + crc0 = rte_crc32_u64(crc0, (crc1 << 32) ^ crc2); + crc1 = rte_crc32_u64(crc3, (crc4 << 32) ^ crc5); + + crc0 ^= crc1; + + return crc0; +} + +static inline uint64_t __rte_experimental +rte_table_hash_crc_key64(void *key, void *mask, __rte_unused uint32_t key_size, + uint64_t seed) +{ + uint64_t *k = key; + uint64_t *m = mask; + uint64_t k0, k2, k5, crc0, crc1, crc2, crc3, crc4, crc5; + + k0 = k[0] & m[0]; + k2 = k[2] & m[2]; + k5 = k[5] & m[5]; + + crc0 = rte_crc32_u64(k0, seed); + crc1 = rte_crc32_u64(k0 >> 32, k[1] & m[1]); + + crc2 = rte_crc32_u64(k2, k[3] & m[3]); + crc3 = rte_crc32_u64(k2 >> 32, k[4] & m[4]); + + crc4 = rte_crc32_u64(k5, k[6] & m[6]); + crc5 = rte_crc32_u64(k5 >> 32, k[7] & m[7]); + + crc0 = rte_crc32_u64(crc0, (crc1 << 32) ^ crc2); + crc1 = rte_crc32_u64(crc3, (crc4 << 32) ^ crc5); + + crc0 ^= crc1; + + return crc0; +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/librte_table/rte_table_hash_func_arm64.h b/lib/librte_table/rte_table_hash_func_arm64.h new file mode 100644 index 00000000..eb04c1ff --- /dev/null +++ b/lib/librte_table/rte_table_hash_func_arm64.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Linaro Limited + */ + +#ifndef __INCLUDE_RTE_TABLE_HASH_FUNC_ARM64_H__ +#define __INCLUDE_RTE_TABLE_HASH_FUNC_ARM64_H__ + +#define _CRC32CX(crc, val) \ + __asm__("crc32cx %w[c], %w[c], %x[v]":[c] "+r" (crc):[v] "r" (val)) + +static inline uint64_t +rte_crc32_u64(uint64_t crc, uint64_t v) +{ + uint32_t crc32 = crc; + + _CRC32CX(crc32, v); + + return crc32; +} + +#endif diff --git a/lib/librte_telemetry/Makefile b/lib/librte_telemetry/Makefile new file mode 100644 index 00000000..1a050691 --- /dev/null +++ b/lib/librte_telemetry/Makefile @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_telemetry.a + +CFLAGS += -O3 +CFLAGS += -I$(SRCDIR) +CFLAGS += -DALLOW_EXPERIMENTAL_API + +LDLIBS += -lrte_eal -lrte_ethdev +LDLIBS += -lrte_metrics +LDLIBS += -lpthread +LDLIBS += -ljansson + +EXPORT_MAP := rte_telemetry_version.map + +LIBABIVER := 1 + +# library source files +SRCS-$(CONFIG_RTE_LIBRTE_TELEMETRY) := rte_telemetry.c +SRCS-$(CONFIG_RTE_LIBRTE_TELEMETRY) += rte_telemetry_parser.c +SRCS-$(CONFIG_RTE_LIBRTE_TELEMETRY) += rte_telemetry_parser_test.c + +# export include files +SYMLINK-$(CONFIG_RTE_LIBRTE_TELEMETRY)-include := rte_telemetry.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_telemetry/meson.build b/lib/librte_telemetry/meson.build new file mode 100644 index 00000000..9492f544 --- /dev/null +++ b/lib/librte_telemetry/meson.build @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Intel Corporation + +sources = files('rte_telemetry.c', 'rte_telemetry_parser.c', 'rte_telemetry_parser_test.c') +headers = files('rte_telemetry.h', 'rte_telemetry_internal.h', 'rte_telemetry_parser.h', 'rte_telemetry_parser_test.h') +deps += ['metrics', 'ethdev'] +cflags += '-DALLOW_EXPERIMENTAL_API' + +jansson = cc.find_library('jansson', required: false) +if jansson.found() + ext_deps += jansson + dpdk_app_link_libraries += ['telemetry'] +else + build = false +endif diff --git a/lib/librte_telemetry/rte_telemetry.c b/lib/librte_telemetry/rte_telemetry.c new file mode 100644 index 00000000..016431f1 --- /dev/null +++ b/lib/librte_telemetry/rte_telemetry.c @@ -0,0 +1,1813 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "rte_telemetry.h" +#include "rte_telemetry_internal.h" +#include "rte_telemetry_parser.h" +#include "rte_telemetry_parser_test.h" +#include "rte_telemetry_socket_tests.h" + +#define BUF_SIZE 1024 +#define ACTION_POST 1 +#define SLEEP_TIME 10 + +#define SELFTEST_VALID_CLIENT "/var/run/dpdk/valid_client" +#define SELFTEST_INVALID_CLIENT "/var/run/dpdk/invalid_client" +#define SOCKET_TEST_CLIENT_PATH "/var/run/dpdk/client" + +static telemetry_impl *static_telemetry; + +struct telemetry_message_test { + char *test_name; + int (*test_func_ptr)(struct telemetry_impl *telemetry, int fd); +}; + +struct json_data { + char *status_code; + char *data; + int port; + char *stat_name; + int stat_value; +}; + +static void +rte_telemetry_get_runtime_dir(char *socket_path, size_t size) +{ + snprintf(socket_path, size, "%s/telemetry", rte_eal_get_runtime_dir()); +} + +int32_t +rte_telemetry_is_port_active(int port_id) +{ + int ret; + + ret = rte_eth_find_next(port_id); + if (ret == port_id) + return 1; + + TELEMETRY_LOG_ERR("port_id: %d is invalid, not active", + port_id); + + return 0; +} + +static int32_t +rte_telemetry_update_metrics_ethdev(struct telemetry_impl *telemetry, + uint16_t port_id, int reg_start_index) +{ + int ret, num_xstats, i; + struct rte_eth_xstat *eth_xstats; + + if (!rte_eth_dev_is_valid_port(port_id)) { + TELEMETRY_LOG_ERR("port_id: %d is invalid", port_id); + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + } + + ret = rte_telemetry_is_port_active(port_id); + if (ret < 1) { + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + } + + num_xstats = rte_eth_xstats_get(port_id, NULL, 0); + if (num_xstats < 0) { + TELEMETRY_LOG_ERR("rte_eth_xstats_get(%u) failed: %d", port_id, + num_xstats); + ret = rte_telemetry_send_error_response(telemetry, -EPERM); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + } + + eth_xstats = malloc(sizeof(struct rte_eth_xstat) * num_xstats); + if (eth_xstats == NULL) { + TELEMETRY_LOG_ERR("Failed to malloc memory for xstats"); + ret = rte_telemetry_send_error_response(telemetry, -ENOMEM); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + } + + ret = rte_eth_xstats_get(port_id, eth_xstats, num_xstats); + if (ret < 0 || ret > num_xstats) { + free(eth_xstats); + TELEMETRY_LOG_ERR("rte_eth_xstats_get(%u) len%i failed: %d", + port_id, num_xstats, ret); + ret = rte_telemetry_send_error_response(telemetry, -EPERM); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + } + + uint64_t xstats_values[num_xstats]; + for (i = 0; i < num_xstats; i++) + xstats_values[i] = eth_xstats[i].value; + + ret = rte_metrics_update_values(port_id, reg_start_index, xstats_values, + num_xstats); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not update metrics values"); + ret = rte_telemetry_send_error_response(telemetry, -EPERM); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + free(eth_xstats); + return -1; + } + + free(eth_xstats); + return 0; +} + +int32_t +rte_telemetry_write_to_socket(struct telemetry_impl *telemetry, + const char *json_string) +{ + int ret; + + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Could not initialise TELEMETRY_API"); + return -1; + } + + if (telemetry->request_client == NULL) { + TELEMETRY_LOG_ERR("No client has been chosen to write to"); + return -1; + } + + if (json_string == NULL) { + TELEMETRY_LOG_ERR("Invalid JSON string!"); + return -1; + } + + ret = send(telemetry->request_client->fd, + json_string, strlen(json_string), 0); + if (ret < 0) { + TELEMETRY_LOG_ERR("Failed to write to socket for client: %s", + telemetry->request_client->file_path); + return -1; + } + + return 0; +} + +int32_t +rte_telemetry_send_error_response(struct telemetry_impl *telemetry, + int error_type) +{ + int ret; + const char *status_code, *json_buffer; + json_t *root; + + if (error_type == -EPERM) + status_code = "Status Error: Unknown"; + else if (error_type == -EINVAL) + status_code = "Status Error: Invalid Argument 404"; + else if (error_type == -ENOMEM) + status_code = "Status Error: Memory Allocation Error"; + else { + TELEMETRY_LOG_ERR("Invalid error type"); + return -EINVAL; + } + + root = json_object(); + + if (root == NULL) { + TELEMETRY_LOG_ERR("Could not create root JSON object"); + return -EPERM; + } + + ret = json_object_set_new(root, "status_code", json_string(status_code)); + if (ret < 0) { + TELEMETRY_LOG_ERR("Status code field cannot be set"); + json_decref(root); + return -EPERM; + } + + ret = json_object_set_new(root, "data", json_null()); + if (ret < 0) { + TELEMETRY_LOG_ERR("Data field cannot be set"); + json_decref(root); + return -EPERM; + } + + json_buffer = json_dumps(root, 0); + json_decref(root); + + ret = rte_telemetry_write_to_socket(telemetry, json_buffer); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not write to socket"); + return -EPERM; + } + + return 0; +} + +static int +rte_telemetry_get_metrics(struct telemetry_impl *telemetry, uint32_t port_id, + struct rte_metric_value *metrics, struct rte_metric_name *names, + int num_metrics) +{ + int ret, num_values; + + if (num_metrics < 0) { + TELEMETRY_LOG_ERR("Invalid metrics count"); + goto einval_fail; + } else if (num_metrics == 0) { + TELEMETRY_LOG_ERR("No metrics to display (none have been registered)"); + goto eperm_fail; + } + + if (metrics == NULL) { + TELEMETRY_LOG_ERR("Metrics must be initialised."); + goto einval_fail; + } + + if (names == NULL) { + TELEMETRY_LOG_ERR("Names must be initialised."); + goto einval_fail; + } + + ret = rte_metrics_get_names(names, num_metrics); + if (ret < 0 || ret > num_metrics) { + TELEMETRY_LOG_ERR("Cannot get metrics names"); + goto eperm_fail; + } + + num_values = rte_metrics_get_values(port_id, NULL, 0); + ret = rte_metrics_get_values(port_id, metrics, num_values); + if (ret < 0 || ret > num_values) { + TELEMETRY_LOG_ERR("Cannot get metrics values"); + goto eperm_fail; + } + + return 0; + +eperm_fail: + ret = rte_telemetry_send_error_response(telemetry, -EPERM); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + +einval_fail: + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + +} + +static int32_t +rte_telemetry_json_format_stat(struct telemetry_impl *telemetry, json_t *stats, + const char *metric_name, uint64_t metric_value) +{ + int ret; + json_t *stat = json_object(); + + if (stat == NULL) { + TELEMETRY_LOG_ERR("Could not create stat JSON object"); + goto eperm_fail; + } + + ret = json_object_set_new(stat, "name", json_string(metric_name)); + if (ret < 0) { + TELEMETRY_LOG_ERR("Stat Name field cannot be set"); + goto eperm_fail; + } + + ret = json_object_set_new(stat, "value", json_integer(metric_value)); + if (ret < 0) { + TELEMETRY_LOG_ERR("Stat Value field cannot be set"); + goto eperm_fail; + } + + ret = json_array_append_new(stats, stat); + if (ret < 0) { + TELEMETRY_LOG_ERR("Stat cannot be added to stats json array"); + goto eperm_fail; + } + + return 0; + +eperm_fail: + ret = rte_telemetry_send_error_response(telemetry, -EPERM); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + +} + +static int32_t +rte_telemetry_json_format_port(struct telemetry_impl *telemetry, + uint32_t port_id, json_t *ports, uint32_t *metric_ids, + uint32_t num_metric_ids) +{ + struct rte_metric_value *metrics = 0; + struct rte_metric_name *names = 0; + int num_metrics, ret, err_ret; + json_t *port, *stats; + uint32_t i; + + num_metrics = rte_metrics_get_names(NULL, 0); + if (num_metrics < 0) { + TELEMETRY_LOG_ERR("Cannot get metrics count"); + goto einval_fail; + } else if (num_metrics == 0) { + TELEMETRY_LOG_ERR("No metrics to display (none have been registered)"); + goto eperm_fail; + } + + metrics = malloc(sizeof(struct rte_metric_value) * num_metrics); + names = malloc(sizeof(struct rte_metric_name) * num_metrics); + if (metrics == NULL || names == NULL) { + TELEMETRY_LOG_ERR("Cannot allocate memory"); + free(metrics); + free(names); + + err_ret = rte_telemetry_send_error_response(telemetry, -ENOMEM); + if (err_ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + } + + ret = rte_telemetry_get_metrics(telemetry, port_id, metrics, names, + num_metrics); + if (ret < 0) { + free(metrics); + free(names); + TELEMETRY_LOG_ERR("rte_telemetry_get_metrics failed"); + return -1; + } + + port = json_object(); + stats = json_array(); + if (port == NULL || stats == NULL) { + TELEMETRY_LOG_ERR("Could not create port/stats JSON objects"); + goto eperm_fail; + } + + ret = json_object_set_new(port, "port", json_integer(port_id)); + if (ret < 0) { + TELEMETRY_LOG_ERR("Port field cannot be set"); + goto eperm_fail; + } + + for (i = 0; i < num_metric_ids; i++) { + int metric_id = metric_ids[i]; + int metric_index = -1; + int metric_name_key = -1; + int32_t j; + uint64_t metric_value; + + if (metric_id >= num_metrics) { + TELEMETRY_LOG_ERR("Metric_id: %d is not valid", + metric_id); + goto einval_fail; + } + + for (j = 0; j < num_metrics; j++) { + if (metrics[j].key == metric_id) { + metric_name_key = metrics[j].key; + metric_index = j; + break; + } + } + + const char *metric_name = names[metric_name_key].name; + metric_value = metrics[metric_index].value; + + if (metric_name_key < 0 || metric_index < 0) { + TELEMETRY_LOG_ERR("Could not get metric name/index"); + goto eperm_fail; + } + + ret = rte_telemetry_json_format_stat(telemetry, stats, + metric_name, metric_value); + if (ret < 0) { + TELEMETRY_LOG_ERR("Format stat with id: %u failed", + metric_id); + free(metrics); + free(names); + return -1; + } + } + + if (json_array_size(stats) == 0) + ret = json_object_set_new(port, "stats", json_null()); + else + ret = json_object_set_new(port, "stats", stats); + + if (ret < 0) { + TELEMETRY_LOG_ERR("Stats object cannot be set"); + goto eperm_fail; + } + + ret = json_array_append_new(ports, port); + if (ret < 0) { + TELEMETRY_LOG_ERR("Port object cannot be added to ports array"); + goto eperm_fail; + } + + free(metrics); + free(names); + return 0; + +eperm_fail: + free(metrics); + free(names); + ret = rte_telemetry_send_error_response(telemetry, -EPERM); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + +einval_fail: + free(metrics); + free(names); + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; +} + +static int32_t +rte_telemetry_encode_json_format(struct telemetry_impl *telemetry, + uint32_t *port_ids, uint32_t num_port_ids, uint32_t *metric_ids, + uint32_t num_metric_ids, char **json_buffer) +{ + int ret; + json_t *root, *ports; + uint32_t i; + + if (num_port_ids <= 0 || num_metric_ids <= 0) { + TELEMETRY_LOG_ERR("Please provide port and metric ids to query"); + goto einval_fail; + } + + ports = json_array(); + if (ports == NULL) { + TELEMETRY_LOG_ERR("Could not create ports JSON array"); + goto eperm_fail; + } + + for (i = 0; i < num_port_ids; i++) { + if (!rte_eth_dev_is_valid_port(port_ids[i])) { + TELEMETRY_LOG_ERR("Port: %d invalid", port_ids[i]); + goto einval_fail; + } + } + + for (i = 0; i < num_port_ids; i++) { + ret = rte_telemetry_json_format_port(telemetry, port_ids[i], + ports, metric_ids, num_metric_ids); + if (ret < 0) { + TELEMETRY_LOG_ERR("Format port in JSON failed"); + return -1; + } + } + + root = json_object(); + if (root == NULL) { + TELEMETRY_LOG_ERR("Could not create root JSON object"); + goto eperm_fail; + } + + ret = json_object_set_new(root, "status_code", + json_string("Status OK: 200")); + if (ret < 0) { + TELEMETRY_LOG_ERR("Status code field cannot be set"); + goto eperm_fail; + } + + ret = json_object_set_new(root, "data", ports); + if (ret < 0) { + TELEMETRY_LOG_ERR("Data field cannot be set"); + goto eperm_fail; + } + + *json_buffer = json_dumps(root, JSON_INDENT(2)); + json_decref(root); + return 0; + +eperm_fail: + ret = rte_telemetry_send_error_response(telemetry, -EPERM); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + +einval_fail: + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; +} + +int32_t +rte_telemetry_send_ports_stats_values(uint32_t *metric_ids, int num_metric_ids, + uint32_t *port_ids, int num_port_ids, struct telemetry_impl *telemetry) +{ + int ret, i; + char *json_buffer = NULL; + + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Invalid telemetry argument"); + return -1; + } + + if (metric_ids == NULL) { + TELEMETRY_LOG_ERR("Invalid metric_ids array"); + goto einval_fail; + } + + if (num_metric_ids < 0) { + TELEMETRY_LOG_ERR("Invalid num_metric_ids, must be positive"); + goto einval_fail; + } + + if (port_ids == NULL) { + TELEMETRY_LOG_ERR("Invalid port_ids array"); + goto einval_fail; + } + + if (num_port_ids < 0) { + TELEMETRY_LOG_ERR("Invalid num_port_ids, must be positive"); + goto einval_fail; + } + + for (i = 0; i < num_port_ids; i++) { + if (!rte_eth_dev_is_valid_port(port_ids[i])) { + TELEMETRY_LOG_ERR("Port: %d invalid", port_ids[i]); + goto einval_fail; + } + + ret = rte_telemetry_update_metrics_ethdev(telemetry, + port_ids[i], telemetry->reg_index); + if (ret < 0) { + TELEMETRY_LOG_ERR("Failed to update ethdev metrics"); + return -1; + } + } + + ret = rte_telemetry_encode_json_format(telemetry, port_ids, + num_port_ids, metric_ids, num_metric_ids, &json_buffer); + if (ret < 0) { + TELEMETRY_LOG_ERR("JSON encode function failed"); + return -1; + } + + ret = rte_telemetry_write_to_socket(telemetry, json_buffer); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not write to socket"); + return -1; + } + + return 0; + +einval_fail: + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; +} + + +static int32_t +rte_telemetry_reg_ethdev_to_metrics(uint16_t port_id) +{ + int ret, num_xstats, ret_val, i; + struct rte_eth_xstat *eth_xstats = NULL; + struct rte_eth_xstat_name *eth_xstats_names = NULL; + + if (!rte_eth_dev_is_valid_port(port_id)) { + TELEMETRY_LOG_ERR("port_id: %d is invalid", port_id); + return -EINVAL; + } + + num_xstats = rte_eth_xstats_get(port_id, NULL, 0); + if (num_xstats < 0) { + TELEMETRY_LOG_ERR("rte_eth_xstats_get(%u) failed: %d", + port_id, num_xstats); + return -EPERM; + } + + eth_xstats = malloc(sizeof(struct rte_eth_xstat) * num_xstats); + if (eth_xstats == NULL) { + TELEMETRY_LOG_ERR("Failed to malloc memory for xstats"); + return -ENOMEM; + } + + ret = rte_eth_xstats_get(port_id, eth_xstats, num_xstats); + const char *xstats_names[num_xstats]; + eth_xstats_names = malloc(sizeof(struct rte_eth_xstat_name) * num_xstats); + if (ret < 0 || ret > num_xstats) { + TELEMETRY_LOG_ERR("rte_eth_xstats_get(%u) len%i failed: %d", + port_id, num_xstats, ret); + ret_val = -EPERM; + goto free_xstats; + } + + if (eth_xstats_names == NULL) { + TELEMETRY_LOG_ERR("Failed to malloc memory for xstats_names"); + ret_val = -ENOMEM; + goto free_xstats; + } + + ret = rte_eth_xstats_get_names(port_id, eth_xstats_names, num_xstats); + if (ret < 0 || ret > num_xstats) { + TELEMETRY_LOG_ERR("rte_eth_xstats_get_names(%u) len%i failed: %d", + port_id, num_xstats, ret); + ret_val = -EPERM; + goto free_xstats; + } + + for (i = 0; i < num_xstats; i++) + xstats_names[i] = eth_xstats_names[eth_xstats[i].id].name; + + ret_val = rte_metrics_reg_names(xstats_names, num_xstats); + if (ret_val < 0) { + TELEMETRY_LOG_ERR("rte_metrics_reg_names failed - metrics may already be registered"); + ret_val = -1; + goto free_xstats; + } + + goto free_xstats; + +free_xstats: + free(eth_xstats); + free(eth_xstats_names); + return ret_val; +} + +static int32_t +rte_telemetry_initial_accept(struct telemetry_impl *telemetry) +{ + uint16_t pid; + int ret; + int selftest = 0; + + RTE_ETH_FOREACH_DEV(pid) { + telemetry->reg_index = rte_telemetry_reg_ethdev_to_metrics(pid); + break; + } + + if (telemetry->reg_index < 0) { + TELEMETRY_LOG_ERR("Failed to register ethdev metrics"); + return -1; + } + + telemetry->metrics_register_done = 1; + if (selftest) { + ret = rte_telemetry_socket_messaging_testing(telemetry->reg_index, + telemetry->server_fd); + if (ret < 0) + return -1; + + ret = rte_telemetry_parser_test(telemetry); + if (ret < 0) { + TELEMETRY_LOG_ERR("Parser Tests Failed"); + return -1; + } + + TELEMETRY_LOG_INFO("Success - All Parser Tests Passed"); + } + + return 0; +} + +static int32_t +rte_telemetry_read_client(struct telemetry_impl *telemetry) +{ + char buf[BUF_SIZE]; + int ret, buffer_read; + + buffer_read = read(telemetry->accept_fd, buf, BUF_SIZE-1); + + if (buffer_read == -1) { + TELEMETRY_LOG_ERR("Read error"); + return -1; + } else if (buffer_read == 0) { + goto close_socket; + } else { + buf[buffer_read] = '\0'; + ret = rte_telemetry_parse_client_message(telemetry, buf); + if (ret < 0) + TELEMETRY_LOG_WARN("Parse message failed"); + goto close_socket; + } + +close_socket: + if (close(telemetry->accept_fd) < 0) { + TELEMETRY_LOG_ERR("Close TELEMETRY socket failed"); + free(telemetry); + return -EPERM; + } + telemetry->accept_fd = 0; + + return 0; +} + +static int32_t +rte_telemetry_accept_new_client(struct telemetry_impl *telemetry) +{ + int ret; + + if (telemetry->accept_fd <= 0) { + ret = listen(telemetry->server_fd, 1); + if (ret < 0) { + TELEMETRY_LOG_ERR("Listening error with server fd"); + return -1; + } + + telemetry->accept_fd = accept(telemetry->server_fd, NULL, NULL); + if (telemetry->accept_fd >= 0 && + telemetry->metrics_register_done == 0) { + ret = rte_telemetry_initial_accept(telemetry); + if (ret < 0) { + TELEMETRY_LOG_ERR("Failed to run initial configurations/tests"); + return -1; + } + } + } else { + ret = rte_telemetry_read_client(telemetry); + if (ret < 0) { + TELEMETRY_LOG_ERR("Failed to read socket buffer"); + return -1; + } + } + + return 0; +} + +static int32_t +rte_telemetry_read_client_sockets(struct telemetry_impl *telemetry) +{ + int ret; + telemetry_client *client; + char client_buf[BUF_SIZE]; + int bytes; + + TAILQ_FOREACH(client, &telemetry->client_list_head, client_list) { + bytes = read(client->fd, client_buf, BUF_SIZE-1); + + if (bytes > 0) { + client_buf[bytes] = '\0'; + telemetry->request_client = client; + ret = rte_telemetry_parse(telemetry, client_buf); + if (ret < 0) { + TELEMETRY_LOG_WARN("Parse socket input failed: %i", + ret); + return -1; + } + } + } + + return 0; +} + +static int32_t +rte_telemetry_run(void *userdata) +{ + int ret; + struct telemetry_impl *telemetry = userdata; + + if (telemetry == NULL) { + TELEMETRY_LOG_WARN("TELEMETRY could not be initialised"); + return -1; + } + + ret = rte_telemetry_accept_new_client(telemetry); + if (ret < 0) { + TELEMETRY_LOG_ERR("Accept and read new client failed"); + return -1; + } + + ret = rte_telemetry_read_client_sockets(telemetry); + if (ret < 0) { + TELEMETRY_LOG_ERR("Client socket read failed"); + return -1; + } + + return 0; +} + +static void +*rte_telemetry_run_thread_func(void *userdata) +{ + int ret; + struct telemetry_impl *telemetry = userdata; + + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("%s passed a NULL instance", __func__); + pthread_exit(0); + } + + while (telemetry->thread_status) { + rte_telemetry_run(telemetry); + ret = usleep(SLEEP_TIME); + if (ret < 0) + TELEMETRY_LOG_ERR("Calling thread could not be put to sleep"); + } + pthread_exit(0); +} + +static int32_t +rte_telemetry_set_socket_nonblock(int fd) +{ + int flags; + + if (fd < 0) { + TELEMETRY_LOG_ERR("Invalid fd provided"); + return -1; + } + + flags = fcntl(fd, F_GETFL, 0); + if (flags < 0) + flags = 0; + + return fcntl(fd, F_SETFL, flags | O_NONBLOCK); +} + +static int32_t +rte_telemetry_create_socket(struct telemetry_impl *telemetry) +{ + int ret; + struct sockaddr_un addr; + char socket_path[BUF_SIZE]; + + if (telemetry == NULL) + return -1; + + telemetry->server_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0); + if (telemetry->server_fd == -1) { + TELEMETRY_LOG_ERR("Failed to open socket"); + return -1; + } + + ret = rte_telemetry_set_socket_nonblock(telemetry->server_fd); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not set socket to NONBLOCK"); + goto close_socket; + } + + addr.sun_family = AF_UNIX; + rte_telemetry_get_runtime_dir(socket_path, sizeof(socket_path)); + strlcpy(addr.sun_path, socket_path, sizeof(addr.sun_path)); + unlink(socket_path); + + if (bind(telemetry->server_fd, (struct sockaddr *)&addr, + sizeof(addr)) < 0) { + TELEMETRY_LOG_ERR("Socket binding error"); + goto close_socket; + } + + return 0; + +close_socket: + if (close(telemetry->server_fd) < 0) { + TELEMETRY_LOG_ERR("Close TELEMETRY socket failed"); + return -EPERM; + } + + return -1; +} + +int32_t __rte_experimental +rte_telemetry_init() +{ + int ret; + pthread_attr_t attr; + const char *telemetry_ctrl_thread = "telemetry"; + + if (static_telemetry) { + TELEMETRY_LOG_WARN("TELEMETRY structure already initialised"); + return -EALREADY; + } + + static_telemetry = calloc(1, sizeof(struct telemetry_impl)); + if (static_telemetry == NULL) { + TELEMETRY_LOG_ERR("Memory could not be allocated"); + return -ENOMEM; + } + + static_telemetry->socket_id = rte_socket_id(); + rte_metrics_init(static_telemetry->socket_id); + + ret = pthread_attr_init(&attr); + if (ret != 0) { + TELEMETRY_LOG_ERR("Pthread attribute init failed"); + return -EPERM; + } + + ret = rte_telemetry_create_socket(static_telemetry); + if (ret < 0) { + ret = rte_telemetry_cleanup(); + if (ret < 0) + TELEMETRY_LOG_ERR("TELEMETRY cleanup failed"); + return -EPERM; + } + TAILQ_INIT(&static_telemetry->client_list_head); + + ret = rte_ctrl_thread_create(&static_telemetry->thread_id, + telemetry_ctrl_thread, &attr, rte_telemetry_run_thread_func, + (void *)static_telemetry); + static_telemetry->thread_status = 1; + + if (ret < 0) { + ret = rte_telemetry_cleanup(); + if (ret < 0) + TELEMETRY_LOG_ERR("TELEMETRY cleanup failed"); + return -EPERM; + } + + return 0; +} + +static int32_t +rte_telemetry_client_cleanup(struct telemetry_client *client) +{ + int ret; + + ret = close(client->fd); + free(client->file_path); + free(client); + + if (ret < 0) { + TELEMETRY_LOG_ERR("Close client socket failed"); + return -EPERM; + } + + return 0; +} + +int32_t __rte_experimental +rte_telemetry_cleanup(void) +{ + int ret; + struct telemetry_impl *telemetry = static_telemetry; + telemetry_client *client, *temp_client; + + TAILQ_FOREACH_SAFE(client, &telemetry->client_list_head, client_list, + temp_client) { + TAILQ_REMOVE(&telemetry->client_list_head, client, client_list); + ret = rte_telemetry_client_cleanup(client); + if (ret < 0) { + TELEMETRY_LOG_ERR("Client cleanup failed"); + return -EPERM; + } + } + + ret = close(telemetry->server_fd); + if (ret < 0) { + TELEMETRY_LOG_ERR("Close TELEMETRY socket failed"); + free(telemetry); + return -EPERM; + } + + telemetry->thread_status = 0; + pthread_join(telemetry->thread_id, NULL); + free(telemetry); + static_telemetry = NULL; + + return 0; +} + +int32_t +rte_telemetry_unregister_client(struct telemetry_impl *telemetry, + const char *client_path) +{ + int ret; + telemetry_client *client, *temp_client; + + if (telemetry == NULL) { + TELEMETRY_LOG_WARN("TELEMETRY is not initialised"); + return -ENODEV; + } + + if (client_path == NULL) { + TELEMETRY_LOG_ERR("Invalid client path"); + goto einval_fail; + } + + if (TAILQ_EMPTY(&telemetry->client_list_head)) { + TELEMETRY_LOG_ERR("There are no clients currently registered"); + return -EPERM; + } + + TAILQ_FOREACH_SAFE(client, &telemetry->client_list_head, client_list, + temp_client) { + if (strcmp(client_path, client->file_path) == 0) { + TAILQ_REMOVE(&telemetry->client_list_head, client, + client_list); + ret = rte_telemetry_client_cleanup(client); + + if (ret < 0) { + TELEMETRY_LOG_ERR("Client cleanup failed"); + return -EPERM; + } + + return 0; + } + } + + TELEMETRY_LOG_WARN("Couldn't find client, possibly not registered yet."); + return -1; + +einval_fail: + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -EINVAL; +} + +int32_t +rte_telemetry_register_client(struct telemetry_impl *telemetry, + const char *client_path) +{ + int ret, fd; + struct sockaddr_un addrs; + + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Could not initialize TELEMETRY API"); + return -ENODEV; + } + + if (client_path == NULL) { + TELEMETRY_LOG_ERR("Invalid client path"); + return -EINVAL; + } + + telemetry_client *client; + TAILQ_FOREACH(client, &telemetry->client_list_head, client_list) { + if (strcmp(client_path, client->file_path) == 0) { + TELEMETRY_LOG_WARN("'%s' already registered", + client_path); + return -EINVAL; + } + } + + fd = socket(AF_UNIX, SOCK_SEQPACKET, 0); + if (fd == -1) { + TELEMETRY_LOG_ERR("Client socket error"); + return -EACCES; + } + + ret = rte_telemetry_set_socket_nonblock(fd); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not set socket to NONBLOCK"); + return -EPERM; + } + + addrs.sun_family = AF_UNIX; + strlcpy(addrs.sun_path, client_path, sizeof(addrs.sun_path)); + telemetry_client *new_client = malloc(sizeof(telemetry_client)); + new_client->file_path = strdup(client_path); + new_client->fd = fd; + + if (connect(fd, (struct sockaddr *)&addrs, sizeof(addrs)) == -1) { + TELEMETRY_LOG_ERR("TELEMETRY client connect to %s didn't work", + client_path); + ret = rte_telemetry_client_cleanup(new_client); + if (ret < 0) { + TELEMETRY_LOG_ERR("Client cleanup failed"); + return -EPERM; + } + return -EINVAL; + } + + TAILQ_INSERT_HEAD(&telemetry->client_list_head, new_client, client_list); + + return 0; +} + +int32_t +rte_telemetry_parse_client_message(struct telemetry_impl *telemetry, char *buf) +{ + int ret, action_int; + json_error_t error; + json_t *root = json_loads(buf, 0, &error); + + if (root == NULL) { + TELEMETRY_LOG_WARN("Could not load JSON object from data passed in : %s", + error.text); + goto fail; + } else if (!json_is_object(root)) { + TELEMETRY_LOG_WARN("JSON Request is not a JSON object"); + goto fail; + } + + json_t *action = json_object_get(root, "action"); + if (action == NULL) { + TELEMETRY_LOG_WARN("Request does not have action field"); + goto fail; + } else if (!json_is_integer(action)) { + TELEMETRY_LOG_WARN("Action value is not an integer"); + goto fail; + } + + json_t *command = json_object_get(root, "command"); + if (command == NULL) { + TELEMETRY_LOG_WARN("Request does not have command field"); + goto fail; + } else if (!json_is_string(command)) { + TELEMETRY_LOG_WARN("Command value is not a string"); + goto fail; + } + + action_int = json_integer_value(action); + if (action_int != ACTION_POST) { + TELEMETRY_LOG_WARN("Invalid action code"); + goto fail; + } + + if (strcmp(json_string_value(command), "clients") != 0) { + TELEMETRY_LOG_WARN("Invalid command"); + goto fail; + } + + json_t *data = json_object_get(root, "data"); + if (data == NULL) { + TELEMETRY_LOG_WARN("Request does not have data field"); + goto fail; + } + + json_t *client_path = json_object_get(data, "client_path"); + if (client_path == NULL) { + TELEMETRY_LOG_WARN("Request does not have client_path field"); + goto fail; + } + + if (!json_is_string(client_path)) { + TELEMETRY_LOG_WARN("Client_path value is not a string"); + goto fail; + } + + ret = rte_telemetry_register_client(telemetry, + json_string_value(client_path)); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not register client"); + telemetry->register_fail_count++; + goto fail; + } + + return 0; + +fail: + TELEMETRY_LOG_WARN("Client attempted to register with invalid message"); + json_decref(root); + return -1; +} + +int32_t +rte_telemetry_dummy_client_socket(const char *valid_client_path) +{ + int sockfd = socket(AF_UNIX, SOCK_SEQPACKET, 0); + struct sockaddr_un addr = {0}; + + if (sockfd < 0) { + TELEMETRY_LOG_ERR("Test socket creation failure"); + return -1; + } + + addr.sun_family = AF_UNIX; + strlcpy(addr.sun_path, valid_client_path, sizeof(addr.sun_path)); + unlink(valid_client_path); + + if (bind(sockfd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + TELEMETRY_LOG_ERR("Test socket binding failure"); + return -1; + } + + if (listen(sockfd, 1) < 0) { + TELEMETRY_LOG_ERR("Listen failure"); + return -1; + } + + return sockfd; +} + +int32_t __rte_experimental +rte_telemetry_selftest(void) +{ + const char *invalid_client_path = SELFTEST_INVALID_CLIENT; + const char *valid_client_path = SELFTEST_VALID_CLIENT; + int ret, sockfd; + + TELEMETRY_LOG_INFO("Selftest"); + + ret = rte_telemetry_init(); + if (ret < 0) { + TELEMETRY_LOG_ERR("Valid initialisation test failed"); + return -1; + } + + TELEMETRY_LOG_INFO("Success - Valid initialisation test passed"); + + ret = rte_telemetry_init(); + if (ret != -EALREADY) { + TELEMETRY_LOG_ERR("Invalid initialisation test failed"); + return -1; + } + + TELEMETRY_LOG_INFO("Success - Invalid initialisation test passed"); + + ret = rte_telemetry_unregister_client(static_telemetry, + invalid_client_path); + if (ret != -EPERM) { + TELEMETRY_LOG_ERR("Invalid unregister test failed"); + return -1; + } + + TELEMETRY_LOG_INFO("Success - Invalid unregister test passed"); + + sockfd = rte_telemetry_dummy_client_socket(valid_client_path); + if (sockfd < 0) { + TELEMETRY_LOG_ERR("Test socket creation failed"); + return -1; + } + + ret = rte_telemetry_register_client(static_telemetry, valid_client_path); + if (ret != 0) { + TELEMETRY_LOG_ERR("Valid register test failed: %i", ret); + return -1; + } + + accept(sockfd, NULL, NULL); + TELEMETRY_LOG_INFO("Success - Valid register test passed"); + + ret = rte_telemetry_register_client(static_telemetry, valid_client_path); + if (ret != -EINVAL) { + TELEMETRY_LOG_ERR("Invalid register test failed: %i", ret); + return -1; + } + + TELEMETRY_LOG_INFO("Success - Invalid register test passed"); + + ret = rte_telemetry_unregister_client(static_telemetry, + invalid_client_path); + if (ret != -1) { + TELEMETRY_LOG_ERR("Invalid unregister test failed: %i", ret); + return -1; + } + + TELEMETRY_LOG_INFO("Success - Invalid unregister test passed"); + + ret = rte_telemetry_unregister_client(static_telemetry, valid_client_path); + if (ret != 0) { + TELEMETRY_LOG_ERR("Valid unregister test failed: %i", ret); + return -1; + } + + TELEMETRY_LOG_INFO("Success - Valid unregister test passed"); + + ret = rte_telemetry_cleanup(); + if (ret < 0) { + TELEMETRY_LOG_ERR("Cleanup test failed"); + return -1; + } + + TELEMETRY_LOG_INFO("Success - Valid cleanup test passed"); + + return 0; +} + +int32_t +rte_telemetry_socket_messaging_testing(int index, int socket) +{ + struct telemetry_impl *telemetry = calloc(1, sizeof(telemetry_impl)); + int fd, bad_send_fd, send_fd, bad_fd, bad_recv_fd, recv_fd, ret; + + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Could not initialize Telemetry API"); + return -1; + } + + telemetry->server_fd = socket; + telemetry->reg_index = index; + TELEMETRY_LOG_INFO("Beginning Telemetry socket message Selftest"); + rte_telemetry_socket_test_setup(telemetry, &send_fd, &recv_fd); + TELEMETRY_LOG_INFO("Register valid client test"); + + ret = rte_telemetry_socket_register_test(telemetry, &fd, send_fd, + recv_fd); + if (ret < 0) { + TELEMETRY_LOG_ERR("Register valid client test failed!"); + free(telemetry); + return -1; + } + + TELEMETRY_LOG_INFO("Success - Register valid client test passed!"); + + TELEMETRY_LOG_INFO("Register invalid/same client test"); + ret = rte_telemetry_socket_test_setup(telemetry, &bad_send_fd, + &bad_recv_fd); + ret = rte_telemetry_socket_register_test(telemetry, &bad_fd, + bad_send_fd, bad_recv_fd); + if (!ret) { + TELEMETRY_LOG_ERR("Register invalid/same client test failed!"); + free(telemetry); + return -1; + } + + TELEMETRY_LOG_INFO("Success - Register invalid/same client test passed!"); + + ret = rte_telemetry_json_socket_message_test(telemetry, fd); + if (ret < 0) { + free(telemetry); + return -1; + } + + free(telemetry); + return 0; +} + +int32_t +rte_telemetry_socket_register_test(struct telemetry_impl *telemetry, int *fd, + int send_fd, int recv_fd) +{ + int ret; + char good_req_string[BUF_SIZE]; + + snprintf(good_req_string, sizeof(good_req_string), + "{\"action\":1,\"command\":\"clients\",\"data\":{\"client_path\"" + ":\"%s\"}}", SOCKET_TEST_CLIENT_PATH); + + listen(recv_fd, 1); + + ret = send(send_fd, good_req_string, strlen(good_req_string), 0); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not send message over socket"); + return -1; + } + + rte_telemetry_run(telemetry); + + if (telemetry->register_fail_count != 0) + return -1; + + *fd = accept(recv_fd, NULL, NULL); + + return 0; +} + +int32_t +rte_telemetry_socket_test_setup(struct telemetry_impl *telemetry, int *send_fd, + int *recv_fd) +{ + int ret; + const char *client_path = SOCKET_TEST_CLIENT_PATH; + char socket_path[BUF_SIZE]; + struct sockaddr_un addr = {0}; + struct sockaddr_un addrs = {0}; + *send_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0); + *recv_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0); + + listen(telemetry->server_fd, 5); + addr.sun_family = AF_UNIX; + rte_telemetry_get_runtime_dir(socket_path, sizeof(socket_path)); + strlcpy(addr.sun_path, socket_path, sizeof(addr.sun_path)); + + ret = connect(*send_fd, (struct sockaddr *) &addr, sizeof(addr)); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not connect socket"); + return -1; + } + + telemetry->accept_fd = accept(telemetry->server_fd, NULL, NULL); + + addrs.sun_family = AF_UNIX; + strlcpy(addrs.sun_path, client_path, sizeof(addrs.sun_path)); + unlink(client_path); + + ret = bind(*recv_fd, (struct sockaddr *)&addrs, sizeof(addrs)); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not bind socket"); + return -1; + } + + return 0; +} + +static int32_t +rte_telemetry_stat_parse(char *buf, struct json_data *json_data_struct) +{ + json_error_t error; + json_t *root = json_loads(buf, 0, &error); + int arraylen, i; + json_t *status, *dataArray, *port, *stats, *name, *value, *dataArrayObj, + *statsArrayObj; + + stats = NULL; + port = NULL; + name = NULL; + + if (buf == NULL) { + TELEMETRY_LOG_ERR("JSON message is NULL"); + return -EINVAL; + } + + if (root == NULL) { + TELEMETRY_LOG_ERR("Could not load JSON object from data passed in : %s", + error.text); + return -EPERM; + } else if (!json_is_object(root)) { + TELEMETRY_LOG_ERR("JSON Request is not a JSON object"); + json_decref(root); + return -EINVAL; + } + + status = json_object_get(root, "status_code"); + if (!status) { + TELEMETRY_LOG_ERR("Request does not have status field"); + return -EINVAL; + } else if (!json_is_string(status)) { + TELEMETRY_LOG_ERR("Status value is not a string"); + return -EINVAL; + } + + json_data_struct->status_code = strdup(json_string_value(status)); + + dataArray = json_object_get(root, "data"); + if (dataArray == NULL) { + TELEMETRY_LOG_ERR("Request does not have data field"); + return -EINVAL; + } + + arraylen = json_array_size(dataArray); + if (arraylen == 0) { + json_data_struct->data = "null"; + return -EINVAL; + } + + for (i = 0; i < arraylen; i++) { + dataArrayObj = json_array_get(dataArray, i); + port = json_object_get(dataArrayObj, "port"); + stats = json_object_get(dataArrayObj, "stats"); + } + + if (port == NULL) { + TELEMETRY_LOG_ERR("Request does not have port field"); + return -EINVAL; + } + + if (!json_is_integer(port)) { + TELEMETRY_LOG_ERR("Port value is not an integer"); + return -EINVAL; + } + + json_data_struct->port = json_integer_value(port); + + if (stats == NULL) { + TELEMETRY_LOG_ERR("Request does not have stats field"); + return -EINVAL; + } + + arraylen = json_array_size(stats); + for (i = 0; i < arraylen; i++) { + statsArrayObj = json_array_get(stats, i); + name = json_object_get(statsArrayObj, "name"); + value = json_object_get(statsArrayObj, "value"); + } + + if (name == NULL) { + TELEMETRY_LOG_ERR("Request does not have name field"); + return -EINVAL; + } + + if (!json_is_string(name)) { + TELEMETRY_LOG_ERR("Stat name value is not a string"); + return -EINVAL; + } + + json_data_struct->stat_name = strdup(json_string_value(name)); + + if (value == NULL) { + TELEMETRY_LOG_ERR("Request does not have value field"); + return -EINVAL; + } + + if (!json_is_integer(value)) { + TELEMETRY_LOG_ERR("Stat value is not an integer"); + return -EINVAL; + } + + json_data_struct->stat_value = json_integer_value(value); + + return 0; +} + +static void +rte_telemetry_free_test_data(struct json_data *data) +{ + free(data->status_code); + free(data->stat_name); + free(data); +} + +int32_t +rte_telemetry_valid_json_test(struct telemetry_impl *telemetry, int fd) +{ + int ret; + int port = 0; + int value = 0; + int fail_count = 0; + int buffer_read = 0; + char buf[BUF_SIZE]; + struct json_data *data_struct; + errno = 0; + const char *status = "Status OK: 200"; + const char *name = "rx_good_packets"; + const char *valid_json_message = "{\"action\":0,\"command\":" + "\"ports_stats_values_by_name\",\"data\":{\"ports\"" + ":[0],\"stats\":[\"rx_good_packets\"]}}"; + + ret = send(fd, valid_json_message, strlen(valid_json_message), 0); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not send message over socket"); + return -1; + } + + rte_telemetry_run(telemetry); + buffer_read = recv(fd, buf, BUF_SIZE-1, 0); + + if (buffer_read == -1) { + TELEMETRY_LOG_ERR("Read error"); + return -1; + } + + buf[buffer_read] = '\0'; + data_struct = calloc(1, sizeof(struct json_data)); + ret = rte_telemetry_stat_parse(buf, data_struct); + + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not parse stats"); + fail_count++; + } + + if (strcmp(data_struct->status_code, status) != 0) { + TELEMETRY_LOG_ERR("Status code is invalid"); + fail_count++; + } + + if (data_struct->port != port) { + TELEMETRY_LOG_ERR("Port is invalid"); + fail_count++; + } + + if (strcmp(data_struct->stat_name, name) != 0) { + TELEMETRY_LOG_ERR("Stat name is invalid"); + fail_count++; + } + + if (data_struct->stat_value != value) { + TELEMETRY_LOG_ERR("Stat value is invalid"); + fail_count++; + } + + rte_telemetry_free_test_data(data_struct); + if (fail_count > 0) + return -1; + + TELEMETRY_LOG_INFO("Success - Passed valid JSON message test passed"); + + return 0; +} + +int32_t +rte_telemetry_invalid_json_test(struct telemetry_impl *telemetry, int fd) +{ + int ret; + char buf[BUF_SIZE]; + int fail_count = 0; + const char *invalid_json = "{]"; + const char *status = "Status Error: Unknown"; + const char *data = "null"; + struct json_data *data_struct; + int buffer_read = 0; + errno = 0; + + ret = send(fd, invalid_json, strlen(invalid_json), 0); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not send message over socket"); + return -1; + } + + rte_telemetry_run(telemetry); + buffer_read = recv(fd, buf, BUF_SIZE-1, 0); + + if (buffer_read == -1) { + TELEMETRY_LOG_ERR("Read error"); + return -1; + } + + buf[buffer_read] = '\0'; + + data_struct = calloc(1, sizeof(struct json_data)); + ret = rte_telemetry_stat_parse(buf, data_struct); + + if (ret < 0) + TELEMETRY_LOG_ERR("Could not parse stats"); + + if (strcmp(data_struct->status_code, status) != 0) { + TELEMETRY_LOG_ERR("Status code is invalid"); + fail_count++; + } + + if (strcmp(data_struct->data, data) != 0) { + TELEMETRY_LOG_ERR("Data status is invalid"); + fail_count++; + } + + rte_telemetry_free_test_data(data_struct); + if (fail_count > 0) + return -1; + + TELEMETRY_LOG_INFO("Success - Passed invalid JSON message test"); + + return 0; +} + +int32_t +rte_telemetry_json_contents_test(struct telemetry_impl *telemetry, int fd) +{ + int ret; + char buf[BUF_SIZE]; + int fail_count = 0; + char *status = "Status Error: Invalid Argument 404"; + char *data = "null"; + struct json_data *data_struct; + const char *invalid_contents = "{\"action\":0,\"command\":" + "\"ports_stats_values_by_name\",\"data\":{\"ports\"" + ":[0],\"stats\":[\"some_invalid_param\"," + "\"another_invalid_param\"]}}"; + int buffer_read = 0; + errno = 0; + + ret = send(fd, invalid_contents, strlen(invalid_contents), 0); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not send message over socket"); + return -1; + } + + rte_telemetry_run(telemetry); + buffer_read = recv(fd, buf, BUF_SIZE-1, 0); + + if (buffer_read == -1) { + TELEMETRY_LOG_ERR("Read error"); + return -1; + } + + buf[buffer_read] = '\0'; + data_struct = calloc(1, sizeof(struct json_data)); + ret = rte_telemetry_stat_parse(buf, data_struct); + + if (ret < 0) + TELEMETRY_LOG_ERR("Could not parse stats"); + + if (strcmp(data_struct->status_code, status) != 0) { + TELEMETRY_LOG_ERR("Status code is invalid"); + fail_count++; + } + + if (strcmp(data_struct->data, data) != 0) { + TELEMETRY_LOG_ERR("Data status is invalid"); + fail_count++; + } + + rte_telemetry_free_test_data(data_struct); + if (fail_count > 0) + return -1; + + TELEMETRY_LOG_INFO("Success - Passed invalid JSON content test"); + + return 0; +} + +int32_t +rte_telemetry_json_empty_test(struct telemetry_impl *telemetry, int fd) +{ + int ret; + char buf[BUF_SIZE]; + int fail_count = 0; + const char *status = "Status Error: Invalid Argument 404"; + char *data = "null"; + struct json_data *data_struct; + const char *empty_json = "{}"; + int buffer_read = 0; + errno = 0; + + ret = (send(fd, empty_json, strlen(empty_json), 0)); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not send message over socket"); + return -1; + } + + rte_telemetry_run(telemetry); + buffer_read = recv(fd, buf, BUF_SIZE-1, 0); + + if (buffer_read == -1) { + TELEMETRY_LOG_ERR("Read error"); + return -1; + } + + buf[buffer_read] = '\0'; + data_struct = calloc(1, sizeof(struct json_data)); + ret = rte_telemetry_stat_parse(buf, data_struct); + + if (ret < 0) + TELEMETRY_LOG_ERR("Could not parse stats"); + + if (strcmp(data_struct->status_code, status) != 0) { + TELEMETRY_LOG_ERR("Status code is invalid"); + fail_count++; + } + + if (strcmp(data_struct->data, data) != 0) { + TELEMETRY_LOG_ERR("Data status is invalid"); + fail_count++; + } + + rte_telemetry_free_test_data(data_struct); + + if (fail_count > 0) + return -1; + + TELEMETRY_LOG_INFO("Success - Passed JSON empty message test"); + + return 0; +} + +int32_t +rte_telemetry_json_socket_message_test(struct telemetry_impl *telemetry, int fd) +{ + uint16_t i; + int ret, fail_count; + + fail_count = 0; + struct telemetry_message_test socket_json_tests[] = { + {.test_name = "Invalid JSON test", + .test_func_ptr = rte_telemetry_invalid_json_test}, + {.test_name = "Valid JSON test", + .test_func_ptr = rte_telemetry_valid_json_test}, + {.test_name = "JSON contents test", + .test_func_ptr = rte_telemetry_json_contents_test}, + {.test_name = "JSON empty tests", + .test_func_ptr = rte_telemetry_json_empty_test} + }; + +#define NUM_TESTS RTE_DIM(socket_json_tests) + + for (i = 0; i < NUM_TESTS; i++) { + TELEMETRY_LOG_INFO("%s", socket_json_tests[i].test_name); + ret = (socket_json_tests[i].test_func_ptr) + (telemetry, fd); + if (ret < 0) { + TELEMETRY_LOG_ERR("%s failed", + socket_json_tests[i].test_name); + fail_count++; + } + } + + if (fail_count > 0) { + TELEMETRY_LOG_ERR("Failed %i JSON socket message test(s)", + fail_count); + return -1; + } + + TELEMETRY_LOG_INFO("Success - All JSON tests passed"); + + return 0; +} + +int telemetry_log_level; + +static struct rte_option option = { + .opt_str = "--telemetry", + .cb = &rte_telemetry_init, + .enabled = 0 +}; + +RTE_INIT(rte_telemetry_register) +{ + telemetry_log_level = rte_log_register("lib.telemetry"); + if (telemetry_log_level >= 0) + rte_log_set_level(telemetry_log_level, RTE_LOG_ERR); + + rte_option_register(&option); +} diff --git a/lib/librte_telemetry/rte_telemetry.h b/lib/librte_telemetry/rte_telemetry.h new file mode 100644 index 00000000..119db16f --- /dev/null +++ b/lib/librte_telemetry/rte_telemetry.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include + +#ifndef _RTE_TELEMETRY_H_ +#define _RTE_TELEMETRY_H_ + +/** + * @file + * RTE Telemetry + * + * The telemetry library provides a method to retrieve statistics from + * DPDK by sending a JSON encoded message over a socket. DPDK will send + * a JSON encoded response containing telemetry data. + ***/ + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Initialize Telemetry + * + * @return + * 0 on successful initialisation. + * @return + * -ENOMEM on memory allocation error + * @return + * -EPERM on unknown error failure + * @return + * -EALREADY if Telemetry is already initialised. + */ +int32_t __rte_experimental +rte_telemetry_init(void); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Clean up and free memory. + * + * @return + * 0 on success + * @return + * -EPERM on failure + */ +int32_t __rte_experimental +rte_telemetry_cleanup(void); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Runs various tests to ensure telemetry initialisation and register/unregister + * functions are working correctly. + * + * @return + * 0 on success when all tests have passed + * @return + * -1 on failure when the test has failed + */ +int32_t __rte_experimental +rte_telemetry_selftest(void); + +#endif diff --git a/lib/librte_telemetry/rte_telemetry_internal.h b/lib/librte_telemetry/rte_telemetry_internal.h new file mode 100644 index 00000000..de7afda3 --- /dev/null +++ b/lib/librte_telemetry/rte_telemetry_internal.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include + +#ifndef _RTE_TELEMETRY_INTERNAL_H_ +#define _RTE_TELEMETRY_INTERNAL_H_ + +/* Logging Macros */ +extern int telemetry_log_level; + +#define TELEMETRY_LOG(level, fmt, args...) \ + rte_log(RTE_LOG_ ##level, telemetry_log_level, "%s(): "fmt "\n", \ + __func__, ##args) + +#define TELEMETRY_LOG_ERR(fmt, args...) \ + TELEMETRY_LOG(ERR, fmt, ## args) + +#define TELEMETRY_LOG_WARN(fmt, args...) \ + TELEMETRY_LOG(WARNING, fmt, ## args) + +#define TELEMETRY_LOG_INFO(fmt, args...) \ + TELEMETRY_LOG(INFO, fmt, ## args) + +typedef struct telemetry_client { + char *file_path; + int fd; + TAILQ_ENTRY(telemetry_client) client_list; +} telemetry_client; + +typedef struct telemetry_impl { + int accept_fd; + int server_fd; + pthread_t thread_id; + int thread_status; + uint32_t socket_id; + int reg_index; + int metrics_register_done; + TAILQ_HEAD(, telemetry_client) client_list_head; + struct telemetry_client *request_client; + int register_fail_count; +} telemetry_impl; + +enum rte_telemetry_parser_actions { + ACTION_GET = 0, + ACTION_DELETE = 2 +}; + +int32_t +rte_telemetry_parse_client_message(struct telemetry_impl *telemetry, char *buf); + +int32_t +rte_telemetry_send_error_response(struct telemetry_impl *telemetry, + int error_type); + +int32_t +rte_telemetry_register_client(struct telemetry_impl *telemetry, + const char *client_path); + +int32_t +rte_telemetry_unregister_client(struct telemetry_impl *telemetry, + const char *client_path); + +/** + * This is a wrapper for the ethdev api rte_eth_find_next(). + * If rte_eth_find_next() returns the same port id that we passed it, + * then we know that that port is active. + */ +int32_t +rte_telemetry_is_port_active(int port_id); + +int32_t +rte_telemetry_send_ports_stats_values(uint32_t *metric_ids, int num_metric_ids, + uint32_t *port_ids, int num_port_ids, struct telemetry_impl *telemetry); + +int32_t +rte_telemetry_socket_messaging_testing(int index, int socket); + +#endif diff --git a/lib/librte_telemetry/rte_telemetry_parser.c b/lib/librte_telemetry/rte_telemetry_parser.c new file mode 100644 index 00000000..03a58a2f --- /dev/null +++ b/lib/librte_telemetry/rte_telemetry_parser.c @@ -0,0 +1,586 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "rte_telemetry_internal.h" + +typedef int (*command_func)(struct telemetry_impl *, int, json_t *); + +struct rte_telemetry_command { + char *text; + command_func fn; +} command; + +static int32_t +rte_telemetry_command_clients(struct telemetry_impl *telemetry, int action, + json_t *data) +{ + int ret; + + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Invalid telemetry argument"); + return -1; + } + + if (action != ACTION_DELETE) { + TELEMETRY_LOG_WARN("Invalid action for this command"); + goto einval_fail; + } + + if (!json_is_object(data)) { + TELEMETRY_LOG_WARN("Invalid data provided for this command"); + goto einval_fail; + } + + json_t *client_path = json_object_get(data, "client_path"); + if (!json_is_string(client_path)) { + TELEMETRY_LOG_WARN("Command value is not a string"); + goto einval_fail; + } + + ret = rte_telemetry_unregister_client(telemetry, + json_string_value(client_path)); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not unregister client"); + goto einval_fail; + } + + return 0; + +einval_fail: + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; +} + +static int32_t +rte_telemetry_command_ports(struct telemetry_impl *telemetry, int action, + json_t *data) +{ + int ret; + + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Invalid telemetry argument"); + return -1; + } + + if (!json_is_null(data)) { + TELEMETRY_LOG_WARN("Data should be NULL JSON object for 'ports' command"); + goto einval_fail; + } + + if (action != ACTION_GET) { + TELEMETRY_LOG_WARN("Invalid action for this command"); + goto einval_fail; + } + + return 0; + +einval_fail: + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; +} + +static int32_t +rte_telemetry_command_ports_details(struct telemetry_impl *telemetry, + int action, json_t *data) +{ + json_t *value, *port_ids_json = json_object_get(data, "ports"); + uint64_t num_port_ids = json_array_size(port_ids_json); + int ret, port_ids[num_port_ids]; + RTE_SET_USED(port_ids); + size_t index; + + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Invalid telemetry argument"); + return -1; + } + + if (action != ACTION_GET) { + TELEMETRY_LOG_WARN("Invalid action for this command"); + goto einval_fail; + } + + if (!json_is_object(data)) { + TELEMETRY_LOG_WARN("Invalid data provided for this command"); + goto einval_fail; + } + + if (!json_is_array(port_ids_json)) { + TELEMETRY_LOG_WARN("Invalid Port ID array"); + goto einval_fail; + } + + json_array_foreach(port_ids_json, index, value) { + if (!json_is_integer(value)) { + TELEMETRY_LOG_WARN("Port ID given is invalid"); + goto einval_fail; + } + port_ids[index] = json_integer_value(value); + } + + return 0; + +einval_fail: + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; +} + +static int32_t +rte_telemetry_command_port_stats(struct telemetry_impl *telemetry, int action, + json_t *data) +{ + int ret; + + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Invalid telemetry argument"); + return -1; + } + + if (!json_is_null(data)) { + TELEMETRY_LOG_WARN("Data should be NULL JSON object for 'port_stats' command"); + goto einval_fail; + } + + if (action != ACTION_GET) { + TELEMETRY_LOG_WARN("Invalid action for this command"); + goto einval_fail; + } + + return 0; + +einval_fail: + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; +} + +static int32_t +rte_telemetry_stat_names_to_ids(struct telemetry_impl *telemetry, + const char * const *stat_names, uint32_t *stat_ids, + uint64_t num_stat_names) +{ + struct rte_metric_name *names; + int ret, num_metrics; + uint32_t i, k; + + if (stat_names == NULL) { + TELEMETRY_LOG_WARN("Invalid stat_names argument"); + goto einval_fail; + } + + if (num_stat_names <= 0) { + TELEMETRY_LOG_WARN("Invalid num_stat_names argument"); + goto einval_fail; + } + + num_metrics = rte_metrics_get_names(NULL, 0); + if (num_metrics < 0) { + TELEMETRY_LOG_ERR("Cannot get metrics count"); + goto eperm_fail; + } else if (num_metrics == 0) { + TELEMETRY_LOG_WARN("No metrics have been registered"); + goto eperm_fail; + } + + names = malloc(sizeof(struct rte_metric_name) * num_metrics); + if (names == NULL) { + TELEMETRY_LOG_ERR("Cannot allocate memory for names"); + + ret = rte_telemetry_send_error_response(telemetry, -ENOMEM); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + + return -1; + } + + ret = rte_metrics_get_names(names, num_metrics); + if (ret < 0 || ret > num_metrics) { + TELEMETRY_LOG_ERR("Cannot get metrics names"); + free(names); + goto eperm_fail; + } + + k = 0; + for (i = 0; i < (uint32_t)num_stat_names; i++) { + uint32_t j; + for (j = 0; j < (uint32_t)num_metrics; j++) { + if (strcmp(stat_names[i], names[j].name) == 0) { + stat_ids[k] = j; + k++; + break; + } + } + } + + if (k != num_stat_names) { + TELEMETRY_LOG_WARN("Invalid stat names provided"); + free(names); + goto einval_fail; + } + + free(names); + return 0; + +einval_fail: + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + +eperm_fail: + ret = rte_telemetry_send_error_response(telemetry, -EPERM); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; +} + +int32_t +rte_telemetry_command_ports_all_stat_values(struct telemetry_impl *telemetry, + int action, json_t *data) +{ + int ret, num_metrics, i, p; + struct rte_metric_name *names; + uint64_t num_port_ids = 0; + uint32_t port_ids[RTE_MAX_ETHPORTS]; + + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Invalid telemetry argument"); + return -1; + } + + if (action != ACTION_GET) { + TELEMETRY_LOG_WARN("Invalid action for this command"); + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + } + + if (json_is_object(data)) { + TELEMETRY_LOG_WARN("Invalid data provided for this command"); + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + } + + num_metrics = rte_metrics_get_names(NULL, 0); + if (num_metrics < 0) { + TELEMETRY_LOG_ERR("Cannot get metrics count"); + + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + + return -1; + } else if (num_metrics == 0) { + TELEMETRY_LOG_ERR("No metrics to display (none have been registered)"); + + ret = rte_telemetry_send_error_response(telemetry, -EPERM); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + + return -1; + } + + names = malloc(sizeof(struct rte_metric_name) * num_metrics); + if (names == NULL) { + TELEMETRY_LOG_ERR("Cannot allocate memory"); + ret = rte_telemetry_send_error_response(telemetry, + -ENOMEM); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + } + + const char *stat_names[num_metrics]; + uint32_t stat_ids[num_metrics]; + + RTE_ETH_FOREACH_DEV(p) { + port_ids[num_port_ids] = p; + num_port_ids++; + } + + if (!num_port_ids) { + TELEMETRY_LOG_WARN("No active ports"); + + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + + goto fail; + } + + ret = rte_metrics_get_names(names, num_metrics); + for (i = 0; i < num_metrics; i++) + stat_names[i] = names[i].name; + + ret = rte_telemetry_stat_names_to_ids(telemetry, stat_names, stat_ids, + num_metrics); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not convert stat names to IDs"); + goto fail; + } + + ret = rte_telemetry_send_ports_stats_values(stat_ids, num_metrics, + port_ids, num_port_ids, telemetry); + if (ret < 0) { + TELEMETRY_LOG_ERR("Sending ports stats values failed"); + goto fail; + } + + return 0; + +fail: + free(names); + return -1; +} + +int32_t +rte_telemetry_command_ports_stats_values_by_name(struct telemetry_impl + *telemetry, int action, json_t *data) +{ + int ret; + json_t *port_ids_json = json_object_get(data, "ports"); + json_t *stat_names_json = json_object_get(data, "stats"); + uint64_t num_port_ids = json_array_size(port_ids_json); + uint64_t num_stat_names = json_array_size(stat_names_json); + const char *stat_names[num_stat_names]; + uint32_t port_ids[num_port_ids], stat_ids[num_stat_names]; + size_t index; + json_t *value; + + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Invalid telemetry argument"); + return -1; + } + + if (action != ACTION_GET) { + TELEMETRY_LOG_WARN("Invalid action for this command"); + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + } + + if (!json_is_object(data)) { + TELEMETRY_LOG_WARN("Invalid data provided for this command"); + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + } + + if (!json_is_array(port_ids_json) || + !json_is_array(stat_names_json)) { + TELEMETRY_LOG_WARN("Invalid input data array(s)"); + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + } + + json_array_foreach(port_ids_json, index, value) { + if (!json_is_integer(value)) { + TELEMETRY_LOG_WARN("Port ID given is not valid"); + ret = rte_telemetry_send_error_response(telemetry, + -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + } + port_ids[index] = json_integer_value(value); + ret = rte_telemetry_is_port_active(port_ids[index]); + if (ret < 1) { + ret = rte_telemetry_send_error_response(telemetry, + -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -1; + } + } + + json_array_foreach(stat_names_json, index, value) { + if (!json_is_string(value)) { + TELEMETRY_LOG_WARN("Stat Name given is not a string"); + + ret = rte_telemetry_send_error_response(telemetry, + -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + + return -1; + } + stat_names[index] = json_string_value(value); + } + + ret = rte_telemetry_stat_names_to_ids(telemetry, stat_names, stat_ids, + num_stat_names); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not convert stat names to IDs"); + return -1; + } + + ret = rte_telemetry_send_ports_stats_values(stat_ids, num_stat_names, + port_ids, num_port_ids, telemetry); + if (ret < 0) { + TELEMETRY_LOG_ERR("Sending ports stats values failed"); + return -1; + } + + return 0; +} + +static int32_t +rte_telemetry_parse_command(struct telemetry_impl *telemetry, int action, + const char *command, json_t *data) +{ + int ret; + uint32_t i; + + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Invalid telemetry argument"); + return -1; + } + + struct rte_telemetry_command commands[] = { + { + .text = "clients", + .fn = &rte_telemetry_command_clients + }, + { + .text = "ports", + .fn = &rte_telemetry_command_ports + }, + { + .text = "ports_details", + .fn = &rte_telemetry_command_ports_details + }, + { + .text = "port_stats", + .fn = &rte_telemetry_command_port_stats + }, + { + .text = "ports_stats_values_by_name", + .fn = &rte_telemetry_command_ports_stats_values_by_name + }, + { + .text = "ports_all_stat_values", + .fn = &rte_telemetry_command_ports_all_stat_values + } + }; + + const uint32_t num_commands = RTE_DIM(commands); + + for (i = 0; i < num_commands; i++) { + if (strcmp(command, commands[i].text) == 0) { + ret = commands[i].fn(telemetry, action, data); + if (ret < 0) { + TELEMETRY_LOG_ERR("Command Function for %s failed", + commands[i].text); + return -1; + } + return 0; + } + } + + TELEMETRY_LOG_WARN("\"%s\" command not found", command); + + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + + return -1; +} + +int32_t __rte_experimental +rte_telemetry_parse(struct telemetry_impl *telemetry, char *socket_rx_data) +{ + int ret, action_int; + json_error_t error; + json_t *root, *action, *command, *data; + + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Invalid telemetry argument"); + return -1; + } + + root = json_loads(socket_rx_data, 0, &error); + if (root == NULL) { + TELEMETRY_LOG_WARN("Could not load JSON object from data passed in : %s", + error.text); + ret = rte_telemetry_send_error_response(telemetry, -EPERM); + if (ret < 0) + TELEMETRY_LOG_ERR("Could not send error"); + return -EPERM; + } else if (!json_is_object(root)) { + TELEMETRY_LOG_WARN("JSON Request is not a JSON object"); + json_decref(root); + goto einval_fail; + } + + action = json_object_get(root, "action"); + if (action == NULL) { + TELEMETRY_LOG_WARN("Request does not have action field"); + goto einval_fail; + } else if (!json_is_integer(action)) { + TELEMETRY_LOG_WARN("Action value is not an integer"); + goto einval_fail; + } + + command = json_object_get(root, "command"); + if (command == NULL) { + TELEMETRY_LOG_WARN("Request does not have command field"); + goto einval_fail; + } else if (!json_is_string(command)) { + TELEMETRY_LOG_WARN("Command value is not a string"); + goto einval_fail; + } + + action_int = json_integer_value(action); + if (action_int != ACTION_GET && action_int != ACTION_DELETE) { + TELEMETRY_LOG_WARN("Invalid action code"); + goto einval_fail; + } + + const char *command_string = json_string_value(command); + data = json_object_get(root, "data"); + if (data == NULL) { + TELEMETRY_LOG_WARN("Request does not have data field"); + goto einval_fail; + } + + ret = rte_telemetry_parse_command(telemetry, action_int, command_string, + data); + if (ret < 0) { + TELEMETRY_LOG_WARN("Could not parse command"); + return -EINVAL; + } + + return 0; + +einval_fail: + ret = rte_telemetry_send_error_response(telemetry, -EINVAL); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not send error"); + return -EPERM; + } + return -EINVAL; +} diff --git a/lib/librte_telemetry/rte_telemetry_parser.h b/lib/librte_telemetry/rte_telemetry_parser.h new file mode 100644 index 00000000..b7051945 --- /dev/null +++ b/lib/librte_telemetry/rte_telemetry_parser.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include "rte_telemetry_internal.h" +#include "rte_compat.h" + +#ifndef _RTE_TELEMETRY_PARSER_H_ +#define _RTE_TELEMETRY_PARSER_H_ + +int32_t __rte_experimental +rte_telemetry_parse(struct telemetry_impl *telemetry, char *socket_rx_data); + +#endif diff --git a/lib/librte_telemetry/rte_telemetry_parser_test.c b/lib/librte_telemetry/rte_telemetry_parser_test.c new file mode 100644 index 00000000..5fe93fa6 --- /dev/null +++ b/lib/librte_telemetry/rte_telemetry_parser_test.c @@ -0,0 +1,534 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "rte_telemetry_parser.h" + +enum choices { + INV_ACTION_VAL, + INV_COMMAND_VAL, + INV_DATA_VAL, + INV_ACTION_FIELD, + INV_COMMAND_FIELD, + INV_DATA_FIELD, + INV_JSON_FORMAT, + VALID_REQ +}; + + +#define TEST_CLIENT "/var/run/dpdk/test_client" + +int32_t +rte_telemetry_create_test_socket(struct telemetry_impl *telemetry, + const char *test_client_path) +{ + int ret, sockfd; + struct sockaddr_un addr = {0}; + struct telemetry_client *client; + + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Telemetry argument has not been initialised"); + return -EINVAL; + } + + sockfd = socket(AF_UNIX, SOCK_SEQPACKET, 0); + if (sockfd < 0) { + TELEMETRY_LOG_ERR("Test socket creation failure"); + return -1; + } + + addr.sun_family = AF_UNIX; + strlcpy(addr.sun_path, test_client_path, sizeof(addr.sun_path)); + unlink(test_client_path); + + if (bind(sockfd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + TELEMETRY_LOG_ERR("Test socket binding failure"); + return -1; + } + + if (listen(sockfd, 1) < 0) { + TELEMETRY_LOG_ERR("Listen failure"); + return -1; + } + + ret = rte_telemetry_register_client(telemetry, test_client_path); + if (ret < 0) { + TELEMETRY_LOG_ERR("Register dummy client failed: %i", ret); + return -1; + } + + ret = accept(sockfd, NULL, NULL); + if (ret < 0) { + TELEMETRY_LOG_ERR("Socket accept failed"); + return -1; + } + + TAILQ_FOREACH(client, &telemetry->client_list_head, client_list) + telemetry->request_client = client; + + return 0; +} + +int32_t +rte_telemetry_format_port_stat_ids(int *port_ids, int num_port_ids, + const char * const *stat_names, int num_stat_names, json_t **data) +{ + + int ret; + json_t *stat_names_json_array = NULL; + json_t *port_ids_json_array = NULL; + uint32_t i; + + if (num_port_ids < 0) { + TELEMETRY_LOG_ERR("Port Ids Count invalid"); + goto fail; + } + + *data = json_object(); + if (*data == NULL) { + TELEMETRY_LOG_ERR("Data json object creation failed"); + goto fail; + } + + port_ids_json_array = json_array(); + if (port_ids_json_array == NULL) { + TELEMETRY_LOG_ERR("port_ids_json_array creation failed"); + goto fail; + } + + for (i = 0; i < (uint32_t)num_port_ids; i++) { + ret = json_array_append(port_ids_json_array, + json_integer(port_ids[i])); + if (ret < 0) { + TELEMETRY_LOG_ERR("JSON array creation failed"); + goto fail; + } + } + + ret = json_object_set_new(*data, "ports", port_ids_json_array); + if (ret < 0) { + TELEMETRY_LOG_ERR("Setting 'ports' value in data object failed"); + goto fail; + } + + if (stat_names) { + if (num_stat_names < 0) { + TELEMETRY_LOG_ERR("Stat Names Count invalid"); + goto fail; + } + + stat_names_json_array = json_array(); + if (stat_names_json_array == NULL) { + TELEMETRY_LOG_ERR("stat_names_json_array creation failed"); + goto fail; + } + + uint32_t i; + for (i = 0; i < (uint32_t)num_stat_names; i++) { + ret = json_array_append(stat_names_json_array, + json_string(stat_names[i])); + if (ret < 0) { + TELEMETRY_LOG_ERR("JSON array creation failed"); + goto fail; + } + } + + ret = json_object_set_new(*data, "stats", stat_names_json_array); + if (ret < 0) { + TELEMETRY_LOG_ERR("Setting 'stats' value in data object failed"); + goto fail; + } + } + + return 0; + +fail: + if (*data) + json_decref(*data); + if (stat_names_json_array) + json_decref(stat_names_json_array); + if (port_ids_json_array) + json_decref(port_ids_json_array); + return -1; +} + +int32_t +rte_telemetry_create_json_request(int action, char *command, + const char *client_path, int *port_ids, int num_port_ids, + const char * const *stat_names, int num_stat_names, char **request, + int inv_choice) +{ + int ret; + json_t *root = json_object(); + json_t *data; + + if (root == NULL) { + TELEMETRY_LOG_ERR("Could not create root json object"); + goto fail; + } + + if (inv_choice == INV_ACTION_FIELD) { + ret = json_object_set_new(root, "ac--on", json_integer(action)); + if (ret < 0) { + TELEMETRY_LOG_ERR("Setting invalid action field in root object failed"); + goto fail; + } + } else { + ret = json_object_set_new(root, "action", json_integer(action)); + if (ret < 0) { + TELEMETRY_LOG_ERR("Setting valid action field in root object failed"); + goto fail; + } + } + + if (inv_choice == INV_COMMAND_FIELD) { + ret = json_object_set_new(root, "co---nd", json_string(command)); + if (ret < 0) { + TELEMETRY_LOG_ERR("Setting invalid command field in root object failed"); + goto fail; + } + } else { + ret = json_object_set_new(root, "command", json_string(command)); + if (ret < 0) { + TELEMETRY_LOG_ERR("Setting valid command field in root object failed"); + goto fail; + } + } + + data = json_null(); + if (client_path) { + data = json_object(); + if (data == NULL) { + TELEMETRY_LOG_ERR("Data json object creation failed"); + goto fail; + } + + ret = json_object_set_new(data, "client_path", + json_string(client_path)); + if (ret < 0) { + TELEMETRY_LOG_ERR("Setting valid client_path field in data object failed"); + goto fail; + } + + } else if (port_ids) { + ret = rte_telemetry_format_port_stat_ids(port_ids, num_port_ids, + stat_names, num_stat_names, &data); + if (ret < 0) { + TELEMETRY_LOG_ERR("Formatting Port/Stat arrays failed"); + goto fail; + } + + } + + if (inv_choice == INV_DATA_FIELD) { + ret = json_object_set_new(root, "d--a", data); + if (ret < 0) { + TELEMETRY_LOG_ERR("Setting invalid data field in data object failed"); + goto fail; + } + } else { + ret = json_object_set_new(root, "data", data); + if (ret < 0) { + TELEMETRY_LOG_ERR("Setting valid data field in data object failed"); + goto fail; + } + } + + *request = json_dumps(root, 0); + if (*request == NULL) { + TELEMETRY_LOG_ERR("Converting JSON root object to char* failed"); + goto fail; + } + + json_decref(root); + return 0; + +fail: + if (root) + json_decref(root); + return -1; +} + +int32_t +rte_telemetry_send_get_ports_and_stats_request(struct telemetry_impl *telemetry, + int action_choice, char *command_choice, int inv_choice) +{ + int ret; + char *request; + char *client_path_data = NULL; + + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Telemetry argument has not been initialised"); + return -EINVAL; + } + + + if (inv_choice == INV_ACTION_VAL) + action_choice = -1; + else if (inv_choice == INV_COMMAND_VAL) + command_choice = "INVALID_COMMAND"; + else if (inv_choice == INV_DATA_VAL) + client_path_data = "INVALID_DATA"; + + ret = rte_telemetry_create_json_request(action_choice, command_choice, + client_path_data, NULL, -1, NULL, -1, &request, inv_choice); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not create JSON Request"); + return -1; + } + + if (inv_choice == INV_JSON_FORMAT) + request++; + + ret = rte_telemetry_parse(telemetry, request); + if (ret < 0) { + TELEMETRY_LOG_WARN("Could not parse JSON Request"); + return -1; + } + + return 0; +} + +int32_t +rte_telemetry_send_get_ports_details_request(struct telemetry_impl *telemetry, + int action_choice, int *port_ids, int num_port_ids, int inv_choice) +{ + int ret; + char *request; + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Telemetry argument has not been initialised"); + return -EINVAL; + } + + char *command = "ports_details"; + + if (inv_choice == INV_ACTION_VAL) + action_choice = -1; + else if (inv_choice == INV_COMMAND_VAL) + command = "INVALID_COMMAND"; + else if (inv_choice == INV_DATA_VAL) + port_ids = NULL; + + + ret = rte_telemetry_create_json_request(action_choice, command, NULL, + port_ids, num_port_ids, NULL, -1, &request, inv_choice); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not create JSON Request"); + return -1; + } + + if (inv_choice == INV_JSON_FORMAT) + request++; + + ret = rte_telemetry_parse(telemetry, request); + if (ret < 0) { + TELEMETRY_LOG_WARN("Could not parse JSON Request"); + return -1; + } + + return 0; +} + +int32_t +rte_telemetry_send_stats_values_by_name_request(struct telemetry_impl + *telemetry, int action_choice, int *port_ids, int num_port_ids, + const char * const *stat_names, int num_stat_names, + int inv_choice) +{ + int ret; + char *request; + char *command = "ports_stats_values_by_name"; + + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Telemetry argument has not been initialised"); + return -EINVAL; + } + + if (inv_choice == INV_ACTION_VAL) + action_choice = -1; + else if (inv_choice == INV_COMMAND_VAL) + command = "INVALID_COMMAND"; + else if (inv_choice == INV_DATA_VAL) { + port_ids = NULL; + stat_names = NULL; + } + + ret = rte_telemetry_create_json_request(action_choice, command, NULL, + port_ids, num_port_ids, stat_names, num_stat_names, &request, + inv_choice); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not create JSON Request"); + return -1; + } + + if (inv_choice == INV_JSON_FORMAT) + request++; + + ret = rte_telemetry_parse(telemetry, request); + if (ret < 0) { + TELEMETRY_LOG_WARN("Could not parse JSON Request"); + return -1; + } + + return 0; +} + +int32_t +rte_telemetry_send_unreg_request(struct telemetry_impl *telemetry, + int action_choice, const char *client_path, int inv_choice) +{ + int ret; + char *request; + + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Telemetry argument has not been initialised"); + return -EINVAL; + } + + char *command = "clients"; + + if (inv_choice == INV_ACTION_VAL) + action_choice = -1; + else if (inv_choice == INV_COMMAND_VAL) + command = "INVALID_COMMAND"; + else if (inv_choice == INV_DATA_VAL) + client_path = NULL; + + ret = rte_telemetry_create_json_request(action_choice, command, + client_path, NULL, -1, NULL, -1, &request, inv_choice); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not create JSON Request"); + return -1; + } + + if (inv_choice == INV_JSON_FORMAT) + request++; + + ret = rte_telemetry_parse(telemetry, request); + if (ret < 0) { + TELEMETRY_LOG_WARN("Could not parse JSON Request"); + return -1; + } + + return 0; +} + +int32_t +rte_telemetry_parser_test(struct telemetry_impl *telemetry) +{ + int ret; + const char *client_path = TEST_CLIENT; + + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Telemetry argument has not been initialised"); + return -EINVAL; + } + + ret = rte_telemetry_create_test_socket(telemetry, client_path); + if (ret < 0) { + TELEMETRY_LOG_ERR("Could not create test request client socket"); + return -1; + } + + int port_ids[] = {0, 1}; + int num_port_ids = RTE_DIM(port_ids); + + static const char * const stat_names[] = {"tx_good_packets", + "rx_good_packets"}; + int num_stat_names = RTE_DIM(stat_names); + + static const char * const test_types[] = { + "INVALID ACTION VALUE TESTS", + "INVALID COMMAND VALUE TESTS", + "INVALID DATA VALUE TESTS", + "INVALID ACTION FIELD TESTS", + "INVALID COMMAND FIELD TESTS", + "INVALID DATA FIELD TESTS", + "INVALID JSON FORMAT TESTS", + "VALID TESTS" + }; + + +#define NUM_TEST_TYPES (sizeof(test_types)/sizeof(const char * const)) + + uint32_t i; + for (i = 0; i < NUM_TEST_TYPES; i++) { + TELEMETRY_LOG_INFO("%s", test_types[i]); + + ret = rte_telemetry_send_get_ports_and_stats_request(telemetry, + ACTION_GET, "ports", i); + if (ret != 0 && i == VALID_REQ) { + TELEMETRY_LOG_ERR("Get ports valid test failed"); + return -EPERM; + } else if (ret != -1 && i != VALID_REQ) { + TELEMETRY_LOG_ERR("Get ports invalid test failed"); + return -EPERM; + } + + TELEMETRY_LOG_INFO("Success - Get ports test passed"); + + ret = rte_telemetry_send_get_ports_details_request(telemetry, + ACTION_GET, port_ids, num_port_ids, i); + if (ret != 0 && i == VALID_REQ) { + TELEMETRY_LOG_ERR("Get ports details valid"); + return -EPERM; + } else if (ret != -1 && i != VALID_REQ) { + TELEMETRY_LOG_ERR("Get ports details invalid"); + return -EPERM; + } + + TELEMETRY_LOG_INFO("Success - Get ports details test passed"); + + ret = rte_telemetry_send_get_ports_and_stats_request(telemetry, + ACTION_GET, "port_stats", i); + if (ret != 0 && i == VALID_REQ) { + TELEMETRY_LOG_ERR("Get port stats valid test"); + return -EPERM; + } else if (ret != -1 && i != VALID_REQ) { + TELEMETRY_LOG_ERR("Get ports stats invalid test failed"); + return -EPERM; + } + + TELEMETRY_LOG_INFO("Success - Get ports stats test passed"); + + ret = rte_telemetry_send_stats_values_by_name_request(telemetry, + ACTION_GET, port_ids, num_port_ids, stat_names, + num_stat_names, i); + if (ret != 0 && i == VALID_REQ) { + TELEMETRY_LOG_ERR("Get ports stats values by name valid test failed"); + return -EPERM; + } else if (ret != -1 && i != VALID_REQ) { + TELEMETRY_LOG_ERR("Get ports stats values by name invalid test failed"); + return -EPERM; + } + + TELEMETRY_LOG_INFO("Success - Get ports stats values by name test passed"); + + ret = rte_telemetry_send_unreg_request(telemetry, ACTION_DELETE, + client_path, i); + if (ret != 0 && i == VALID_REQ) { + TELEMETRY_LOG_ERR("Deregister valid test failed"); + return -EPERM; + } else if (ret != -1 && i != VALID_REQ) { + TELEMETRY_LOG_ERR("Deregister invalid test failed"); + return -EPERM; + } + + TELEMETRY_LOG_INFO("Success - Deregister test passed"); + } + + return 0; +} diff --git a/lib/librte_telemetry/rte_telemetry_parser_test.h b/lib/librte_telemetry/rte_telemetry_parser_test.h new file mode 100644 index 00000000..6ada8527 --- /dev/null +++ b/lib/librte_telemetry/rte_telemetry_parser_test.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _RTE_TELEMETRY_PARSER_TEST_H_ +#define _RTE_TELEMETRY_PARSER_TEST_H_ + +int32_t +rte_telemetry_parser_test(struct telemetry_impl *telemetry); + +int32_t +rte_telemetry_format_port_stat_ids(int *port_ids, int num_port_ids, + const char * const stat_names, int num_stat_names, json_t **data); + +int32_t +rte_telemetry_create_json_request(int action, char *command, + const char *client_path, int *port_ids, int num_port_ids, + const char * const stat_names, int num_stat_names, char **request, + int inv_choice); + +int32_t +rte_telemetry_send_get_ports_and_stats_request(struct telemetry_impl *telemetry, + int action_choice, char *command_choice, int inv_choice); + +int32_t +rte_telemetry_send_get_ports_details_request(struct telemetry_impl *telemetry, + int action_choice, int *port_ids, int num_port_ids, int inv_choice); + +int32_t +rte_telemetry_send_stats_values_by_name_request(struct telemetry_impl + *telemetry, int action_choice, int *port_ids, int num_port_ids, + const char * const stat_names, int num_stat_names, + int inv_choice); + +int32_t +rte_telemetry_send_unreg_request(int action_choice, const char *client_path, + int inv_choice); + +#endif diff --git a/lib/librte_telemetry/rte_telemetry_socket_tests.h b/lib/librte_telemetry/rte_telemetry_socket_tests.h new file mode 100644 index 00000000..db9167c5 --- /dev/null +++ b/lib/librte_telemetry/rte_telemetry_socket_tests.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include + +#include "rte_telemetry_internal.h" + +#ifndef _RTE_TELEMETRY_SOCKET_TESTING_H_ +#define _RTE_TELEMETRY_SOCKET_TESTING_H_ + +int32_t +rte_telemetry_json_socket_message_test(struct telemetry_impl *telemetry, + int fd); + +int32_t +rte_telemetry_invalid_json_test(struct telemetry_impl *telemetry, int fd); + +int32_t +rte_telemetry_valid_json_test(struct telemetry_impl *telemetry, int fd); + +int32_t +rte_telemetry_json_contents_test(struct telemetry_impl *telemetry, int fd); + +int32_t +rte_telemetry_json_empty_test(struct telemetry_impl *telemetry, int fd); + +int32_t +rte_telemetry_socket_register_test(struct telemetry_impl *telemetry, int *fd, + int send_fd, int recv_fd); + +int32_t +rte_telemetry_socket_test_setup(struct telemetry_impl *telemetry, int *send_fd, + int *recv_fd); + +#endif diff --git a/lib/librte_telemetry/rte_telemetry_version.map b/lib/librte_telemetry/rte_telemetry_version.map new file mode 100644 index 00000000..fa62d771 --- /dev/null +++ b/lib/librte_telemetry/rte_telemetry_version.map @@ -0,0 +1,10 @@ +EXPERIMENTAL { + global: + + rte_telemetry_cleanup; + rte_telemetry_init; + rte_telemetry_parse; + rte_telemetry_selftest; + + local: *; +}; diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index de431fbb..5dd31898 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -13,13 +13,13 @@ LIBABIVER := 4 CFLAGS += -DALLOW_EXPERIMENTAL_API CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -D_FILE_OFFSET_BITS=64 CFLAGS += -I vhost_user +CFLAGS += -fno-strict-aliasing LDLIBS += -lpthread ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) LDLIBS += -lnuma endif -LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev -lrte_net \ - -lrte_cryptodev -lrte_hash +LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev -lrte_net # all source are stored in SRCS-y SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c socket.c vhost.c \ @@ -30,6 +30,7 @@ SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h # only compile vhost crypto when cryptodev is enabled ifeq ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y) +LDLIBS += -lrte_cryptodev -lrte_hash SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost_crypto.c SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost_crypto.h endif diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build index bd62e0e3..e33e6fc1 100644 --- a/lib/librte_vhost/meson.build +++ b/lib/librte_vhost/meson.build @@ -7,8 +7,11 @@ endif if has_libnuma == 1 dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true) endif +dpdk_conf.set('RTE_LIBRTE_VHOST_POSTCOPY', + cc.has_header('linux/userfaultfd.h')) version = 4 allow_experimental_apis = true +cflags += '-fno-strict-aliasing' sources = files('fd_man.c', 'iotlb.c', 'socket.c', 'vdpa.c', 'vhost.c', 'vhost_user.c', 'virtio_net.c', 'vhost_crypto.c') diff --git a/lib/librte_vhost/rte_vdpa.h b/lib/librte_vhost/rte_vdpa.h index 90465ca2..a418da47 100644 --- a/lib/librte_vhost/rte_vdpa.h +++ b/lib/librte_vhost/rte_vdpa.h @@ -21,67 +21,138 @@ enum vdpa_addr_type { VDPA_ADDR_MAX }; +/** + * vdpa device address + */ struct rte_vdpa_dev_addr { + /** vdpa address type */ enum vdpa_addr_type type; + + /** vdpa pci address */ union { uint8_t __dummy[64]; struct rte_pci_addr pci_addr; }; }; +/** + * vdpa device operations + */ struct rte_vdpa_dev_ops { - /* Get capabilities of this device */ + /** Get capabilities of this device */ int (*get_queue_num)(int did, uint32_t *queue_num); + + /** Get supported features of this device */ int (*get_features)(int did, uint64_t *features); + + /** Get supported protocol features of this device */ int (*get_protocol_features)(int did, uint64_t *protocol_features); - /* Driver configure/close the device */ + /** Driver configure/close the device */ int (*dev_conf)(int vid); int (*dev_close)(int vid); - /* Enable/disable this vring */ + /** Enable/disable this vring */ int (*set_vring_state)(int vid, int vring, int state); - /* Set features when changed */ + /** Set features when changed */ int (*set_features)(int vid); - /* Destination operations when migration done */ + /** Destination operations when migration done */ int (*migration_done)(int vid); - /* Get the vfio group fd */ + /** Get the vfio group fd */ int (*get_vfio_group_fd)(int vid); - /* Get the vfio device fd */ + /** Get the vfio device fd */ int (*get_vfio_device_fd)(int vid); - /* Get the notify area info of the queue */ + /** Get the notify area info of the queue */ int (*get_notify_area)(int vid, int qid, uint64_t *offset, uint64_t *size); - /* Reserved for future extension */ + /** Reserved for future extension */ void *reserved[5]; }; +/** + * vdpa device structure includes device address and device operations. + */ struct rte_vdpa_device { + /** vdpa device address */ struct rte_vdpa_dev_addr addr; + /** vdpa device operations */ struct rte_vdpa_dev_ops *ops; } __rte_cache_aligned; -/* Register a vdpa device, return did if successful, -1 on failure */ +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Register a vdpa device + * + * @param addr + * the vdpa device address + * @param ops + * the vdpa device operations + * @return + * device id on success, -1 on failure + */ int __rte_experimental rte_vdpa_register_device(struct rte_vdpa_dev_addr *addr, struct rte_vdpa_dev_ops *ops); -/* Unregister a vdpa device, return -1 on failure */ +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Unregister a vdpa device + * + * @param did + * vdpa device id + * @return + * device id on success, -1 on failure + */ int __rte_experimental rte_vdpa_unregister_device(int did); -/* Find did of a vdpa device, return -1 on failure */ +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Find the device id of a vdpa device + * + * @param addr + * the vdpa device address + * @return + * device id on success, -1 on failure + */ int __rte_experimental rte_vdpa_find_device_id(struct rte_vdpa_dev_addr *addr); -/* Find a vdpa device based on did */ +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Find a vdpa device based on device id + * + * @param did + * device id + * @return + * rte_vdpa_device on success, NULL on failure + */ struct rte_vdpa_device * __rte_experimental rte_vdpa_get_device(int did); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Get current available vdpa device number + * + * @return + * available vdpa device number + */ +int __rte_experimental +rte_vdpa_get_device_num(void); #endif /* _RTE_VDPA_H_ */ diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h index b02673d4..d280ac42 100644 --- a/lib/librte_vhost/rte_vhost.h +++ b/lib/librte_vhost/rte_vhost.h @@ -28,6 +28,7 @@ extern "C" { #define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1) #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2) #define RTE_VHOST_USER_IOMMU_SUPPORT (1ULL << 3) +#define RTE_VHOST_USER_POSTCOPY_SUPPORT (1ULL << 4) /** Protocol features. */ #ifndef VHOST_USER_PROTOCOL_F_MQ @@ -58,6 +59,10 @@ extern "C" { #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7 #endif +#ifndef VHOST_USER_PROTOCOL_F_PAGEFAULT +#define VHOST_USER_PROTOCOL_F_PAGEFAULT 8 +#endif + #ifndef VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10 #endif diff --git a/lib/librte_vhost/rte_vhost_version.map b/lib/librte_vhost/rte_vhost_version.map index da220dd0..ae39b6e2 100644 --- a/lib/librte_vhost/rte_vhost_version.map +++ b/lib/librte_vhost/rte_vhost_version.map @@ -67,6 +67,7 @@ EXPERIMENTAL { rte_vdpa_unregister_device; rte_vdpa_find_device_id; rte_vdpa_get_device; + rte_vdpa_get_device_num; rte_vhost_driver_attach_vdpa_device; rte_vhost_driver_detach_vdpa_device; rte_vhost_driver_get_vdpa_device_id; diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c index d6303174..01b60ff9 100644 --- a/lib/librte_vhost/socket.c +++ b/lib/librte_vhost/socket.c @@ -51,6 +51,8 @@ struct vhost_user_socket { uint64_t supported_features; uint64_t features; + uint64_t protocol_features; + /* * Device id to identify a specific backend device. * It's set to -1 for the default software implementation. @@ -94,18 +96,23 @@ static struct vhost_user vhost_user = { .mutex = PTHREAD_MUTEX_INITIALIZER, }; -/* return bytes# of read on success or negative val on failure. */ +/* + * return bytes# of read on success or negative val on failure. Update fdnum + * with number of fds read. + */ int -read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds, + int *fd_num) { struct iovec iov; struct msghdr msgh; - size_t fdsize = fd_num * sizeof(int); - char control[CMSG_SPACE(fdsize)]; + char control[CMSG_SPACE(max_fds * sizeof(int))]; struct cmsghdr *cmsg; int got_fds = 0; int ret; + *fd_num = 0; + memset(&msgh, 0, sizeof(msgh)); iov.iov_base = buf; iov.iov_len = buflen; @@ -131,13 +138,14 @@ read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) if ((cmsg->cmsg_level == SOL_SOCKET) && (cmsg->cmsg_type == SCM_RIGHTS)) { got_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + *fd_num = got_fds; memcpy(fds, CMSG_DATA(cmsg), got_fds * sizeof(int)); break; } } /* Clear out unused file descriptors */ - while (got_fds < fd_num) + while (got_fds < max_fds) fds[got_fds++] = -1; return ret; @@ -720,7 +728,7 @@ rte_vhost_driver_get_protocol_features(const char *path, did = vsocket->vdpa_dev_id; vdpa_dev = rte_vdpa_get_device(did); if (!vdpa_dev || !vdpa_dev->ops->get_protocol_features) { - *protocol_features = VHOST_USER_PROTOCOL_FEATURES; + *protocol_features = vsocket->protocol_features; goto unlock_exit; } @@ -733,7 +741,7 @@ rte_vhost_driver_get_protocol_features(const char *path, goto unlock_exit; } - *protocol_features = VHOST_USER_PROTOCOL_FEATURES + *protocol_features = vsocket->protocol_features & vdpa_protocol_features; unlock_exit: @@ -852,11 +860,21 @@ rte_vhost_driver_register(const char *path, uint64_t flags) vsocket->use_builtin_virtio_net = true; vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES; vsocket->features = VIRTIO_NET_SUPPORTED_FEATURES; + vsocket->protocol_features = VHOST_USER_PROTOCOL_FEATURES; - /* Dequeue zero copy can't assure descriptors returned in order */ + /* + * Dequeue zero copy can't assure descriptors returned in order. + * Also, it requires that the guest memory is populated, which is + * not compatible with postcopy. + */ if (vsocket->dequeue_zero_copy) { vsocket->supported_features &= ~(1ULL << VIRTIO_F_IN_ORDER); vsocket->features &= ~(1ULL << VIRTIO_F_IN_ORDER); + + RTE_LOG(INFO, VHOST_CONFIG, + "Dequeue zero copy requested, disabling postcopy support\n"); + vsocket->protocol_features &= + ~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT); } if (!(flags & RTE_VHOST_USER_IOMMU_SUPPORT)) { @@ -864,6 +882,18 @@ rte_vhost_driver_register(const char *path, uint64_t flags) vsocket->features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM); } + if (!(flags & RTE_VHOST_USER_POSTCOPY_SUPPORT)) { + vsocket->protocol_features &= + ~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT); + } else { +#ifndef RTE_LIBRTE_VHOST_POSTCOPY + RTE_LOG(ERR, VHOST_CONFIG, + "Postcopy requested but not compiled\n"); + ret = -1; + goto out_mutex; +#endif + } + if ((flags & RTE_VHOST_USER_CLIENT) != 0) { vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT); if (vsocket->reconnect && reconn_tid == 0) { diff --git a/lib/librte_vhost/vdpa.c b/lib/librte_vhost/vdpa.c index c82fd437..c2c5dff1 100644 --- a/lib/librte_vhost/vdpa.c +++ b/lib/librte_vhost/vdpa.c @@ -113,3 +113,9 @@ rte_vdpa_get_device(int did) return vdpa_devices[did]; } + +int +rte_vdpa_get_device_num(void) +{ + return vdpa_device_num; +} diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index 3c9be10a..70ac6bc9 100644 --- a/lib/librte_vhost/vhost.c +++ b/lib/librte_vhost/vhost.c @@ -8,6 +8,7 @@ #include #include #ifdef RTE_LIBRTE_VHOST_NUMA +#include #include #endif @@ -343,6 +344,7 @@ vhost_new_device(void) dev->flags = VIRTIO_DEV_BUILTIN_VIRTIO_NET; dev->slave_req_fd = -1; dev->vdpa_dev_id = -1; + dev->postcopy_ufd = -1; rte_spinlock_init(&dev->slave_req_lock); return i; @@ -480,7 +482,7 @@ rte_vhost_get_numa_node(int vid) int numa_node; int ret; - if (dev == NULL) + if (dev == NULL || numa_available() != 0) return -1; ret = get_mempolicy(&numa_node, NULL, 0, dev, @@ -646,12 +648,18 @@ rte_vhost_avail_entries(int vid, uint16_t queue_id) } static inline void -vhost_enable_notify_split(struct vhost_virtqueue *vq, int enable) +vhost_enable_notify_split(struct virtio_net *dev, + struct vhost_virtqueue *vq, int enable) { - if (enable) - vq->used->flags &= ~VRING_USED_F_NO_NOTIFY; - else - vq->used->flags |= VRING_USED_F_NO_NOTIFY; + if (!(dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))) { + if (enable) + vq->used->flags &= ~VRING_USED_F_NO_NOTIFY; + else + vq->used->flags |= VRING_USED_F_NO_NOTIFY; + } else { + if (enable) + vhost_avail_event(vq) = vq->last_avail_idx; + } } static inline void @@ -660,8 +668,10 @@ vhost_enable_notify_packed(struct virtio_net *dev, { uint16_t flags; - if (!enable) + if (!enable) { vq->device_event->flags = VRING_EVENT_F_DISABLE; + return; + } flags = VRING_EVENT_F_ENABLE; if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) { @@ -689,7 +699,7 @@ rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) if (vq_is_packed(dev)) vhost_enable_notify_packed(dev, vq, enable); else - vhost_enable_notify_split(vq, enable); + vhost_enable_notify_split(dev, vq, enable); return 0; } diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 760a09c0..b4abad30 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -284,6 +284,16 @@ struct guest_page { uint64_t size; }; +/* The possible results of a message handling function */ +enum vh_result { + /* Message handling failed */ + VH_RESULT_ERR = -1, + /* Message handling successful */ + VH_RESULT_OK = 0, + /* Message handling successful and reply prepared */ + VH_RESULT_REPLY = 1, +}; + /** * function prototype for the vhost backend to handler specific vhost user * messages prior to the master message handling @@ -292,17 +302,15 @@ struct guest_page { * vhost device id * @param msg * Message pointer. - * @param require_reply - * If the handler requires sending a reply, this varaible shall be written 1, - * otherwise 0. * @param skip_master * If the handler requires skipping the master message handling, this variable * shall be written 1, otherwise 0. * @return - * 0 on success, -1 on failure + * VH_RESULT_OK on success, VH_RESULT_REPLY on success with reply, + * VH_RESULT_ERR on failure */ -typedef int (*vhost_msg_pre_handle)(int vid, void *msg, - uint32_t *require_reply, uint32_t *skip_master); +typedef enum vh_result (*vhost_msg_pre_handle)(int vid, void *msg, + uint32_t *skip_master); /** * function prototype for the vhost backend to handler specific vhost user @@ -312,14 +320,11 @@ typedef int (*vhost_msg_pre_handle)(int vid, void *msg, * vhost device id * @param msg * Message pointer. - * @param require_reply - * If the handler requires sending a reply, this varaible shall be written 1, - * otherwise 0. * @return - * 0 on success, -1 on failure + * VH_RESULT_OK on success, VH_RESULT_REPLY on success with reply, + * VH_RESULT_ERR on failure */ -typedef int (*vhost_msg_post_handle)(int vid, void *msg, - uint32_t *require_reply); +typedef enum vh_result (*vhost_msg_post_handle)(int vid, void *msg); /** * pre and post vhost user message handlers @@ -363,6 +368,9 @@ struct virtio_net { int slave_req_fd; rte_spinlock_t slave_req_lock; + int postcopy_ufd; + int postcopy_listening; + /* * Device id to identify a specific backend device. * It's set to -1 for the default software implementation. @@ -648,6 +656,8 @@ vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, return __vhost_iova_to_vva(dev, vq, iova, len, perm); } +#define vhost_avail_event(vr) \ + (*(volatile uint16_t*)&(vr)->used->ring[(vr)->size]) #define vhost_used_event(vr) \ (*(volatile uint16_t*)&(vr)->avail->ring[(vr)->size]) diff --git a/lib/librte_vhost/vhost_crypto.c b/lib/librte_vhost/vhost_crypto.c index 57341ef8..9811a232 100644 --- a/lib/librte_vhost/vhost_crypto.c +++ b/lib/librte_vhost/vhost_crypto.c @@ -425,35 +425,34 @@ vhost_crypto_close_sess(struct vhost_crypto *vcrypto, uint64_t session_id) return 0; } -static int -vhost_crypto_msg_post_handler(int vid, void *msg, uint32_t *require_reply) +static enum vh_result +vhost_crypto_msg_post_handler(int vid, void *msg) { struct virtio_net *dev = get_device(vid); struct vhost_crypto *vcrypto; VhostUserMsg *vmsg = msg; - int ret = 0; + enum vh_result ret = VH_RESULT_OK; - if (dev == NULL || require_reply == NULL) { + if (dev == NULL) { VC_LOG_ERR("Invalid vid %i", vid); - return -EINVAL; + return VH_RESULT_ERR; } vcrypto = dev->extern_data; if (vcrypto == NULL) { VC_LOG_ERR("Cannot find required data, is it initialized?"); - return -ENOENT; + return VH_RESULT_ERR; } - *require_reply = 0; - if (vmsg->request.master == VHOST_USER_CRYPTO_CREATE_SESS) { vhost_crypto_create_sess(vcrypto, &vmsg->payload.crypto_session); - *require_reply = 1; - } else if (vmsg->request.master == VHOST_USER_CRYPTO_CLOSE_SESS) - ret = vhost_crypto_close_sess(vcrypto, vmsg->payload.u64); - else - ret = -EINVAL; + vmsg->fd_num = 0; + ret = VH_RESULT_REPLY; + } else if (vmsg->request.master == VHOST_USER_CRYPTO_CLOSE_SESS) { + if (vhost_crypto_close_sess(vcrypto, vmsg->payload.u64)) + ret = VH_RESULT_ERR; + } return ret; } diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c index a2d4c9ff..508228a3 100644 --- a/lib/librte_vhost/vhost_user.c +++ b/lib/librte_vhost/vhost_user.c @@ -24,13 +24,19 @@ #include #include #include +#include +#include #include #include #include +#include #include #ifdef RTE_LIBRTE_VHOST_NUMA #include #endif +#ifdef RTE_LIBRTE_VHOST_POSTCOPY +#include +#endif #include #include @@ -69,8 +75,14 @@ static const char *vhost_message_str[VHOST_USER_MAX] = { [VHOST_USER_IOTLB_MSG] = "VHOST_USER_IOTLB_MSG", [VHOST_USER_CRYPTO_CREATE_SESS] = "VHOST_USER_CRYPTO_CREATE_SESS", [VHOST_USER_CRYPTO_CLOSE_SESS] = "VHOST_USER_CRYPTO_CLOSE_SESS", + [VHOST_USER_POSTCOPY_ADVISE] = "VHOST_USER_POSTCOPY_ADVISE", + [VHOST_USER_POSTCOPY_LISTEN] = "VHOST_USER_POSTCOPY_LISTEN", + [VHOST_USER_POSTCOPY_END] = "VHOST_USER_POSTCOPY_END", }; +static int send_vhost_reply(int sockfd, struct VhostUserMsg *msg); +static int read_vhost_message(int sockfd, struct VhostUserMsg *msg); + static uint64_t get_blk_size(int fd) { @@ -120,6 +132,13 @@ vhost_backend_cleanup(struct virtio_net *dev) close(dev->slave_req_fd); dev->slave_req_fd = -1; } + + if (dev->postcopy_ufd >= 0) { + close(dev->postcopy_ufd); + dev->postcopy_ufd = -1; + } + + dev->postcopy_listening = 0; } /* @@ -127,51 +146,73 @@ vhost_backend_cleanup(struct virtio_net *dev) * the device hasn't been initialised. */ static int -vhost_user_set_owner(void) +vhost_user_set_owner(struct virtio_net **pdev __rte_unused, + struct VhostUserMsg *msg __rte_unused, + int main_fd __rte_unused) { - return 0; + return VH_RESULT_OK; } static int -vhost_user_reset_owner(struct virtio_net *dev) +vhost_user_reset_owner(struct virtio_net **pdev, + struct VhostUserMsg *msg __rte_unused, + int main_fd __rte_unused) { + struct virtio_net *dev = *pdev; vhost_destroy_device_notify(dev); cleanup_device(dev, 0); reset_device(dev); - return 0; + return VH_RESULT_OK; } /* * The features that we support are requested. */ -static uint64_t -vhost_user_get_features(struct virtio_net *dev) +static int +vhost_user_get_features(struct virtio_net **pdev, struct VhostUserMsg *msg, + int main_fd __rte_unused) { + struct virtio_net *dev = *pdev; uint64_t features = 0; rte_vhost_driver_get_features(dev->ifname, &features); - return features; + + msg->payload.u64 = features; + msg->size = sizeof(msg->payload.u64); + msg->fd_num = 0; + + return VH_RESULT_REPLY; } /* * The queue number that we support are requested. */ -static uint32_t -vhost_user_get_queue_num(struct virtio_net *dev) +static int +vhost_user_get_queue_num(struct virtio_net **pdev, struct VhostUserMsg *msg, + int main_fd __rte_unused) { + struct virtio_net *dev = *pdev; uint32_t queue_num = 0; rte_vhost_driver_get_queue_num(dev->ifname, &queue_num); - return queue_num; + + msg->payload.u64 = (uint64_t)queue_num; + msg->size = sizeof(msg->payload.u64); + msg->fd_num = 0; + + return VH_RESULT_REPLY; } /* * We receive the negotiated features supported by us and the virtio device. */ static int -vhost_user_set_features(struct virtio_net *dev, uint64_t features) +vhost_user_set_features(struct virtio_net **pdev, struct VhostUserMsg *msg, + int main_fd __rte_unused) { + struct virtio_net *dev = *pdev; + uint64_t features = msg->payload.u64; uint64_t vhost_features = 0; struct rte_vdpa_device *vdpa_dev; int did = -1; @@ -181,12 +222,12 @@ vhost_user_set_features(struct virtio_net *dev, uint64_t features) RTE_LOG(ERR, VHOST_CONFIG, "(%d) received invalid negotiated features.\n", dev->vid); - return -1; + return VH_RESULT_ERR; } if (dev->flags & VIRTIO_DEV_RUNNING) { if (dev->features == features) - return 0; + return VH_RESULT_OK; /* * Error out if master tries to change features while device is @@ -197,7 +238,7 @@ vhost_user_set_features(struct virtio_net *dev, uint64_t features) RTE_LOG(ERR, VHOST_CONFIG, "(%d) features changed while device is running.\n", dev->vid); - return -1; + return VH_RESULT_ERR; } if (dev->notify_ops->features_changed) @@ -242,16 +283,18 @@ vhost_user_set_features(struct virtio_net *dev, uint64_t features) if (vdpa_dev && vdpa_dev->ops->set_features) vdpa_dev->ops->set_features(dev->vid); - return 0; + return VH_RESULT_OK; } /* * The virtio device sends us the size of the descriptor ring. */ static int -vhost_user_set_vring_num(struct virtio_net *dev, - VhostUserMsg *msg) +vhost_user_set_vring_num(struct virtio_net **pdev, + struct VhostUserMsg *msg, + int main_fd __rte_unused) { + struct virtio_net *dev = *pdev; struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; vq->size = msg->payload.state.num; @@ -264,7 +307,7 @@ vhost_user_set_vring_num(struct virtio_net *dev, if ((vq->size & (vq->size - 1)) || vq->size > 32768) { RTE_LOG(ERR, VHOST_CONFIG, "invalid virtqueue size %u\n", vq->size); - return -1; + return VH_RESULT_ERR; } if (dev->dequeue_zero_copy) { @@ -290,7 +333,7 @@ vhost_user_set_vring_num(struct virtio_net *dev, if (!vq->shadow_used_packed) { RTE_LOG(ERR, VHOST_CONFIG, "failed to allocate memory for shadow used ring.\n"); - return -1; + return VH_RESULT_ERR; } } else { @@ -300,7 +343,7 @@ vhost_user_set_vring_num(struct virtio_net *dev, if (!vq->shadow_used_split) { RTE_LOG(ERR, VHOST_CONFIG, "failed to allocate memory for shadow used ring.\n"); - return -1; + return VH_RESULT_ERR; } } @@ -310,10 +353,10 @@ vhost_user_set_vring_num(struct virtio_net *dev, if (!vq->batch_copy_elems) { RTE_LOG(ERR, VHOST_CONFIG, "failed to allocate memory for batching copy.\n"); - return -1; + return VH_RESULT_ERR; } - return 0; + return VH_RESULT_OK; } /* @@ -357,11 +400,13 @@ numa_realloc(struct virtio_net *dev, int index) memcpy(vq, old_vq, sizeof(*vq)); TAILQ_INIT(&vq->zmbuf_list); - new_zmbuf = rte_malloc_socket(NULL, vq->zmbuf_size * - sizeof(struct zcopy_mbuf), 0, newnode); - if (new_zmbuf) { - rte_free(vq->zmbufs); - vq->zmbufs = new_zmbuf; + if (dev->dequeue_zero_copy) { + new_zmbuf = rte_malloc_socket(NULL, vq->zmbuf_size * + sizeof(struct zcopy_mbuf), 0, newnode); + if (new_zmbuf) { + rte_free(vq->zmbufs); + vq->zmbufs = new_zmbuf; + } } if (vq_is_packed(dev)) { @@ -609,14 +654,15 @@ translate_ring_addresses(struct virtio_net *dev, int vq_index) * This function then converts these to our address space. */ static int -vhost_user_set_vring_addr(struct virtio_net **pdev, VhostUserMsg *msg) +vhost_user_set_vring_addr(struct virtio_net **pdev, struct VhostUserMsg *msg, + int main_fd __rte_unused) { + struct virtio_net *dev = *pdev; struct vhost_virtqueue *vq; struct vhost_vring_addr *addr = &msg->payload.addr; - struct virtio_net *dev = *pdev; if (dev->mem == NULL) - return -1; + return VH_RESULT_ERR; /* addr->index refers to the queue index. The txq 1, rxq is 0. */ vq = dev->virtqueue[msg->payload.addr.index]; @@ -633,27 +679,29 @@ vhost_user_set_vring_addr(struct virtio_net **pdev, VhostUserMsg *msg) (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) { dev = translate_ring_addresses(dev, msg->payload.addr.index); if (!dev) - return -1; + return VH_RESULT_ERR; *pdev = dev; } - return 0; + return VH_RESULT_OK; } /* * The virtio device sends us the available ring last used index. */ static int -vhost_user_set_vring_base(struct virtio_net *dev, - VhostUserMsg *msg) +vhost_user_set_vring_base(struct virtio_net **pdev, + struct VhostUserMsg *msg, + int main_fd __rte_unused) { + struct virtio_net *dev = *pdev; dev->virtqueue[msg->payload.state.index]->last_used_idx = msg->payload.state.num; dev->virtqueue[msg->payload.state.index]->last_avail_idx = msg->payload.state.num; - return 0; + return VH_RESULT_OK; } static int @@ -778,10 +826,11 @@ vhost_memory_changed(struct VhostUserMemory *new, } static int -vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *pmsg) +vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg, + int main_fd) { struct virtio_net *dev = *pdev; - struct VhostUserMemory memory = pmsg->payload.memory; + struct VhostUserMemory *memory = &msg->payload.memory; struct rte_vhost_mem_region *reg; void *mmap_addr; uint64_t mmap_size; @@ -791,20 +840,20 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *pmsg) int populate; int fd; - if (memory.nregions > VHOST_MEMORY_MAX_NREGIONS) { + if (memory->nregions > VHOST_MEMORY_MAX_NREGIONS) { RTE_LOG(ERR, VHOST_CONFIG, - "too many memory regions (%u)\n", memory.nregions); - return -1; + "too many memory regions (%u)\n", memory->nregions); + return VH_RESULT_ERR; } - if (dev->mem && !vhost_memory_changed(&memory, dev->mem)) { + if (dev->mem && !vhost_memory_changed(memory, dev->mem)) { RTE_LOG(INFO, VHOST_CONFIG, "(%d) memory regions not changed\n", dev->vid); - for (i = 0; i < memory.nregions; i++) - close(pmsg->fds[i]); + for (i = 0; i < memory->nregions; i++) + close(msg->fds[i]); - return 0; + return VH_RESULT_OK; } if (dev->mem) { @@ -828,30 +877,30 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *pmsg) "(%d) failed to allocate memory " "for dev->guest_pages\n", dev->vid); - return -1; + return VH_RESULT_ERR; } } dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) + - sizeof(struct rte_vhost_mem_region) * memory.nregions, 0); + sizeof(struct rte_vhost_mem_region) * memory->nregions, 0); if (dev->mem == NULL) { RTE_LOG(ERR, VHOST_CONFIG, "(%d) failed to allocate memory for dev->mem\n", dev->vid); - return -1; + return VH_RESULT_ERR; } - dev->mem->nregions = memory.nregions; + dev->mem->nregions = memory->nregions; - for (i = 0; i < memory.nregions; i++) { - fd = pmsg->fds[i]; + for (i = 0; i < memory->nregions; i++) { + fd = msg->fds[i]; reg = &dev->mem->regions[i]; - reg->guest_phys_addr = memory.regions[i].guest_phys_addr; - reg->guest_user_addr = memory.regions[i].userspace_addr; - reg->size = memory.regions[i].memory_size; + reg->guest_phys_addr = memory->regions[i].guest_phys_addr; + reg->guest_user_addr = memory->regions[i].userspace_addr; + reg->size = memory->regions[i].memory_size; reg->fd = fd; - mmap_offset = memory.regions[i].mmap_offset; + mmap_offset = memory->regions[i].mmap_offset; /* Check for memory_size + mmap_offset overflow */ if (mmap_offset >= -reg->size) { @@ -920,6 +969,70 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *pmsg) mmap_size, alignment, mmap_offset); + + if (dev->postcopy_listening) { + /* + * We haven't a better way right now than sharing + * DPDK's virtual address with Qemu, so that Qemu can + * retrieve the region offset when handling userfaults. + */ + memory->regions[i].userspace_addr = + reg->host_user_addr; + } + } + if (dev->postcopy_listening) { + /* Send the addresses back to qemu */ + msg->fd_num = 0; + send_vhost_reply(main_fd, msg); + + /* Wait for qemu to acknolwedge it's got the addresses + * we've got to wait before we're allowed to generate faults. + */ + VhostUserMsg ack_msg; + if (read_vhost_message(main_fd, &ack_msg) <= 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to read qemu ack on postcopy set-mem-table\n"); + goto err_mmap; + } + if (ack_msg.request.master != VHOST_USER_SET_MEM_TABLE) { + RTE_LOG(ERR, VHOST_CONFIG, + "Bad qemu ack on postcopy set-mem-table (%d)\n", + ack_msg.request.master); + goto err_mmap; + } + + /* Now userfault register and we can use the memory */ + for (i = 0; i < memory->nregions; i++) { +#ifdef RTE_LIBRTE_VHOST_POSTCOPY + reg = &dev->mem->regions[i]; + struct uffdio_register reg_struct; + + /* + * Let's register all the mmap'ed area to ensure + * alignment on page boundary. + */ + reg_struct.range.start = + (uint64_t)(uintptr_t)reg->mmap_addr; + reg_struct.range.len = reg->mmap_size; + reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; + + if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, + ®_struct)) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to register ufd for region %d: (ufd = %d) %s\n", + i, dev->postcopy_ufd, + strerror(errno)); + goto err_mmap; + } + RTE_LOG(INFO, VHOST_CONFIG, + "\t userfaultfd registered for range : %llx - %llx\n", + reg_struct.range.start, + reg_struct.range.start + + reg_struct.range.len - 1); +#else + goto err_mmap; +#endif + } } for (i = 0; i < dev->nr_vring; i++) { @@ -934,8 +1047,10 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *pmsg) vring_invalidate(dev, vq); dev = translate_ring_addresses(dev, i); - if (!dev) - return -1; + if (!dev) { + dev = *pdev; + goto err_mmap; + } *pdev = dev; } @@ -943,13 +1058,13 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *pmsg) dump_guest_pages(dev); - return 0; + return VH_RESULT_OK; err_mmap: free_mem_region(dev); rte_free(dev->mem); dev->mem = NULL; - return -1; + return VH_RESULT_ERR; } static bool @@ -991,17 +1106,19 @@ virtio_is_ready(struct virtio_net *dev) return 1; } -static void -vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg) +static int +vhost_user_set_vring_call(struct virtio_net **pdev, struct VhostUserMsg *msg, + int main_fd __rte_unused) { + struct virtio_net *dev = *pdev; struct vhost_vring_file file; struct vhost_virtqueue *vq; - file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; - if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) + file.index = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + if (msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) file.fd = VIRTIO_INVALID_EVENTFD; else - file.fd = pmsg->fds[0]; + file.fd = msg->fds[0]; RTE_LOG(INFO, VHOST_CONFIG, "vring call idx:%d file:%d\n", file.index, file.fd); @@ -1010,27 +1127,41 @@ vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg) close(vq->callfd); vq->callfd = file.fd; + + return VH_RESULT_OK; } -static void -vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *pmsg) +static int vhost_user_set_vring_err(struct virtio_net **pdev __rte_unused, + struct VhostUserMsg *msg, + int main_fd __rte_unused) { + if (!(msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) + close(msg->fds[0]); + RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n"); + + return VH_RESULT_OK; +} + +static int +vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg, + int main_fd __rte_unused) +{ + struct virtio_net *dev = *pdev; struct vhost_vring_file file; struct vhost_virtqueue *vq; - struct virtio_net *dev = *pdev; - file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; - if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) + file.index = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + if (msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) file.fd = VIRTIO_INVALID_EVENTFD; else - file.fd = pmsg->fds[0]; + file.fd = msg->fds[0]; RTE_LOG(INFO, VHOST_CONFIG, "vring kick idx:%d file:%d\n", file.index, file.fd); /* Interpret ring addresses only when ring is started. */ dev = translate_ring_addresses(dev, file.index); if (!dev) - return; + return VH_RESULT_ERR; *pdev = dev; @@ -1047,6 +1178,8 @@ vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *pmsg) if (vq->kickfd >= 0) close(vq->kickfd); vq->kickfd = file.fd; + + return VH_RESULT_OK; } static void @@ -1069,9 +1202,11 @@ free_zmbufs(struct vhost_virtqueue *vq) * when virtio is stopped, qemu will send us the GET_VRING_BASE message. */ static int -vhost_user_get_vring_base(struct virtio_net *dev, - VhostUserMsg *msg) +vhost_user_get_vring_base(struct virtio_net **pdev, + struct VhostUserMsg *msg, + int main_fd __rte_unused) { + struct virtio_net *dev = *pdev; struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; /* We have to stop the queue (virtio) if it is running. */ @@ -1114,7 +1249,10 @@ vhost_user_get_vring_base(struct virtio_net *dev, rte_free(vq->batch_copy_elems); vq->batch_copy_elems = NULL; - return 0; + msg->size = sizeof(msg->payload.state); + msg->fd_num = 0; + + return VH_RESULT_REPLY; } /* @@ -1122,9 +1260,11 @@ vhost_user_get_vring_base(struct virtio_net *dev, * enable the virtio queue pair. */ static int -vhost_user_set_vring_enable(struct virtio_net *dev, - VhostUserMsg *msg) +vhost_user_set_vring_enable(struct virtio_net **pdev, + struct VhostUserMsg *msg, + int main_fd __rte_unused) { + struct virtio_net *dev = *pdev; int enable = (int)msg->payload.state.num; int index = (int)msg->payload.state.index; struct rte_vdpa_device *vdpa_dev; @@ -1145,13 +1285,15 @@ vhost_user_set_vring_enable(struct virtio_net *dev, dev->virtqueue[index]->enabled = enable; - return 0; + return VH_RESULT_OK; } -static void -vhost_user_get_protocol_features(struct virtio_net *dev, - struct VhostUserMsg *msg) +static int +vhost_user_get_protocol_features(struct virtio_net **pdev, + struct VhostUserMsg *msg, + int main_fd __rte_unused) { + struct virtio_net *dev = *pdev; uint64_t features, protocol_features; rte_vhost_driver_get_features(dev->ifname, &features); @@ -1168,35 +1310,53 @@ vhost_user_get_protocol_features(struct virtio_net *dev, msg->payload.u64 = protocol_features; msg->size = sizeof(msg->payload.u64); + msg->fd_num = 0; + + return VH_RESULT_REPLY; } -static void -vhost_user_set_protocol_features(struct virtio_net *dev, - uint64_t protocol_features) +static int +vhost_user_set_protocol_features(struct virtio_net **pdev, + struct VhostUserMsg *msg, + int main_fd __rte_unused) { - if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES) - return; + struct virtio_net *dev = *pdev; + uint64_t protocol_features = msg->payload.u64; + uint64_t slave_protocol_features = 0; + + rte_vhost_driver_get_protocol_features(dev->ifname, + &slave_protocol_features); + if (protocol_features & ~slave_protocol_features) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) received invalid protocol features.\n", + dev->vid); + return VH_RESULT_ERR; + } dev->protocol_features = protocol_features; + + return VH_RESULT_OK; } static int -vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg) +vhost_user_set_log_base(struct virtio_net **pdev, struct VhostUserMsg *msg, + int main_fd __rte_unused) { + struct virtio_net *dev = *pdev; int fd = msg->fds[0]; uint64_t size, off; void *addr; if (fd < 0) { RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd); - return -1; + return VH_RESULT_ERR; } if (msg->size != sizeof(VhostUserLog)) { RTE_LOG(ERR, VHOST_CONFIG, "invalid log base msg size: %"PRId32" != %d\n", msg->size, (int)sizeof(VhostUserLog)); - return -1; + return VH_RESULT_ERR; } size = msg->payload.log.mmap_size; @@ -1207,7 +1367,7 @@ vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg) RTE_LOG(ERR, VHOST_CONFIG, "log offset %#"PRIx64" exceeds log size %#"PRIx64"\n", off, size); - return -1; + return VH_RESULT_ERR; } RTE_LOG(INFO, VHOST_CONFIG, @@ -1222,7 +1382,7 @@ vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg) close(fd); if (addr == MAP_FAILED) { RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n"); - return -1; + return VH_RESULT_ERR; } /* @@ -1236,7 +1396,24 @@ vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg) dev->log_base = dev->log_addr + off; dev->log_size = size; - return 0; + /* + * The spec is not clear about it (yet), but QEMU doesn't expect + * any payload in the reply. + */ + msg->size = 0; + msg->fd_num = 0; + + return VH_RESULT_REPLY; +} + +static int vhost_user_set_log_fd(struct virtio_net **pdev __rte_unused, + struct VhostUserMsg *msg, + int main_fd __rte_unused) +{ + close(msg->fds[0]); + RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); + + return VH_RESULT_OK; } /* @@ -1248,8 +1425,10 @@ vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg) * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it. */ static int -vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg) +vhost_user_send_rarp(struct virtio_net **pdev, struct VhostUserMsg *msg, + int main_fd __rte_unused) { + struct virtio_net *dev = *pdev; uint8_t *mac = (uint8_t *)&msg->payload.u64; struct rte_vdpa_device *vdpa_dev; int did = -1; @@ -1273,40 +1452,44 @@ vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg) if (vdpa_dev && vdpa_dev->ops->migration_done) vdpa_dev->ops->migration_done(dev->vid); - return 0; + return VH_RESULT_OK; } static int -vhost_user_net_set_mtu(struct virtio_net *dev, struct VhostUserMsg *msg) +vhost_user_net_set_mtu(struct virtio_net **pdev, struct VhostUserMsg *msg, + int main_fd __rte_unused) { + struct virtio_net *dev = *pdev; if (msg->payload.u64 < VIRTIO_MIN_MTU || msg->payload.u64 > VIRTIO_MAX_MTU) { RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n", msg->payload.u64); - return -1; + return VH_RESULT_ERR; } dev->mtu = msg->payload.u64; - return 0; + return VH_RESULT_OK; } static int -vhost_user_set_req_fd(struct virtio_net *dev, struct VhostUserMsg *msg) +vhost_user_set_req_fd(struct virtio_net **pdev, struct VhostUserMsg *msg, + int main_fd __rte_unused) { + struct virtio_net *dev = *pdev; int fd = msg->fds[0]; if (fd < 0) { RTE_LOG(ERR, VHOST_CONFIG, "Invalid file descriptor for slave channel (%d)\n", fd); - return -1; + return VH_RESULT_ERR; } dev->slave_req_fd = fd; - return 0; + return VH_RESULT_OK; } static int @@ -1359,7 +1542,8 @@ is_vring_iotlb_invalidate(struct vhost_virtqueue *vq, } static int -vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg) +vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg, + int main_fd __rte_unused) { struct virtio_net *dev = *pdev; struct vhost_iotlb_msg *imsg = &msg->payload.iotlb; @@ -1371,7 +1555,7 @@ vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg) len = imsg->size; vva = qva_to_vva(dev, imsg->uaddr, &len); if (!vva) - return -1; + return VH_RESULT_ERR; for (i = 0; i < dev->nr_vring; i++) { struct vhost_virtqueue *vq = dev->virtqueue[i]; @@ -1397,12 +1581,118 @@ vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg) default: RTE_LOG(ERR, VHOST_CONFIG, "Invalid IOTLB message type (%d)\n", imsg->type); - return -1; + return VH_RESULT_ERR; } - return 0; + return VH_RESULT_OK; } +static int +vhost_user_set_postcopy_advise(struct virtio_net **pdev, + struct VhostUserMsg *msg, + int main_fd __rte_unused) +{ + struct virtio_net *dev = *pdev; +#ifdef RTE_LIBRTE_VHOST_POSTCOPY + struct uffdio_api api_struct; + + dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + + if (dev->postcopy_ufd == -1) { + RTE_LOG(ERR, VHOST_CONFIG, "Userfaultfd not available: %s\n", + strerror(errno)); + return VH_RESULT_ERR; + } + api_struct.api = UFFD_API; + api_struct.features = 0; + if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) { + RTE_LOG(ERR, VHOST_CONFIG, "UFFDIO_API ioctl failure: %s\n", + strerror(errno)); + close(dev->postcopy_ufd); + dev->postcopy_ufd = -1; + return VH_RESULT_ERR; + } + msg->fds[0] = dev->postcopy_ufd; + msg->fd_num = 1; + + return VH_RESULT_REPLY; +#else + dev->postcopy_ufd = -1; + msg->fd_num = 0; + + return VH_RESULT_ERR; +#endif +} + +static int +vhost_user_set_postcopy_listen(struct virtio_net **pdev, + struct VhostUserMsg *msg __rte_unused, + int main_fd __rte_unused) +{ + struct virtio_net *dev = *pdev; + + if (dev->mem && dev->mem->nregions) { + RTE_LOG(ERR, VHOST_CONFIG, + "Regions already registered at postcopy-listen\n"); + return VH_RESULT_ERR; + } + dev->postcopy_listening = 1; + + return VH_RESULT_OK; +} + +static int +vhost_user_postcopy_end(struct virtio_net **pdev, struct VhostUserMsg *msg, + int main_fd __rte_unused) +{ + struct virtio_net *dev = *pdev; + + dev->postcopy_listening = 0; + if (dev->postcopy_ufd >= 0) { + close(dev->postcopy_ufd); + dev->postcopy_ufd = -1; + } + + msg->payload.u64 = 0; + msg->size = sizeof(msg->payload.u64); + msg->fd_num = 0; + + return VH_RESULT_REPLY; +} + +typedef int (*vhost_message_handler_t)(struct virtio_net **pdev, + struct VhostUserMsg *msg, + int main_fd); +static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = { + [VHOST_USER_NONE] = NULL, + [VHOST_USER_GET_FEATURES] = vhost_user_get_features, + [VHOST_USER_SET_FEATURES] = vhost_user_set_features, + [VHOST_USER_SET_OWNER] = vhost_user_set_owner, + [VHOST_USER_RESET_OWNER] = vhost_user_reset_owner, + [VHOST_USER_SET_MEM_TABLE] = vhost_user_set_mem_table, + [VHOST_USER_SET_LOG_BASE] = vhost_user_set_log_base, + [VHOST_USER_SET_LOG_FD] = vhost_user_set_log_fd, + [VHOST_USER_SET_VRING_NUM] = vhost_user_set_vring_num, + [VHOST_USER_SET_VRING_ADDR] = vhost_user_set_vring_addr, + [VHOST_USER_SET_VRING_BASE] = vhost_user_set_vring_base, + [VHOST_USER_GET_VRING_BASE] = vhost_user_get_vring_base, + [VHOST_USER_SET_VRING_KICK] = vhost_user_set_vring_kick, + [VHOST_USER_SET_VRING_CALL] = vhost_user_set_vring_call, + [VHOST_USER_SET_VRING_ERR] = vhost_user_set_vring_err, + [VHOST_USER_GET_PROTOCOL_FEATURES] = vhost_user_get_protocol_features, + [VHOST_USER_SET_PROTOCOL_FEATURES] = vhost_user_set_protocol_features, + [VHOST_USER_GET_QUEUE_NUM] = vhost_user_get_queue_num, + [VHOST_USER_SET_VRING_ENABLE] = vhost_user_set_vring_enable, + [VHOST_USER_SEND_RARP] = vhost_user_send_rarp, + [VHOST_USER_NET_SET_MTU] = vhost_user_net_set_mtu, + [VHOST_USER_SET_SLAVE_REQ_FD] = vhost_user_set_req_fd, + [VHOST_USER_IOTLB_MSG] = vhost_user_iotlb_msg, + [VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise, + [VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen, + [VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end, +}; + + /* return bytes# of read on success or negative val on failure. */ static int read_vhost_message(int sockfd, struct VhostUserMsg *msg) @@ -1410,7 +1700,7 @@ read_vhost_message(int sockfd, struct VhostUserMsg *msg) int ret; ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, - msg->fds, VHOST_MEMORY_MAX_NREGIONS); + msg->fds, VHOST_MEMORY_MAX_NREGIONS, &msg->fd_num); if (ret <= 0) return ret; @@ -1434,13 +1724,13 @@ read_vhost_message(int sockfd, struct VhostUserMsg *msg) } static int -send_vhost_message(int sockfd, struct VhostUserMsg *msg, int *fds, int fd_num) +send_vhost_message(int sockfd, struct VhostUserMsg *msg) { if (!msg) return 0; return send_fd_message(sockfd, (char *)msg, - VHOST_USER_HDR_SIZE + msg->size, fds, fd_num); + VHOST_USER_HDR_SIZE + msg->size, msg->fds, msg->fd_num); } static int @@ -1454,19 +1744,18 @@ send_vhost_reply(int sockfd, struct VhostUserMsg *msg) msg->flags |= VHOST_USER_VERSION; msg->flags |= VHOST_USER_REPLY_MASK; - return send_vhost_message(sockfd, msg, NULL, 0); + return send_vhost_message(sockfd, msg); } static int -send_vhost_slave_message(struct virtio_net *dev, struct VhostUserMsg *msg, - int *fds, int fd_num) +send_vhost_slave_message(struct virtio_net *dev, struct VhostUserMsg *msg) { int ret; if (msg->flags & VHOST_USER_NEED_REPLY) rte_spinlock_lock(&dev->slave_req_lock); - ret = send_vhost_message(dev->slave_req_fd, msg, fds, fd_num); + ret = send_vhost_message(dev->slave_req_fd, msg); if (ret < 0 && (msg->flags & VHOST_USER_NEED_REPLY)) rte_spinlock_unlock(&dev->slave_req_lock); @@ -1477,7 +1766,8 @@ send_vhost_slave_message(struct virtio_net *dev, struct VhostUserMsg *msg, * Allocate a queue pair if it hasn't been allocated yet */ static int -vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg) +vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, + struct VhostUserMsg *msg) { uint16_t vring_idx; @@ -1555,6 +1845,7 @@ vhost_user_msg_handler(int vid, int fd) int ret; int unlock_required = 0; uint32_t skip_master = 0; + int request; dev = get_device(vid); if (dev == NULL) @@ -1633,132 +1924,54 @@ vhost_user_msg_handler(int vid, int fd) } if (dev->extern_ops.pre_msg_handle) { - uint32_t need_reply; - ret = (*dev->extern_ops.pre_msg_handle)(dev->vid, - (void *)&msg, &need_reply, &skip_master); - if (ret < 0) + (void *)&msg, &skip_master); + if (ret == VH_RESULT_ERR) goto skip_to_reply; - - if (need_reply) + else if (ret == VH_RESULT_REPLY) send_vhost_reply(fd, &msg); if (skip_master) goto skip_to_post_handle; } - switch (msg.request.master) { - case VHOST_USER_GET_FEATURES: - msg.payload.u64 = vhost_user_get_features(dev); - msg.size = sizeof(msg.payload.u64); - send_vhost_reply(fd, &msg); - break; - case VHOST_USER_SET_FEATURES: - ret = vhost_user_set_features(dev, msg.payload.u64); - if (ret) - return -1; - break; - - case VHOST_USER_GET_PROTOCOL_FEATURES: - vhost_user_get_protocol_features(dev, &msg); - send_vhost_reply(fd, &msg); - break; - case VHOST_USER_SET_PROTOCOL_FEATURES: - vhost_user_set_protocol_features(dev, msg.payload.u64); - break; - - case VHOST_USER_SET_OWNER: - vhost_user_set_owner(); - break; - case VHOST_USER_RESET_OWNER: - vhost_user_reset_owner(dev); - break; - - case VHOST_USER_SET_MEM_TABLE: - ret = vhost_user_set_mem_table(&dev, &msg); - break; - - case VHOST_USER_SET_LOG_BASE: - vhost_user_set_log_base(dev, &msg); - - /* it needs a reply */ - msg.size = sizeof(msg.payload.u64); - send_vhost_reply(fd, &msg); - break; - case VHOST_USER_SET_LOG_FD: - close(msg.fds[0]); - RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); - break; - - case VHOST_USER_SET_VRING_NUM: - vhost_user_set_vring_num(dev, &msg); - break; - case VHOST_USER_SET_VRING_ADDR: - vhost_user_set_vring_addr(&dev, &msg); - break; - case VHOST_USER_SET_VRING_BASE: - vhost_user_set_vring_base(dev, &msg); - break; - - case VHOST_USER_GET_VRING_BASE: - vhost_user_get_vring_base(dev, &msg); - msg.size = sizeof(msg.payload.state); - send_vhost_reply(fd, &msg); - break; - - case VHOST_USER_SET_VRING_KICK: - vhost_user_set_vring_kick(&dev, &msg); - break; - case VHOST_USER_SET_VRING_CALL: - vhost_user_set_vring_call(dev, &msg); - break; - - case VHOST_USER_SET_VRING_ERR: - if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)) - close(msg.fds[0]); - RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n"); - break; - - case VHOST_USER_GET_QUEUE_NUM: - msg.payload.u64 = (uint64_t)vhost_user_get_queue_num(dev); - msg.size = sizeof(msg.payload.u64); - send_vhost_reply(fd, &msg); - break; - - case VHOST_USER_SET_VRING_ENABLE: - vhost_user_set_vring_enable(dev, &msg); - break; - case VHOST_USER_SEND_RARP: - vhost_user_send_rarp(dev, &msg); - break; - - case VHOST_USER_NET_SET_MTU: - ret = vhost_user_net_set_mtu(dev, &msg); - break; - - case VHOST_USER_SET_SLAVE_REQ_FD: - ret = vhost_user_set_req_fd(dev, &msg); - break; - - case VHOST_USER_IOTLB_MSG: - ret = vhost_user_iotlb_msg(&dev, &msg); - break; + request = msg.request.master; + if (request > VHOST_USER_NONE && request < VHOST_USER_MAX) { + if (!vhost_message_handlers[request]) + goto skip_to_post_handle; + ret = vhost_message_handlers[request](&dev, &msg, fd); - default: - ret = -1; - break; + switch (ret) { + case VH_RESULT_ERR: + RTE_LOG(ERR, VHOST_CONFIG, + "Processing %s failed.\n", + vhost_message_str[request]); + break; + case VH_RESULT_OK: + RTE_LOG(DEBUG, VHOST_CONFIG, + "Processing %s succeeded.\n", + vhost_message_str[request]); + break; + case VH_RESULT_REPLY: + RTE_LOG(DEBUG, VHOST_CONFIG, + "Processing %s succeeded and needs reply.\n", + vhost_message_str[request]); + send_vhost_reply(fd, &msg); + break; + } + } else { + RTE_LOG(ERR, VHOST_CONFIG, + "Requested invalid message type %d.\n", request); + ret = VH_RESULT_ERR; } skip_to_post_handle: - if (dev->extern_ops.post_msg_handle) { - uint32_t need_reply; - + if (ret != VH_RESULT_ERR && dev->extern_ops.post_msg_handle) { ret = (*dev->extern_ops.post_msg_handle)( - dev->vid, (void *)&msg, &need_reply); - if (ret < 0) + dev->vid, (void *)&msg); + if (ret == VH_RESULT_ERR) goto skip_to_reply; - - if (need_reply) + else if (ret == VH_RESULT_REPLY) send_vhost_reply(fd, &msg); } @@ -1766,10 +1979,20 @@ skip_to_reply: if (unlock_required) vhost_user_unlock_all_queue_pairs(dev); + /* + * If the request required a reply that was already sent, + * this optional reply-ack won't be sent as the + * VHOST_USER_NEED_REPLY was cleared in send_vhost_reply(). + */ if (msg.flags & VHOST_USER_NEED_REPLY) { - msg.payload.u64 = !!ret; + msg.payload.u64 = ret == VH_RESULT_ERR; msg.size = sizeof(msg.payload.u64); + msg.fd_num = 0; send_vhost_reply(fd, &msg); + } else if (ret == VH_RESULT_ERR) { + RTE_LOG(ERR, VHOST_CONFIG, + "vhost message handling failed.\n"); + return -1; } if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) { @@ -1805,9 +2028,9 @@ skip_to_reply: } static int process_slave_message_reply(struct virtio_net *dev, - const VhostUserMsg *msg) + const struct VhostUserMsg *msg) { - VhostUserMsg msg_reply; + struct VhostUserMsg msg_reply; int ret; if ((msg->flags & VHOST_USER_NEED_REPLY) == 0) @@ -1848,7 +2071,7 @@ vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm) }, }; - ret = send_vhost_message(dev->slave_req_fd, &msg, NULL, 0); + ret = send_vhost_message(dev->slave_req_fd, &msg); if (ret < 0) { RTE_LOG(ERR, VHOST_CONFIG, "Failed to send IOTLB miss message (%d)\n", @@ -1864,8 +2087,6 @@ static int vhost_user_slave_set_vring_host_notifier(struct virtio_net *dev, uint64_t offset, uint64_t size) { - int *fdp = NULL; - size_t fd_num = 0; int ret; struct VhostUserMsg msg = { .request.slave = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG, @@ -1881,11 +2102,11 @@ static int vhost_user_slave_set_vring_host_notifier(struct virtio_net *dev, if (fd < 0) msg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK; else { - fdp = &fd; - fd_num = 1; + msg.fds[0] = fd; + msg.fd_num = 1; } - ret = send_vhost_slave_message(dev, &msg, fdp, fd_num); + ret = send_vhost_slave_message(dev, &msg); if (ret < 0) { RTE_LOG(ERR, VHOST_CONFIG, "Failed to set host notifier (%d)\n", ret); diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h index 42166adf..dc97be84 100644 --- a/lib/librte_vhost/vhost_user.h +++ b/lib/librte_vhost/vhost_user.h @@ -22,7 +22,8 @@ (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \ (1ULL << VHOST_USER_PROTOCOL_F_CRYPTO_SESSION) | \ (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \ - (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER)) + (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \ + (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT)) typedef enum VhostUserRequest { VHOST_USER_NONE = 0, @@ -50,7 +51,10 @@ typedef enum VhostUserRequest { VHOST_USER_IOTLB_MSG = 22, VHOST_USER_CRYPTO_CREATE_SESS = 26, VHOST_USER_CRYPTO_CLOSE_SESS = 27, - VHOST_USER_MAX = 28 + VHOST_USER_POSTCOPY_ADVISE = 28, + VHOST_USER_POSTCOPY_LISTEN = 29, + VHOST_USER_POSTCOPY_END = 30, + VHOST_USER_MAX = 31 } VhostUserRequest; typedef enum VhostUserSlaveRequest { @@ -132,6 +136,7 @@ typedef struct VhostUserMsg { VhostUserVringArea area; } payload; int fds[VHOST_MEMORY_MAX_NREGIONS]; + int fd_num; } __attribute((packed)) VhostUserMsg; #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) @@ -146,7 +151,8 @@ int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm); int vhost_user_host_notifier_ctrl(int vid, bool enable); /* socket.c */ -int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); +int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds, + int *fd_num); int send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); #endif diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 99c7afc8..8ad30c94 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -122,7 +122,7 @@ flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq) static __rte_always_inline void update_shadow_used_ring_split(struct vhost_virtqueue *vq, - uint16_t desc_idx, uint16_t len) + uint16_t desc_idx, uint32_t len) { uint16_t i = vq->shadow_used_idx++; @@ -186,7 +186,7 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, static __rte_always_inline void update_shadow_used_ring_packed(struct vhost_virtqueue *vq, - uint16_t desc_idx, uint16_t len, uint16_t count) + uint16_t desc_idx, uint32_t len, uint16_t count) { uint16_t i = vq->shadow_used_idx++; @@ -329,7 +329,7 @@ static __rte_always_inline int fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq, uint32_t avail_idx, uint16_t *vec_idx, struct buf_vector *buf_vec, uint16_t *desc_chain_head, - uint16_t *desc_chain_len, uint8_t perm) + uint32_t *desc_chain_len, uint8_t perm) { uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)]; uint16_t vec_id = *vec_idx; @@ -409,7 +409,7 @@ reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq, uint16_t max_tries, tries = 0; uint16_t head_idx = 0; - uint16_t len = 0; + uint32_t len = 0; *num_buffers = 0; cur_idx = vq->last_avail_idx; @@ -452,7 +452,7 @@ static __rte_always_inline int fill_vec_buf_packed_indirect(struct virtio_net *dev, struct vhost_virtqueue *vq, struct vring_packed_desc *desc, uint16_t *vec_idx, - struct buf_vector *buf_vec, uint16_t *len, uint8_t perm) + struct buf_vector *buf_vec, uint32_t *len, uint8_t perm) { uint16_t i; uint32_t nr_descs; @@ -508,7 +508,7 @@ static __rte_always_inline int fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint16_t avail_idx, uint16_t *desc_count, struct buf_vector *buf_vec, uint16_t *vec_idx, - uint16_t *buf_id, uint16_t *len, uint8_t perm) + uint16_t *buf_id, uint32_t *len, uint8_t perm) { bool wrap_counter = vq->avail_wrap_counter; struct vring_packed_desc *descs = vq->desc_packed; @@ -521,6 +521,7 @@ fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return -1; *desc_count = 0; + *len = 0; while (1) { if (unlikely(vec_id >= BUF_VECTOR_MAX)) @@ -573,7 +574,7 @@ reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint16_t max_tries, tries = 0; uint16_t buf_id = 0; - uint16_t len = 0; + uint32_t len = 0; uint16_t desc_count; *num_buffers = 0; @@ -888,6 +889,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf **pkts, uint32_t count) { struct vhost_virtqueue *vq; + uint32_t nb_tx = 0; VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) { @@ -915,9 +917,9 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, goto out; if (vq_is_packed(dev)) - count = virtio_dev_rx_packed(dev, vq, pkts, count); + nb_tx = virtio_dev_rx_packed(dev, vq, pkts, count); else - count = virtio_dev_rx_split(dev, vq, pkts, count); + nb_tx = virtio_dev_rx_split(dev, vq, pkts, count); out: if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) @@ -926,7 +928,7 @@ out: out_access_unlock: rte_spinlock_unlock(&vq->access_lock); - return count; + return nb_tx; } uint16_t @@ -1358,8 +1360,10 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, } } - flush_shadow_used_ring_split(dev, vq); - vhost_vring_call_split(dev, vq); + if (likely(vq->shadow_used_idx)) { + flush_shadow_used_ring_split(dev, vq); + vhost_vring_call_split(dev, vq); + } } rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); @@ -1378,7 +1382,8 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, for (i = 0; i < count; i++) { struct buf_vector buf_vec[BUF_VECTOR_MAX]; - uint16_t head_idx, dummy_len; + uint16_t head_idx; + uint32_t dummy_len; uint16_t nr_vec = 0; int err; @@ -1437,8 +1442,10 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, do_data_copy_dequeue(vq); if (unlikely(i < count)) vq->shadow_used_idx = i; - flush_shadow_used_ring_split(dev, vq); - vhost_vring_call_split(dev, vq); + if (likely(vq->shadow_used_idx)) { + flush_shadow_used_ring_split(dev, vq); + vhost_vring_call_split(dev, vq); + } } return i; @@ -1473,8 +1480,10 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, } } - flush_shadow_used_ring_packed(dev, vq); - vhost_vring_call_packed(dev, vq); + if (likely(vq->shadow_used_idx)) { + flush_shadow_used_ring_packed(dev, vq); + vhost_vring_call_packed(dev, vq); + } } VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); @@ -1485,7 +1494,8 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, for (i = 0; i < count; i++) { struct buf_vector buf_vec[BUF_VECTOR_MAX]; - uint16_t buf_id, dummy_len; + uint16_t buf_id; + uint32_t dummy_len; uint16_t desc_count, nr_vec = 0; int err; @@ -1551,8 +1561,10 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, do_data_copy_dequeue(vq); if (unlikely(i < count)) vq->shadow_used_idx = i; - flush_shadow_used_ring_packed(dev, vq); - vhost_vring_call_packed(dev, vq); + if (likely(vq->shadow_used_idx)) { + flush_shadow_used_ring_packed(dev, vq); + vhost_vring_call_packed(dev, vq); + } } return i; diff --git a/lib/meson.build b/lib/meson.build index eb91f100..bb7f443f 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -9,13 +9,14 @@ # given as a dep, no need to mention ring. This is especially true for the # core libs which are widely reused, so their deps are kept to a minimum. libraries = [ 'compat', # just a header, used for versioning - 'kvargs', + 'cmdline', # ethdev depends on cmdline for parsing functions + 'kvargs', # eal depends on kvargs 'eal', 'ring', 'mempool', 'mbuf', 'net', 'ethdev', 'pci', # core 'metrics', # bitrate/latency stats depends on this 'hash', # efd depends on this 'timer', # eventdev depends on this 'acl', 'bbdev', 'bitratestats', 'cfgfile', - 'cmdline', 'compressdev', 'cryptodev', + 'compressdev', 'cryptodev', 'distributor', 'efd', 'eventdev', 'gro', 'gso', 'ip_frag', 'jobstats', 'kni', 'latencystats', 'lpm', 'member', @@ -24,12 +25,18 @@ libraries = [ 'compat', # just a header, used for versioning # add pkt framework libs which use other libs from above 'port', 'table', 'pipeline', # flow_classify lib depends on pkt framework table lib - 'flow_classify', 'bpf'] + 'flow_classify', 'bpf', 'telemetry'] default_cflags = machine_args if cc.has_argument('-Wno-format-truncation') default_cflags += '-Wno-format-truncation' endif + +enabled_libs = [] # used to print summary at the end + +# -D_GNU_SOURCE unconditionally +default_cflags += '-D_GNU_SOURCE' + foreach l:libraries build = true name = l @@ -45,18 +52,17 @@ foreach l:libraries # use "deps" for internal DPDK dependencies, and "ext_deps" for # external package/library requirements ext_deps = [] - deps = ['eal'] # eal is standard dependency except for itself - if l == 'kvargs' - deps = [] - endif - if l == 'eal' - deps = ['kvargs'] + deps = [] + # eal is standard dependency once built + if dpdk_conf.has('RTE_LIBRTE_EAL') + deps += ['eal'] endif dir_name = 'librte_' + l subdir(dir_name) if build + enabled_libs += name dpdk_conf.set('RTE_LIBRTE_' + name.to_upper(), 1) install_headers(headers) @@ -87,10 +93,8 @@ foreach l:libraries lib_version = '@0@.1'.format(version) so_version = '@0@'.format(version) else - prj_ver = meson.project_version().split('.') - lib_version = '@0@.@1@'.format( - prj_ver.get(0), prj_ver.get(1)) - so_version = lib_version + lib_version = major_version + so_version = major_version endif # first build static lib @@ -126,6 +130,7 @@ foreach l:libraries dependencies: shared_deps) dpdk_libraries = [shared_lib] + dpdk_libraries + dpdk_static_libraries = [static_lib] + dpdk_static_libraries endif # sources.length() > 0 set_variable('shared_' + libname, shared_dep) -- cgit 1.2.3-korg