summaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/Makefile7
-rw-r--r--lib/librte_acl/rte_acl.c8
-rw-r--r--lib/librte_acl/rte_acl.h2
-rw-r--r--lib/librte_bpf/bpf_load.c2
-rw-r--r--lib/librte_bpf/rte_bpf_ethdev.h2
-rw-r--r--lib/librte_cmdline/Makefile1
-rw-r--r--lib/librte_cmdline/cmdline.c24
-rw-r--r--lib/librte_cmdline/meson.build4
-rw-r--r--lib/librte_compressdev/rte_comp.c4
-rw-r--r--lib/librte_compressdev/rte_comp.h4
-rw-r--r--lib/librte_compressdev/rte_compressdev.c32
-rw-r--r--lib/librte_compressdev/rte_compressdev_pmd.c23
-rw-r--r--lib/librte_compressdev/rte_compressdev_pmd.h5
-rw-r--r--lib/librte_cryptodev/Makefile2
-rw-r--r--lib/librte_cryptodev/meson.build2
-rw-r--r--lib/librte_cryptodev/rte_cryptodev.c37
-rw-r--r--lib/librte_cryptodev/rte_cryptodev_pmd.c23
-rw-r--r--lib/librte_cryptodev/rte_cryptodev_pmd.h3
-rw-r--r--lib/librte_eal/bsdapp/eal/Makefile9
-rw-r--r--lib/librte_eal/bsdapp/eal/eal.c54
-rw-r--r--lib/librte_eal/bsdapp/eal/eal_dev.c14
-rw-r--r--lib/librte_eal/bsdapp/eal/eal_memalloc.c21
-rw-r--r--lib/librte_eal/bsdapp/eal/eal_memory.c9
-rw-r--r--lib/librte_eal/common/Makefile1
-rw-r--r--lib/librte_eal/common/arch/arm/meson.build2
-rw-r--r--lib/librte_eal/common/arch/ppc_64/meson.build5
-rw-r--r--lib/librte_eal/common/arch/x86/meson.build2
-rw-r--r--lib/librte_eal/common/eal_common_bus.c45
-rw-r--r--lib/librte_eal/common/eal_common_class.c2
-rw-r--r--lib/librte_eal/common/eal_common_dev.c347
-rw-r--r--lib/librte_eal/common/eal_common_devargs.c52
-rw-r--r--lib/librte_eal/common/eal_common_fbarray.c5
-rw-r--r--lib/librte_eal/common/eal_common_memory.c210
-rw-r--r--lib/librte_eal/common/eal_common_memzone.c8
-rw-r--r--lib/librte_eal/common/eal_common_options.c42
-rw-r--r--lib/librte_eal/common/eal_common_proc.c10
-rw-r--r--lib/librte_eal/common/eal_common_string_fns.c26
-rw-r--r--lib/librte_eal/common/eal_common_timer.c24
-rw-r--r--lib/librte_eal/common/eal_filesystem.h15
-rw-r--r--lib/librte_eal/common/eal_internal_cfg.h1
-rw-r--r--lib/librte_eal/common/eal_memalloc.h11
-rw-r--r--lib/librte_eal/common/eal_options.h2
-rw-r--r--lib/librte_eal/common/eal_private.h90
-rw-r--r--lib/librte_eal/common/hotplug_mp.c426
-rw-r--r--lib/librte_eal/common/hotplug_mp.h46
-rw-r--r--lib/librte_eal/common/include/arch/arm/rte_cycles_32.h4
-rw-r--r--lib/librte_eal/common/include/arch/ppc_64/meson.build16
-rw-r--r--lib/librte_eal/common/include/arch/ppc_64/rte_pause.h7
-rw-r--r--lib/librte_eal/common/include/generic/rte_cycles.h11
-rw-r--r--lib/librte_eal/common/include/rte_bitmap.h14
-rw-r--r--lib/librte_eal/common/include/rte_bus.h34
-rw-r--r--lib/librte_eal/common/include/rte_common.h11
-rw-r--r--lib/librte_eal/common/include/rte_dev.h123
-rw-r--r--lib/librte_eal/common/include/rte_devargs.h81
-rw-r--r--lib/librte_eal/common/include/rte_eal.h20
-rw-r--r--lib/librte_eal/common/include/rte_eal_interrupts.h1
-rw-r--r--lib/librte_eal/common/include/rte_eal_memconfig.h18
-rw-r--r--lib/librte_eal/common/include/rte_malloc.h192
-rw-r--r--lib/librte_eal/common/include/rte_malloc_heap.h3
-rw-r--r--lib/librte_eal/common/include/rte_memory.h109
-rw-r--r--lib/librte_eal/common/include/rte_option.h63
-rw-r--r--lib/librte_eal/common/include/rte_string_fns.h26
-rw-r--r--lib/librte_eal/common/include/rte_version.h6
-rw-r--r--lib/librte_eal/common/include/rte_vfio.h31
-rw-r--r--lib/librte_eal/common/malloc_elem.c10
-rw-r--r--lib/librte_eal/common/malloc_heap.c340
-rw-r--r--lib/librte_eal/common/malloc_heap.h17
-rw-r--r--lib/librte_eal/common/malloc_mp.c4
-rw-r--r--lib/librte_eal/common/meson.build5
-rw-r--r--lib/librte_eal/common/rte_malloc.c436
-rw-r--r--lib/librte_eal/common/rte_option.c54
-rw-r--r--lib/librte_eal/linuxapp/eal/Makefile20
-rw-r--r--lib/librte_eal/linuxapp/eal/eal.c119
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_dev.c172
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_hugepage_info.c1
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_interrupts.c79
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_memalloc.c466
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_memory.c264
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_thread.c4
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_timer.c5
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_vfio.c216
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_vfio.h4
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c11
-rw-r--r--lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h6
-rw-r--r--lib/librte_eal/meson.build3
-rw-r--r--lib/librte_eal/rte_eal_version.map39
-rw-r--r--lib/librte_ethdev/Makefile6
-rw-r--r--lib/librte_ethdev/ethdev_private.c121
-rw-r--r--lib/librte_ethdev/ethdev_private.h38
-rw-r--r--lib/librte_ethdev/ethdev_profile.c103
-rw-r--r--lib/librte_ethdev/ethdev_profile.h6
-rw-r--r--lib/librte_ethdev/meson.build8
-rw-r--r--lib/librte_ethdev/rte_class_eth.c173
-rw-r--r--lib/librte_ethdev/rte_ethdev.c463
-rw-r--r--lib/librte_ethdev/rte_ethdev.h183
-rw-r--r--lib/librte_ethdev/rte_ethdev_core.h50
-rw-r--r--lib/librte_ethdev/rte_ethdev_driver.h37
-rw-r--r--lib/librte_ethdev/rte_ethdev_pci.h11
-rw-r--r--lib/librte_ethdev/rte_ethdev_version.map17
-rw-r--r--lib/librte_ethdev/rte_flow.c686
-rw-r--r--lib/librte_ethdev/rte_flow.h484
-rw-r--r--lib/librte_ethdev/rte_tm.h4
-rw-r--r--lib/librte_eventdev/Makefile4
-rw-r--r--lib/librte_eventdev/meson.build8
-rw-r--r--lib/librte_eventdev/rte_event_eth_rx_adapter.c6
-rw-r--r--lib/librte_eventdev/rte_event_eth_rx_adapter.h4
-rw-r--r--lib/librte_eventdev/rte_event_eth_tx_adapter.c1138
-rw-r--r--lib/librte_eventdev/rte_event_eth_tx_adapter.h462
-rw-r--r--lib/librte_eventdev/rte_eventdev.c70
-rw-r--r--lib/librte_eventdev/rte_eventdev.h74
-rw-r--r--lib/librte_eventdev/rte_eventdev_pmd.h225
-rw-r--r--lib/librte_eventdev/rte_eventdev_version.map13
-rw-r--r--lib/librte_flow_classify/rte_flow_classify.c3
-rw-r--r--lib/librte_hash/rte_cuckoo_hash.c1061
-rw-r--r--lib/librte_hash/rte_cuckoo_hash.h34
-rw-r--r--lib/librte_hash/rte_hash.h85
-rw-r--r--lib/librte_hash/rte_hash_version.map7
-rw-r--r--lib/librte_ip_frag/ip_frag_common.h23
-rw-r--r--lib/librte_ip_frag/ip_frag_internal.c18
-rw-r--r--lib/librte_ip_frag/rte_ip_frag.h19
-rw-r--r--lib/librte_ip_frag/rte_ip_frag_common.c21
-rw-r--r--lib/librte_ip_frag/rte_ip_frag_version.map6
-rw-r--r--lib/librte_kni/rte_kni.c534
-rw-r--r--lib/librte_kni/rte_kni.h26
-rw-r--r--lib/librte_kni/rte_kni_fifo.h43
-rw-r--r--lib/librte_kni/rte_kni_version.map6
-rw-r--r--lib/librte_kvargs/rte_kvargs.c17
-rw-r--r--lib/librte_kvargs/rte_kvargs.h7
-rw-r--r--lib/librte_latencystats/rte_latencystats.c7
-rw-r--r--lib/librte_lpm/Makefile2
-rw-r--r--lib/librte_lpm/meson.build1
-rw-r--r--lib/librte_lpm/rte_lpm6.c1050
-rw-r--r--lib/librte_mbuf/meson.build2
-rw-r--r--lib/librte_mbuf/rte_mbuf.c53
-rw-r--r--lib/librte_mbuf/rte_mbuf.h130
-rw-r--r--lib/librte_mbuf/rte_mbuf_ptype.c3
-rw-r--r--lib/librte_mbuf/rte_mbuf_ptype.h22
-rw-r--r--lib/librte_mempool/rte_mempool.c57
-rw-r--r--lib/librte_net/Makefile2
-rw-r--r--lib/librte_net/meson.build3
-rw-r--r--lib/librte_net/net_crc_sse.h4
-rw-r--r--lib/librte_net/rte_ether.h2
-rw-r--r--lib/librte_net/rte_mpls.h42
-rw-r--r--lib/librte_net/rte_net.c21
-rw-r--r--lib/librte_net/rte_net.h20
-rw-r--r--lib/librte_pdump/Makefile2
-rw-r--r--lib/librte_pipeline/Makefile2
-rw-r--r--lib/librte_pipeline/meson.build2
-rw-r--r--lib/librte_pipeline/rte_pipeline.c3
-rw-r--r--lib/librte_pipeline/rte_pipeline_version.map1
-rw-r--r--lib/librte_pipeline/rte_table_action.c1048
-rw-r--r--lib/librte_pipeline/rte_table_action.h203
-rw-r--r--lib/librte_port/Makefile4
-rw-r--r--lib/librte_port/meson.build8
-rw-r--r--lib/librte_port/rte_port_sym_crypto.c552
-rw-r--r--lib/librte_port/rte_port_sym_crypto.h93
-rw-r--r--lib/librte_port/rte_port_version.map9
-rw-r--r--lib/librte_power/Makefile5
-rw-r--r--lib/librte_power/channel_commands.h5
-rw-r--r--lib/librte_power/meson.build6
-rw-r--r--lib/librte_power/rte_power_empty_poll.c545
-rw-r--r--lib/librte_power/rte_power_empty_poll.h223
-rw-r--r--lib/librte_power/rte_power_version.map13
-rw-r--r--lib/librte_rawdev/rte_rawdev.c10
-rw-r--r--lib/librte_rawdev/rte_rawdev_pmd.h2
-rw-r--r--lib/librte_ring/meson.build1
-rw-r--r--lib/librte_ring/rte_ring.h4
-rw-r--r--lib/librte_sched/Makefile2
-rw-r--r--lib/librte_sched/rte_sched.c5
-rw-r--r--lib/librte_security/rte_security.c4
-rw-r--r--lib/librte_security/rte_security.h90
-rw-r--r--lib/librte_table/Makefile2
-rw-r--r--lib/librte_table/meson.build2
-rw-r--r--lib/librte_table/rte_table_hash_func.h245
-rw-r--r--lib/librte_table/rte_table_hash_func_arm64.h21
-rw-r--r--lib/librte_telemetry/Makefile30
-rw-r--r--lib/librte_telemetry/meson.build15
-rw-r--r--lib/librte_telemetry/rte_telemetry.c1813
-rw-r--r--lib/librte_telemetry/rte_telemetry.h66
-rw-r--r--lib/librte_telemetry/rte_telemetry_internal.h81
-rw-r--r--lib/librte_telemetry/rte_telemetry_parser.c586
-rw-r--r--lib/librte_telemetry/rte_telemetry_parser.h14
-rw-r--r--lib/librte_telemetry/rte_telemetry_parser_test.c534
-rw-r--r--lib/librte_telemetry/rte_telemetry_parser_test.h39
-rw-r--r--lib/librte_telemetry/rte_telemetry_socket_tests.h36
-rw-r--r--lib/librte_telemetry/rte_telemetry_version.map10
-rw-r--r--lib/librte_vhost/Makefile5
-rw-r--r--lib/librte_vhost/meson.build3
-rw-r--r--lib/librte_vhost/rte_vdpa.h97
-rw-r--r--lib/librte_vhost/rte_vhost.h5
-rw-r--r--lib/librte_vhost/rte_vhost_version.map1
-rw-r--r--lib/librte_vhost/socket.c46
-rw-r--r--lib/librte_vhost/vdpa.c6
-rw-r--r--lib/librte_vhost/vhost.c26
-rw-r--r--lib/librte_vhost/vhost.h34
-rw-r--r--lib/librte_vhost/vhost_crypto.c25
-rw-r--r--lib/librte_vhost/vhost_user.c683
-rw-r--r--lib/librte_vhost/vhost_user.h12
-rw-r--r--lib/librte_vhost/virtio_net.c52
-rw-r--r--lib/meson.build31
200 files changed, 17096 insertions, 2700 deletions
diff --git a/lib/Makefile b/lib/Makefile
index afa604e2..b7370ef9 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,6 +25,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_ETHER) += librte_ethdev
DEPDIRS-librte_ethdev := librte_net librte_eal librte_mempool librte_ring
DEPDIRS-librte_ethdev += librte_mbuf
DEPDIRS-librte_ethdev += librte_kvargs
+DEPDIRS-librte_ethdev += librte_cmdline
DIRS-$(CONFIG_RTE_LIBRTE_BBDEV) += librte_bbdev
DEPDIRS-librte_bbdev := librte_eal librte_mempool librte_mbuf
DIRS-$(CONFIG_RTE_LIBRTE_CRYPTODEV) += librte_cryptodev
@@ -50,7 +51,7 @@ DEPDIRS-librte_hash := librte_eal librte_ring
DIRS-$(CONFIG_RTE_LIBRTE_EFD) += librte_efd
DEPDIRS-librte_efd := librte_eal librte_ring librte_hash
DIRS-$(CONFIG_RTE_LIBRTE_LPM) += librte_lpm
-DEPDIRS-librte_lpm := librte_eal
+DEPDIRS-librte_lpm := librte_eal librte_hash
DIRS-$(CONFIG_RTE_LIBRTE_ACL) += librte_acl
DEPDIRS-librte_acl := librte_eal
DIRS-$(CONFIG_RTE_LIBRTE_MEMBER) += librte_member
@@ -71,7 +72,7 @@ DEPDIRS-librte_bitratestats := librte_eal librte_metrics librte_ethdev
DIRS-$(CONFIG_RTE_LIBRTE_LATENCY_STATS) += librte_latencystats
DEPDIRS-librte_latencystats := librte_eal librte_metrics librte_ethdev librte_mbuf
DIRS-$(CONFIG_RTE_LIBRTE_POWER) += librte_power
-DEPDIRS-librte_power := librte_eal
+DEPDIRS-librte_power := librte_eal librte_timer
DIRS-$(CONFIG_RTE_LIBRTE_METER) += librte_meter
DEPDIRS-librte_meter := librte_eal
DIRS-$(CONFIG_RTE_LIBRTE_FLOW_CLASSIFY) += librte_flow_classify
@@ -105,6 +106,8 @@ DEPDIRS-librte_gso := librte_eal librte_mbuf librte_ethdev librte_net
DEPDIRS-librte_gso += librte_mempool
DIRS-$(CONFIG_RTE_LIBRTE_BPF) += librte_bpf
DEPDIRS-librte_bpf := librte_eal librte_mempool librte_mbuf librte_ethdev
+DIRS-$(CONFIG_RTE_LIBRTE_TELEMETRY) += librte_telemetry
+DEPDIRS-librte_telemetry := librte_eal librte_metrics librte_ethdev
ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y)
DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
diff --git a/lib/librte_acl/rte_acl.c b/lib/librte_acl/rte_acl.c
index 2f1243cd..db7d3221 100644
--- a/lib/librte_acl/rte_acl.c
+++ b/lib/librte_acl/rte_acl.c
@@ -16,7 +16,7 @@ EAL_REGISTER_TAILQ(rte_acl_tailq)
* If the compiler doesn't support AVX2 instructions,
* then the dummy one would be used instead for AVX2 classify method.
*/
-int __attribute__ ((weak))
+__rte_weak int
rte_acl_classify_avx2(__rte_unused const struct rte_acl_ctx *ctx,
__rte_unused const uint8_t **data,
__rte_unused uint32_t *results,
@@ -26,7 +26,7 @@ rte_acl_classify_avx2(__rte_unused const struct rte_acl_ctx *ctx,
return -ENOTSUP;
}
-int __attribute__ ((weak))
+__rte_weak int
rte_acl_classify_sse(__rte_unused const struct rte_acl_ctx *ctx,
__rte_unused const uint8_t **data,
__rte_unused uint32_t *results,
@@ -36,7 +36,7 @@ rte_acl_classify_sse(__rte_unused const struct rte_acl_ctx *ctx,
return -ENOTSUP;
}
-int __attribute__ ((weak))
+__rte_weak int
rte_acl_classify_neon(__rte_unused const struct rte_acl_ctx *ctx,
__rte_unused const uint8_t **data,
__rte_unused uint32_t *results,
@@ -46,7 +46,7 @@ rte_acl_classify_neon(__rte_unused const struct rte_acl_ctx *ctx,
return -ENOTSUP;
}
-int __attribute__ ((weak))
+__rte_weak int
rte_acl_classify_altivec(__rte_unused const struct rte_acl_ctx *ctx,
__rte_unused const uint8_t **data,
__rte_unused uint32_t *results,
diff --git a/lib/librte_acl/rte_acl.h b/lib/librte_acl/rte_acl.h
index 34c3b9c6..aa22e70c 100644
--- a/lib/librte_acl/rte_acl.h
+++ b/lib/librte_acl/rte_acl.h
@@ -88,7 +88,7 @@ enum {
RTE_ACL_TYPE_SHIFT = 29,
RTE_ACL_MAX_INDEX = RTE_LEN2MASK(RTE_ACL_TYPE_SHIFT, uint32_t),
RTE_ACL_MAX_PRIORITY = RTE_ACL_MAX_INDEX,
- RTE_ACL_MIN_PRIORITY = 0,
+ RTE_ACL_MIN_PRIORITY = 1,
};
#define RTE_ACL_MASKLEN_TO_BITMASK(v, s) \
diff --git a/lib/librte_bpf/bpf_load.c b/lib/librte_bpf/bpf_load.c
index 2b84fe72..d9d163b7 100644
--- a/lib/librte_bpf/bpf_load.c
+++ b/lib/librte_bpf/bpf_load.c
@@ -131,7 +131,7 @@ rte_bpf_load(const struct rte_bpf_prm *prm)
return bpf;
}
-__rte_experimental __attribute__ ((weak)) struct rte_bpf *
+__rte_experimental __rte_weak struct rte_bpf *
rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname,
const char *sname)
{
diff --git a/lib/librte_bpf/rte_bpf_ethdev.h b/lib/librte_bpf/rte_bpf_ethdev.h
index 31731e7a..11d09cdc 100644
--- a/lib/librte_bpf/rte_bpf_ethdev.h
+++ b/lib/librte_bpf/rte_bpf_ethdev.h
@@ -75,7 +75,7 @@ rte_bpf_eth_tx_unload(uint16_t port, uint16_t queue);
* @param prm
* Parameters used to create and initialise the BPF exeution context.
* @param flags
- * Flags that define expected expected behavior of the loaded filter
+ * Flags that define expected behavior of the loaded filter
* (i.e. jited/non-jited version to use).
* @return
* Zero on successful completion or negative error code otherwise.
diff --git a/lib/librte_cmdline/Makefile b/lib/librte_cmdline/Makefile
index ddae1cfd..c64142b8 100644
--- a/lib/librte_cmdline/Makefile
+++ b/lib/librte_cmdline/Makefile
@@ -25,7 +25,6 @@ SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_vt100.c
SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_socket.c
SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse_portlist.c
-CFLAGS += -D_GNU_SOURCE
LDLIBS += -lrte_eal
# install includes
diff --git a/lib/librte_cmdline/cmdline.c b/lib/librte_cmdline/cmdline.c
index 591b78b0..d9042f04 100644
--- a/lib/librte_cmdline/cmdline.c
+++ b/lib/librte_cmdline/cmdline.c
@@ -126,35 +126,11 @@ cmdline_printf(const struct cmdline *cl, const char *fmt, ...)
if (!cl || !fmt)
return;
-#ifdef _GNU_SOURCE
if (cl->s_out < 0)
return;
va_start(ap, fmt);
vdprintf(cl->s_out, fmt, ap);
va_end(ap);
-#else
- int ret;
- char *buf;
-
- if (cl->s_out < 0)
- return;
-
- buf = malloc(BUFSIZ);
- if (buf == NULL)
- return;
- va_start(ap, fmt);
- ret = vsnprintf(buf, BUFSIZ, fmt, ap);
- va_end(ap);
- if (ret < 0) {
- free(buf);
- return;
- }
- if (ret >= BUFSIZ)
- ret = BUFSIZ - 1;
- ret = write(cl->s_out, buf, ret);
- (void)ret;
- free(buf);
-#endif
}
int
diff --git a/lib/librte_cmdline/meson.build b/lib/librte_cmdline/meson.build
index 5741817a..30498906 100644
--- a/lib/librte_cmdline/meson.build
+++ b/lib/librte_cmdline/meson.build
@@ -1,6 +1,10 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright(c) 2017 Intel Corporation
+# This library is processed before EAL
+includes = [global_inc]
+includes += include_directories('../librte_eal/common/include')
+
version = 2
sources = files('cmdline.c',
'cmdline_cirbuf.c',
diff --git a/lib/librte_compressdev/rte_comp.c b/lib/librte_compressdev/rte_comp.c
index 98ad0cfd..c663be59 100644
--- a/lib/librte_compressdev/rte_comp.c
+++ b/lib/librte_compressdev/rte_comp.c
@@ -83,8 +83,8 @@ struct rte_comp_op_pool_private {
* @param nb_ops
* Number of operations to allocate
* @return
- * - 0: Success
- * - -ENOENT: Not enough entries in the mempool; no ops are retrieved.
+ * - nb_ops: Success, the nb_ops requested was allocated
+ * - 0: Not enough entries in the mempool; no ops are retrieved.
*/
static inline int
rte_comp_op_raw_bulk_alloc(struct rte_mempool *mempool,
diff --git a/lib/librte_compressdev/rte_comp.h b/lib/librte_compressdev/rte_comp.h
index ee9056ea..395ce29f 100644
--- a/lib/librte_compressdev/rte_comp.h
+++ b/lib/librte_compressdev/rte_comp.h
@@ -448,8 +448,8 @@ rte_comp_op_alloc(struct rte_mempool *mempool);
* @param nb_ops
* Number of operations to allocate
* @return
- * - 0: Success
- * - -ENOENT: Not enough entries in the mempool; no ops are retrieved.
+ * - nb_ops: Success, the nb_ops requested was allocated
+ * - 0: Not enough entries in the mempool; no ops are retrieved.
*/
int __rte_experimental
rte_comp_op_bulk_alloc(struct rte_mempool *mempool,
diff --git a/lib/librte_compressdev/rte_compressdev.c b/lib/librte_compressdev/rte_compressdev.c
index 9091dd6e..10101ebb 100644
--- a/lib/librte_compressdev/rte_compressdev.c
+++ b/lib/librte_compressdev/rte_compressdev.c
@@ -18,19 +18,15 @@
#define RTE_COMPRESSDEV_DETACHED (0)
#define RTE_COMPRESSDEV_ATTACHED (1)
-struct rte_compressdev rte_comp_devices[RTE_COMPRESS_MAX_DEVS];
-
-struct rte_compressdev *rte_compressdevs = &rte_comp_devices[0];
+static struct rte_compressdev rte_comp_devices[RTE_COMPRESS_MAX_DEVS];
static struct rte_compressdev_global compressdev_globals = {
- .devs = &rte_comp_devices[0],
+ .devs = rte_comp_devices,
.data = { NULL },
.nb_devs = 0,
.max_devs = RTE_COMPRESS_MAX_DEVS
};
-struct rte_compressdev_global *rte_compressdev_globals = &compressdev_globals;
-
const struct rte_compressdev_capabilities * __rte_experimental
rte_compressdev_capability_get(uint8_t dev_id,
enum rte_comp_algorithm algo)
@@ -78,7 +74,7 @@ rte_compressdev_get_feature_name(uint64_t flag)
static struct rte_compressdev *
rte_compressdev_get_dev(uint8_t dev_id)
{
- return &rte_compressdev_globals->devs[dev_id];
+ return &compressdev_globals.devs[dev_id];
}
struct rte_compressdev * __rte_experimental
@@ -90,8 +86,8 @@ rte_compressdev_pmd_get_named_dev(const char *name)
if (name == NULL)
return NULL;
- for (i = 0; i < rte_compressdev_globals->max_devs; i++) {
- dev = &rte_compressdev_globals->devs[i];
+ for (i = 0; i < compressdev_globals.max_devs; i++) {
+ dev = &compressdev_globals.devs[i];
if ((dev->attached == RTE_COMPRESSDEV_ATTACHED) &&
(strcmp(dev->data->name, name) == 0))
@@ -106,7 +102,7 @@ rte_compressdev_is_valid_dev(uint8_t dev_id)
{
struct rte_compressdev *dev = NULL;
- if (dev_id >= rte_compressdev_globals->nb_devs)
+ if (dev_id >= compressdev_globals.nb_devs)
return 0;
dev = rte_compressdev_get_dev(dev_id);
@@ -125,10 +121,10 @@ rte_compressdev_get_dev_id(const char *name)
if (name == NULL)
return -1;
- for (i = 0; i < rte_compressdev_globals->nb_devs; i++)
- if ((strcmp(rte_compressdev_globals->devs[i].data->name, name)
+ for (i = 0; i < compressdev_globals.nb_devs; i++)
+ if ((strcmp(compressdev_globals.devs[i].data->name, name)
== 0) &&
- (rte_compressdev_globals->devs[i].attached ==
+ (compressdev_globals.devs[i].attached ==
RTE_COMPRESSDEV_ATTACHED))
return i;
@@ -138,7 +134,7 @@ rte_compressdev_get_dev_id(const char *name)
uint8_t __rte_experimental
rte_compressdev_count(void)
{
- return rte_compressdev_globals->nb_devs;
+ return compressdev_globals.nb_devs;
}
uint8_t __rte_experimental
@@ -146,8 +142,8 @@ rte_compressdev_devices_get(const char *driver_name, uint8_t *devices,
uint8_t nb_devices)
{
uint8_t i, count = 0;
- struct rte_compressdev *devs = rte_compressdev_globals->devs;
- uint8_t max_devs = rte_compressdev_globals->max_devs;
+ struct rte_compressdev *devs = compressdev_globals.devs;
+ uint8_t max_devs = compressdev_globals.max_devs;
for (i = 0; i < max_devs && count < nb_devices; i++) {
@@ -578,7 +574,7 @@ uint16_t __rte_experimental
rte_compressdev_dequeue_burst(uint8_t dev_id, uint16_t qp_id,
struct rte_comp_op **ops, uint16_t nb_ops)
{
- struct rte_compressdev *dev = &rte_compressdevs[dev_id];
+ struct rte_compressdev *dev = &rte_comp_devices[dev_id];
nb_ops = (*dev->dequeue_burst)
(dev->data->queue_pairs[qp_id], ops, nb_ops);
@@ -590,7 +586,7 @@ uint16_t __rte_experimental
rte_compressdev_enqueue_burst(uint8_t dev_id, uint16_t qp_id,
struct rte_comp_op **ops, uint16_t nb_ops)
{
- struct rte_compressdev *dev = &rte_compressdevs[dev_id];
+ struct rte_compressdev *dev = &rte_comp_devices[dev_id];
return (*dev->enqueue_burst)(
dev->data->queue_pairs[qp_id], ops, nb_ops);
diff --git a/lib/librte_compressdev/rte_compressdev_pmd.c b/lib/librte_compressdev/rte_compressdev_pmd.c
index 7de4f339..95beb26a 100644
--- a/lib/librte_compressdev/rte_compressdev_pmd.c
+++ b/lib/librte_compressdev/rte_compressdev_pmd.c
@@ -92,24 +92,20 @@ rte_compressdev_pmd_create(const char *name,
struct rte_compressdev *compressdev;
if (params->name[0] != '\0') {
- COMPRESSDEV_LOG(INFO, "[%s] User specified device name = %s\n",
- device->driver->name, params->name);
+ COMPRESSDEV_LOG(INFO, "User specified device name = %s\n",
+ params->name);
name = params->name;
}
- COMPRESSDEV_LOG(INFO, "[%s] - Creating compressdev %s\n",
- device->driver->name, name);
+ COMPRESSDEV_LOG(INFO, "Creating compressdev %s\n", name);
- COMPRESSDEV_LOG(INFO,
- "[%s] - Init parameters - name: %s, socket id: %d",
- device->driver->name, name,
- params->socket_id);
+ COMPRESSDEV_LOG(INFO, "Init parameters - name: %s, socket id: %d",
+ name, params->socket_id);
/* allocate device structure */
compressdev = rte_compressdev_pmd_allocate(name, params->socket_id);
if (compressdev == NULL) {
- COMPRESSDEV_LOG(ERR, "[%s] Failed to allocate comp device %s",
- device->driver->name, name);
+ COMPRESSDEV_LOG(ERR, "Failed to allocate comp device %s", name);
return NULL;
}
@@ -123,8 +119,8 @@ rte_compressdev_pmd_create(const char *name,
if (compressdev->data->dev_private == NULL) {
COMPRESSDEV_LOG(ERR,
- "[%s] Cannot allocate memory for compressdev %s private data",
- device->driver->name, name);
+ "Cannot allocate memory for compressdev"
+ " %s private data", name);
rte_compressdev_pmd_release_device(compressdev);
return NULL;
@@ -141,8 +137,7 @@ rte_compressdev_pmd_destroy(struct rte_compressdev *compressdev)
{
int retval;
- COMPRESSDEV_LOG(INFO, "[%s] Closing comp device %s",
- compressdev->device->driver->name,
+ COMPRESSDEV_LOG(INFO, "Closing comp device %s",
compressdev->device->name);
/* free comp device */
diff --git a/lib/librte_compressdev/rte_compressdev_pmd.h b/lib/librte_compressdev/rte_compressdev_pmd.h
index 38e9ea02..043353c9 100644
--- a/lib/librte_compressdev/rte_compressdev_pmd.h
+++ b/lib/librte_compressdev/rte_compressdev_pmd.h
@@ -51,11 +51,6 @@ struct rte_compressdev_global {
uint8_t max_devs; /**< Max number of devices */
};
-/** Pointer to global array of comp devices */
-extern struct rte_compressdev *rte_compressdevs;
-/** Pointer to global comp devices data structure */
-extern struct rte_compressdev_global *rte_compressdev_globals;
-
/**
* Get the rte_compressdev structure device pointer for the named device.
*
diff --git a/lib/librte_cryptodev/Makefile b/lib/librte_cryptodev/Makefile
index c1148887..a8f94c09 100644
--- a/lib/librte_cryptodev/Makefile
+++ b/lib/librte_cryptodev/Makefile
@@ -7,7 +7,7 @@ include $(RTE_SDK)/mk/rte.vars.mk
LIB = librte_cryptodev.a
# library version
-LIBABIVER := 4
+LIBABIVER := 5
# build flags
CFLAGS += -O3
diff --git a/lib/librte_cryptodev/meson.build b/lib/librte_cryptodev/meson.build
index 295f509e..990dd3d4 100644
--- a/lib/librte_cryptodev/meson.build
+++ b/lib/librte_cryptodev/meson.build
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright(c) 2017 Intel Corporation
-version = 4
+version = 5
sources = files('rte_cryptodev.c', 'rte_cryptodev_pmd.c')
headers = files('rte_cryptodev.h',
'rte_cryptodev_pmd.h',
diff --git a/lib/librte_cryptodev/rte_cryptodev.c b/lib/librte_cryptodev/rte_cryptodev.c
index 63ae23f0..a52eaaa4 100644
--- a/lib/librte_cryptodev/rte_cryptodev.c
+++ b/lib/librte_cryptodev/rte_cryptodev.c
@@ -43,19 +43,17 @@
static uint8_t nb_drivers;
-struct rte_cryptodev rte_crypto_devices[RTE_CRYPTO_MAX_DEVS];
+static struct rte_cryptodev rte_crypto_devices[RTE_CRYPTO_MAX_DEVS];
-struct rte_cryptodev *rte_cryptodevs = &rte_crypto_devices[0];
+struct rte_cryptodev *rte_cryptodevs = rte_crypto_devices;
static struct rte_cryptodev_global cryptodev_globals = {
- .devs = &rte_crypto_devices[0],
+ .devs = rte_crypto_devices,
.data = { NULL },
.nb_devs = 0,
.max_devs = RTE_CRYPTO_MAX_DEVS
};
-struct rte_cryptodev_global *rte_cryptodev_globals = &cryptodev_globals;
-
/* spinlock for crypto device callbacks */
static rte_spinlock_t rte_cryptodev_cb_lock = RTE_SPINLOCK_INITIALIZER;
@@ -486,7 +484,7 @@ rte_cryptodev_get_feature_name(uint64_t flag)
struct rte_cryptodev *
rte_cryptodev_pmd_get_dev(uint8_t dev_id)
{
- return &rte_cryptodev_globals->devs[dev_id];
+ return &cryptodev_globals.devs[dev_id];
}
struct rte_cryptodev *
@@ -498,8 +496,8 @@ rte_cryptodev_pmd_get_named_dev(const char *name)
if (name == NULL)
return NULL;
- for (i = 0; i < rte_cryptodev_globals->max_devs; i++) {
- dev = &rte_cryptodev_globals->devs[i];
+ for (i = 0; i < cryptodev_globals.max_devs; i++) {
+ dev = &cryptodev_globals.devs[i];
if ((dev->attached == RTE_CRYPTODEV_ATTACHED) &&
(strcmp(dev->data->name, name) == 0))
@@ -514,7 +512,7 @@ rte_cryptodev_pmd_is_valid_dev(uint8_t dev_id)
{
struct rte_cryptodev *dev = NULL;
- if (dev_id >= rte_cryptodev_globals->nb_devs)
+ if (dev_id >= cryptodev_globals.nb_devs)
return 0;
dev = rte_cryptodev_pmd_get_dev(dev_id);
@@ -533,10 +531,10 @@ rte_cryptodev_get_dev_id(const char *name)
if (name == NULL)
return -1;
- for (i = 0; i < rte_cryptodev_globals->nb_devs; i++)
- if ((strcmp(rte_cryptodev_globals->devs[i].data->name, name)
+ for (i = 0; i < cryptodev_globals.nb_devs; i++)
+ if ((strcmp(cryptodev_globals.devs[i].data->name, name)
== 0) &&
- (rte_cryptodev_globals->devs[i].attached ==
+ (cryptodev_globals.devs[i].attached ==
RTE_CRYPTODEV_ATTACHED))
return i;
@@ -546,7 +544,7 @@ rte_cryptodev_get_dev_id(const char *name)
uint8_t
rte_cryptodev_count(void)
{
- return rte_cryptodev_globals->nb_devs;
+ return cryptodev_globals.nb_devs;
}
uint8_t
@@ -554,9 +552,9 @@ rte_cryptodev_device_count_by_driver(uint8_t driver_id)
{
uint8_t i, dev_count = 0;
- for (i = 0; i < rte_cryptodev_globals->max_devs; i++)
- if (rte_cryptodev_globals->devs[i].driver_id == driver_id &&
- rte_cryptodev_globals->devs[i].attached ==
+ for (i = 0; i < cryptodev_globals.max_devs; i++)
+ if (cryptodev_globals.devs[i].driver_id == driver_id &&
+ cryptodev_globals.devs[i].attached ==
RTE_CRYPTODEV_ATTACHED)
dev_count++;
@@ -568,8 +566,8 @@ rte_cryptodev_devices_get(const char *driver_name, uint8_t *devices,
uint8_t nb_devices)
{
uint8_t i, count = 0;
- struct rte_cryptodev *devs = rte_cryptodev_globals->devs;
- uint8_t max_devs = rte_cryptodev_globals->max_devs;
+ struct rte_cryptodev *devs = cryptodev_globals.devs;
+ uint8_t max_devs = cryptodev_globals.max_devs;
for (i = 0; i < max_devs && count < nb_devices; i++) {
@@ -1477,6 +1475,9 @@ rte_crypto_op_pool_create(const char *name, enum rte_crypto_op_type type,
elt_size += sizeof(struct rte_crypto_sym_op);
} else if (type == RTE_CRYPTO_OP_TYPE_ASYMMETRIC) {
elt_size += sizeof(struct rte_crypto_asym_op);
+ } else if (type == RTE_CRYPTO_OP_TYPE_UNDEFINED) {
+ elt_size += RTE_MAX(sizeof(struct rte_crypto_sym_op),
+ sizeof(struct rte_crypto_asym_op));
} else {
CDEV_LOG_ERR("Invalid op_type\n");
return NULL;
diff --git a/lib/librte_cryptodev/rte_cryptodev_pmd.c b/lib/librte_cryptodev/rte_cryptodev_pmd.c
index 2088ac3f..f03bdbd5 100644
--- a/lib/librte_cryptodev/rte_cryptodev_pmd.c
+++ b/lib/librte_cryptodev/rte_cryptodev_pmd.c
@@ -93,24 +93,20 @@ rte_cryptodev_pmd_create(const char *name,
struct rte_cryptodev *cryptodev;
if (params->name[0] != '\0') {
- CDEV_LOG_INFO("[%s] User specified device name = %s\n",
- device->driver->name, params->name);
+ CDEV_LOG_INFO("User specified device name = %s\n", params->name);
name = params->name;
}
- CDEV_LOG_INFO("[%s] - Creating cryptodev %s\n",
- device->driver->name, name);
+ CDEV_LOG_INFO("Creating cryptodev %s\n", name);
- CDEV_LOG_INFO("[%s] - Initialisation parameters - name: %s,"
+ CDEV_LOG_INFO("Initialisation parameters - name: %s,"
"socket id: %d, max queue pairs: %u",
- device->driver->name, name,
- params->socket_id, params->max_nb_queue_pairs);
+ name, params->socket_id, params->max_nb_queue_pairs);
/* allocate device structure */
cryptodev = rte_cryptodev_pmd_allocate(name, params->socket_id);
if (cryptodev == NULL) {
- CDEV_LOG_ERR("[%s] Failed to allocate crypto device for %s",
- device->driver->name, name);
+ CDEV_LOG_ERR("Failed to allocate crypto device for %s", name);
return NULL;
}
@@ -123,9 +119,8 @@ rte_cryptodev_pmd_create(const char *name,
params->socket_id);
if (cryptodev->data->dev_private == NULL) {
- CDEV_LOG_ERR("[%s] Cannot allocate memory for "
- "cryptodev %s private data",
- device->driver->name, name);
+ CDEV_LOG_ERR("Cannot allocate memory for cryptodev %s"
+ " private data", name);
rte_cryptodev_pmd_release_device(cryptodev);
return NULL;
@@ -145,9 +140,7 @@ rte_cryptodev_pmd_destroy(struct rte_cryptodev *cryptodev)
{
int retval;
- CDEV_LOG_INFO("[%s] Closing crypto device %s",
- cryptodev->device->driver->name,
- cryptodev->device->name);
+ CDEV_LOG_INFO("Closing crypto device %s", cryptodev->device->name);
/* free crypto device */
retval = rte_cryptodev_pmd_release_device(cryptodev);
diff --git a/lib/librte_cryptodev/rte_cryptodev_pmd.h b/lib/librte_cryptodev/rte_cryptodev_pmd.h
index 6ff49d64..1b6cafd1 100644
--- a/lib/librte_cryptodev/rte_cryptodev_pmd.h
+++ b/lib/librte_cryptodev/rte_cryptodev_pmd.h
@@ -71,9 +71,6 @@ struct cryptodev_driver {
uint8_t id;
};
-/** pointer to global crypto devices data structure. */
-extern struct rte_cryptodev_global *rte_cryptodev_globals;
-
/**
* Get the rte_cryptodev structure device pointer for the device. Assumes a
* valid device index.
diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile
index d27da3d1..bfeddaad 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -22,7 +22,7 @@ LDLIBS += -lrte_kvargs
EXPORT_MAP := ../../rte_eal_version.map
-LIBABIVER := 8
+LIBABIVER := 9
# specific to bsdapp exec-env
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) := eal.c
@@ -62,10 +62,12 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_proc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_uuid.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_malloc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += hotplug_mp.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_heap.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_mp.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_keepalive.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_option.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_service.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_reciprocal.c
@@ -77,11 +79,6 @@ SRCS-y += rte_cycles.c
CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST)
-CFLAGS_eal.o := -D_GNU_SOURCE
-#CFLAGS_eal_thread.o := -D_GNU_SOURCE
-CFLAGS_eal_log.o := -D_GNU_SOURCE
-CFLAGS_eal_common_log.o := -D_GNU_SOURCE
-
# workaround for a gcc bug with noreturn attribute
# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603
ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index d7ae9d68..508cbc46 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -42,6 +42,7 @@
#include <rte_devargs.h>
#include <rte_version.h>
#include <rte_vfio.h>
+#include <rte_option.h>
#include <rte_atomic.h>
#include <malloc_heap.h>
@@ -141,7 +142,7 @@ eal_create_runtime_dir(void)
}
const char *
-eal_get_runtime_dir(void)
+rte_eal_get_runtime_dir(void)
{
return runtime_dir;
}
@@ -414,12 +415,20 @@ eal_parse_args(int argc, char **argv)
argvopt = argv;
optind = 1;
optreset = 1;
+ opterr = 0;
while ((opt = getopt_long(argc, argvopt, eal_short_options,
eal_long_options, &option_index)) != EOF) {
- /* getopt is not happy, stop right now */
+ /*
+ * getopt didn't recognise the option, lets parse the
+ * registered options to see if the flag is valid
+ */
if (opt == '?') {
+ ret = rte_option_parse(argv[optind-1]);
+ if (ret == 0)
+ continue;
+
eal_usage(prgname);
ret = -1;
goto out;
@@ -502,6 +511,9 @@ check_socket(const struct rte_memseg_list *msl, void *arg)
{
int *socket_id = arg;
+ if (msl->external)
+ return 0;
+
if (msl->socket_id == *socket_id && msl->memseg_arr.count != 0)
return 1;
@@ -607,7 +619,7 @@ rte_eal_init(int argc, char **argv)
internal_config.legacy_mem = true;
if (eal_plugins_init() < 0) {
- rte_eal_init_alert("Cannot init plugins\n");
+ rte_eal_init_alert("Cannot init plugins");
rte_errno = EINVAL;
rte_atomic32_clear(&run_once);
return -1;
@@ -622,7 +634,7 @@ rte_eal_init(int argc, char **argv)
rte_config_init();
if (rte_eal_intr_init() < 0) {
- rte_eal_init_alert("Cannot init interrupt-handling thread\n");
+ rte_eal_init_alert("Cannot init interrupt-handling thread");
return -1;
}
@@ -630,7 +642,7 @@ rte_eal_init(int argc, char **argv)
* bus through mp channel in the secondary process before the bus scan.
*/
if (rte_mp_channel_init() < 0) {
- rte_eal_init_alert("failed to init mp channel\n");
+ rte_eal_init_alert("failed to init mp channel");
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
rte_errno = EFAULT;
return -1;
@@ -638,14 +650,21 @@ rte_eal_init(int argc, char **argv)
}
if (rte_bus_scan()) {
- rte_eal_init_alert("Cannot scan the buses for devices\n");
+ rte_eal_init_alert("Cannot scan the buses for devices");
rte_errno = ENODEV;
rte_atomic32_clear(&run_once);
return -1;
}
- /* autodetect the iova mapping mode (default is iova_pa) */
- rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class();
+ /* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */
+ if (internal_config.iova_mode == RTE_IOVA_DC) {
+ /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */
+ rte_eal_get_configuration()->iova_mode =
+ rte_bus_get_iommu_class();
+ } else {
+ rte_eal_get_configuration()->iova_mode =
+ internal_config.iova_mode;
+ }
if (internal_config.no_hugetlbfs == 0) {
/* rte_config isn't initialized yet */
@@ -685,37 +704,37 @@ rte_eal_init(int argc, char **argv)
* initialize memzones first.
*/
if (rte_eal_memzone_init() < 0) {
- rte_eal_init_alert("Cannot init memzone\n");
+ rte_eal_init_alert("Cannot init memzone");
rte_errno = ENODEV;
return -1;
}
if (rte_eal_memory_init() < 0) {
- rte_eal_init_alert("Cannot init memory\n");
+ rte_eal_init_alert("Cannot init memory");
rte_errno = ENOMEM;
return -1;
}
if (rte_eal_malloc_heap_init() < 0) {
- rte_eal_init_alert("Cannot init malloc heap\n");
+ rte_eal_init_alert("Cannot init malloc heap");
rte_errno = ENODEV;
return -1;
}
if (rte_eal_tailqs_init() < 0) {
- rte_eal_init_alert("Cannot init tail queues for objects\n");
+ rte_eal_init_alert("Cannot init tail queues for objects");
rte_errno = EFAULT;
return -1;
}
if (rte_eal_alarm_init() < 0) {
- rte_eal_init_alert("Cannot init interrupt-handling thread\n");
+ rte_eal_init_alert("Cannot init interrupt-handling thread");
/* rte_eal_alarm_init sets rte_errno on failure. */
return -1;
}
if (rte_eal_timer_init() < 0) {
- rte_eal_init_alert("Cannot init HPET or TSC timers\n");
+ rte_eal_init_alert("Cannot init HPET or TSC timers");
rte_errno = ENOTSUP;
return -1;
}
@@ -765,14 +784,14 @@ rte_eal_init(int argc, char **argv)
/* initialize services so vdevs register service during bus_probe. */
ret = rte_service_init();
if (ret) {
- rte_eal_init_alert("rte_service_init() failed\n");
+ rte_eal_init_alert("rte_service_init() failed");
rte_errno = ENOEXEC;
return -1;
}
/* Probe all the buses and devices/drivers on them */
if (rte_bus_probe()) {
- rte_eal_init_alert("Cannot probe devices\n");
+ rte_eal_init_alert("Cannot probe devices");
rte_errno = ENOTSUP;
return -1;
}
@@ -788,6 +807,9 @@ rte_eal_init(int argc, char **argv)
rte_eal_mcfg_complete();
+ /* Call each registered callback, if enabled */
+ rte_option_init();
+
return fctret;
}
diff --git a/lib/librte_eal/bsdapp/eal/eal_dev.c b/lib/librte_eal/bsdapp/eal/eal_dev.c
index 1c6c51bd..255d611e 100644
--- a/lib/librte_eal/bsdapp/eal/eal_dev.c
+++ b/lib/librte_eal/bsdapp/eal/eal_dev.c
@@ -19,3 +19,17 @@ rte_dev_event_monitor_stop(void)
RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n");
return -1;
}
+
+int __rte_experimental
+rte_dev_hotplug_handle_enable(void)
+{
+ RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n");
+ return -1;
+}
+
+int __rte_experimental
+rte_dev_hotplug_handle_disable(void)
+{
+ RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n");
+ return -1;
+}
diff --git a/lib/librte_eal/bsdapp/eal/eal_memalloc.c b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
index f7f07abd..a5847f0b 100644
--- a/lib/librte_eal/bsdapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
@@ -4,6 +4,7 @@
#include <inttypes.h>
+#include <rte_errno.h>
#include <rte_log.h>
#include <rte_memory.h>
@@ -48,6 +49,26 @@ eal_memalloc_sync_with_primary(void)
}
int
+eal_memalloc_get_seg_fd(int list_idx __rte_unused, int seg_idx __rte_unused)
+{
+ return -ENOTSUP;
+}
+
+int
+eal_memalloc_set_seg_fd(int list_idx __rte_unused, int seg_idx __rte_unused,
+ int fd __rte_unused)
+{
+ return -ENOTSUP;
+}
+
+int
+eal_memalloc_get_seg_fd_offset(int list_idx __rte_unused,
+ int seg_idx __rte_unused, size_t *offset __rte_unused)
+{
+ return -ENOTSUP;
+}
+
+int
eal_memalloc_init(void)
{
return 0;
diff --git a/lib/librte_eal/bsdapp/eal/eal_memory.c b/lib/librte_eal/bsdapp/eal/eal_memory.c
index 16d2bc7c..4b092e1f 100644
--- a/lib/librte_eal/bsdapp/eal/eal_memory.c
+++ b/lib/librte_eal/bsdapp/eal/eal_memory.c
@@ -79,6 +79,7 @@ rte_eal_hugepage_init(void)
}
msl->base_va = addr;
msl->page_sz = page_sz;
+ msl->len = internal_config.memory;
msl->socket_id = 0;
/* populate memsegs. each memseg is 1 page long */
@@ -235,12 +236,15 @@ struct attach_walk_args {
int seg_idx;
};
static int
-attach_segment(const struct rte_memseg_list *msl __rte_unused,
- const struct rte_memseg *ms, void *arg)
+attach_segment(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+ void *arg)
{
struct attach_walk_args *wa = arg;
void *addr;
+ if (msl->external)
+ return 0;
+
addr = mmap(ms->addr, ms->len, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, wa->fd_hugepage,
wa->seg_idx * EAL_PAGE_SIZE);
@@ -370,6 +374,7 @@ alloc_va_space(struct rte_memseg_list *msl)
return -1;
}
msl->base_va = addr;
+ msl->len = mem_sz;
return 0;
}
diff --git a/lib/librte_eal/common/Makefile b/lib/librte_eal/common/Makefile
index cca68826..87d8c455 100644
--- a/lib/librte_eal/common/Makefile
+++ b/lib/librte_eal/common/Makefile
@@ -12,6 +12,7 @@ INC += rte_tailq.h rte_interrupts.h rte_alarm.h
INC += rte_string_fns.h rte_version.h
INC += rte_eal_memconfig.h rte_malloc_heap.h
INC += rte_hexdump.h rte_devargs.h rte_bus.h rte_dev.h rte_class.h
+INC += rte_option.h
INC += rte_pci_dev_feature_defs.h rte_pci_dev_features.h
INC += rte_malloc.h rte_keepalive.h rte_time.h
INC += rte_service.h rte_service_component.h
diff --git a/lib/librte_eal/common/arch/arm/meson.build b/lib/librte_eal/common/arch/arm/meson.build
index c6bd9227..79731e1a 100644
--- a/lib/librte_eal/common/arch/arm/meson.build
+++ b/lib/librte_eal/common/arch/arm/meson.build
@@ -2,4 +2,4 @@
# Copyright(c) 2017 Intel Corporation.
eal_common_arch_sources = files('rte_cpuflags.c',
- 'rte_cycles.c')
+ 'rte_cycles.c', 'rte_hypervisor.c')
diff --git a/lib/librte_eal/common/arch/ppc_64/meson.build b/lib/librte_eal/common/arch/ppc_64/meson.build
new file mode 100644
index 00000000..40b3dc53
--- /dev/null
+++ b/lib/librte_eal/common/arch/ppc_64/meson.build
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Luca Boccassi <bluca@debian.org>
+
+eal_common_arch_sources = files('rte_cpuflags.c',
+ 'rte_cycles.c', 'rte_hypervisor.c')
diff --git a/lib/librte_eal/common/arch/x86/meson.build b/lib/librte_eal/common/arch/x86/meson.build
index 4e0f7790..14bf204c 100644
--- a/lib/librte_eal/common/arch/x86/meson.build
+++ b/lib/librte_eal/common/arch/x86/meson.build
@@ -2,4 +2,4 @@
# Copyright(c) 2017 Intel Corporation
eal_common_arch_sources = files('rte_spinlock.c', 'rte_cpuflags.c',
- 'rte_cycles.c')
+ 'rte_cycles.c', 'rte_hypervisor.c')
diff --git a/lib/librte_eal/common/eal_common_bus.c b/lib/librte_eal/common/eal_common_bus.c
index 0943851c..c8f1901f 100644
--- a/lib/librte_eal/common/eal_common_bus.c
+++ b/lib/librte_eal/common/eal_common_bus.c
@@ -37,10 +37,11 @@
#include <rte_bus.h>
#include <rte_debug.h>
#include <rte_string_fns.h>
+#include <rte_errno.h>
#include "eal_private.h"
-struct rte_bus_list rte_bus_list =
+static struct rte_bus_list rte_bus_list =
TAILQ_HEAD_INITIALIZER(rte_bus_list);
void
@@ -242,3 +243,45 @@ rte_bus_get_iommu_class(void)
}
return mode;
}
+
+static int
+bus_handle_sigbus(const struct rte_bus *bus,
+ const void *failure_addr)
+{
+ int ret;
+
+ if (!bus->sigbus_handler)
+ return -1;
+
+ ret = bus->sigbus_handler(failure_addr);
+
+ /* find bus but handle failed, keep the errno be set. */
+ if (ret < 0 && rte_errno == 0)
+ rte_errno = ENOTSUP;
+
+ return ret > 0;
+}
+
+int
+rte_bus_sigbus_handler(const void *failure_addr)
+{
+ struct rte_bus *bus;
+
+ int ret = 0;
+ int old_errno = rte_errno;
+
+ rte_errno = 0;
+
+ bus = rte_bus_find(NULL, bus_handle_sigbus, failure_addr);
+ /* can not find bus. */
+ if (!bus)
+ return 1;
+ /* find bus but handle failed, pass on the new errno. */
+ else if (rte_errno != 0)
+ return -1;
+
+ /* restore the old errno. */
+ rte_errno = old_errno;
+
+ return ret;
+}
diff --git a/lib/librte_eal/common/eal_common_class.c b/lib/librte_eal/common/eal_common_class.c
index 404a9065..d922266d 100644
--- a/lib/librte_eal/common/eal_common_class.c
+++ b/lib/librte_eal/common/eal_common_class.c
@@ -9,7 +9,7 @@
#include <rte_class.h>
#include <rte_debug.h>
-struct rte_class_list rte_class_list =
+static struct rte_class_list rte_class_list =
TAILQ_HEAD_INITIALIZER(rte_class_list);
__rte_experimental void
diff --git a/lib/librte_eal/common/eal_common_dev.c b/lib/librte_eal/common/eal_common_dev.c
index 678dbcac..62e9ed47 100644
--- a/lib/librte_eal/common/eal_common_dev.c
+++ b/lib/librte_eal/common/eal_common_dev.c
@@ -19,8 +19,10 @@
#include <rte_log.h>
#include <rte_spinlock.h>
#include <rte_malloc.h>
+#include <rte_string_fns.h>
#include "eal_private.h"
+#include "hotplug_mp.h"
/**
* The device event callback description.
@@ -74,119 +76,110 @@ static int cmp_dev_name(const struct rte_device *dev, const void *_name)
return strcmp(dev->name, name);
}
-int rte_eal_dev_attach(const char *name, const char *devargs)
+int __rte_experimental
+rte_dev_is_probed(const struct rte_device *dev)
{
- struct rte_bus *bus;
+ /* The field driver should be set only when the probe is successful. */
+ return dev->driver != NULL;
+}
- if (name == NULL || devargs == NULL) {
- RTE_LOG(ERR, EAL, "Invalid device or arguments provided\n");
+/* helper function to build devargs, caller should free the memory */
+static int
+build_devargs(const char *busname, const char *devname,
+ const char *drvargs, char **devargs)
+{
+ int length;
+
+ length = snprintf(NULL, 0, "%s:%s,%s", busname, devname, drvargs);
+ if (length < 0)
return -EINVAL;
- }
- bus = rte_bus_find_by_device_name(name);
- if (bus == NULL) {
- RTE_LOG(ERR, EAL, "Unable to find a bus for the device '%s'\n",
- name);
+ *devargs = malloc(length + 1);
+ if (*devargs == NULL)
+ return -ENOMEM;
+
+ length = snprintf(*devargs, length + 1, "%s:%s,%s",
+ busname, devname, drvargs);
+ if (length < 0) {
+ free(*devargs);
return -EINVAL;
}
- if (strcmp(bus->name, "pci") == 0 || strcmp(bus->name, "vdev") == 0)
- return rte_eal_hotplug_add(bus->name, name, devargs);
-
- RTE_LOG(ERR, EAL,
- "Device attach is only supported for PCI and vdev devices.\n");
- return -ENOTSUP;
+ return 0;
}
-int rte_eal_dev_detach(struct rte_device *dev)
+int
+rte_eal_hotplug_add(const char *busname, const char *devname,
+ const char *drvargs)
{
- struct rte_bus *bus;
- int ret;
- if (dev == NULL) {
- RTE_LOG(ERR, EAL, "Invalid device provided.\n");
- return -EINVAL;
- }
+ char *devargs;
+ int ret;
- bus = rte_bus_find_by_device(dev);
- if (bus == NULL) {
- RTE_LOG(ERR, EAL, "Cannot find bus for device (%s)\n",
- dev->name);
- return -EINVAL;
- }
+ ret = build_devargs(busname, devname, drvargs, &devargs);
+ if (ret != 0)
+ return ret;
- if (bus->unplug == NULL) {
- RTE_LOG(ERR, EAL, "Bus function not supported\n");
- return -ENOTSUP;
- }
+ ret = rte_dev_probe(devargs);
+ free(devargs);
- ret = bus->unplug(dev);
- if (ret)
- RTE_LOG(ERR, EAL, "Driver cannot detach the device (%s)\n",
- dev->name);
return ret;
}
-int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devname,
- const char *devargs)
+/* probe device at local process. */
+int
+local_dev_probe(const char *devargs, struct rte_device **new_dev)
{
- struct rte_bus *bus;
struct rte_device *dev;
struct rte_devargs *da;
int ret;
- bus = rte_bus_find_by_name(busname);
- if (bus == NULL) {
- RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", busname);
- return -ENOENT;
- }
-
- if (bus->plug == NULL) {
- RTE_LOG(ERR, EAL, "Function plug not supported by bus (%s)\n",
- bus->name);
- return -ENOTSUP;
- }
-
+ *new_dev = NULL;
da = calloc(1, sizeof(*da));
if (da == NULL)
return -ENOMEM;
- ret = rte_devargs_parsef(da, "%s:%s,%s",
- busname, devname, devargs);
+ ret = rte_devargs_parse(da, devargs);
if (ret)
goto err_devarg;
+ if (da->bus->plug == NULL) {
+ RTE_LOG(ERR, EAL, "Function plug not supported by bus (%s)\n",
+ da->bus->name);
+ ret = -ENOTSUP;
+ goto err_devarg;
+ }
+
ret = rte_devargs_insert(da);
if (ret)
goto err_devarg;
- ret = bus->scan();
+ ret = da->bus->scan();
if (ret)
goto err_devarg;
- dev = bus->find_device(NULL, cmp_dev_name, devname);
+ dev = da->bus->find_device(NULL, cmp_dev_name, da->name);
if (dev == NULL) {
RTE_LOG(ERR, EAL, "Cannot find device (%s)\n",
- devname);
+ da->name);
ret = -ENODEV;
goto err_devarg;
}
- if (dev->driver != NULL) {
- RTE_LOG(ERR, EAL, "Device is already plugged\n");
- return -EEXIST;
- }
-
- ret = bus->plug(dev);
+ ret = dev->bus->plug(dev);
if (ret) {
+ if (rte_dev_is_probed(dev)) /* if already succeeded earlier */
+ return ret; /* no rollback */
RTE_LOG(ERR, EAL, "Driver cannot attach the device (%s)\n",
dev->name);
goto err_devarg;
}
+
+ *new_dev = dev;
return 0;
err_devarg:
- if (rte_devargs_remove(busname, devname)) {
+ if (rte_devargs_remove(da) != 0) {
free(da->args);
free(da);
}
@@ -194,40 +187,235 @@ err_devarg:
}
int __rte_experimental
-rte_eal_hotplug_remove(const char *busname, const char *devname)
+rte_dev_probe(const char *devargs)
{
- struct rte_bus *bus;
+ struct eal_dev_mp_req req;
struct rte_device *dev;
int ret;
+ memset(&req, 0, sizeof(req));
+ req.t = EAL_DEV_REQ_TYPE_ATTACH;
+ strlcpy(req.devargs, devargs, EAL_DEV_MP_DEV_ARGS_MAX_LEN);
+
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
+ /**
+ * If in secondary process, just send IPC request to
+ * primary process.
+ */
+ ret = eal_dev_hotplug_request_to_primary(&req);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to send hotplug request to primary\n");
+ return -ENOMSG;
+ }
+ if (req.result != 0)
+ RTE_LOG(ERR, EAL,
+ "Failed to hotplug add device\n");
+ return req.result;
+ }
+
+ /* attach a shared device from primary start from here: */
+
+ /* primary attach the new device itself. */
+ ret = local_dev_probe(devargs, &dev);
+
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to attach device on primary process\n");
+
+ /**
+ * it is possible that secondary process failed to attached a
+ * device that primary process have during initialization,
+ * so for -EEXIST case, we still need to sync with secondary
+ * process.
+ */
+ if (ret != -EEXIST)
+ return ret;
+ }
+
+ /* primary send attach sync request to secondary. */
+ ret = eal_dev_hotplug_request_to_secondary(&req);
+
+ /* if any communication error, we need to rollback. */
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to send hotplug add request to secondary\n");
+ ret = -ENOMSG;
+ goto rollback;
+ }
+
+ /**
+ * if any secondary failed to attach, we need to consider if rollback
+ * is necessary.
+ */
+ if (req.result != 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to attach device on secondary process\n");
+ ret = req.result;
+
+ /* for -EEXIST, we don't need to rollback. */
+ if (ret == -EEXIST)
+ return ret;
+ goto rollback;
+ }
+
+ return 0;
+
+rollback:
+ req.t = EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK;
+
+ /* primary send rollback request to secondary. */
+ if (eal_dev_hotplug_request_to_secondary(&req) != 0)
+ RTE_LOG(WARNING, EAL,
+ "Failed to rollback device attach on secondary."
+ "Devices in secondary may not sync with primary\n");
+
+ /* primary rollback itself. */
+ if (local_dev_remove(dev) != 0)
+ RTE_LOG(WARNING, EAL,
+ "Failed to rollback device attach on primary."
+ "Devices in secondary may not sync with primary\n");
+
+ return ret;
+}
+
+int
+rte_eal_hotplug_remove(const char *busname, const char *devname)
+{
+ struct rte_device *dev;
+ struct rte_bus *bus;
+
bus = rte_bus_find_by_name(busname);
if (bus == NULL) {
RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", busname);
return -ENOENT;
}
- if (bus->unplug == NULL) {
- RTE_LOG(ERR, EAL, "Function unplug not supported by bus (%s)\n",
- bus->name);
- return -ENOTSUP;
- }
-
dev = bus->find_device(NULL, cmp_dev_name, devname);
if (dev == NULL) {
RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", devname);
return -EINVAL;
}
- if (dev->driver == NULL) {
- RTE_LOG(ERR, EAL, "Device is already unplugged\n");
- return -ENOENT;
+ return rte_dev_remove(dev);
+}
+
+/* remove device at local process. */
+int
+local_dev_remove(struct rte_device *dev)
+{
+ int ret;
+
+ if (dev->bus->unplug == NULL) {
+ RTE_LOG(ERR, EAL, "Function unplug not supported by bus (%s)\n",
+ dev->bus->name);
+ return -ENOTSUP;
}
- ret = bus->unplug(dev);
- if (ret)
+ ret = dev->bus->unplug(dev);
+ if (ret) {
RTE_LOG(ERR, EAL, "Driver cannot detach the device (%s)\n",
dev->name);
- rte_devargs_remove(busname, devname);
+ return ret;
+ }
+
+ return 0;
+}
+
+int __rte_experimental
+rte_dev_remove(struct rte_device *dev)
+{
+ struct eal_dev_mp_req req;
+ char *devargs;
+ int ret;
+
+ if (!rte_dev_is_probed(dev)) {
+ RTE_LOG(ERR, EAL, "Device is not probed\n");
+ return -ENOENT;
+ }
+
+ ret = build_devargs(dev->bus->name, dev->name, "", &devargs);
+ if (ret != 0)
+ return ret;
+
+ memset(&req, 0, sizeof(req));
+ req.t = EAL_DEV_REQ_TYPE_DETACH;
+ strlcpy(req.devargs, devargs, EAL_DEV_MP_DEV_ARGS_MAX_LEN);
+ free(devargs);
+
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
+ /**
+ * If in secondary process, just send IPC request to
+ * primary process.
+ */
+ ret = eal_dev_hotplug_request_to_primary(&req);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to send hotplug request to primary\n");
+ return -ENOMSG;
+ }
+ if (req.result != 0)
+ RTE_LOG(ERR, EAL,
+ "Failed to hotplug remove device\n");
+ return req.result;
+ }
+
+ /* detach a device from primary start from here: */
+
+ /* primary send detach sync request to secondary */
+ ret = eal_dev_hotplug_request_to_secondary(&req);
+
+ /**
+ * if communication error, we need to rollback, because it is possible
+ * part of the secondary processes still detached it successfully.
+ */
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to send device detach request to secondary\n");
+ ret = -ENOMSG;
+ goto rollback;
+ }
+
+ /**
+ * if any secondary failed to detach, we need to consider if rollback
+ * is necessary.
+ */
+ if (req.result != 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to detach device on secondary process\n");
+ ret = req.result;
+ /**
+ * if -ENOENT, we don't need to rollback, since devices is
+ * already detached on secondary process.
+ */
+ if (ret != -ENOENT)
+ goto rollback;
+ }
+
+ /* primary detach the device itself. */
+ ret = local_dev_remove(dev);
+
+ /* if primary failed, still need to consider if rollback is necessary */
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to detach device on primary process\n");
+ /* if -ENOENT, we don't need to rollback */
+ if (ret == -ENOENT)
+ return ret;
+ goto rollback;
+ }
+
+ return 0;
+
+rollback:
+ req.t = EAL_DEV_REQ_TYPE_DETACH_ROLLBACK;
+
+ /* primary send rollback request to secondary. */
+ if (eal_dev_hotplug_request_to_secondary(&req) != 0)
+ RTE_LOG(WARNING, EAL,
+ "Failed to rollback device detach on secondary."
+ "Devices in secondary may not sync with primary\n");
+
return ret;
}
@@ -342,8 +530,9 @@ rte_dev_event_callback_unregister(const char *device_name,
return ret;
}
-void
-dev_callback_process(char *device_name, enum rte_dev_event_type event)
+void __rte_experimental
+rte_dev_event_callback_process(const char *device_name,
+ enum rte_dev_event_type event)
{
struct dev_event_callback *cb_lst;
diff --git a/lib/librte_eal/common/eal_common_devargs.c b/lib/librte_eal/common/eal_common_devargs.c
index dac2402a..b7b9cb69 100644
--- a/lib/librte_eal/common/eal_common_devargs.c
+++ b/lib/librte_eal/common/eal_common_devargs.c
@@ -4,9 +4,6 @@
/* This file manages the list of devices and their arguments, as given
* by the user at startup
- *
- * Code here should not call rte_log since the EAL environment
- * may not be initialized.
*/
#include <stdio.h>
@@ -28,39 +25,9 @@
TAILQ_HEAD(rte_devargs_list, rte_devargs);
/** Global list of user devices */
-struct rte_devargs_list devargs_list =
+static struct rte_devargs_list devargs_list =
TAILQ_HEAD_INITIALIZER(devargs_list);
-int
-rte_eal_parse_devargs_str(const char *devargs_str,
- char **drvname, char **drvargs)
-{
- char *sep;
-
- if ((devargs_str) == NULL || (drvname) == NULL || (drvargs == NULL))
- return -1;
-
- *drvname = strdup(devargs_str);
- if (*drvname == NULL)
- return -1;
-
- /* set the first ',' to '\0' to split name and arguments */
- sep = strchr(*drvname, ',');
- if (sep != NULL) {
- sep[0] = '\0';
- *drvargs = strdup(sep + 1);
- } else {
- *drvargs = strdup("");
- }
-
- if (*drvargs == NULL) {
- free(*drvname);
- *drvname = NULL;
- return -1;
- }
- return 0;
-}
-
static size_t
devargs_layer_count(const char *s)
{
@@ -270,6 +237,7 @@ rte_devargs_parsef(struct rte_devargs *da, const char *format, ...)
va_list ap;
size_t len;
char *dev;
+ int ret;
if (da == NULL)
return -EINVAL;
@@ -288,7 +256,10 @@ rte_devargs_parsef(struct rte_devargs *da, const char *format, ...)
vsnprintf(dev, len + 1, format, ap);
va_end(ap);
- return rte_devargs_parse(da, dev);
+ ret = rte_devargs_parse(da, dev);
+
+ free(dev);
+ return ret;
}
int __rte_experimental
@@ -296,7 +267,7 @@ rte_devargs_insert(struct rte_devargs *da)
{
int ret;
- ret = rte_devargs_remove(da->bus->name, da->name);
+ ret = rte_devargs_remove(da);
if (ret < 0)
return ret;
TAILQ_INSERT_TAIL(&devargs_list, da, next);
@@ -342,14 +313,17 @@ fail:
}
int __rte_experimental
-rte_devargs_remove(const char *busname, const char *devname)
+rte_devargs_remove(struct rte_devargs *devargs)
{
struct rte_devargs *d;
void *tmp;
+ if (devargs == NULL || devargs->bus == NULL)
+ return -1;
+
TAILQ_FOREACH_SAFE(d, &devargs_list, next, tmp) {
- if (strcmp(d->bus->name, busname) == 0 &&
- strcmp(d->name, devname) == 0) {
+ if (strcmp(d->bus->name, devargs->bus->name) == 0 &&
+ strcmp(d->name, devargs->name) == 0) {
TAILQ_REMOVE(&devargs_list, d, next);
free(d->args);
free(d);
diff --git a/lib/librte_eal/common/eal_common_fbarray.c b/lib/librte_eal/common/eal_common_fbarray.c
index 43caf3ce..ea0735cb 100644
--- a/lib/librte_eal/common/eal_common_fbarray.c
+++ b/lib/librte_eal/common/eal_common_fbarray.c
@@ -2,6 +2,7 @@
* Copyright(c) 2017-2018 Intel Corporation
*/
+#include <fcntl.h>
#include <inttypes.h>
#include <limits.h>
#include <sys/mman.h>
@@ -878,6 +879,10 @@ rte_fbarray_destroy(struct rte_fbarray *arr)
if (ret)
return ret;
+ /* with no shconf, there were never any files to begin with */
+ if (internal_config.no_shconf)
+ return 0;
+
/* try deleting the file */
eal_get_fbarray_path(path, sizeof(path), arr->name);
diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index fbfb1b05..12dcedf5 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -2,6 +2,7 @@
* Copyright(c) 2010-2014 Intel Corporation
*/
+#include <fcntl.h>
#include <errno.h>
#include <stdio.h>
#include <stdint.h>
@@ -37,6 +38,23 @@
static void *next_baseaddr;
static uint64_t system_page_sz;
+#ifdef RTE_ARCH_64
+/*
+ * Linux kernel uses a really high address as starting address for serving
+ * mmaps calls. If there exists addressing limitations and IOVA mode is VA,
+ * this starting address is likely too high for those devices. However, it
+ * is possible to use a lower address in the process virtual address space
+ * as with 64 bits there is a lot of available space.
+ *
+ * Current known limitations are 39 or 40 bits. Setting the starting address
+ * at 4GB implies there are 508GB or 1020GB for mapping the available
+ * hugepages. This is likely enough for most systems, although a device with
+ * addressing limitations should call rte_eal_check_dma_mask for ensuring all
+ * memory is within supported range.
+ */
+static uint64_t baseaddr = 0x100000000;
+#endif
+
void *
eal_get_virtual_area(void *requested_addr, size_t *size,
size_t page_sz, int flags, int mmap_flags)
@@ -60,6 +78,11 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
rte_eal_process_type() == RTE_PROC_PRIMARY)
next_baseaddr = (void *) internal_config.base_virtaddr;
+#ifdef RTE_ARCH_64
+ if (next_baseaddr == NULL && internal_config.base_virtaddr == 0 &&
+ rte_eal_process_type() == RTE_PROC_PRIMARY)
+ next_baseaddr = (void *) baseaddr;
+#endif
if (requested_addr == NULL && next_baseaddr != NULL) {
requested_addr = next_baseaddr;
requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz);
@@ -91,7 +114,17 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
mmap_flags, -1, 0);
if (mapped_addr == MAP_FAILED && allow_shrink)
*size -= page_sz;
- } while (allow_shrink && mapped_addr == MAP_FAILED && *size > 0);
+
+ if (mapped_addr != MAP_FAILED && addr_is_hint &&
+ mapped_addr != requested_addr) {
+ /* hint was not used. Try with another offset */
+ munmap(mapped_addr, map_sz);
+ mapped_addr = MAP_FAILED;
+ next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz);
+ requested_addr = next_baseaddr;
+ }
+ } while ((allow_shrink || addr_is_hint) &&
+ mapped_addr == MAP_FAILED && *size > 0);
/* align resulting address - if map failed, we will ignore the value
* anyway, so no need to add additional checks.
@@ -171,7 +204,7 @@ virt2memseg(const void *addr, const struct rte_memseg_list *msl)
/* a memseg list was specified, check if it's the right one */
start = msl->base_va;
- end = RTE_PTR_ADD(start, (size_t)msl->page_sz * msl->memseg_arr.len);
+ end = RTE_PTR_ADD(start, msl->len);
if (addr < start || addr >= end)
return NULL;
@@ -194,8 +227,7 @@ virt2memseg_list(const void *addr)
msl = &mcfg->memsegs[msl_idx];
start = msl->base_va;
- end = RTE_PTR_ADD(start,
- (size_t)msl->page_sz * msl->memseg_arr.len);
+ end = RTE_PTR_ADD(start, msl->len);
if (addr >= start && addr < end)
break;
}
@@ -273,6 +305,9 @@ physmem_size(const struct rte_memseg_list *msl, void *arg)
{
uint64_t *total_len = arg;
+ if (msl->external)
+ return 0;
+
*total_len += msl->memseg_arr.count * msl->page_sz;
return 0;
@@ -294,7 +329,7 @@ dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
void *arg)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int msl_idx, ms_idx;
+ int msl_idx, ms_idx, fd;
FILE *f = arg;
msl_idx = msl - mcfg->memsegs;
@@ -305,10 +340,11 @@ dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
if (ms_idx < 0)
return -1;
+ fd = eal_memalloc_get_seg_fd(msl_idx, ms_idx);
fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, "
"virt:%p, socket_id:%"PRId32", "
"hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
- "nrank:%"PRIx32"\n",
+ "nrank:%"PRIx32" fd:%i\n",
msl_idx, ms_idx,
ms->iova,
ms->len,
@@ -316,7 +352,8 @@ dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
ms->socket_id,
ms->hugepage_sz,
ms->nchannel,
- ms->nrank);
+ ms->nrank,
+ fd);
return 0;
}
@@ -383,6 +420,66 @@ rte_dump_physmem_layout(FILE *f)
rte_memseg_walk(dump_memseg, f);
}
+static int
+check_iova(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg)
+{
+ uint64_t *mask = arg;
+ rte_iova_t iova;
+
+ /* higher address within segment */
+ iova = (ms->iova + ms->len) - 1;
+ if (!(iova & *mask))
+ return 0;
+
+ RTE_LOG(DEBUG, EAL, "memseg iova %"PRIx64", len %zx, out of range\n",
+ ms->iova, ms->len);
+
+ RTE_LOG(DEBUG, EAL, "\tusing dma mask %"PRIx64"\n", *mask);
+ return 1;
+}
+
+#if defined(RTE_ARCH_64)
+#define MAX_DMA_MASK_BITS 63
+#else
+#define MAX_DMA_MASK_BITS 31
+#endif
+
+/* check memseg iovas are within the required range based on dma mask */
+int __rte_experimental
+rte_eal_check_dma_mask(uint8_t maskbits)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ uint64_t mask;
+
+ /* sanity check */
+ if (maskbits > MAX_DMA_MASK_BITS) {
+ RTE_LOG(ERR, EAL, "wrong dma mask size %u (Max: %u)\n",
+ maskbits, MAX_DMA_MASK_BITS);
+ return -1;
+ }
+
+ /* create dma mask */
+ mask = ~((1ULL << maskbits) - 1);
+
+ if (rte_memseg_walk(check_iova, &mask))
+ /*
+ * Dma mask precludes hugepage usage.
+ * This device can not be used and we do not need to keep
+ * the dma mask.
+ */
+ return 1;
+
+ /*
+ * we need to keep the more restricted maskbit for checking
+ * potential dynamic memory allocation in the future.
+ */
+ mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits :
+ RTE_MIN(mcfg->dma_maskbits, maskbits);
+
+ return 0;
+}
+
/* return the number of memory channels */
unsigned rte_memory_get_nchannel(void)
{
@@ -548,6 +645,105 @@ rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg)
return ret;
}
+int __rte_experimental
+rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ int msl_idx, seg_idx, ret;
+
+ if (ms == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ msl = rte_mem_virt2memseg_list(ms->addr);
+ if (msl == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ arr = &msl->memseg_arr;
+
+ msl_idx = msl - mcfg->memsegs;
+ seg_idx = rte_fbarray_find_idx(arr, ms);
+
+ if (!rte_fbarray_is_used(arr, seg_idx)) {
+ rte_errno = ENOENT;
+ return -1;
+ }
+
+ ret = eal_memalloc_get_seg_fd(msl_idx, seg_idx);
+ if (ret < 0) {
+ rte_errno = -ret;
+ ret = -1;
+ }
+ return ret;
+}
+
+int __rte_experimental
+rte_memseg_get_fd(const struct rte_memseg *ms)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int ret;
+
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+ ret = rte_memseg_get_fd_thread_unsafe(ms);
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+
+ return ret;
+}
+
+int __rte_experimental
+rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms,
+ size_t *offset)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ int msl_idx, seg_idx, ret;
+
+ if (ms == NULL || offset == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ msl = rte_mem_virt2memseg_list(ms->addr);
+ if (msl == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ arr = &msl->memseg_arr;
+
+ msl_idx = msl - mcfg->memsegs;
+ seg_idx = rte_fbarray_find_idx(arr, ms);
+
+ if (!rte_fbarray_is_used(arr, seg_idx)) {
+ rte_errno = ENOENT;
+ return -1;
+ }
+
+ ret = eal_memalloc_get_seg_fd_offset(msl_idx, seg_idx, offset);
+ if (ret < 0) {
+ rte_errno = -ret;
+ ret = -1;
+ }
+ return ret;
+}
+
+int __rte_experimental
+rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int ret;
+
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+ ret = rte_memseg_get_fd_offset_thread_unsafe(ms, offset);
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+
+ return ret;
+}
+
/* init memory subsystem */
int
rte_eal_memory_init(void)
diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 7300fe05..b7081afb 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -120,13 +120,15 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
return NULL;
}
- if ((socket_id != SOCKET_ID_ANY) &&
- (socket_id >= RTE_MAX_NUMA_NODES || socket_id < 0)) {
+ if ((socket_id != SOCKET_ID_ANY) && socket_id < 0) {
rte_errno = EINVAL;
return NULL;
}
- if (!rte_eal_has_hugepages())
+ /* only set socket to SOCKET_ID_ANY if we aren't allocating for an
+ * external heap.
+ */
+ if (!rte_eal_has_hugepages() && socket_id < RTE_MAX_NUMA_NODES)
socket_id = SOCKET_ID_ANY;
contig = (flags & RTE_MEMZONE_IOVA_CONTIG) != 0;
diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index dd5f9740..b82f3ddd 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -58,6 +58,7 @@ eal_long_options[] = {
{OPT_HELP, 0, NULL, OPT_HELP_NUM },
{OPT_HUGE_DIR, 1, NULL, OPT_HUGE_DIR_NUM },
{OPT_HUGE_UNLINK, 0, NULL, OPT_HUGE_UNLINK_NUM },
+ {OPT_IOVA_MODE, 1, NULL, OPT_IOVA_MODE_NUM },
{OPT_LCORES, 1, NULL, OPT_LCORES_NUM },
{OPT_LOG_LEVEL, 1, NULL, OPT_LOG_LEVEL_NUM },
{OPT_MASTER_LCORE, 1, NULL, OPT_MASTER_LCORE_NUM },
@@ -205,6 +206,7 @@ eal_reset_internal_config(struct internal_config *internal_cfg)
#endif
internal_cfg->vmware_tsc_map = 0;
internal_cfg->create_uio_dev = 0;
+ internal_cfg->iova_mode = RTE_IOVA_DC;
internal_cfg->user_mbuf_pool_ops_name = NULL;
internal_cfg->init_complete = 0;
}
@@ -1075,6 +1077,25 @@ eal_parse_proc_type(const char *arg)
return RTE_PROC_INVALID;
}
+static int
+eal_parse_iova_mode(const char *name)
+{
+ int mode;
+
+ if (name == NULL)
+ return -1;
+
+ if (!strcmp("pa", name))
+ mode = RTE_IOVA_PA;
+ else if (!strcmp("va", name))
+ mode = RTE_IOVA_VA;
+ else
+ return -1;
+
+ internal_config.iova_mode = mode;
+ return 0;
+}
+
int
eal_parse_common_option(int opt, const char *optarg,
struct internal_config *conf)
@@ -1281,6 +1302,13 @@ eal_parse_common_option(int opt, const char *optarg,
case OPT_SINGLE_FILE_SEGMENTS_NUM:
conf->single_file_segments = 1;
break;
+ case OPT_IOVA_MODE_NUM:
+ if (eal_parse_iova_mode(optarg) < 0) {
+ RTE_LOG(ERR, EAL, "invalid parameters for --"
+ OPT_IOVA_MODE "\n");
+ return -1;
+ }
+ break;
/* don't know what to do, leave this to caller */
default:
@@ -1384,10 +1412,16 @@ eal_check_common_options(struct internal_config *internal_cfg)
" is only supported in non-legacy memory mode\n");
}
if (internal_cfg->single_file_segments &&
- internal_cfg->hugepage_unlink) {
+ internal_cfg->hugepage_unlink &&
+ !internal_cfg->in_memory) {
RTE_LOG(ERR, EAL, "Option --"OPT_SINGLE_FILE_SEGMENTS" is "
- "not compatible with neither --"OPT_IN_MEMORY" nor "
- "--"OPT_HUGE_UNLINK"\n");
+ "not compatible with --"OPT_HUGE_UNLINK"\n");
+ return -1;
+ }
+ if (internal_cfg->legacy_mem &&
+ internal_cfg->in_memory) {
+ RTE_LOG(ERR, EAL, "Option --"OPT_LEGACY_MEM" is not compatible "
+ "with --"OPT_IN_MEMORY"\n");
return -1;
}
@@ -1428,6 +1462,8 @@ eal_common_usage(void)
" --"OPT_VDEV" Add a virtual device.\n"
" The argument format is <driver><id>[,key=val,...]\n"
" (ex: --vdev=net_pcap0,iface=eth2).\n"
+ " --"OPT_IOVA_MODE" Set IOVA mode. 'pa' for IOVA_PA\n"
+ " 'va' for IOVA_VA\n"
" -d LIB.so|DIR Add a driver or driver directory\n"
" (can be used multiple times)\n"
" --"OPT_VMWARE_TSC_MAP" Use VMware TSC map instead of native RDTSC\n"
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 9fcb9121..97663d3b 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -939,13 +939,17 @@ rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply,
if (check_input(req) == false)
return -1;
+ reply->nb_sent = 0;
+ reply->nb_received = 0;
+ reply->msgs = NULL;
+
if (internal_config.no_shconf) {
RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n");
return 0;
}
if (gettimeofday(&now, NULL) < 0) {
- RTE_LOG(ERR, EAL, "Faile to get current time\n");
+ RTE_LOG(ERR, EAL, "Failed to get current time\n");
rte_errno = errno;
return -1;
}
@@ -954,10 +958,6 @@ rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply,
end.tv_sec = now.tv_sec + ts->tv_sec +
(now.tv_usec * 1000 + ts->tv_nsec) / 1000000000;
- reply->nb_sent = 0;
- reply->nb_received = 0;
- reply->msgs = NULL;
-
/* for secondary process, send request to the primary process only */
if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
pthread_mutex_lock(&pending_requests.lock);
diff --git a/lib/librte_eal/common/eal_common_string_fns.c b/lib/librte_eal/common/eal_common_string_fns.c
index 6ac5f828..60c5dd66 100644
--- a/lib/librte_eal/common/eal_common_string_fns.c
+++ b/lib/librte_eal/common/eal_common_string_fns.c
@@ -38,3 +38,29 @@ einval_error:
errno = EINVAL;
return -1;
}
+
+/* Copy src string into dst.
+ *
+ * Return negative value and NUL-terminate if dst is too short,
+ * Otherwise return number of bytes copied.
+ */
+ssize_t
+rte_strscpy(char *dst, const char *src, size_t dsize)
+{
+ size_t nleft = dsize;
+ size_t res = 0;
+
+ /* Copy as many bytes as will fit. */
+ while (nleft != 0) {
+ dst[res] = src[res];
+ if (src[res] == '\0')
+ return res;
+ res++;
+ nleft--;
+ }
+
+ /* Not enough room in dst, set NUL and return error. */
+ if (res != 0)
+ dst[res - 1] = '\0';
+ return -E2BIG;
+}
diff --git a/lib/librte_eal/common/eal_common_timer.c b/lib/librte_eal/common/eal_common_timer.c
index 2e2b770f..dcf26bfe 100644
--- a/lib/librte_eal/common/eal_common_timer.c
+++ b/lib/librte_eal/common/eal_common_timer.c
@@ -7,9 +7,11 @@
#include <unistd.h>
#include <inttypes.h>
#include <sys/types.h>
+#include <time.h>
#include <errno.h>
#include <rte_common.h>
+#include <rte_compat.h>
#include <rte_log.h>
#include <rte_cycles.h>
#include <rte_pause.h>
@@ -31,6 +33,28 @@ rte_delay_us_block(unsigned int us)
rte_pause();
}
+void __rte_experimental
+rte_delay_us_sleep(unsigned int us)
+{
+ struct timespec wait[2];
+ int ind = 0;
+
+ wait[0].tv_sec = 0;
+ if (us >= US_PER_S) {
+ wait[0].tv_sec = us / US_PER_S;
+ us -= wait[0].tv_sec * US_PER_S;
+ }
+ wait[0].tv_nsec = 1000 * us;
+
+ while (nanosleep(&wait[ind], &wait[1 - ind]) && errno == EINTR) {
+ /*
+ * Sleep was interrupted. Flip the index, so the 'remainder'
+ * will become the 'request' for a next call.
+ */
+ ind = 1 - ind;
+ }
+}
+
uint64_t
rte_get_tsc_hz(void)
{
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index de05febf..b3e8ae5e 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -27,7 +27,7 @@ eal_create_runtime_dir(void);
/* returns runtime dir */
const char *
-eal_get_runtime_dir(void);
+rte_eal_get_runtime_dir(void);
#define RUNTIME_CONFIG_FNAME "config"
static inline const char *
@@ -35,7 +35,7 @@ eal_runtime_config_path(void)
{
static char buffer[PATH_MAX]; /* static so auto-zeroed */
- snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(),
+ snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(),
RUNTIME_CONFIG_FNAME);
return buffer;
}
@@ -47,7 +47,7 @@ eal_mp_socket_path(void)
{
static char buffer[PATH_MAX]; /* static so auto-zeroed */
- snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(),
+ snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(),
MP_SOCKET_FNAME);
return buffer;
}
@@ -55,7 +55,8 @@ eal_mp_socket_path(void)
#define FBARRAY_NAME_FMT "%s/fbarray_%s"
static inline const char *
eal_get_fbarray_path(char *buffer, size_t buflen, const char *name) {
- snprintf(buffer, buflen, FBARRAY_NAME_FMT, eal_get_runtime_dir(), name);
+ snprintf(buffer, buflen, FBARRAY_NAME_FMT, rte_eal_get_runtime_dir(),
+ name);
return buffer;
}
@@ -66,7 +67,7 @@ eal_hugepage_info_path(void)
{
static char buffer[PATH_MAX]; /* static so auto-zeroed */
- snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(),
+ snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(),
HUGEPAGE_INFO_FNAME);
return buffer;
}
@@ -78,7 +79,7 @@ eal_hugepage_data_path(void)
{
static char buffer[PATH_MAX]; /* static so auto-zeroed */
- snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(),
+ snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(),
HUGEPAGE_DATA_FNAME);
return buffer;
}
@@ -99,7 +100,7 @@ eal_get_hugefile_path(char *buffer, size_t buflen, const char *hugedir, int f_id
static inline const char *
eal_get_hugefile_lock_path(char *buffer, size_t buflen, int f_id)
{
- snprintf(buffer, buflen, HUGEFILE_LOCK_FMT, eal_get_runtime_dir(),
+ snprintf(buffer, buflen, HUGEFILE_LOCK_FMT, rte_eal_get_runtime_dir(),
f_id);
buffer[buflen - 1] = '\0';
return buffer;
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 00ee6e06..737f17e3 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -70,6 +70,7 @@ struct internal_config {
/**< user defined mbuf pool ops name */
unsigned num_hugepage_sizes; /**< how many sizes on this system */
struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES];
+ enum rte_iova_mode iova_mode ; /**< Set IOVA mode on this system */
volatile unsigned int init_complete;
/**< indicates whether EAL has completed initialization */
};
diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
index 36bb1a02..af917c2f 100644
--- a/lib/librte_eal/common/eal_memalloc.h
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -76,6 +76,17 @@ eal_memalloc_mem_alloc_validator_unregister(const char *name, int socket_id);
int
eal_memalloc_mem_alloc_validate(int socket_id, size_t new_len);
+/* returns fd or -errno */
+int
+eal_memalloc_get_seg_fd(int list_idx, int seg_idx);
+
+/* returns 0 or -errno */
+int
+eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd);
+
+int
+eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset);
+
int
eal_memalloc_init(void);
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index 96e16678..5271f944 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -63,6 +63,8 @@ enum {
OPT_LEGACY_MEM_NUM,
#define OPT_SINGLE_FILE_SEGMENTS "single-file-segments"
OPT_SINGLE_FILE_SEGMENTS_NUM,
+#define OPT_IOVA_MODE "iova-mode"
+ OPT_IOVA_MODE_NUM,
OPT_LONG_MAX_NUM
};
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index 4f809a83..442c6dc4 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -259,18 +259,6 @@ struct rte_bus *rte_bus_find_by_device_name(const char *str);
int rte_mp_channel_init(void);
/**
- * Internal Executes all the user application registered callbacks for
- * the specific device. It is for DPDK internal user only. User
- * application should not call it directly.
- *
- * @param device_name
- * The device name.
- * @param event
- * the device event type.
- */
-void dev_callback_process(char *device_name, enum rte_dev_event_type event);
-
-/**
* @internal
* Parse a device string and store its information in an
* rte_devargs structure.
@@ -304,4 +292,82 @@ int
rte_devargs_layers_parse(struct rte_devargs *devargs,
const char *devstr);
+/*
+ * probe a device at local process.
+ *
+ * @param devargs
+ * Device arguments including bus, class and driver properties.
+ * @param new_dev
+ * new device be probed as output.
+ * @return
+ * 0 on success, negative on error.
+ */
+int local_dev_probe(const char *devargs, struct rte_device **new_dev);
+
+/**
+ * Hotplug remove a given device from a specific bus at local process.
+ *
+ * @param dev
+ * Data structure of the device to remove.
+ * @return
+ * 0 on success, negative on error.
+ */
+int local_dev_remove(struct rte_device *dev);
+
+/**
+ * Iterate over all buses to find the corresponding bus to handle the sigbus
+ * error.
+ * @param failure_addr
+ * Pointer of the fault address of the sigbus error.
+ *
+ * @return
+ * 0 success to handle the sigbus.
+ * -1 failed to handle the sigbus
+ * 1 no bus can handler the sigbus
+ */
+int rte_bus_sigbus_handler(const void *failure_addr);
+
+/**
+ * @internal
+ * Register the sigbus handler.
+ *
+ * @return
+ * - On success, zero.
+ * - On failure, a negative value.
+ */
+int
+dev_sigbus_handler_register(void);
+
+/**
+ * @internal
+ * Unregister the sigbus handler.
+ *
+ * @return
+ * - On success, zero.
+ * - On failure, a negative value.
+ */
+int
+dev_sigbus_handler_unregister(void);
+
+/**
+ * Check if the option is registered.
+ *
+ * @param option
+ * The option to be parsed.
+ *
+ * @return
+ * 0 on success
+ * @return
+ * -1 on fail
+ */
+int
+rte_option_parse(const char *opt);
+
+/**
+ * Iterate through the registered options and execute the associated
+ * callback if enabled.
+ */
+void
+rte_option_init(void);
+
#endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/hotplug_mp.c b/lib/librte_eal/common/hotplug_mp.c
new file mode 100644
index 00000000..84f59d95
--- /dev/null
+++ b/lib/librte_eal/common/hotplug_mp.c
@@ -0,0 +1,426 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+#include <string.h>
+
+#include <rte_eal.h>
+#include <rte_alarm.h>
+#include <rte_string_fns.h>
+#include <rte_devargs.h>
+
+#include "hotplug_mp.h"
+#include "eal_private.h"
+
+#define MP_TIMEOUT_S 5 /**< 5 seconds timeouts */
+
+struct mp_reply_bundle {
+ struct rte_mp_msg msg;
+ void *peer;
+};
+
+static int cmp_dev_name(const struct rte_device *dev, const void *_name)
+{
+ const char *name = _name;
+
+ return strcmp(dev->name, name);
+}
+
+/**
+ * Secondary to primary request.
+ * start from function eal_dev_hotplug_request_to_primary.
+ *
+ * device attach on secondary:
+ * a) secondary send sync request to the primary.
+ * b) primary receive the request and attach the new device if
+ * failed goto i).
+ * c) primary forward attach sync request to all secondary.
+ * d) secondary receive the request and attach the device and send a reply.
+ * e) primary check the reply if all success goes to j).
+ * f) primary send attach rollback sync request to all secondary.
+ * g) secondary receive the request and detach the device and send a reply.
+ * h) primary receive the reply and detach device as rollback action.
+ * i) send attach fail to secondary as a reply of step a), goto k).
+ * j) send attach success to secondary as a reply of step a).
+ * k) secondary receive reply and return.
+ *
+ * device detach on secondary:
+ * a) secondary send sync request to the primary.
+ * b) primary send detach sync request to all secondary.
+ * c) secondary detach the device and send a reply.
+ * d) primary check the reply if all success goes to g).
+ * e) primary send detach rollback sync request to all secondary.
+ * f) secondary receive the request and attach back device. goto h).
+ * g) primary detach the device if success goto i), else goto e).
+ * h) primary send detach fail to secondary as a reply of step a), goto j).
+ * i) primary send detach success to secondary as a reply of step a).
+ * j) secondary receive reply and return.
+ */
+
+static int
+send_response_to_secondary(const struct eal_dev_mp_req *req,
+ int result,
+ const void *peer)
+{
+ struct rte_mp_msg mp_resp;
+ struct eal_dev_mp_req *resp =
+ (struct eal_dev_mp_req *)mp_resp.param;
+ int ret;
+
+ memset(&mp_resp, 0, sizeof(mp_resp));
+ mp_resp.len_param = sizeof(*resp);
+ strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name));
+ memcpy(resp, req, sizeof(*req));
+ resp->result = result;
+
+ ret = rte_mp_reply(&mp_resp, peer);
+ if (ret != 0)
+ RTE_LOG(ERR, EAL, "failed to send response to secondary\n");
+
+ return ret;
+}
+
+static void
+__handle_secondary_request(void *param)
+{
+ struct mp_reply_bundle *bundle = param;
+ const struct rte_mp_msg *msg = &bundle->msg;
+ const struct eal_dev_mp_req *req =
+ (const struct eal_dev_mp_req *)msg->param;
+ struct eal_dev_mp_req tmp_req;
+ struct rte_devargs *da;
+ struct rte_device *dev;
+ struct rte_bus *bus;
+ int ret = 0;
+
+ tmp_req = *req;
+
+ if (req->t == EAL_DEV_REQ_TYPE_ATTACH) {
+ ret = local_dev_probe(req->devargs, &dev);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Failed to hotplug add device on primary\n");
+ if (ret != -EEXIST)
+ goto finish;
+ }
+ ret = eal_dev_hotplug_request_to_secondary(&tmp_req);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Failed to send hotplug request to secondary\n");
+ ret = -ENOMSG;
+ goto rollback;
+ }
+ if (tmp_req.result != 0) {
+ ret = tmp_req.result;
+ RTE_LOG(ERR, EAL, "Failed to hotplug add device on secondary\n");
+ if (ret != -EEXIST)
+ goto rollback;
+ }
+ } else if (req->t == EAL_DEV_REQ_TYPE_DETACH) {
+ da = calloc(1, sizeof(*da));
+ if (da == NULL) {
+ ret = -ENOMEM;
+ goto finish;
+ }
+
+ ret = rte_devargs_parse(da, req->devargs);
+ if (ret != 0)
+ goto finish;
+
+ ret = eal_dev_hotplug_request_to_secondary(&tmp_req);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Failed to send hotplug request to secondary\n");
+ ret = -ENOMSG;
+ goto rollback;
+ }
+
+ bus = rte_bus_find_by_name(da->bus->name);
+ if (bus == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", da->bus->name);
+ ret = -ENOENT;
+ goto finish;
+ }
+
+ dev = bus->find_device(NULL, cmp_dev_name, da->name);
+ if (dev == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", da->name);
+ ret = -ENOENT;
+ goto finish;
+ }
+
+ if (tmp_req.result != 0) {
+ RTE_LOG(ERR, EAL, "Failed to hotplug remove device on secondary\n");
+ ret = tmp_req.result;
+ if (ret != -ENOENT)
+ goto rollback;
+ }
+
+ ret = local_dev_remove(dev);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Failed to hotplug remove device on primary\n");
+ if (ret != -ENOENT)
+ goto rollback;
+ }
+ } else {
+ RTE_LOG(ERR, EAL, "unsupported secondary to primary request\n");
+ ret = -ENOTSUP;
+ }
+ goto finish;
+
+rollback:
+ if (req->t == EAL_DEV_REQ_TYPE_ATTACH) {
+ tmp_req.t = EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK;
+ eal_dev_hotplug_request_to_secondary(&tmp_req);
+ local_dev_remove(dev);
+ } else {
+ tmp_req.t = EAL_DEV_REQ_TYPE_DETACH_ROLLBACK;
+ eal_dev_hotplug_request_to_secondary(&tmp_req);
+ }
+
+finish:
+ ret = send_response_to_secondary(&tmp_req, ret, bundle->peer);
+ if (ret)
+ RTE_LOG(ERR, EAL, "failed to send response to secondary\n");
+
+ free(bundle->peer);
+ free(bundle);
+}
+
+static int
+handle_secondary_request(const struct rte_mp_msg *msg, const void *peer)
+{
+ struct mp_reply_bundle *bundle;
+ const struct eal_dev_mp_req *req =
+ (const struct eal_dev_mp_req *)msg->param;
+ int ret = 0;
+
+ bundle = malloc(sizeof(*bundle));
+ if (bundle == NULL) {
+ RTE_LOG(ERR, EAL, "not enough memory\n");
+ return send_response_to_secondary(req, -ENOMEM, peer);
+ }
+
+ bundle->msg = *msg;
+ /**
+ * We need to send reply on interrupt thread, but peer can't be
+ * parsed directly, so this is a temporal hack, need to be fixed
+ * when it is ready.
+ */
+ bundle->peer = strdup(peer);
+
+ /**
+ * We are at IPC callback thread, sync IPC is not allowed due to
+ * dead lock, so we delegate the task to interrupt thread.
+ */
+ ret = rte_eal_alarm_set(1, __handle_secondary_request, bundle);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "failed to add mp task\n");
+ return send_response_to_secondary(req, ret, peer);
+ }
+ return 0;
+}
+
+static void __handle_primary_request(void *param)
+{
+ struct mp_reply_bundle *bundle = param;
+ struct rte_mp_msg *msg = &bundle->msg;
+ const struct eal_dev_mp_req *req =
+ (const struct eal_dev_mp_req *)msg->param;
+ struct rte_mp_msg mp_resp;
+ struct eal_dev_mp_req *resp =
+ (struct eal_dev_mp_req *)mp_resp.param;
+ struct rte_devargs *da;
+ struct rte_device *dev;
+ struct rte_bus *bus;
+ int ret = 0;
+
+ memset(&mp_resp, 0, sizeof(mp_resp));
+
+ switch (req->t) {
+ case EAL_DEV_REQ_TYPE_ATTACH:
+ case EAL_DEV_REQ_TYPE_DETACH_ROLLBACK:
+ ret = local_dev_probe(req->devargs, &dev);
+ break;
+ case EAL_DEV_REQ_TYPE_DETACH:
+ case EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK:
+ da = calloc(1, sizeof(*da));
+ if (da == NULL) {
+ ret = -ENOMEM;
+ goto quit;
+ }
+
+ ret = rte_devargs_parse(da, req->devargs);
+ if (ret != 0)
+ goto quit;
+
+ bus = rte_bus_find_by_name(da->bus->name);
+ if (bus == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", da->bus->name);
+ ret = -ENOENT;
+ goto quit;
+ }
+
+ dev = bus->find_device(NULL, cmp_dev_name, da->name);
+ if (dev == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", da->name);
+ ret = -ENOENT;
+ goto quit;
+ }
+
+ ret = local_dev_remove(dev);
+quit:
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name));
+ mp_resp.len_param = sizeof(*req);
+ memcpy(resp, req, sizeof(*resp));
+ resp->result = ret;
+ if (rte_mp_reply(&mp_resp, bundle->peer) < 0)
+ RTE_LOG(ERR, EAL, "failed to send reply to primary request\n");
+
+ free(bundle->peer);
+ free(bundle);
+}
+
+static int
+handle_primary_request(const struct rte_mp_msg *msg, const void *peer)
+{
+ struct rte_mp_msg mp_resp;
+ const struct eal_dev_mp_req *req =
+ (const struct eal_dev_mp_req *)msg->param;
+ struct eal_dev_mp_req *resp =
+ (struct eal_dev_mp_req *)mp_resp.param;
+ struct mp_reply_bundle *bundle;
+ int ret = 0;
+
+ memset(&mp_resp, 0, sizeof(mp_resp));
+ strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name));
+ mp_resp.len_param = sizeof(*req);
+ memcpy(resp, req, sizeof(*resp));
+
+ bundle = calloc(1, sizeof(*bundle));
+ if (bundle == NULL) {
+ resp->result = -ENOMEM;
+ ret = rte_mp_reply(&mp_resp, peer);
+ if (ret)
+ RTE_LOG(ERR, EAL, "failed to send reply to primary request\n");
+ return ret;
+ }
+
+ bundle->msg = *msg;
+ /**
+ * We need to send reply on interrupt thread, but peer can't be
+ * parsed directly, so this is a temporal hack, need to be fixed
+ * when it is ready.
+ */
+ bundle->peer = (void *)strdup(peer);
+
+ /**
+ * We are at IPC callback thread, sync IPC is not allowed due to
+ * dead lock, so we delegate the task to interrupt thread.
+ */
+ ret = rte_eal_alarm_set(1, __handle_primary_request, bundle);
+ if (ret != 0) {
+ resp->result = ret;
+ ret = rte_mp_reply(&mp_resp, peer);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "failed to send reply to primary request\n");
+ return ret;
+ }
+ }
+ return 0;
+}
+
+int eal_dev_hotplug_request_to_primary(struct eal_dev_mp_req *req)
+{
+ struct rte_mp_msg mp_req;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = MP_TIMEOUT_S, .tv_nsec = 0};
+ struct eal_dev_mp_req *resp;
+ int ret;
+
+ memset(&mp_req, 0, sizeof(mp_req));
+ memcpy(mp_req.param, req, sizeof(*req));
+ mp_req.len_param = sizeof(*req);
+ strlcpy(mp_req.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_req.name));
+
+ ret = rte_mp_request_sync(&mp_req, &mp_reply, &ts);
+ if (ret || mp_reply.nb_received != 1) {
+ RTE_LOG(ERR, EAL, "cannot send request to primary");
+ if (!ret)
+ return -1;
+ return ret;
+ }
+
+ resp = (struct eal_dev_mp_req *)mp_reply.msgs[0].param;
+ req->result = resp->result;
+
+ return ret;
+}
+
+int eal_dev_hotplug_request_to_secondary(struct eal_dev_mp_req *req)
+{
+ struct rte_mp_msg mp_req;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = MP_TIMEOUT_S, .tv_nsec = 0};
+ int ret;
+ int i;
+
+ memset(&mp_req, 0, sizeof(mp_req));
+ memcpy(mp_req.param, req, sizeof(*req));
+ mp_req.len_param = sizeof(*req);
+ strlcpy(mp_req.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_req.name));
+
+ ret = rte_mp_request_sync(&mp_req, &mp_reply, &ts);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "rte_mp_request_sync failed\n");
+ return ret;
+ }
+
+ if (mp_reply.nb_sent != mp_reply.nb_received) {
+ RTE_LOG(ERR, EAL, "not all secondary reply\n");
+ return -1;
+ }
+
+ req->result = 0;
+ for (i = 0; i < mp_reply.nb_received; i++) {
+ struct eal_dev_mp_req *resp =
+ (struct eal_dev_mp_req *)mp_reply.msgs[i].param;
+ if (resp->result != 0) {
+ req->result = resp->result;
+ if (req->t == EAL_DEV_REQ_TYPE_ATTACH &&
+ req->result != -EEXIST)
+ break;
+ if (req->t == EAL_DEV_REQ_TYPE_DETACH &&
+ req->result != -ENOENT)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+int rte_mp_dev_hotplug_init(void)
+{
+ int ret;
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ ret = rte_mp_action_register(EAL_DEV_MP_ACTION_REQUEST,
+ handle_secondary_request);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ EAL_DEV_MP_ACTION_REQUEST);
+ return ret;
+ }
+ } else {
+ ret = rte_mp_action_register(EAL_DEV_MP_ACTION_REQUEST,
+ handle_primary_request);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ EAL_DEV_MP_ACTION_REQUEST);
+ return ret;
+ }
+ }
+
+ return 0;
+}
diff --git a/lib/librte_eal/common/hotplug_mp.h b/lib/librte_eal/common/hotplug_mp.h
new file mode 100644
index 00000000..597fde3d
--- /dev/null
+++ b/lib/librte_eal/common/hotplug_mp.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _HOTPLUG_MP_H_
+#define _HOTPLUG_MP_H_
+
+#include "rte_dev.h"
+#include "rte_bus.h"
+
+#define EAL_DEV_MP_ACTION_REQUEST "eal_dev_mp_request"
+#define EAL_DEV_MP_ACTION_RESPONSE "eal_dev_mp_response"
+
+#define EAL_DEV_MP_DEV_NAME_MAX_LEN RTE_DEV_NAME_MAX_LEN
+#define EAL_DEV_MP_BUS_NAME_MAX_LEN 32
+#define EAL_DEV_MP_DEV_ARGS_MAX_LEN 128
+
+enum eal_dev_req_type {
+ EAL_DEV_REQ_TYPE_ATTACH,
+ EAL_DEV_REQ_TYPE_DETACH,
+ EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK,
+ EAL_DEV_REQ_TYPE_DETACH_ROLLBACK,
+};
+
+struct eal_dev_mp_req {
+ enum eal_dev_req_type t;
+ char devargs[EAL_DEV_MP_DEV_ARGS_MAX_LEN];
+ int result;
+};
+
+/**
+ * This is a synchronous wrapper for secondary process send
+ * request to primary process, this is invoked when an attach
+ * or detach request is issued from primary process.
+ */
+int eal_dev_hotplug_request_to_primary(struct eal_dev_mp_req *req);
+
+/**
+ * this is a synchronous wrapper for primary process send
+ * request to secondary process, this is invoked when an attach
+ * or detach request issued from secondary process.
+ */
+int eal_dev_hotplug_request_to_secondary(struct eal_dev_mp_req *req);
+
+
+#endif /* _HOTPLUG_MP_H_ */
diff --git a/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h b/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h
index c4f974fe..859b0974 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h
@@ -29,8 +29,8 @@ extern "C" {
#ifndef RTE_ARM_EAL_RDTSC_USE_PMU
/**
- * This call is easily portable to any ARM architecture, however,
- * it may be damn slow and inprecise for some tasks.
+ * This call is easily portable to any architecture, however,
+ * it may require a system call and inprecise for some tasks.
*/
static inline uint64_t
__rte_rdtsc_syscall(void)
diff --git a/lib/librte_eal/common/include/arch/ppc_64/meson.build b/lib/librte_eal/common/include/arch/ppc_64/meson.build
new file mode 100644
index 00000000..00f96117
--- /dev/null
+++ b/lib/librte_eal/common/include/arch/ppc_64/meson.build
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Luca Boccassi <bluca@debian.org>
+
+install_headers(
+ 'rte_atomic.h',
+ 'rte_byteorder.h',
+ 'rte_cpuflags.h',
+ 'rte_cycles.h',
+ 'rte_io.h',
+ 'rte_memcpy.h',
+ 'rte_pause.h',
+ 'rte_prefetch.h',
+ 'rte_rwlock.h',
+ 'rte_spinlock.h',
+ 'rte_vect.h',
+ subdir: get_option('include_subdir_arch'))
diff --git a/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h b/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h
index 8bd83576..16e47ce2 100644
--- a/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h
+++ b/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h
@@ -9,10 +9,17 @@
extern "C" {
#endif
+#include "rte_atomic.h"
+
#include "generic/rte_pause.h"
static inline void rte_pause(void)
{
+ /* Set hardware multi-threading low priority */
+ asm volatile("or 1,1,1");
+ /* Set hardware multi-threading medium priority */
+ asm volatile("or 2,2,2");
+ rte_compiler_barrier();
}
#ifdef __cplusplus
diff --git a/lib/librte_eal/common/include/generic/rte_cycles.h b/lib/librte_eal/common/include/generic/rte_cycles.h
index 0ff1af50..ac379e87 100644
--- a/lib/librte_eal/common/include/generic/rte_cycles.h
+++ b/lib/librte_eal/common/include/generic/rte_cycles.h
@@ -13,6 +13,7 @@
*/
#include <stdint.h>
+#include <rte_compat.h>
#include <rte_debug.h>
#include <rte_atomic.h>
@@ -158,6 +159,16 @@ rte_delay_ms(unsigned ms)
void rte_delay_us_block(unsigned int us);
/**
+ * Delay function that uses system sleep.
+ * Does not block the CPU core.
+ *
+ * @param us
+ * Number of microseconds to wait.
+ */
+void __rte_experimental
+rte_delay_us_sleep(unsigned int us);
+
+/**
* Replace rte_delay_us with user defined function.
*
* @param userfunc
diff --git a/lib/librte_eal/common/include/rte_bitmap.h b/lib/librte_eal/common/include/rte_bitmap.h
index d9facc64..7a36ce73 100644
--- a/lib/librte_eal/common/include/rte_bitmap.h
+++ b/lib/librte_eal/common/include/rte_bitmap.h
@@ -88,7 +88,7 @@ __rte_bitmap_index1_inc(struct rte_bitmap *bmp)
static inline uint64_t
__rte_bitmap_mask1_get(struct rte_bitmap *bmp)
{
- return (~1lu) << bmp->offset1;
+ return (~1llu) << bmp->offset1;
}
static inline void
@@ -317,7 +317,7 @@ rte_bitmap_get(struct rte_bitmap *bmp, uint32_t pos)
index2 = pos >> RTE_BITMAP_SLAB_BIT_SIZE_LOG2;
offset2 = pos & RTE_BITMAP_SLAB_BIT_MASK;
slab2 = bmp->array2 + index2;
- return (*slab2) & (1lu << offset2);
+ return (*slab2) & (1llu << offset2);
}
/**
@@ -342,8 +342,8 @@ rte_bitmap_set(struct rte_bitmap *bmp, uint32_t pos)
slab2 = bmp->array2 + index2;
slab1 = bmp->array1 + index1;
- *slab2 |= 1lu << offset2;
- *slab1 |= 1lu << offset1;
+ *slab2 |= 1llu << offset2;
+ *slab1 |= 1llu << offset1;
}
/**
@@ -370,7 +370,7 @@ rte_bitmap_set_slab(struct rte_bitmap *bmp, uint32_t pos, uint64_t slab)
slab1 = bmp->array1 + index1;
*slab2 |= slab;
- *slab1 |= 1lu << offset1;
+ *slab1 |= 1llu << offset1;
}
static inline uint64_t
@@ -408,7 +408,7 @@ rte_bitmap_clear(struct rte_bitmap *bmp, uint32_t pos)
slab2 = bmp->array2 + index2;
/* Return if array2 slab is not all-zeros */
- *slab2 &= ~(1lu << offset2);
+ *slab2 &= ~(1llu << offset2);
if (*slab2){
return;
}
@@ -424,7 +424,7 @@ rte_bitmap_clear(struct rte_bitmap *bmp, uint32_t pos)
index1 = pos >> (RTE_BITMAP_SLAB_BIT_SIZE_LOG2 + RTE_BITMAP_CL_BIT_SIZE_LOG2);
offset1 = (pos >> RTE_BITMAP_CL_BIT_SIZE_LOG2) & RTE_BITMAP_SLAB_BIT_MASK;
slab1 = bmp->array1 + index1;
- *slab1 &= ~(1lu << offset1);
+ *slab1 &= ~(1llu << offset1);
return;
}
diff --git a/lib/librte_eal/common/include/rte_bus.h b/lib/librte_eal/common/include/rte_bus.h
index b7b5b084..6be4b5ca 100644
--- a/lib/librte_eal/common/include/rte_bus.h
+++ b/lib/librte_eal/common/include/rte_bus.h
@@ -168,6 +168,35 @@ typedef int (*rte_bus_unplug_t)(struct rte_device *dev);
typedef int (*rte_bus_parse_t)(const char *name, void *addr);
/**
+ * Implement a specific hot-unplug handler, which is responsible for
+ * handle the failure when device be hot-unplugged. When the event of
+ * hot-unplug be detected, it could call this function to handle
+ * the hot-unplug failure and avoid app crash.
+ * @param dev
+ * Pointer of the device structure.
+ *
+ * @return
+ * 0 on success.
+ * !0 on error.
+ */
+typedef int (*rte_bus_hot_unplug_handler_t)(struct rte_device *dev);
+
+/**
+ * Implement a specific sigbus handler, which is responsible for handling
+ * the sigbus error which is either original memory error, or specific memory
+ * error that caused of device be hot-unplugged. When sigbus error be captured,
+ * it could call this function to handle sigbus error.
+ * @param failure_addr
+ * Pointer of the fault address of the sigbus error.
+ *
+ * @return
+ * 0 for success handle the sigbus for hot-unplug.
+ * 1 for not process it, because it is a generic sigbus error.
+ * -1 for failed to handle the sigbus for hot-unplug.
+ */
+typedef int (*rte_bus_sigbus_handler_t)(const void *failure_addr);
+
+/**
* Bus scan policies
*/
enum rte_bus_scan_mode {
@@ -212,6 +241,11 @@ struct rte_bus {
struct rte_bus_conf conf; /**< Bus configuration */
rte_bus_get_iommu_class_t get_iommu_class; /**< Get iommu class */
rte_dev_iterate_t dev_iterate; /**< Device iterator. */
+ rte_bus_hot_unplug_handler_t hot_unplug_handler;
+ /**< handle hot-unplug failure on the bus */
+ rte_bus_sigbus_handler_t sigbus_handler;
+ /**< handle sigbus error on the bus */
+
};
/**
diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h
index 069c13ec..cba7bbc1 100644
--- a/lib/librte_eal/common/include/rte_common.h
+++ b/lib/librte_eal/common/include/rte_common.h
@@ -68,6 +68,11 @@ typedef uint16_t unaligned_uint16_t;
/******* Macro to mark functions and fields scheduled for removal *****/
#define __rte_deprecated __attribute__((__deprecated__))
+/**
+ * Mark a function or variable to a weak reference.
+ */
+#define __rte_weak __attribute__((__weak__))
+
/*********** Macros to eliminate unused variable warnings ********/
/**
@@ -164,6 +169,12 @@ static void __attribute__((destructor(RTE_PRIO(prio)), used)) func(void)
*/
#define RTE_PTR_DIFF(ptr1, ptr2) ((uintptr_t)(ptr1) - (uintptr_t)(ptr2))
+/**
+ * Workaround to cast a const field of a structure to non-const type.
+ */
+#define RTE_CAST_FIELD(var, field, type) \
+ (*(type *)((uintptr_t)(var) + offsetof(typeof(*(var)), field)))
+
/*********** Macros/static functions for doing alignment ********/
diff --git a/lib/librte_eal/common/include/rte_dev.h b/lib/librte_eal/common/include/rte_dev.h
index b80a8059..cd6c187c 100644
--- a/lib/librte_eal/common/include/rte_dev.h
+++ b/lib/librte_eal/common/include/rte_dev.h
@@ -39,7 +39,7 @@ struct rte_dev_event {
char *devname; /**< device name */
};
-typedef void (*rte_dev_event_cb_fn)(char *device_name,
+typedef void (*rte_dev_event_cb_fn)(const char *device_name,
enum rte_dev_event_type event,
void *cb_arg);
@@ -156,63 +156,67 @@ struct rte_driver {
struct rte_device {
TAILQ_ENTRY(rte_device) next; /**< Next device */
const char *name; /**< Device name */
- const struct rte_driver *driver;/**< Associated driver */
+ const struct rte_driver *driver; /**< Driver assigned after probing */
+ const struct rte_bus *bus; /**< Bus handle assigned on scan */
int numa_node; /**< NUMA node connection */
- struct rte_devargs *devargs; /**< Device user arguments */
+ struct rte_devargs *devargs; /**< Arguments for latest probing */
};
/**
- * Attach a device to a registered driver.
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
*
- * @param name
- * The device name, that refers to a pci device (or some private
- * way of designating a vdev device). Based on this device name, eal
- * will identify a driver capable of handling it and pass it to the
- * driver probing function.
- * @param devargs
- * Device arguments to be passed to the driver.
- * @return
- * 0 on success, negative on error.
- */
-__rte_deprecated
-int rte_eal_dev_attach(const char *name, const char *devargs);
-
-/**
- * Detach a device from its driver.
+ * Query status of a device.
*
* @param dev
- * A pointer to a rte_device structure.
+ * Generic device pointer.
* @return
- * 0 on success, negative on error.
+ * (int)true if already probed successfully, 0 otherwise.
*/
-__rte_deprecated
-int rte_eal_dev_detach(struct rte_device *dev);
+__rte_experimental
+int rte_dev_is_probed(const struct rte_device *dev);
/**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
* Hotplug add a given device to a specific bus.
*
+ * In multi-process, it will request other processes to add the same device.
+ * A failure, in any process, will rollback the action
+ *
* @param busname
* The bus name the device is added to.
* @param devname
* The device name. Based on this device name, eal will identify a driver
* capable of handling it and pass it to the driver probing function.
- * @param devargs
+ * @param drvargs
* Device arguments to be passed to the driver.
* @return
* 0 on success, negative on error.
*/
-int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devname,
- const char *devargs);
+int rte_eal_hotplug_add(const char *busname, const char *devname,
+ const char *drvargs);
/**
* @warning
* @b EXPERIMENTAL: this API may change without prior notice
*
+ * Add matching devices.
+ *
+ * In multi-process, it will request other processes to add the same device.
+ * A failure, in any process, will rollback the action
+ *
+ * @param devargs
+ * Device arguments including bus, class and driver properties.
+ * @return
+ * 0 on success, negative on error.
+ */
+int __rte_experimental rte_dev_probe(const char *devargs);
+
+/**
* Hotplug remove a given device from a specific bus.
*
+ * In multi-process, it will request other processes to remove the same device.
+ * A failure, in any process, will rollback the action
+ *
* @param busname
* The bus name the device is removed from.
* @param devname
@@ -220,8 +224,23 @@ int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devn
* @return
* 0 on success, negative on error.
*/
-int __rte_experimental rte_eal_hotplug_remove(const char *busname,
- const char *devname);
+int rte_eal_hotplug_remove(const char *busname, const char *devname);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Remove one device.
+ *
+ * In multi-process, it will request other processes to remove the same device.
+ * A failure, in any process, will rollback the action
+ *
+ * @param dev
+ * Data structure of the device to remove.
+ * @return
+ * 0 on success, negative on error.
+ */
+int __rte_experimental rte_dev_remove(struct rte_device *dev);
/**
* Device comparison function.
@@ -438,6 +457,22 @@ rte_dev_event_callback_unregister(const char *device_name,
* @warning
* @b EXPERIMENTAL: this API may change without prior notice
*
+ * Executes all the user application registered callbacks for
+ * the specific device.
+ *
+ * @param device_name
+ * The device name.
+ * @param event
+ * the device event type.
+ */
+void __rte_experimental
+rte_dev_event_callback_process(const char *device_name,
+ enum rte_dev_event_type event);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
* Start the device event monitoring.
*
* @return
@@ -460,4 +495,30 @@ rte_dev_event_monitor_start(void);
int __rte_experimental
rte_dev_event_monitor_stop(void);
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Enable hotplug handling for devices.
+ *
+ * @return
+ * - On success, zero.
+ * - On failure, a negative value.
+ */
+int __rte_experimental
+rte_dev_hotplug_handle_enable(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Disable hotplug handling for devices.
+ *
+ * @return
+ * - On success, zero.
+ * - On failure, a negative value.
+ */
+int __rte_experimental
+rte_dev_hotplug_handle_disable(void);
+
#endif /* _RTE_DEV_H_ */
diff --git a/lib/librte_eal/common/include/rte_devargs.h b/lib/librte_eal/common/include/rte_devargs.h
index 097a4ce7..b1f121f8 100644
--- a/lib/librte_eal/common/include/rte_devargs.h
+++ b/lib/librte_eal/common/include/rte_devargs.h
@@ -67,36 +67,6 @@ struct rte_devargs {
};
/**
- * @deprecated
- * Parse a devargs string.
- *
- * For PCI devices, the format of arguments string is "PCI_ADDR" or
- * "PCI_ADDR,key=val,key2=val2,...". Examples: "08:00.1", "0000:5:00.0",
- * "04:00.0,arg=val".
- *
- * For virtual devices, the format of arguments string is "DRIVER_NAME*"
- * or "DRIVER_NAME*,key=val,key2=val2,...". Examples: "net_ring",
- * "net_ring0", "net_pmdAnything,arg=0:arg2=1".
- *
- * The function parses the arguments string to get driver name and driver
- * arguments.
- *
- * @param devargs_str
- * The arguments as given by the user.
- * @param drvname
- * The pointer to the string to store parsed driver name.
- * @param drvargs
- * The pointer to the string to store parsed driver arguments.
- *
- * @return
- * - 0 on success
- * - A negative value on error
- */
-__rte_deprecated
-int rte_eal_parse_devargs_str(const char *devargs_str,
- char **drvname, char **drvargs);
-
-/**
* Parse a device string.
*
* Verify that a bus is capable of handling the device passed
@@ -202,32 +172,12 @@ __rte_experimental
int rte_devargs_add(enum rte_devtype devtype, const char *devargs_str);
/**
- * @deprecated
- * Add a device to the user device list
- * See rte_devargs_parse() for details.
- *
- * @param devtype
- * The type of the device.
- * @param devargs_str
- * The arguments as given by the user.
- *
- * @return
- * - 0 on success
- * - A negative value on error
- */
-__rte_deprecated
-int rte_eal_devargs_add(enum rte_devtype devtype, const char *devargs_str);
-
-/**
* Remove a device from the user device list.
* Its resources are freed.
* If the devargs cannot be found, nothing happens.
*
- * @param busname
- * bus name of the devargs to remove.
- *
- * @param devname
- * device name of the devargs to remove.
+ * @param devargs
+ * The instance or a copy of devargs to remove.
*
* @return
* 0 on success.
@@ -235,8 +185,7 @@ int rte_eal_devargs_add(enum rte_devtype devtype, const char *devargs_str);
* >0 if the devargs was not within the user device list.
*/
__rte_experimental
-int rte_devargs_remove(const char *busname,
- const char *devname);
+int rte_devargs_remove(struct rte_devargs *devargs);
/**
* Count the number of user devices of a specified type
@@ -252,20 +201,6 @@ unsigned int
rte_devargs_type_count(enum rte_devtype devtype);
/**
- * @deprecated
- * Count the number of user devices of a specified type
- *
- * @param devtype
- * The type of the devices to counted.
- *
- * @return
- * The number of devices.
- */
-__rte_deprecated
-unsigned int
-rte_eal_devargs_type_count(enum rte_devtype devtype);
-
-/**
* This function dumps the list of user device and their arguments.
*
* @param f
@@ -275,16 +210,6 @@ __rte_experimental
void rte_devargs_dump(FILE *f);
/**
- * @deprecated
- * This function dumps the list of user device and their arguments.
- *
- * @param f
- * A pointer to a file for output
- */
-__rte_deprecated
-void rte_eal_devargs_dump(FILE *f);
-
-/**
* Find next rte_devargs matching the provided bus name.
*
* @param busname
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index e114dcbd..a0cedd57 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -316,7 +316,7 @@ rte_mp_sendmsg(struct rte_mp_msg *msg);
*
* @param reply
* The reply argument will be for storing all the replied messages;
- * the caller is responsible for free reply->replies.
+ * the caller is responsible for free reply->msgs.
*
* @param ts
* The ts argument specifies how long we can wait for the peer(s) to reply.
@@ -378,6 +378,15 @@ int __rte_experimental
rte_mp_reply(struct rte_mp_msg *msg, const char *peer);
/**
+ * Register all mp action callbacks for hotplug.
+ *
+ * @return
+ * 0 on success, negative on error.
+ */
+int __rte_experimental
+rte_mp_dev_hotplug_init(void);
+
+/**
* Usage function typedef used by the application usage function.
*
* Use this function typedef to define and call rte_set_application_usage_hook()
@@ -498,6 +507,15 @@ enum rte_iova_mode rte_eal_iova_mode(void);
const char *
rte_eal_mbuf_user_pool_ops(void);
+/**
+ * Get the runtime directory of DPDK
+ *
+ * @return
+ * The runtime directory path of DPDK
+ */
+const char *
+rte_eal_get_runtime_dir(void);
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/librte_eal/common/include/rte_eal_interrupts.h b/lib/librte_eal/common/include/rte_eal_interrupts.h
index 6eb49327..9d302f41 100644
--- a/lib/librte_eal/common/include/rte_eal_interrupts.h
+++ b/lib/librte_eal/common/include/rte_eal_interrupts.h
@@ -35,6 +35,7 @@ enum rte_intr_handle_type {
RTE_INTR_HANDLE_EXT, /**< external handler */
RTE_INTR_HANDLE_VDEV, /**< virtual device */
RTE_INTR_HANDLE_DEV_EVENT, /**< device event handle */
+ RTE_INTR_HANDLE_VFIO_REQ, /**< VFIO request handle */
RTE_INTR_HANDLE_MAX /**< count of elements */
};
diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index aff0688d..84aabe36 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -30,9 +30,11 @@ struct rte_memseg_list {
uint64_t addr_64;
/**< Makes sure addr is always 64-bits */
};
- int socket_id; /**< Socket ID for all memsegs in this list. */
uint64_t page_sz; /**< Page size for all memsegs in this list. */
+ int socket_id; /**< Socket ID for all memsegs in this list. */
volatile uint32_t version; /**< version number for multiprocess sync. */
+ size_t len; /**< Length of memory area covered by this memseg list. */
+ unsigned int external; /**< 1 if this list points to external memory */
struct rte_fbarray memseg_arr;
};
@@ -70,13 +72,23 @@ struct rte_mem_config {
struct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; /**< Tailqs for objects */
- /* Heaps of Malloc per socket */
- struct malloc_heap malloc_heaps[RTE_MAX_NUMA_NODES];
+ /* Heaps of Malloc */
+ struct malloc_heap malloc_heaps[RTE_MAX_HEAPS];
+
+ /* next socket ID for external malloc heap */
+ int next_socket_id;
/* address of mem_config in primary process. used to map shared config into
* exact same address the primary process maps it.
*/
uint64_t mem_cfg_addr;
+
+ /* legacy mem and single file segments options are shared */
+ uint32_t legacy_mem;
+ uint32_t single_file_segments;
+
+ /* keeps the more restricted dma mask */
+ uint8_t dma_maskbits;
} __attribute__((__packed__));
diff --git a/lib/librte_eal/common/include/rte_malloc.h b/lib/librte_eal/common/include/rte_malloc.h
index a9fb7e45..7249e6aa 100644
--- a/lib/librte_eal/common/include/rte_malloc.h
+++ b/lib/librte_eal/common/include/rte_malloc.h
@@ -264,6 +264,198 @@ rte_malloc_get_socket_stats(int socket,
struct rte_malloc_socket_stats *socket_stats);
/**
+ * Add memory chunk to a heap with specified name.
+ *
+ * @note Multiple memory chunks can be added to the same heap
+ *
+ * @note Before accessing this memory in other processes, it needs to be
+ * attached in each of those processes by calling
+ * ``rte_malloc_heap_memory_attach`` in each other process.
+ *
+ * @note Memory must be previously allocated for DPDK to be able to use it as a
+ * malloc heap. Failing to do so will result in undefined behavior, up to and
+ * including segmentation faults.
+ *
+ * @note Calling this function will erase any contents already present at the
+ * supplied memory address.
+ *
+ * @param heap_name
+ * Name of the heap to add memory chunk to
+ * @param va_addr
+ * Start of virtual area to add to the heap
+ * @param len
+ * Length of virtual area to add to the heap
+ * @param iova_addrs
+ * Array of page IOVA addresses corresponding to each page in this memory
+ * area. Can be NULL, in which case page IOVA addresses will be set to
+ * RTE_BAD_IOVA.
+ * @param n_pages
+ * Number of elements in the iova_addrs array. Ignored if ``iova_addrs``
+ * is NULL.
+ * @param page_sz
+ * Page size of the underlying memory
+ *
+ * @return
+ * - 0 on success
+ * - -1 in case of error, with rte_errno set to one of the following:
+ * EINVAL - one of the parameters was invalid
+ * EPERM - attempted to add memory to a reserved heap
+ * ENOSPC - no more space in internal config to store a new memory chunk
+ */
+int __rte_experimental
+rte_malloc_heap_memory_add(const char *heap_name, void *va_addr, size_t len,
+ rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz);
+
+/**
+ * Remove memory chunk from heap with specified name.
+ *
+ * @note Memory chunk being removed must be the same as one that was added;
+ * partially removing memory chunks is not supported
+ *
+ * @note Memory area must not contain any allocated elements to allow its
+ * removal from the heap
+ *
+ * @note All other processes must detach from the memory chunk prior to it being
+ * removed from the heap.
+ *
+ * @param heap_name
+ * Name of the heap to remove memory from
+ * @param va_addr
+ * Virtual address to remove from the heap
+ * @param len
+ * Length of virtual area to remove from the heap
+ *
+ * @return
+ * - 0 on success
+ * - -1 in case of error, with rte_errno set to one of the following:
+ * EINVAL - one of the parameters was invalid
+ * EPERM - attempted to remove memory from a reserved heap
+ * ENOENT - heap or memory chunk was not found
+ * EBUSY - memory chunk still contains data
+ */
+int __rte_experimental
+rte_malloc_heap_memory_remove(const char *heap_name, void *va_addr, size_t len);
+
+/**
+ * Attach to an already existing chunk of external memory in another process.
+ *
+ * @note This function must be called before any attempt is made to use an
+ * already existing external memory chunk. This function does *not* need to
+ * be called if a call to ``rte_malloc_heap_memory_add`` was made in the
+ * current process.
+ *
+ * @param heap_name
+ * Heap name to which this chunk of memory belongs
+ * @param va_addr
+ * Start address of memory chunk to attach to
+ * @param len
+ * Length of memory chunk to attach to
+ * @return
+ * 0 on successful attach
+ * -1 on unsuccessful attach, with rte_errno set to indicate cause for error:
+ * EINVAL - one of the parameters was invalid
+ * EPERM - attempted to attach memory to a reserved heap
+ * ENOENT - heap or memory chunk was not found
+ */
+int __rte_experimental
+rte_malloc_heap_memory_attach(const char *heap_name, void *va_addr, size_t len);
+
+/**
+ * Detach from a chunk of external memory in secondary process.
+ *
+ * @note This function must be called in before any attempt is made to remove
+ * external memory from the heap in another process. This function does *not*
+ * need to be called if a call to ``rte_malloc_heap_memory_remove`` will be
+ * called in current process.
+ *
+ * @param heap_name
+ * Heap name to which this chunk of memory belongs
+ * @param va_addr
+ * Start address of memory chunk to attach to
+ * @param len
+ * Length of memory chunk to attach to
+ * @return
+ * 0 on successful detach
+ * -1 on unsuccessful detach, with rte_errno set to indicate cause for error:
+ * EINVAL - one of the parameters was invalid
+ * EPERM - attempted to detach memory from a reserved heap
+ * ENOENT - heap or memory chunk was not found
+ */
+int __rte_experimental
+rte_malloc_heap_memory_detach(const char *heap_name, void *va_addr, size_t len);
+
+/**
+ * Creates a new empty malloc heap with a specified name.
+ *
+ * @note Heaps created via this call will automatically get assigned a unique
+ * socket ID, which can be found using ``rte_malloc_heap_get_socket()``
+ *
+ * @param heap_name
+ * Name of the heap to create.
+ *
+ * @return
+ * - 0 on successful creation
+ * - -1 in case of error, with rte_errno set to one of the following:
+ * EINVAL - ``heap_name`` was NULL, empty or too long
+ * EEXIST - heap by name of ``heap_name`` already exists
+ * ENOSPC - no more space in internal config to store a new heap
+ */
+int __rte_experimental
+rte_malloc_heap_create(const char *heap_name);
+
+/**
+ * Destroys a previously created malloc heap with specified name.
+ *
+ * @note This function will return a failure result if not all memory allocated
+ * from the heap has been freed back to the heap
+ *
+ * @note This function will return a failure result if not all memory segments
+ * were removed from the heap prior to its destruction
+ *
+ * @param heap_name
+ * Name of the heap to create.
+ *
+ * @return
+ * - 0 on success
+ * - -1 in case of error, with rte_errno set to one of the following:
+ * EINVAL - ``heap_name`` was NULL, empty or too long
+ * ENOENT - heap by the name of ``heap_name`` was not found
+ * EPERM - attempting to destroy reserved heap
+ * EBUSY - heap still contains data
+ */
+int __rte_experimental
+rte_malloc_heap_destroy(const char *heap_name);
+
+/**
+ * Find socket ID corresponding to a named heap.
+ *
+ * @param name
+ * Heap name to find socket ID for
+ * @return
+ * Socket ID in case of success (a non-negative number)
+ * -1 in case of error, with rte_errno set to one of the following:
+ * EINVAL - ``name`` was NULL
+ * ENOENT - heap identified by the name ``name`` was not found
+ */
+int __rte_experimental
+rte_malloc_heap_get_socket(const char *name);
+
+/**
+ * Check if a given socket ID refers to externally allocated memory.
+ *
+ * @note Passing SOCKET_ID_ANY will return 0.
+ *
+ * @param socket_id
+ * Socket ID to check
+ * @return
+ * 1 if socket ID refers to externally allocated memory
+ * 0 if socket ID refers to internal DPDK memory
+ * -1 if socket ID is invalid
+ */
+int __rte_experimental
+rte_malloc_heap_socket_is_external(int socket_id);
+
+/**
* Dump statistics.
*
* Dump for the specified type to a file. If the type argument is
diff --git a/lib/librte_eal/common/include/rte_malloc_heap.h b/lib/librte_eal/common/include/rte_malloc_heap.h
index d43fa909..4a7e0eb1 100644
--- a/lib/librte_eal/common/include/rte_malloc_heap.h
+++ b/lib/librte_eal/common/include/rte_malloc_heap.h
@@ -12,6 +12,7 @@
/* Number of free lists per heap, grouped by size. */
#define RTE_HEAP_NUM_FREELISTS 13
+#define RTE_HEAP_NAME_MAX_LEN 32
/* dummy definition, for pointers */
struct malloc_elem;
@@ -26,7 +27,9 @@ struct malloc_heap {
struct malloc_elem *volatile last;
unsigned alloc_count;
+ unsigned int socket_id;
size_t total_size;
+ char name[RTE_HEAP_NAME_MAX_LEN];
} __rte_cache_aligned;
#endif /* _RTE_MALLOC_HEAP_H_ */
diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index c4b7f4cf..ce937058 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -215,6 +215,9 @@ typedef int (*rte_memseg_list_walk_t)(const struct rte_memseg_list *msl,
* @note This function read-locks the memory hotplug subsystem, and thus cannot
* be used within memory-related callback functions.
*
+ * @note This function will also walk through externally allocated segments. It
+ * is up to the user to decide whether to skip through these segments.
+ *
* @param func
* Iterator function
* @param arg
@@ -233,6 +236,9 @@ rte_memseg_walk(rte_memseg_walk_t func, void *arg);
* @note This function read-locks the memory hotplug subsystem, and thus cannot
* be used within memory-related callback functions.
*
+ * @note This function will also walk through externally allocated segments. It
+ * is up to the user to decide whether to skip through these segments.
+ *
* @param func
* Iterator function
* @param arg
@@ -251,6 +257,9 @@ rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg);
* @note This function read-locks the memory hotplug subsystem, and thus cannot
* be used within memory-related callback functions.
*
+ * @note This function will also walk through externally allocated segments. It
+ * is up to the user to decide whether to skip through these segments.
+ *
* @param func
* Iterator function
* @param arg
@@ -318,6 +327,103 @@ int __rte_experimental
rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg);
/**
+ * Return file descriptor associated with a particular memseg (if available).
+ *
+ * @note This function read-locks the memory hotplug subsystem, and thus cannot
+ * be used within memory-related callback functions.
+ *
+ * @note This returns an internal file descriptor. Performing any operations on
+ * this file descriptor is inherently dangerous, so it should be treated
+ * as read-only for all intents and purposes.
+ *
+ * @param ms
+ * A pointer to memseg for which to get file descriptor.
+ *
+ * @return
+ * Valid file descriptor in case of success.
+ * -1 in case of error, with ``rte_errno`` set to the following values:
+ * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg
+ * - ENODEV - ``ms`` fd is not available
+ * - ENOENT - ``ms`` is an unused segment
+ * - ENOTSUP - segment fd's are not supported
+ */
+int __rte_experimental
+rte_memseg_get_fd(const struct rte_memseg *ms);
+
+/**
+ * Return file descriptor associated with a particular memseg (if available).
+ *
+ * @note This function does not perform any locking, and is only safe to call
+ * from within memory-related callback functions.
+ *
+ * @note This returns an internal file descriptor. Performing any operations on
+ * this file descriptor is inherently dangerous, so it should be treated
+ * as read-only for all intents and purposes.
+ *
+ * @param ms
+ * A pointer to memseg for which to get file descriptor.
+ *
+ * @return
+ * Valid file descriptor in case of success.
+ * -1 in case of error, with ``rte_errno`` set to the following values:
+ * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg
+ * - ENODEV - ``ms`` fd is not available
+ * - ENOENT - ``ms`` is an unused segment
+ * - ENOTSUP - segment fd's are not supported
+ */
+int __rte_experimental
+rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms);
+
+/**
+ * Get offset into segment file descriptor associated with a particular memseg
+ * (if available).
+ *
+ * @note This function read-locks the memory hotplug subsystem, and thus cannot
+ * be used within memory-related callback functions.
+ *
+ * @param ms
+ * A pointer to memseg for which to get file descriptor.
+ * @param offset
+ * A pointer to offset value where the result will be stored.
+ *
+ * @return
+ * Valid file descriptor in case of success.
+ * -1 in case of error, with ``rte_errno`` set to the following values:
+ * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg
+ * - EINVAL - ``offset`` pointer was NULL
+ * - ENODEV - ``ms`` fd is not available
+ * - ENOENT - ``ms`` is an unused segment
+ * - ENOTSUP - segment fd's are not supported
+ */
+int __rte_experimental
+rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset);
+
+/**
+ * Get offset into segment file descriptor associated with a particular memseg
+ * (if available).
+ *
+ * @note This function does not perform any locking, and is only safe to call
+ * from within memory-related callback functions.
+ *
+ * @param ms
+ * A pointer to memseg for which to get file descriptor.
+ * @param offset
+ * A pointer to offset value where the result will be stored.
+ *
+ * @return
+ * Valid file descriptor in case of success.
+ * -1 in case of error, with ``rte_errno`` set to the following values:
+ * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg
+ * - EINVAL - ``offset`` pointer was NULL
+ * - ENODEV - ``ms`` fd is not available
+ * - ENOENT - ``ms`` is an unused segment
+ * - ENOTSUP - segment fd's are not supported
+ */
+int __rte_experimental
+rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms,
+ size_t *offset);
+
+/**
* Dump the physical memory layout to a file.
*
* @note This function read-locks the memory hotplug subsystem, and thus cannot
@@ -357,6 +463,9 @@ unsigned rte_memory_get_nchannel(void);
*/
unsigned rte_memory_get_nrank(void);
+/* check memsegs iovas are within a range based on dma mask */
+int __rte_experimental rte_eal_check_dma_mask(uint8_t maskbits);
+
/**
* Drivers based on uio will not load unless physical
* addresses are obtainable. It is only possible to get
diff --git a/lib/librte_eal/common/include/rte_option.h b/lib/librte_eal/common/include/rte_option.h
new file mode 100644
index 00000000..8957b970
--- /dev/null
+++ b/lib/librte_eal/common/include/rte_option.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#ifndef __INCLUDE_RTE_OPTION_H__
+#define __INCLUDE_RTE_OPTION_H__
+
+/**
+ * @file
+ *
+ * This API offers the ability to register options to the EAL command line and
+ * map those options to functions that will be executed at the end of EAL
+ * initialization. These options will be available as part of the EAL command
+ * line of applications and are dynamically managed.
+ *
+ * This is used primarily by DPDK libraries offering command line options.
+ * Currently, this API is limited to registering options without argument.
+ *
+ * The register API can be used to resolve circular dependency issues
+ * between EAL and the library. The library uses EAL, but is also initialized
+ * by EAL. Hence, EAL depends on the init function of the library. The API
+ * introduced in rte_option allows us to register the library init with EAL
+ * (passing a function pointer) and avoid the circular dependency.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int (*rte_option_cb)(void);
+
+/*
+ * Structure describing the EAL command line option being registered.
+ */
+struct rte_option {
+ TAILQ_ENTRY(rte_option) next; /**< Next entry in the list. */
+ char *opt_str; /**< The option name. */
+ rte_option_cb cb; /**< Function called when option is used. */
+ int enabled; /**< Set when the option is used. */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Register an option to the EAL command line.
+ * When recognized, the associated function will be executed at the end of EAL
+ * initialization.
+ *
+ * The associated structure must be available the whole time this option is
+ * registered (i.e. not stack memory).
+ *
+ * @param opt
+ * Structure describing the option to parse.
+ */
+void __rte_experimental
+rte_option_register(struct rte_option *opt);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/lib/librte_eal/common/include/rte_string_fns.h b/lib/librte_eal/common/include/rte_string_fns.h
index 97597a14..9a2a1ff9 100644
--- a/lib/librte_eal/common/include/rte_string_fns.h
+++ b/lib/librte_eal/common/include/rte_string_fns.h
@@ -16,6 +16,7 @@ extern "C" {
#endif
#include <stdio.h>
+#include <string.h>
/**
* Takes string "string" parameter and splits it at character "delim"
@@ -60,12 +61,10 @@ rte_strlcpy(char *dst, const char *src, size_t size)
/* pull in a strlcpy function */
#ifdef RTE_EXEC_ENV_BSDAPP
-#include <string.h>
#ifndef __BSD_VISIBLE /* non-standard functions are hidden */
#define strlcpy(dst, src, size) rte_strlcpy(dst, src, size)
#endif
-
#else /* non-BSD platforms */
#ifdef RTE_USE_LIBBSD
#include <bsd/string.h>
@@ -76,6 +75,29 @@ rte_strlcpy(char *dst, const char *src, size_t size)
#endif /* RTE_USE_LIBBSD */
#endif /* BSDAPP */
+/**
+ * Copy string src to buffer dst of size dsize.
+ * At most dsize-1 chars will be copied.
+ * Always NUL-terminates, unless (dsize == 0).
+ * Returns number of bytes copied (terminating NUL-byte excluded) on success ;
+ * negative errno on error.
+ *
+ * @param dst
+ * The destination string.
+ *
+ * @param src
+ * The input string to be copied.
+ *
+ * @param dsize
+ * Length in bytes of the destination buffer.
+ *
+ * @return
+ * The number of bytes copied on success
+ * -E2BIG if the destination buffer is too small.
+ */
+ssize_t
+rte_strscpy(char *dst, const char *src, size_t dsize);
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/librte_eal/common/include/rte_version.h b/lib/librte_eal/common/include/rte_version.h
index 7c6714a2..412ed2db 100644
--- a/lib/librte_eal/common/include/rte_version.h
+++ b/lib/librte_eal/common/include/rte_version.h
@@ -32,7 +32,7 @@ extern "C" {
/**
* Minor version/month number i.e. the mm in yy.mm.z
*/
-#define RTE_VER_MONTH 8
+#define RTE_VER_MONTH 11
/**
* Patch level number i.e. the z in yy.mm.z
@@ -42,14 +42,14 @@ extern "C" {
/**
* Extra string to be appended to version number
*/
-#define RTE_VER_SUFFIX ""
+#define RTE_VER_SUFFIX "-rc"
/**
* Patch release number
* 0-15 = release candidates
* 16 = release
*/
-#define RTE_VER_RELEASE 16
+#define RTE_VER_RELEASE 1
/**
* Macro to compute a version number usable for comparisons
diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
index 5ca13fcc..cae96fab 100644
--- a/lib/librte_eal/common/include/rte_vfio.h
+++ b/lib/librte_eal/common/include/rte_vfio.h
@@ -14,6 +14,8 @@
extern "C" {
#endif
+#include <stdint.h>
+
/*
* determine if VFIO is present on the system
*/
@@ -22,6 +24,9 @@ extern "C" {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
#define VFIO_PRESENT
#endif /* kernel version >= 3.6.0 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0)
+#define HAVE_VFIO_DEV_REQ_INTERFACE
+#endif /* kernel version >= 4.0.0 */
#endif /* RTE_EAL_VFIO */
#ifdef VFIO_PRESENT
@@ -44,6 +49,30 @@ extern "C" {
#define RTE_VFIO_NOIOMMU 8
#endif
+/*
+ * capabilities are only supported on kernel 4.6+. there were also some API
+ * changes as well, so add a macro to get cap offset.
+ */
+#ifdef VFIO_REGION_INFO_FLAG_CAPS
+#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS
+#define VFIO_CAP_OFFSET(x) (x->cap_offset)
+#else
+#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3)
+#define VFIO_CAP_OFFSET(x) (x->resv)
+struct vfio_info_cap_header {
+ uint16_t id;
+ uint16_t version;
+ uint32_t next;
+};
+#endif
+
+/* kernels 4.16+ can map BAR containing MSI-X table */
+#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
+#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
+#else
+#define RTE_VFIO_CAP_MSIX_MAPPABLE 3
+#endif
+
#else /* not VFIO_PRESENT */
/* we don't need an actual definition, only pointer is used */
@@ -227,7 +256,7 @@ rte_vfio_get_group_num(const char *sysfs_base,
const char *dev_addr, int *iommu_group_num);
/**
- * Open VFIO container fd or get an existing one
+ * Open a new VFIO container fd
*
* This function is only relevant to linux and will return
* an error on BSD.
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index e0a8ed15..1a74660d 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -39,10 +39,14 @@ malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align)
contig_seg_start = RTE_PTR_ALIGN_CEIL(data_start, align);
/* if we're in IOVA as VA mode, or if we're in legacy mode with
- * hugepages, all elements are IOVA-contiguous.
+ * hugepages, all elements are IOVA-contiguous. however, we can only
+ * make these assumptions about internal memory - externally allocated
+ * segments have to be checked.
*/
- if (rte_eal_iova_mode() == RTE_IOVA_VA ||
- (internal_config.legacy_mem && rte_eal_has_hugepages()))
+ if (!elem->msl->external &&
+ (rte_eal_iova_mode() == RTE_IOVA_VA ||
+ (internal_config.legacy_mem &&
+ rte_eal_has_hugepages())))
return RTE_PTR_DIFF(data_end, contig_seg_start);
cur_page = RTE_PTR_ALIGN_FLOOR(contig_seg_start, page_sz);
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 12aaf2d7..1973b6e6 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -29,6 +29,10 @@
#include "malloc_heap.h"
#include "malloc_mp.h"
+/* start external socket ID's at a very high number */
+#define CONST_MAX(a, b) (a > b ? a : b) /* RTE_MAX is not a constant */
+#define EXTERNAL_HEAP_MIN_SOCKET_ID (CONST_MAX((1 << 8), RTE_MAX_NUMA_NODES))
+
static unsigned
check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
{
@@ -66,6 +70,21 @@ check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
return check_flag & flags;
}
+int
+malloc_socket_to_heap_id(unsigned int socket_id)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int i;
+
+ for (i = 0; i < RTE_MAX_HEAPS; i++) {
+ struct malloc_heap *heap = &mcfg->malloc_heaps[i];
+
+ if (heap->socket_id == socket_id)
+ return i;
+ }
+ return -1;
+}
+
/*
* Expand the heap with a memory area.
*/
@@ -93,9 +112,17 @@ malloc_add_seg(const struct rte_memseg_list *msl,
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct rte_memseg_list *found_msl;
struct malloc_heap *heap;
- int msl_idx;
+ int msl_idx, heap_idx;
+
+ if (msl->external)
+ return 0;
- heap = &mcfg->malloc_heaps[msl->socket_id];
+ heap_idx = malloc_socket_to_heap_id(msl->socket_id);
+ if (heap_idx < 0) {
+ RTE_LOG(ERR, EAL, "Memseg list has invalid socket id\n");
+ return -1;
+ }
+ heap = &mcfg->malloc_heaps[heap_idx];
/* msl is const, so find it */
msl_idx = msl - mcfg->memsegs;
@@ -165,7 +192,9 @@ find_biggest_element(struct malloc_heap *heap, size_t *size,
for (elem = LIST_FIRST(&heap->free_head[idx]);
!!elem; elem = LIST_NEXT(elem, free_list)) {
size_t cur_size;
- if (!check_hugepage_sz(flags, elem->msl->page_sz))
+ if ((flags & RTE_MEMZONE_SIZE_HINT_ONLY) == 0 &&
+ !check_hugepage_sz(flags,
+ elem->msl->page_sz))
continue;
if (contig) {
cur_size =
@@ -259,11 +288,13 @@ alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
int socket, unsigned int flags, size_t align, size_t bound,
bool contig, struct rte_memseg **ms, int n_segs)
{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct rte_memseg_list *msl;
struct malloc_elem *elem = NULL;
size_t alloc_sz;
int allocd_pages;
void *ret, *map_addr;
+ uint64_t mask;
alloc_sz = (size_t)pg_sz * n_segs;
@@ -291,6 +322,16 @@ alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
goto fail;
}
+ if (mcfg->dma_maskbits) {
+ mask = ~((1ULL << mcfg->dma_maskbits) - 1);
+ if (rte_eal_check_dma_mask(mask)) {
+ RTE_LOG(ERR, EAL,
+ "%s(): couldn't allocate memory due to DMA mask\n",
+ __func__);
+ goto fail;
+ }
+ }
+
/* add newly minted memsegs to malloc heap */
elem = malloc_heap_add_memory(heap, msl, map_addr, alloc_sz);
@@ -326,11 +367,9 @@ try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz,
/* we can't know in advance how many pages we'll need, so we malloc */
ms = malloc(sizeof(*ms) * n_segs);
-
- memset(ms, 0, sizeof(*ms) * n_segs);
-
if (ms == NULL)
return -1;
+ memset(ms, 0, sizeof(*ms) * n_segs);
elem = alloc_pages_on_heap(heap, pg_sz, elt_size, socket, flags, align,
bound, contig, ms, n_segs);
@@ -560,12 +599,14 @@ alloc_more_mem_on_socket(struct malloc_heap *heap, size_t size, int socket,
/* this will try lower page sizes first */
static void *
-heap_alloc_on_socket(const char *type, size_t size, int socket,
- unsigned int flags, size_t align, size_t bound, bool contig)
+malloc_heap_alloc_on_heap_id(const char *type, size_t size,
+ unsigned int heap_id, unsigned int flags, size_t align,
+ size_t bound, bool contig)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
+ struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id];
unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY;
+ int socket_id;
void *ret;
rte_spinlock_lock(&(heap->lock));
@@ -583,12 +624,28 @@ heap_alloc_on_socket(const char *type, size_t size, int socket,
* we may still be able to allocate memory from appropriate page sizes,
* we just need to request more memory first.
*/
+
+ socket_id = rte_socket_id_by_idx(heap_id);
+ /*
+ * if socket ID is negative, we cannot find a socket ID for this heap -
+ * which means it's an external heap. those can have unexpected page
+ * sizes, so if the user asked to allocate from there - assume user
+ * knows what they're doing, and allow allocating from there with any
+ * page size flags.
+ */
+ if (socket_id < 0)
+ size_flags |= RTE_MEMZONE_SIZE_HINT_ONLY;
+
ret = heap_alloc(heap, type, size, size_flags, align, bound, contig);
if (ret != NULL)
goto alloc_unlock;
- if (!alloc_more_mem_on_socket(heap, size, socket, flags, align, bound,
- contig)) {
+ /* if socket ID is invalid, this is an external heap */
+ if (socket_id < 0)
+ goto alloc_unlock;
+
+ if (!alloc_more_mem_on_socket(heap, size, socket_id, flags, align,
+ bound, contig)) {
ret = heap_alloc(heap, type, size, flags, align, bound, contig);
/* this should have succeeded */
@@ -604,14 +661,14 @@ void *
malloc_heap_alloc(const char *type, size_t size, int socket_arg,
unsigned int flags, size_t align, size_t bound, bool contig)
{
- int socket, i, cur_socket;
+ int socket, heap_id, i;
void *ret;
/* return NULL if size is 0 or alignment is not power-of-2 */
if (size == 0 || (align && !rte_is_power_of_2(align)))
return NULL;
- if (!rte_eal_has_hugepages())
+ if (!rte_eal_has_hugepages() && socket_arg < RTE_MAX_NUMA_NODES)
socket_arg = SOCKET_ID_ANY;
if (socket_arg == SOCKET_ID_ANY)
@@ -619,22 +676,25 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg,
else
socket = socket_arg;
- /* Check socket parameter */
- if (socket >= RTE_MAX_NUMA_NODES)
+ /* turn socket ID into heap ID */
+ heap_id = malloc_socket_to_heap_id(socket);
+ /* if heap id is negative, socket ID was invalid */
+ if (heap_id < 0)
return NULL;
- ret = heap_alloc_on_socket(type, size, socket, flags, align, bound,
- contig);
+ ret = malloc_heap_alloc_on_heap_id(type, size, heap_id, flags, align,
+ bound, contig);
if (ret != NULL || socket_arg != SOCKET_ID_ANY)
return ret;
- /* try other heaps */
+ /* try other heaps. we are only iterating through native DPDK sockets,
+ * so external heaps won't be included.
+ */
for (i = 0; i < (int) rte_socket_count(); i++) {
- cur_socket = rte_socket_id_by_idx(i);
- if (cur_socket == socket)
+ if (i == heap_id)
continue;
- ret = heap_alloc_on_socket(type, size, cur_socket, flags,
- align, bound, contig);
+ ret = malloc_heap_alloc_on_heap_id(type, size, i, flags, align,
+ bound, contig);
if (ret != NULL)
return ret;
}
@@ -642,11 +702,11 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg,
}
static void *
-heap_alloc_biggest_on_socket(const char *type, int socket, unsigned int flags,
- size_t align, bool contig)
+heap_alloc_biggest_on_heap_id(const char *type, unsigned int heap_id,
+ unsigned int flags, size_t align, bool contig)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
+ struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id];
void *ret;
rte_spinlock_lock(&(heap->lock));
@@ -664,7 +724,7 @@ void *
malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags,
size_t align, bool contig)
{
- int socket, i, cur_socket;
+ int socket, i, cur_socket, heap_id;
void *ret;
/* return NULL if align is not power-of-2 */
@@ -679,11 +739,13 @@ malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags,
else
socket = socket_arg;
- /* Check socket parameter */
- if (socket >= RTE_MAX_NUMA_NODES)
+ /* turn socket ID into heap ID */
+ heap_id = malloc_socket_to_heap_id(socket);
+ /* if heap id is negative, socket ID was invalid */
+ if (heap_id < 0)
return NULL;
- ret = heap_alloc_biggest_on_socket(type, socket, flags, align,
+ ret = heap_alloc_biggest_on_heap_id(type, heap_id, flags, align,
contig);
if (ret != NULL || socket_arg != SOCKET_ID_ANY)
return ret;
@@ -693,8 +755,8 @@ malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags,
cur_socket = rte_socket_id_by_idx(i);
if (cur_socket == socket)
continue;
- ret = heap_alloc_biggest_on_socket(type, cur_socket, flags,
- align, contig);
+ ret = heap_alloc_biggest_on_heap_id(type, i, flags, align,
+ contig);
if (ret != NULL)
return ret;
}
@@ -756,8 +818,10 @@ malloc_heap_free(struct malloc_elem *elem)
/* anything after this is a bonus */
ret = 0;
- /* ...of which we can't avail if we are in legacy mode */
- if (internal_config.legacy_mem)
+ /* ...of which we can't avail if we are in legacy mode, or if this is an
+ * externally allocated segment.
+ */
+ if (internal_config.legacy_mem || (msl->external > 0))
goto free_unlock;
/* check if we can free any memory back to the system */
@@ -914,7 +978,7 @@ malloc_heap_resize(struct malloc_elem *elem, size_t size)
}
/*
- * Function to retrieve data for heap on given socket
+ * Function to retrieve data for a given heap
*/
int
malloc_heap_get_stats(struct malloc_heap *heap,
@@ -952,7 +1016,7 @@ malloc_heap_get_stats(struct malloc_heap *heap,
}
/*
- * Function to retrieve data for heap on given socket
+ * Function to retrieve data for a given heap
*/
void
malloc_heap_dump(struct malloc_heap *heap, FILE *f)
@@ -973,10 +1037,216 @@ malloc_heap_dump(struct malloc_heap *heap, FILE *f)
rte_spinlock_unlock(&heap->lock);
}
+static int
+destroy_seg(struct malloc_elem *elem, size_t len)
+{
+ struct malloc_heap *heap = elem->heap;
+ struct rte_memseg_list *msl;
+
+ msl = elem->msl;
+
+ /* notify all subscribers that a memory area is going to be removed */
+ eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, elem, len);
+
+ /* this element can be removed */
+ malloc_elem_free_list_remove(elem);
+ malloc_elem_hide_region(elem, elem, len);
+
+ heap->total_size -= len;
+
+ memset(elem, 0, sizeof(*elem));
+
+ /* destroy the fbarray backing this memory */
+ if (rte_fbarray_destroy(&msl->memseg_arr) < 0)
+ return -1;
+
+ /* reset the memseg list */
+ memset(msl, 0, sizeof(*msl));
+
+ return 0;
+}
+
+int
+malloc_heap_add_external_memory(struct malloc_heap *heap, void *va_addr,
+ rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ char fbarray_name[RTE_FBARRAY_NAME_LEN];
+ struct rte_memseg_list *msl = NULL;
+ struct rte_fbarray *arr;
+ size_t seg_len = n_pages * page_sz;
+ unsigned int i;
+
+ /* first, find a free memseg list */
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *tmp = &mcfg->memsegs[i];
+ if (tmp->base_va == NULL) {
+ msl = tmp;
+ break;
+ }
+ }
+ if (msl == NULL) {
+ RTE_LOG(ERR, EAL, "Couldn't find empty memseg list\n");
+ rte_errno = ENOSPC;
+ return -1;
+ }
+
+ snprintf(fbarray_name, sizeof(fbarray_name) - 1, "%s_%p",
+ heap->name, va_addr);
+
+ /* create the backing fbarray */
+ if (rte_fbarray_init(&msl->memseg_arr, fbarray_name, n_pages,
+ sizeof(struct rte_memseg)) < 0) {
+ RTE_LOG(ERR, EAL, "Couldn't create fbarray backing the memseg list\n");
+ return -1;
+ }
+ arr = &msl->memseg_arr;
+
+ /* fbarray created, fill it up */
+ for (i = 0; i < n_pages; i++) {
+ struct rte_memseg *ms;
+
+ rte_fbarray_set_used(arr, i);
+ ms = rte_fbarray_get(arr, i);
+ ms->addr = RTE_PTR_ADD(va_addr, i * page_sz);
+ ms->iova = iova_addrs == NULL ? RTE_BAD_IOVA : iova_addrs[i];
+ ms->hugepage_sz = page_sz;
+ ms->len = page_sz;
+ ms->nchannel = rte_memory_get_nchannel();
+ ms->nrank = rte_memory_get_nrank();
+ ms->socket_id = heap->socket_id;
+ }
+
+ /* set up the memseg list */
+ msl->base_va = va_addr;
+ msl->page_sz = page_sz;
+ msl->socket_id = heap->socket_id;
+ msl->len = seg_len;
+ msl->version = 0;
+ msl->external = 1;
+
+ /* erase contents of new memory */
+ memset(va_addr, 0, seg_len);
+
+ /* now, add newly minted memory to the malloc heap */
+ malloc_heap_add_memory(heap, msl, va_addr, seg_len);
+
+ heap->total_size += seg_len;
+
+ /* all done! */
+ RTE_LOG(DEBUG, EAL, "Added segment for heap %s starting at %p\n",
+ heap->name, va_addr);
+
+ /* notify all subscribers that a new memory area has been added */
+ eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC,
+ va_addr, seg_len);
+
+ return 0;
+}
+
+int
+malloc_heap_remove_external_memory(struct malloc_heap *heap, void *va_addr,
+ size_t len)
+{
+ struct malloc_elem *elem = heap->first;
+
+ /* find element with specified va address */
+ while (elem != NULL && elem != va_addr) {
+ elem = elem->next;
+ /* stop if we've blown past our VA */
+ if (elem > (struct malloc_elem *)va_addr) {
+ rte_errno = ENOENT;
+ return -1;
+ }
+ }
+ /* check if element was found */
+ if (elem == NULL || elem->msl->len != len) {
+ rte_errno = ENOENT;
+ return -1;
+ }
+ /* if element's size is not equal to segment len, segment is busy */
+ if (elem->state == ELEM_BUSY || elem->size != len) {
+ rte_errno = EBUSY;
+ return -1;
+ }
+ return destroy_seg(elem, len);
+}
+
+int
+malloc_heap_create(struct malloc_heap *heap, const char *heap_name)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ uint32_t next_socket_id = mcfg->next_socket_id;
+
+ /* prevent overflow. did you really create 2 billion heaps??? */
+ if (next_socket_id > INT32_MAX) {
+ RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n");
+ rte_errno = ENOSPC;
+ return -1;
+ }
+
+ /* initialize empty heap */
+ heap->alloc_count = 0;
+ heap->first = NULL;
+ heap->last = NULL;
+ LIST_INIT(heap->free_head);
+ rte_spinlock_init(&heap->lock);
+ heap->total_size = 0;
+ heap->socket_id = next_socket_id;
+
+ /* we hold a global mem hotplug writelock, so it's safe to increment */
+ mcfg->next_socket_id++;
+
+ /* set up name */
+ strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN);
+ return 0;
+}
+
+int
+malloc_heap_destroy(struct malloc_heap *heap)
+{
+ if (heap->alloc_count != 0) {
+ RTE_LOG(ERR, EAL, "Heap is still in use\n");
+ rte_errno = EBUSY;
+ return -1;
+ }
+ if (heap->first != NULL || heap->last != NULL) {
+ RTE_LOG(ERR, EAL, "Heap still contains memory segments\n");
+ rte_errno = EBUSY;
+ return -1;
+ }
+ if (heap->total_size != 0)
+ RTE_LOG(ERR, EAL, "Total size not zero, heap is likely corrupt\n");
+
+ /* after this, the lock will be dropped */
+ memset(heap, 0, sizeof(*heap));
+
+ return 0;
+}
+
int
rte_eal_malloc_heap_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ unsigned int i;
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ /* assign min socket ID to external heaps */
+ mcfg->next_socket_id = EXTERNAL_HEAP_MIN_SOCKET_ID;
+
+ /* assign names to default DPDK heaps */
+ for (i = 0; i < rte_socket_count(); i++) {
+ struct malloc_heap *heap = &mcfg->malloc_heaps[i];
+ char heap_name[RTE_HEAP_NAME_MAX_LEN];
+ int socket_id = rte_socket_id_by_idx(i);
+
+ snprintf(heap_name, sizeof(heap_name) - 1,
+ "socket_%i", socket_id);
+ strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN);
+ heap->socket_id = socket_id;
+ }
+ }
+
if (register_mp_requests()) {
RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n");
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index f52cb555..e48996d5 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -34,6 +34,20 @@ malloc_heap_alloc_biggest(const char *type, int socket, unsigned int flags,
size_t align, bool contig);
int
+malloc_heap_create(struct malloc_heap *heap, const char *heap_name);
+
+int
+malloc_heap_destroy(struct malloc_heap *heap);
+
+int
+malloc_heap_add_external_memory(struct malloc_heap *heap, void *va_addr,
+ rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz);
+
+int
+malloc_heap_remove_external_memory(struct malloc_heap *heap, void *va_addr,
+ size_t len);
+
+int
malloc_heap_free(struct malloc_elem *elem);
int
@@ -47,6 +61,9 @@ void
malloc_heap_dump(struct malloc_heap *heap, FILE *f);
int
+malloc_socket_to_heap_id(unsigned int socket_id);
+
+int
rte_eal_malloc_heap_init(void);
#ifdef __cplusplus
diff --git a/lib/librte_eal/common/malloc_mp.c b/lib/librte_eal/common/malloc_mp.c
index 931c14bc..5f2d4e0b 100644
--- a/lib/librte_eal/common/malloc_mp.c
+++ b/lib/librte_eal/common/malloc_mp.c
@@ -194,13 +194,11 @@ handle_alloc_request(const struct malloc_mp_req *m,
/* we can't know in advance how many pages we'll need, so we malloc */
ms = malloc(sizeof(*ms) * n_segs);
-
- memset(ms, 0, sizeof(*ms) * n_segs);
-
if (ms == NULL) {
RTE_LOG(ERR, EAL, "Couldn't allocate memory for request state\n");
goto fail;
}
+ memset(ms, 0, sizeof(*ms) * n_segs);
elem = alloc_pages_on_heap(heap, ar->page_sz, ar->elt_size, ar->socket,
ar->flags, ar->align, ar->bound, ar->contig, ms,
diff --git a/lib/librte_eal/common/meson.build b/lib/librte_eal/common/meson.build
index 56005bea..2a10d57d 100644
--- a/lib/librte_eal/common/meson.build
+++ b/lib/librte_eal/common/meson.build
@@ -14,6 +14,7 @@ common_sources = files(
'eal_common_errno.c',
'eal_common_fbarray.c',
'eal_common_hexdump.c',
+ 'eal_common_hypervisor.c',
'eal_common_launch.c',
'eal_common_lcore.c',
'eal_common_log.c',
@@ -27,11 +28,13 @@ common_sources = files(
'eal_common_thread.c',
'eal_common_timer.c',
'eal_common_uuid.c',
+ 'hotplug_mp.c',
'malloc_elem.c',
'malloc_heap.c',
'malloc_mp.c',
'rte_keepalive.c',
'rte_malloc.c',
+ 'rte_option.c',
'rte_reciprocal.c',
'rte_service.c'
)
@@ -59,6 +62,7 @@ common_headers = files(
'include/rte_errno.h',
'include/rte_fbarray.h',
'include/rte_hexdump.h',
+ 'include/rte_hypervisor.h',
'include/rte_interrupts.h',
'include/rte_keepalive.h',
'include/rte_launch.h',
@@ -68,6 +72,7 @@ common_headers = files(
'include/rte_malloc_heap.h',
'include/rte_memory.h',
'include/rte_memzone.h',
+ 'include/rte_option.h',
'include/rte_pci_dev_feature_defs.h',
'include/rte_pci_dev_features.h',
'include/rte_per_lcore.h',
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index b51a6d11..9e61dc41 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -8,6 +8,7 @@
#include <string.h>
#include <sys/queue.h>
+#include <rte_errno.h>
#include <rte_memcpy.h>
#include <rte_memory.h>
#include <rte_eal.h>
@@ -23,6 +24,7 @@
#include <rte_malloc.h>
#include "malloc_elem.h"
#include "malloc_heap.h"
+#include "eal_memalloc.h"
/* Free the memory space back to heap */
@@ -44,13 +46,15 @@ rte_malloc_socket(const char *type, size_t size, unsigned int align,
if (size == 0 || (align && !rte_is_power_of_2(align)))
return NULL;
- if (!rte_eal_has_hugepages())
+ /* if there are no hugepages and if we are not allocating from an
+ * external heap, use memory from any socket available. checking for
+ * socket being external may return -1 in case of invalid socket, but
+ * that's OK - if there are no hugepages, it doesn't matter.
+ */
+ if (rte_malloc_heap_socket_is_external(socket_arg) != 1 &&
+ !rte_eal_has_hugepages())
socket_arg = SOCKET_ID_ANY;
- /* Check socket parameter */
- if (socket_arg >= RTE_MAX_NUMA_NODES)
- return NULL;
-
return malloc_heap_alloc(type, size, socket_arg, 0,
align == 0 ? 1 : align, 0, false);
}
@@ -152,11 +156,20 @@ rte_malloc_get_socket_stats(int socket,
struct rte_malloc_socket_stats *socket_stats)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int heap_idx, ret = -1;
- if (socket >= RTE_MAX_NUMA_NODES || socket < 0)
- return -1;
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+
+ heap_idx = malloc_socket_to_heap_id(socket);
+ if (heap_idx < 0)
+ goto unlock;
- return malloc_heap_get_stats(&mcfg->malloc_heaps[socket], socket_stats);
+ ret = malloc_heap_get_stats(&mcfg->malloc_heaps[heap_idx],
+ socket_stats);
+unlock:
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+
+ return ret;
}
/*
@@ -168,12 +181,75 @@ rte_malloc_dump_heaps(FILE *f)
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
unsigned int idx;
- for (idx = 0; idx < rte_socket_count(); idx++) {
- unsigned int socket = rte_socket_id_by_idx(idx);
- fprintf(f, "Heap on socket %i:\n", socket);
- malloc_heap_dump(&mcfg->malloc_heaps[socket], f);
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+
+ for (idx = 0; idx < RTE_MAX_HEAPS; idx++) {
+ fprintf(f, "Heap id: %u\n", idx);
+ malloc_heap_dump(&mcfg->malloc_heaps[idx], f);
+ }
+
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+}
+
+int
+rte_malloc_heap_get_socket(const char *name)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = NULL;
+ unsigned int idx;
+ int ret;
+
+ if (name == NULL ||
+ strnlen(name, RTE_HEAP_NAME_MAX_LEN) == 0 ||
+ strnlen(name, RTE_HEAP_NAME_MAX_LEN) ==
+ RTE_HEAP_NAME_MAX_LEN) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+ for (idx = 0; idx < RTE_MAX_HEAPS; idx++) {
+ struct malloc_heap *tmp = &mcfg->malloc_heaps[idx];
+
+ if (!strncmp(name, tmp->name, RTE_HEAP_NAME_MAX_LEN)) {
+ heap = tmp;
+ break;
+ }
+ }
+
+ if (heap != NULL) {
+ ret = heap->socket_id;
+ } else {
+ rte_errno = ENOENT;
+ ret = -1;
+ }
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+
+ return ret;
+}
+
+int
+rte_malloc_heap_socket_is_external(int socket_id)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ unsigned int idx;
+ int ret = -1;
+
+ if (socket_id == SOCKET_ID_ANY)
+ return 0;
+
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+ for (idx = 0; idx < RTE_MAX_HEAPS; idx++) {
+ struct malloc_heap *tmp = &mcfg->malloc_heaps[idx];
+
+ if ((int)tmp->socket_id == socket_id) {
+ /* external memory always has large socket ID's */
+ ret = tmp->socket_id >= RTE_MAX_NUMA_NODES;
+ break;
+ }
}
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ return ret;
}
/*
@@ -182,14 +258,20 @@ rte_malloc_dump_heaps(FILE *f)
void
rte_malloc_dump_stats(FILE *f, __rte_unused const char *type)
{
- unsigned int socket;
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ unsigned int heap_id;
struct rte_malloc_socket_stats sock_stats;
+
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+
/* Iterate through all initialised heaps */
- for (socket=0; socket< RTE_MAX_NUMA_NODES; socket++) {
- if ((rte_malloc_get_socket_stats(socket, &sock_stats) < 0))
- continue;
+ for (heap_id = 0; heap_id < RTE_MAX_HEAPS; heap_id++) {
+ struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id];
+
+ malloc_heap_get_stats(heap, &sock_stats);
- fprintf(f, "Socket:%u\n", socket);
+ fprintf(f, "Heap id:%u\n", heap_id);
+ fprintf(f, "\tHeap name:%s\n", heap->name);
fprintf(f, "\tHeap_size:%zu,\n", sock_stats.heap_totalsz_bytes);
fprintf(f, "\tFree_size:%zu,\n", sock_stats.heap_freesz_bytes);
fprintf(f, "\tAlloc_size:%zu,\n", sock_stats.heap_allocsz_bytes);
@@ -198,6 +280,7 @@ rte_malloc_dump_stats(FILE *f, __rte_unused const char *type)
fprintf(f, "\tAlloc_count:%u,\n",sock_stats.alloc_count);
fprintf(f, "\tFree_count:%u,\n", sock_stats.free_count);
}
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
return;
}
@@ -223,7 +306,7 @@ rte_malloc_virt2iova(const void *addr)
if (elem == NULL)
return RTE_BAD_IOVA;
- if (rte_eal_iova_mode() == RTE_IOVA_VA)
+ if (!elem->msl->external && rte_eal_iova_mode() == RTE_IOVA_VA)
return (uintptr_t) addr;
ms = rte_mem_virt2memseg(addr, elem->msl);
@@ -235,3 +318,320 @@ rte_malloc_virt2iova(const void *addr)
return ms->iova + RTE_PTR_DIFF(addr, ms->addr);
}
+
+static struct malloc_heap *
+find_named_heap(const char *name)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ unsigned int i;
+
+ for (i = 0; i < RTE_MAX_HEAPS; i++) {
+ struct malloc_heap *heap = &mcfg->malloc_heaps[i];
+
+ if (!strncmp(name, heap->name, RTE_HEAP_NAME_MAX_LEN))
+ return heap;
+ }
+ return NULL;
+}
+
+int
+rte_malloc_heap_memory_add(const char *heap_name, void *va_addr, size_t len,
+ rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = NULL;
+ unsigned int n;
+ int ret;
+
+ if (heap_name == NULL || va_addr == NULL ||
+ page_sz == 0 || !rte_is_power_of_2(page_sz) ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) ==
+ RTE_HEAP_NAME_MAX_LEN) {
+ rte_errno = EINVAL;
+ ret = -1;
+ goto unlock;
+ }
+ rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
+
+ /* find our heap */
+ heap = find_named_heap(heap_name);
+ if (heap == NULL) {
+ rte_errno = ENOENT;
+ ret = -1;
+ goto unlock;
+ }
+ if (heap->socket_id < RTE_MAX_NUMA_NODES) {
+ /* cannot add memory to internal heaps */
+ rte_errno = EPERM;
+ ret = -1;
+ goto unlock;
+ }
+ n = len / page_sz;
+ if (n != n_pages && iova_addrs != NULL) {
+ rte_errno = EINVAL;
+ ret = -1;
+ goto unlock;
+ }
+
+ rte_spinlock_lock(&heap->lock);
+ ret = malloc_heap_add_external_memory(heap, va_addr, iova_addrs, n,
+ page_sz);
+ rte_spinlock_unlock(&heap->lock);
+
+unlock:
+ rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
+
+ return ret;
+}
+
+int
+rte_malloc_heap_memory_remove(const char *heap_name, void *va_addr, size_t len)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = NULL;
+ int ret;
+
+ if (heap_name == NULL || va_addr == NULL || len == 0 ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) ==
+ RTE_HEAP_NAME_MAX_LEN) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
+ /* find our heap */
+ heap = find_named_heap(heap_name);
+ if (heap == NULL) {
+ rte_errno = ENOENT;
+ ret = -1;
+ goto unlock;
+ }
+ if (heap->socket_id < RTE_MAX_NUMA_NODES) {
+ /* cannot remove memory from internal heaps */
+ rte_errno = EPERM;
+ ret = -1;
+ goto unlock;
+ }
+
+ rte_spinlock_lock(&heap->lock);
+ ret = malloc_heap_remove_external_memory(heap, va_addr, len);
+ rte_spinlock_unlock(&heap->lock);
+
+unlock:
+ rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
+
+ return ret;
+}
+
+struct sync_mem_walk_arg {
+ void *va_addr;
+ size_t len;
+ int result;
+ bool attach;
+};
+
+static int
+sync_mem_walk(const struct rte_memseg_list *msl, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct sync_mem_walk_arg *wa = arg;
+ size_t len = msl->page_sz * msl->memseg_arr.len;
+
+ if (msl->base_va == wa->va_addr &&
+ len == wa->len) {
+ struct rte_memseg_list *found_msl;
+ int msl_idx, ret;
+
+ /* msl is const */
+ msl_idx = msl - mcfg->memsegs;
+ found_msl = &mcfg->memsegs[msl_idx];
+
+ if (wa->attach) {
+ ret = rte_fbarray_attach(&found_msl->memseg_arr);
+ } else {
+ /* notify all subscribers that a memory area is about to
+ * be removed
+ */
+ eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE,
+ msl->base_va, msl->len);
+ ret = rte_fbarray_detach(&found_msl->memseg_arr);
+ }
+
+ if (ret < 0) {
+ wa->result = -rte_errno;
+ } else {
+ /* notify all subscribers that a new memory area was
+ * added
+ */
+ if (wa->attach)
+ eal_memalloc_mem_event_notify(
+ RTE_MEM_EVENT_ALLOC,
+ msl->base_va, msl->len);
+ wa->result = 0;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int
+sync_memory(const char *heap_name, void *va_addr, size_t len, bool attach)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = NULL;
+ struct sync_mem_walk_arg wa;
+ int ret;
+
+ if (heap_name == NULL || va_addr == NULL || len == 0 ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) ==
+ RTE_HEAP_NAME_MAX_LEN) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+
+ /* find our heap */
+ heap = find_named_heap(heap_name);
+ if (heap == NULL) {
+ rte_errno = ENOENT;
+ ret = -1;
+ goto unlock;
+ }
+ /* we shouldn't be able to sync to internal heaps */
+ if (heap->socket_id < RTE_MAX_NUMA_NODES) {
+ rte_errno = EPERM;
+ ret = -1;
+ goto unlock;
+ }
+
+ /* find corresponding memseg list to sync to */
+ wa.va_addr = va_addr;
+ wa.len = len;
+ wa.result = -ENOENT; /* fail unless explicitly told to succeed */
+ wa.attach = attach;
+
+ /* we're already holding a read lock */
+ rte_memseg_list_walk_thread_unsafe(sync_mem_walk, &wa);
+
+ if (wa.result < 0) {
+ rte_errno = -wa.result;
+ ret = -1;
+ } else {
+ /* notify all subscribers that a new memory area was added */
+ if (attach)
+ eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC,
+ va_addr, len);
+ ret = 0;
+ }
+unlock:
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ return ret;
+}
+
+int
+rte_malloc_heap_memory_attach(const char *heap_name, void *va_addr, size_t len)
+{
+ return sync_memory(heap_name, va_addr, len, true);
+}
+
+int
+rte_malloc_heap_memory_detach(const char *heap_name, void *va_addr, size_t len)
+{
+ return sync_memory(heap_name, va_addr, len, false);
+}
+
+int
+rte_malloc_heap_create(const char *heap_name)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = NULL;
+ int i, ret;
+
+ if (heap_name == NULL ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) ==
+ RTE_HEAP_NAME_MAX_LEN) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ /* check if there is space in the heap list, or if heap with this name
+ * already exists.
+ */
+ rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
+
+ for (i = 0; i < RTE_MAX_HEAPS; i++) {
+ struct malloc_heap *tmp = &mcfg->malloc_heaps[i];
+ /* existing heap */
+ if (strncmp(heap_name, tmp->name,
+ RTE_HEAP_NAME_MAX_LEN) == 0) {
+ RTE_LOG(ERR, EAL, "Heap %s already exists\n",
+ heap_name);
+ rte_errno = EEXIST;
+ ret = -1;
+ goto unlock;
+ }
+ /* empty heap */
+ if (strnlen(tmp->name, RTE_HEAP_NAME_MAX_LEN) == 0) {
+ heap = tmp;
+ break;
+ }
+ }
+ if (heap == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot create new heap: no space\n");
+ rte_errno = ENOSPC;
+ ret = -1;
+ goto unlock;
+ }
+
+ /* we're sure that we can create a new heap, so do it */
+ ret = malloc_heap_create(heap, heap_name);
+unlock:
+ rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
+
+ return ret;
+}
+
+int
+rte_malloc_heap_destroy(const char *heap_name)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = NULL;
+ int ret;
+
+ if (heap_name == NULL ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) ==
+ RTE_HEAP_NAME_MAX_LEN) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
+
+ /* start from non-socket heaps */
+ heap = find_named_heap(heap_name);
+ if (heap == NULL) {
+ RTE_LOG(ERR, EAL, "Heap %s not found\n", heap_name);
+ rte_errno = ENOENT;
+ ret = -1;
+ goto unlock;
+ }
+ /* we shouldn't be able to destroy internal heaps */
+ if (heap->socket_id < RTE_MAX_NUMA_NODES) {
+ rte_errno = EPERM;
+ ret = -1;
+ goto unlock;
+ }
+ /* sanity checks done, now we can destroy the heap */
+ rte_spinlock_lock(&heap->lock);
+ ret = malloc_heap_destroy(heap);
+
+ /* if we failed, lock is still active */
+ if (ret < 0)
+ rte_spinlock_unlock(&heap->lock);
+unlock:
+ rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
+
+ return ret;
+}
diff --git a/lib/librte_eal/common/rte_option.c b/lib/librte_eal/common/rte_option.c
new file mode 100644
index 00000000..02d59a86
--- /dev/null
+++ b/lib/librte_eal/common/rte_option.c
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <unistd.h>
+#include <string.h>
+
+#include <rte_eal.h>
+#include <rte_option.h>
+
+#include "eal_private.h"
+
+TAILQ_HEAD(rte_option_list, rte_option);
+
+struct rte_option_list rte_option_list =
+ TAILQ_HEAD_INITIALIZER(rte_option_list);
+
+static struct rte_option *option;
+
+int
+rte_option_parse(const char *opt)
+{
+ /* Check if the option is registered */
+ TAILQ_FOREACH(option, &rte_option_list, next) {
+ if (strcmp(opt, option->opt_str) == 0) {
+ option->enabled = 1;
+ return 0;
+ }
+ }
+
+ return -1;
+}
+
+void __rte_experimental
+rte_option_register(struct rte_option *opt)
+{
+ TAILQ_FOREACH(option, &rte_option_list, next) {
+ if (strcmp(opt->opt_str, option->opt_str) == 0)
+ RTE_LOG(INFO, EAL, "Option %s has already been registered.",
+ opt->opt_str);
+ return;
+ }
+
+ TAILQ_INSERT_HEAD(&rte_option_list, opt, next);
+}
+
+void
+rte_option_init(void)
+{
+ TAILQ_FOREACH(option, &rte_option_list, next) {
+ if (option->enabled)
+ option->cb();
+ }
+}
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index fd92c75c..51deb579 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -10,7 +10,7 @@ ARCH_DIR ?= $(RTE_ARCH)
EXPORT_MAP := ../../rte_eal_version.map
VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR)
-LIBABIVER := 8
+LIBABIVER := 9
VPATH += $(RTE_SDK)/lib/librte_eal/common
@@ -70,10 +70,12 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_uuid.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += hotplug_mp.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_mp.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_option.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_service.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_reciprocal.c
@@ -85,22 +87,6 @@ SRCS-y += rte_cycles.c
CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST)
-CFLAGS_eal.o := -D_GNU_SOURCE
-CFLAGS_eal_interrupts.o := -D_GNU_SOURCE
-CFLAGS_eal_vfio_mp_sync.o := -D_GNU_SOURCE
-CFLAGS_eal_timer.o := -D_GNU_SOURCE
-CFLAGS_eal_lcore.o := -D_GNU_SOURCE
-CFLAGS_eal_memalloc.o := -D_GNU_SOURCE
-CFLAGS_eal_thread.o := -D_GNU_SOURCE
-CFLAGS_eal_log.o := -D_GNU_SOURCE
-CFLAGS_eal_common_log.o := -D_GNU_SOURCE
-CFLAGS_eal_hugepage_info.o := -D_GNU_SOURCE
-CFLAGS_eal_common_whitelist.o := -D_GNU_SOURCE
-CFLAGS_eal_common_options.o := -D_GNU_SOURCE
-CFLAGS_eal_common_thread.o := -D_GNU_SOURCE
-CFLAGS_eal_common_lcore.o := -D_GNU_SOURCE
-CFLAGS_rte_cycles.o := -D_GNU_SOURCE
-
# workaround for a gcc bug with noreturn attribute
# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603
ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index e59ac657..361744d4 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -48,6 +48,7 @@
#include <rte_atomic.h>
#include <malloc_heap.h>
#include <rte_vfio.h>
+#include <rte_option.h>
#include "eal_private.h"
#include "eal_thread.h"
@@ -149,7 +150,7 @@ eal_create_runtime_dir(void)
}
const char *
-eal_get_runtime_dir(void)
+rte_eal_get_runtime_dir(void)
{
return runtime_dir;
}
@@ -263,6 +264,8 @@ rte_eal_config_create(void)
* processes could later map the config into this exact location */
rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
+ rte_config.mem_config->dma_maskbits = 0;
+
}
/* attach to an existing shared memory config */
@@ -352,6 +355,24 @@ eal_proc_type_detect(void)
return ptype;
}
+/* copies data from internal config to shared config */
+static void
+eal_update_mem_config(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ mcfg->legacy_mem = internal_config.legacy_mem;
+ mcfg->single_file_segments = internal_config.single_file_segments;
+}
+
+/* copies data from shared config to internal config */
+static void
+eal_update_internal_config(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ internal_config.legacy_mem = mcfg->legacy_mem;
+ internal_config.single_file_segments = mcfg->single_file_segments;
+}
+
/* Sets up rte_config structure with the pointer to shared memory config.*/
static void
rte_config_init(void)
@@ -361,11 +382,13 @@ rte_config_init(void)
switch (rte_config.process_type){
case RTE_PROC_PRIMARY:
rte_eal_config_create();
+ eal_update_mem_config();
break;
case RTE_PROC_SECONDARY:
rte_eal_config_attach();
rte_eal_mcfg_wait_complete(rte_config.mem_config);
rte_eal_config_reattach();
+ eal_update_internal_config();
break;
case RTE_PROC_AUTO:
case RTE_PROC_INVALID:
@@ -580,12 +603,20 @@ eal_parse_args(int argc, char **argv)
argvopt = argv;
optind = 1;
+ opterr = 0;
while ((opt = getopt_long(argc, argvopt, eal_short_options,
eal_long_options, &option_index)) != EOF) {
- /* getopt is not happy, stop right now */
+ /*
+ * getopt didn't recognise the option, lets parse the
+ * registered options to see if the flag is valid
+ */
if (opt == '?') {
+ ret = rte_option_parse(argv[optind-1]);
+ if (ret == 0)
+ continue;
+
eal_usage(prgname);
ret = -1;
goto out;
@@ -725,6 +756,9 @@ check_socket(const struct rte_memseg_list *msl, void *arg)
{
int *socket_id = arg;
+ if (msl->external)
+ return 0;
+
return *socket_id == msl->socket_id;
}
@@ -793,7 +827,8 @@ rte_eal_init(int argc, char **argv)
int i, fctret, ret;
pthread_t thread_id;
static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0);
- const char *logid;
+ const char *p;
+ static char logid[PATH_MAX];
char cpuset[RTE_CPU_AFFINITY_STR_LEN];
char thread_name[RTE_MAX_THREAD_NAME_LEN];
@@ -810,9 +845,8 @@ rte_eal_init(int argc, char **argv)
return -1;
}
- logid = strrchr(argv[0], '/');
- logid = strdup(logid ? logid + 1: argv[0]);
-
+ p = strrchr(argv[0], '/');
+ strlcpy(logid, p ? p + 1 : argv[0], sizeof(logid));
thread_id = pthread_self();
eal_reset_internal_config(&internal_config);
@@ -835,7 +869,7 @@ rte_eal_init(int argc, char **argv)
}
if (eal_plugins_init() < 0) {
- rte_eal_init_alert("Cannot init plugins\n");
+ rte_eal_init_alert("Cannot init plugins");
rte_errno = EINVAL;
rte_atomic32_clear(&run_once);
return -1;
@@ -850,7 +884,7 @@ rte_eal_init(int argc, char **argv)
rte_config_init();
if (rte_eal_intr_init() < 0) {
- rte_eal_init_alert("Cannot init interrupt-handling thread\n");
+ rte_eal_init_alert("Cannot init interrupt-handling thread");
return -1;
}
@@ -858,30 +892,43 @@ rte_eal_init(int argc, char **argv)
* bus through mp channel in the secondary process before the bus scan.
*/
if (rte_mp_channel_init() < 0) {
- rte_eal_init_alert("failed to init mp channel\n");
+ rte_eal_init_alert("failed to init mp channel");
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
rte_errno = EFAULT;
return -1;
}
}
+ /* register multi-process action callbacks for hotplug */
+ if (rte_mp_dev_hotplug_init() < 0) {
+ rte_eal_init_alert("failed to register mp callback for hotplug");
+ return -1;
+ }
+
if (rte_bus_scan()) {
- rte_eal_init_alert("Cannot scan the buses for devices\n");
+ rte_eal_init_alert("Cannot scan the buses for devices");
rte_errno = ENODEV;
rte_atomic32_clear(&run_once);
return -1;
}
- /* autodetect the iova mapping mode (default is iova_pa) */
- rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class();
-
- /* Workaround for KNI which requires physical address to work */
- if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA &&
- rte_eal_check_module("rte_kni") == 1) {
- rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA;
- RTE_LOG(WARNING, EAL,
- "Some devices want IOVA as VA but PA will be used because.. "
- "KNI module inserted\n");
+ /* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */
+ if (internal_config.iova_mode == RTE_IOVA_DC) {
+ /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */
+ rte_eal_get_configuration()->iova_mode =
+ rte_bus_get_iommu_class();
+
+ /* Workaround for KNI which requires physical address to work */
+ if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA &&
+ rte_eal_check_module("rte_kni") == 1) {
+ rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA;
+ RTE_LOG(WARNING, EAL,
+ "Some devices want IOVA as VA but PA will be used because.. "
+ "KNI module inserted\n");
+ }
+ } else {
+ rte_eal_get_configuration()->iova_mode =
+ internal_config.iova_mode;
}
if (internal_config.no_hugetlbfs == 0) {
@@ -924,7 +971,7 @@ rte_eal_init(int argc, char **argv)
#ifdef VFIO_PRESENT
if (rte_eal_vfio_setup() < 0) {
- rte_eal_init_alert("Cannot init VFIO\n");
+ rte_eal_init_alert("Cannot init VFIO");
rte_errno = EAGAIN;
rte_atomic32_clear(&run_once);
return -1;
@@ -935,13 +982,13 @@ rte_eal_init(int argc, char **argv)
* initialize memzones first.
*/
if (rte_eal_memzone_init() < 0) {
- rte_eal_init_alert("Cannot init memzone\n");
+ rte_eal_init_alert("Cannot init memzone");
rte_errno = ENODEV;
return -1;
}
if (rte_eal_memory_init() < 0) {
- rte_eal_init_alert("Cannot init memory\n");
+ rte_eal_init_alert("Cannot init memory");
rte_errno = ENOMEM;
return -1;
}
@@ -950,25 +997,25 @@ rte_eal_init(int argc, char **argv)
eal_hugedirs_unlock();
if (rte_eal_malloc_heap_init() < 0) {
- rte_eal_init_alert("Cannot init malloc heap\n");
+ rte_eal_init_alert("Cannot init malloc heap");
rte_errno = ENODEV;
return -1;
}
if (rte_eal_tailqs_init() < 0) {
- rte_eal_init_alert("Cannot init tail queues for objects\n");
+ rte_eal_init_alert("Cannot init tail queues for objects");
rte_errno = EFAULT;
return -1;
}
if (rte_eal_alarm_init() < 0) {
- rte_eal_init_alert("Cannot init interrupt-handling thread\n");
+ rte_eal_init_alert("Cannot init interrupt-handling thread");
/* rte_eal_alarm_init sets rte_errno on failure. */
return -1;
}
if (rte_eal_timer_init() < 0) {
- rte_eal_init_alert("Cannot init HPET or TSC timers\n");
+ rte_eal_init_alert("Cannot init HPET or TSC timers");
rte_errno = ENOTSUP;
return -1;
}
@@ -979,8 +1026,8 @@ rte_eal_init(int argc, char **argv)
ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
- RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
- rte_config.master_lcore, (int)thread_id, cpuset,
+ RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n",
+ rte_config.master_lcore, (uintptr_t)thread_id, cpuset,
ret == 0 ? "" : "...");
RTE_LCORE_FOREACH_SLAVE(i) {
@@ -1022,14 +1069,14 @@ rte_eal_init(int argc, char **argv)
/* initialize services so vdevs register service during bus_probe. */
ret = rte_service_init();
if (ret) {
- rte_eal_init_alert("rte_service_init() failed\n");
+ rte_eal_init_alert("rte_service_init() failed");
rte_errno = ENOEXEC;
return -1;
}
/* Probe all the buses and devices/drivers on them */
if (rte_bus_probe()) {
- rte_eal_init_alert("Cannot probe devices\n");
+ rte_eal_init_alert("Cannot probe devices");
rte_errno = ENOTSUP;
return -1;
}
@@ -1051,6 +1098,9 @@ rte_eal_init(int argc, char **argv)
rte_eal_mcfg_complete();
+ /* Call each registered callback, if enabled */
+ rte_option_init();
+
return fctret;
}
@@ -1059,7 +1109,12 @@ mark_freeable(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
void *arg __rte_unused)
{
/* ms is const, so find this memseg */
- struct rte_memseg *found = rte_mem_virt2memseg(ms->addr, msl);
+ struct rte_memseg *found;
+
+ if (msl->external)
+ return 0;
+
+ found = rte_mem_virt2memseg(ms->addr, msl);
found->flags &= ~RTE_MEMSEG_FLAG_DO_NOT_FREE;
diff --git a/lib/librte_eal/linuxapp/eal/eal_dev.c b/lib/librte_eal/linuxapp/eal/eal_dev.c
index 1cf6aebf..d589c692 100644
--- a/lib/librte_eal/linuxapp/eal/eal_dev.c
+++ b/lib/librte_eal/linuxapp/eal/eal_dev.c
@@ -4,6 +4,8 @@
#include <string.h>
#include <unistd.h>
+#include <fcntl.h>
+#include <signal.h>
#include <sys/socket.h>
#include <linux/netlink.h>
@@ -14,15 +16,32 @@
#include <rte_malloc.h>
#include <rte_interrupts.h>
#include <rte_alarm.h>
+#include <rte_bus.h>
+#include <rte_eal.h>
+#include <rte_spinlock.h>
+#include <rte_errno.h>
#include "eal_private.h"
static struct rte_intr_handle intr_handle = {.fd = -1 };
static bool monitor_started;
+static bool hotplug_handle;
#define EAL_UEV_MSG_LEN 4096
#define EAL_UEV_MSG_ELEM_LEN 128
+/*
+ * spinlock for device hot-unplug failure handling. If it try to access bus or
+ * device, such as handle sigbus on bus or handle memory failure for device
+ * just need to use this lock. It could protect the bus and the device to avoid
+ * race condition.
+ */
+static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER;
+
+static struct sigaction sigbus_action_old;
+
+static int sigbus_need_recover;
+
static void dev_uev_handler(__rte_unused void *param);
/* identify the system layer which reports this event. */
@@ -33,6 +52,55 @@ enum eal_dev_event_subsystem {
EAL_DEV_EVENT_SUBSYSTEM_MAX
};
+static void
+sigbus_action_recover(void)
+{
+ if (sigbus_need_recover) {
+ sigaction(SIGBUS, &sigbus_action_old, NULL);
+ sigbus_need_recover = 0;
+ }
+}
+
+static void sigbus_handler(int signum, siginfo_t *info,
+ void *ctx __rte_unused)
+{
+ int ret;
+
+ RTE_LOG(DEBUG, EAL, "Thread[%d] catch SIGBUS, fault address:%p\n",
+ (int)pthread_self(), info->si_addr);
+
+ rte_spinlock_lock(&failure_handle_lock);
+ ret = rte_bus_sigbus_handler(info->si_addr);
+ rte_spinlock_unlock(&failure_handle_lock);
+ if (ret == -1) {
+ rte_exit(EXIT_FAILURE,
+ "Failed to handle SIGBUS for hot-unplug, "
+ "(rte_errno: %s)!", strerror(rte_errno));
+ } else if (ret == 1) {
+ if (sigbus_action_old.sa_flags == SA_SIGINFO
+ && sigbus_action_old.sa_sigaction) {
+ (*(sigbus_action_old.sa_sigaction))(signum,
+ info, ctx);
+ } else if (sigbus_action_old.sa_flags != SA_SIGINFO
+ && sigbus_action_old.sa_handler) {
+ (*(sigbus_action_old.sa_handler))(signum);
+ } else {
+ rte_exit(EXIT_FAILURE,
+ "Failed to handle generic SIGBUS!");
+ }
+ }
+
+ RTE_LOG(DEBUG, EAL, "Success to handle SIGBUS for hot-unplug!\n");
+}
+
+static int cmp_dev_name(const struct rte_device *dev,
+ const void *_name)
+{
+ const char *name = _name;
+
+ return strcmp(dev->name, name);
+}
+
static int
dev_uev_socket_fd_create(void)
{
@@ -147,6 +215,9 @@ dev_uev_handler(__rte_unused void *param)
struct rte_dev_event uevent;
int ret;
char buf[EAL_UEV_MSG_LEN];
+ struct rte_bus *bus;
+ struct rte_device *dev;
+ const char *busname = "";
memset(&uevent, 0, sizeof(struct rte_dev_event));
memset(buf, 0, EAL_UEV_MSG_LEN);
@@ -171,8 +242,43 @@ dev_uev_handler(__rte_unused void *param)
RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n",
uevent.devname, uevent.type, uevent.subsystem);
- if (uevent.devname)
- dev_callback_process(uevent.devname, uevent.type);
+ switch (uevent.subsystem) {
+ case EAL_DEV_EVENT_SUBSYSTEM_PCI:
+ case EAL_DEV_EVENT_SUBSYSTEM_UIO:
+ busname = "pci";
+ break;
+ default:
+ break;
+ }
+
+ if (uevent.devname) {
+ if (uevent.type == RTE_DEV_EVENT_REMOVE && hotplug_handle) {
+ rte_spinlock_lock(&failure_handle_lock);
+ bus = rte_bus_find_by_name(busname);
+ if (bus == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n",
+ busname);
+ return;
+ }
+
+ dev = bus->find_device(NULL, cmp_dev_name,
+ uevent.devname);
+ if (dev == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find device (%s) on "
+ "bus (%s)\n", uevent.devname, busname);
+ return;
+ }
+
+ ret = bus->hot_unplug_handler(dev);
+ rte_spinlock_unlock(&failure_handle_lock);
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Can not handle hot-unplug "
+ "for device (%s)\n", dev->name);
+ return;
+ }
+ }
+ rte_dev_event_callback_process(uevent.devname, uevent.type);
+ }
}
int __rte_experimental
@@ -220,5 +326,67 @@ rte_dev_event_monitor_stop(void)
close(intr_handle.fd);
intr_handle.fd = -1;
monitor_started = false;
+
return 0;
}
+
+int
+dev_sigbus_handler_register(void)
+{
+ sigset_t mask;
+ struct sigaction action;
+
+ rte_errno = 0;
+
+ if (sigbus_need_recover)
+ return 0;
+
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGBUS);
+ action.sa_flags = SA_SIGINFO;
+ action.sa_mask = mask;
+ action.sa_sigaction = sigbus_handler;
+ sigbus_need_recover = !sigaction(SIGBUS, &action, &sigbus_action_old);
+
+ return rte_errno;
+}
+
+int
+dev_sigbus_handler_unregister(void)
+{
+ rte_errno = 0;
+
+ sigbus_action_recover();
+
+ return rte_errno;
+}
+
+int __rte_experimental
+rte_dev_hotplug_handle_enable(void)
+{
+ int ret = 0;
+
+ ret = dev_sigbus_handler_register();
+ if (ret < 0)
+ RTE_LOG(ERR, EAL,
+ "fail to register sigbus handler for devices.\n");
+
+ hotplug_handle = true;
+
+ return ret;
+}
+
+int __rte_experimental
+rte_dev_hotplug_handle_disable(void)
+{
+ int ret = 0;
+
+ ret = dev_sigbus_handler_unregister();
+ if (ret < 0)
+ RTE_LOG(ERR, EAL,
+ "fail to unregister sigbus handler for devices.\n");
+
+ hotplug_handle = false;
+
+ return ret;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 3a7d4b22..0eab1cf7 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -6,6 +6,7 @@
#include <sys/types.h>
#include <sys/file.h>
#include <dirent.h>
+#include <fcntl.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 4076c6d6..39252a88 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -33,6 +33,7 @@
#include <rte_errno.h>
#include <rte_spinlock.h>
#include <rte_pause.h>
+#include <rte_vfio.h>
#include "eal_private.h"
#include "eal_vfio.h"
@@ -308,6 +309,66 @@ vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
return ret;
}
+
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+/* enable req notifier */
+static int
+vfio_enable_req(const struct rte_intr_handle *intr_handle)
+{
+ int len, ret;
+ char irq_set_buf[IRQ_SET_BUF_LEN];
+ struct vfio_irq_set *irq_set;
+ int *fd_ptr;
+
+ len = sizeof(irq_set_buf);
+
+ irq_set = (struct vfio_irq_set *) irq_set_buf;
+ irq_set->argsz = len;
+ irq_set->count = 1;
+ irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+ VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
+ irq_set->start = 0;
+ fd_ptr = (int *) &irq_set->data;
+ *fd_ptr = intr_handle->fd;
+
+ ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
+ intr_handle->fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* disable req notifier */
+static int
+vfio_disable_req(const struct rte_intr_handle *intr_handle)
+{
+ struct vfio_irq_set *irq_set;
+ char irq_set_buf[IRQ_SET_BUF_LEN];
+ int len, ret;
+
+ len = sizeof(struct vfio_irq_set);
+
+ irq_set = (struct vfio_irq_set *) irq_set_buf;
+ irq_set->argsz = len;
+ irq_set->count = 0;
+ irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
+ irq_set->start = 0;
+
+ ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+ if (ret)
+ RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
+ intr_handle->fd);
+
+ return ret;
+}
+#endif
#endif
static int
@@ -556,6 +617,12 @@ rte_intr_enable(const struct rte_intr_handle *intr_handle)
if (vfio_enable_intx(intr_handle))
return -1;
break;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+ case RTE_INTR_HANDLE_VFIO_REQ:
+ if (vfio_enable_req(intr_handle))
+ return -1;
+ break;
+#endif
#endif
/* not used at this moment */
case RTE_INTR_HANDLE_DEV_EVENT:
@@ -606,6 +673,12 @@ rte_intr_disable(const struct rte_intr_handle *intr_handle)
if (vfio_disable_intx(intr_handle))
return -1;
break;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+ case RTE_INTR_HANDLE_VFIO_REQ:
+ if (vfio_disable_req(intr_handle))
+ return -1;
+ break;
+#endif
#endif
/* not used at this moment */
case RTE_INTR_HANDLE_DEV_EVENT:
@@ -672,6 +745,12 @@ eal_intr_process_interrupts(struct epoll_event *events, int nfds)
case RTE_INTR_HANDLE_VFIO_LEGACY:
bytes_read = sizeof(buf.vfio_intr_count);
break;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+ case RTE_INTR_HANDLE_VFIO_REQ:
+ bytes_read = 0;
+ call = true;
+ break;
+#endif
#endif
case RTE_INTR_HANDLE_VDEV:
case RTE_INTR_HANDLE_EXT:
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index aa95551a..48b9c736 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -34,6 +34,7 @@
#include <rte_log.h>
#include <rte_eal_memconfig.h>
#include <rte_eal.h>
+#include <rte_errno.h>
#include <rte_memory.h>
#include <rte_spinlock.h>
@@ -52,30 +53,55 @@ const int anonymous_hugepages_supported =
#endif
/*
+ * we don't actually care if memfd itself is supported - we only need to check
+ * if memfd supports hugetlbfs, as that already implies memfd support.
+ *
+ * also, this is not a constant, because while we may be *compiled* with memfd
+ * hugetlbfs support, we might not be *running* on a system that supports memfd
+ * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at
+ * runtime, and fall back to anonymous memory.
+ */
+static int memfd_create_supported =
+#ifdef MFD_HUGETLB
+#define MEMFD_SUPPORTED
+ 1;
+#else
+ 0;
+#endif
+
+/*
* not all kernel version support fallocate on hugetlbfs, so fall back to
* ftruncate and disallow deallocation if fallocate is not supported.
*/
static int fallocate_supported = -1; /* unknown */
-/* for single-file segments, we need some kind of mechanism to keep track of
+/*
+ * we have two modes - single file segments, and file-per-page mode.
+ *
+ * for single-file segments, we need some kind of mechanism to keep track of
* which hugepages can be freed back to the system, and which cannot. we cannot
* use flock() because they don't allow locking parts of a file, and we cannot
* use fcntl() due to issues with their semantics, so we will have to rely on a
- * bunch of lockfiles for each page.
+ * bunch of lockfiles for each page. so, we will use 'fds' array to keep track
+ * of per-page lockfiles. we will store the actual segment list fd in the
+ * 'memseg_list_fd' field.
+ *
+ * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd'
+ * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's.
*
* we cannot know how many pages a system will have in advance, but we do know
* that they come in lists, and we know lengths of these lists. so, simply store
* a malloc'd array of fd's indexed by list and segment index.
*
* they will be initialized at startup, and filled as we allocate/deallocate
- * segments. also, use this to track memseg list proper fd.
+ * segments.
*/
static struct {
int *fds; /**< dynamically allocated array of segment lock fd's */
int memseg_list_fd; /**< memseg list fd */
int len; /**< total length of the array */
int count; /**< entries used in an array */
-} lock_fds[RTE_MAX_MEMSEG_LISTS];
+} fd_list[RTE_MAX_MEMSEG_LISTS];
/** local copy of a memory map, used to synchronize memory hotplug in MP */
static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
@@ -182,6 +208,31 @@ get_file_size(int fd)
return st.st_size;
}
+static inline uint32_t
+bsf64(uint64_t v)
+{
+ return (uint32_t)__builtin_ctzll(v);
+}
+
+static inline uint32_t
+log2_u64(uint64_t v)
+{
+ if (v == 0)
+ return 0;
+ v = rte_align64pow2(v);
+ return bsf64(v);
+}
+
+static int
+pagesz_flags(uint64_t page_sz)
+{
+ /* as per mmap() manpage, all page sizes are log2 of page size
+ * shifted by MAP_HUGE_SHIFT
+ */
+ int log2 = log2_u64(page_sz);
+ return log2 << RTE_MAP_HUGE_SHIFT;
+}
+
/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
static int lock(int fd, int type)
{
@@ -209,12 +260,12 @@ static int get_segment_lock_fd(int list_idx, int seg_idx)
char path[PATH_MAX] = {0};
int fd;
- if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds))
+ if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list))
return -1;
- if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len)
+ if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len)
return -1;
- fd = lock_fds[list_idx].fds[seg_idx];
+ fd = fd_list[list_idx].fds[seg_idx];
/* does this lock already exist? */
if (fd >= 0)
return fd;
@@ -236,8 +287,8 @@ static int get_segment_lock_fd(int list_idx, int seg_idx)
return -1;
}
/* store it for future reference */
- lock_fds[list_idx].fds[seg_idx] = fd;
- lock_fds[list_idx].count++;
+ fd_list[list_idx].fds[seg_idx] = fd;
+ fd_list[list_idx].count++;
return fd;
}
@@ -245,12 +296,12 @@ static int unlock_segment(int list_idx, int seg_idx)
{
int fd, ret;
- if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds))
+ if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list))
return -1;
- if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len)
+ if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len)
return -1;
- fd = lock_fds[list_idx].fds[seg_idx];
+ fd = fd_list[list_idx].fds[seg_idx];
/* upgrade lock to exclusive to see if we can remove the lockfile */
ret = lock(fd, LOCK_EX);
@@ -270,8 +321,8 @@ static int unlock_segment(int list_idx, int seg_idx)
* and remove it from list anyway.
*/
close(fd);
- lock_fds[list_idx].fds[seg_idx] = -1;
- lock_fds[list_idx].count--;
+ fd_list[list_idx].fds[seg_idx] = -1;
+ fd_list[list_idx].count--;
if (ret < 0)
return -1;
@@ -279,16 +330,68 @@ static int unlock_segment(int list_idx, int seg_idx)
}
static int
+get_seg_memfd(struct hugepage_info *hi __rte_unused,
+ unsigned int list_idx __rte_unused,
+ unsigned int seg_idx __rte_unused)
+{
+#ifdef MEMFD_SUPPORTED
+ int fd;
+ char segname[250]; /* as per manpage, limit is 249 bytes plus null */
+
+ if (internal_config.single_file_segments) {
+ fd = fd_list[list_idx].memseg_list_fd;
+
+ if (fd < 0) {
+ int flags = MFD_HUGETLB | pagesz_flags(hi->hugepage_sz);
+
+ snprintf(segname, sizeof(segname), "seg_%i", list_idx);
+ fd = memfd_create(segname, flags);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ fd_list[list_idx].memseg_list_fd = fd;
+ }
+ } else {
+ fd = fd_list[list_idx].fds[seg_idx];
+
+ if (fd < 0) {
+ int flags = MFD_HUGETLB | pagesz_flags(hi->hugepage_sz);
+
+ snprintf(segname, sizeof(segname), "seg_%i-%i",
+ list_idx, seg_idx);
+ fd = memfd_create(segname, flags);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ fd_list[list_idx].fds[seg_idx] = fd;
+ }
+ }
+ return fd;
+#endif
+ return -1;
+}
+
+static int
get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
unsigned int list_idx, unsigned int seg_idx)
{
int fd;
+ /* for in-memory mode, we only make it here when we're sure we support
+ * memfd, and this is a special case.
+ */
+ if (internal_config.in_memory)
+ return get_seg_memfd(hi, list_idx, seg_idx);
+
if (internal_config.single_file_segments) {
/* create a hugepage file path */
eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx);
- fd = lock_fds[list_idx].memseg_list_fd;
+ fd = fd_list[list_idx].memseg_list_fd;
if (fd < 0) {
fd = open(path, O_CREAT | O_RDWR, 0600);
@@ -304,24 +407,30 @@ get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
close(fd);
return -1;
}
- lock_fds[list_idx].memseg_list_fd = fd;
+ fd_list[list_idx].memseg_list_fd = fd;
}
} else {
/* create a hugepage file path */
eal_get_hugefile_path(path, buflen, hi->hugedir,
list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
- fd = open(path, O_CREAT | O_RDWR, 0600);
+
+ fd = fd_list[list_idx].fds[seg_idx];
+
if (fd < 0) {
- RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
- strerror(errno));
- return -1;
- }
- /* take out a read lock */
- if (lock(fd, LOCK_SH) < 0) {
- RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
- __func__, strerror(errno));
- close(fd);
- return -1;
+ fd = open(path, O_CREAT | O_RDWR, 0600);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ /* take out a read lock */
+ if (lock(fd, LOCK_SH) < 0) {
+ RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
+ __func__, strerror(errno));
+ close(fd);
+ return -1;
+ }
+ fd_list[list_idx].fds[seg_idx] = fd;
}
}
return fd;
@@ -332,6 +441,33 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx,
uint64_t fa_offset, uint64_t page_sz, bool grow)
{
bool again = false;
+
+ /* in-memory mode is a special case, because we don't need to perform
+ * any locking, and we can be sure that fallocate() is supported.
+ */
+ if (internal_config.in_memory) {
+ int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_KEEP_SIZE;
+ int ret;
+
+ /* grow or shrink the file */
+ ret = fallocate(fd, flags, fa_offset, page_sz);
+
+ if (ret < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
+ __func__,
+ strerror(errno));
+ return -1;
+ }
+ /* increase/decrease total segment count */
+ fd_list[list_idx].count += (grow ? 1 : -1);
+ if (!grow && fd_list[list_idx].count == 0) {
+ close(fd_list[list_idx].memseg_list_fd);
+ fd_list[list_idx].memseg_list_fd = -1;
+ }
+ return 0;
+ }
+
do {
if (fallocate_supported == 0) {
/* we cannot deallocate memory if fallocate() is not
@@ -410,9 +546,9 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx,
* page file fd, so that one of the processes
* could then delete the file after shrinking.
*/
- if (ret < 1 && lock_fds[list_idx].count == 0) {
+ if (ret < 1 && fd_list[list_idx].count == 0) {
close(fd);
- lock_fds[list_idx].memseg_list_fd = -1;
+ fd_list[list_idx].memseg_list_fd = -1;
}
if (ret < 0) {
@@ -448,13 +584,13 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx,
* more segments active in this segment list,
* and remove the file if there aren't.
*/
- if (lock_fds[list_idx].count == 0) {
+ if (fd_list[list_idx].count == 0) {
if (unlink(path))
RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n",
__func__, path,
strerror(errno));
close(fd);
- lock_fds[list_idx].memseg_list_fd = -1;
+ fd_list[list_idx].memseg_list_fd = -1;
}
}
}
@@ -481,26 +617,34 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
void *new_addr;
alloc_sz = hi->hugepage_sz;
- if (!internal_config.single_file_segments &&
- internal_config.in_memory &&
- anonymous_hugepages_supported) {
- int log2, flags;
-
- log2 = rte_log2_u32(alloc_sz);
- /* as per mmap() manpage, all page sizes are log2 of page size
- * shifted by MAP_HUGE_SHIFT
- */
- flags = (log2 << RTE_MAP_HUGE_SHIFT) | MAP_HUGETLB | MAP_FIXED |
+
+ /* these are checked at init, but code analyzers don't know that */
+ if (internal_config.in_memory && !anonymous_hugepages_supported) {
+ RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n");
+ return -1;
+ }
+ if (internal_config.in_memory && !memfd_create_supported &&
+ internal_config.single_file_segments) {
+ RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n");
+ return -1;
+ }
+
+ /* in-memory without memfd is a special case */
+ int mmap_flags;
+
+ if (internal_config.in_memory && !memfd_create_supported) {
+ int pagesz_flag, flags;
+
+ pagesz_flag = pagesz_flags(alloc_sz);
+ flags = pagesz_flag | MAP_HUGETLB | MAP_FIXED |
MAP_PRIVATE | MAP_ANONYMOUS;
fd = -1;
- va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, flags, -1, 0);
-
- /* single-file segments codepath will never be active because
- * in-memory mode is incompatible with it and it's stopped at
- * EAL initialization stage, however the compiler doesn't know
- * that and complains about map_offset being used uninitialized
- * on failure codepaths while having in-memory mode enabled. so,
- * assign a value here.
+ mmap_flags = flags;
+
+ /* single-file segments codepath will never be active
+ * here because in-memory mode is incompatible with the
+ * fallback path, and it's stopped at EAL initialization
+ * stage.
*/
map_offset = 0;
} else {
@@ -524,7 +668,8 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
__func__, strerror(errno));
goto resized;
}
- if (internal_config.hugepage_unlink) {
+ if (internal_config.hugepage_unlink &&
+ !internal_config.in_memory) {
if (unlink(path)) {
RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n",
__func__, strerror(errno));
@@ -532,16 +677,16 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
}
}
}
-
- /*
- * map the segment, and populate page tables, the kernel fills
- * this segment with zeros if it's a new page.
- */
- va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd,
- map_offset);
+ mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED;
}
+ /*
+ * map the segment, and populate page tables, the kernel fills
+ * this segment with zeros if it's a new page.
+ */
+ va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd,
+ map_offset);
+
if (va == MAP_FAILED) {
RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
strerror(errno));
@@ -593,10 +738,6 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
goto mapped;
}
#endif
- /* for non-single file segments that aren't in-memory, we can close fd
- * here */
- if (!internal_config.single_file_segments && !internal_config.in_memory)
- close(fd);
ms->addr = addr;
ms->hugepage_sz = alloc_sz;
@@ -626,7 +767,10 @@ unmapped:
RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n");
}
resized:
- /* in-memory mode will never be single-file-segments mode */
+ /* some codepaths will return negative fd, so exit early */
+ if (fd < 0)
+ return -1;
+
if (internal_config.single_file_segments) {
resize_hugefile(fd, path, list_idx, seg_idx, map_offset,
alloc_sz, false);
@@ -638,6 +782,7 @@ resized:
lock(fd, LOCK_EX) == 1)
unlink(path);
close(fd);
+ fd_list[list_idx].fds[seg_idx] = -1;
}
return -1;
}
@@ -648,7 +793,8 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
{
uint64_t map_offset;
char path[PATH_MAX];
- int fd, ret;
+ int fd, ret = 0;
+ bool exit_early;
/* erase page data */
memset(ms->addr, 0, ms->len);
@@ -660,8 +806,17 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
return -1;
}
+ exit_early = false;
+
+ /* if we're using anonymous hugepages, nothing to be done */
+ if (internal_config.in_memory && !memfd_create_supported)
+ exit_early = true;
+
/* if we've already unlinked the page, nothing needs to be done */
- if (internal_config.hugepage_unlink) {
+ if (!internal_config.in_memory && internal_config.hugepage_unlink)
+ exit_early = true;
+
+ if (exit_early) {
memset(ms, 0, sizeof(*ms));
return 0;
}
@@ -684,14 +839,17 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
/* if we're able to take out a write lock, we're the last one
* holding onto this page.
*/
- ret = lock(fd, LOCK_EX);
- if (ret >= 0) {
- /* no one else is using this page */
- if (ret == 1)
- unlink(path);
+ if (!internal_config.in_memory) {
+ ret = lock(fd, LOCK_EX);
+ if (ret >= 0) {
+ /* no one else is using this page */
+ if (ret == 1)
+ unlink(path);
+ }
}
/* closing fd will drop the lock */
close(fd);
+ fd_list[list_idx].fds[seg_idx] = -1;
}
memset(ms, 0, sizeof(*ms));
@@ -828,7 +986,7 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg)
int msl_idx, seg_idx, ret, dir_fd = -1;
start_addr = (uintptr_t) msl->base_va;
- end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz;
+ end_addr = start_addr + msl->len;
if ((uintptr_t)wa->ms->addr < start_addr ||
(uintptr_t)wa->ms->addr >= end_addr)
@@ -1250,6 +1408,9 @@ sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
unsigned int i;
int msl_idx;
+ if (msl->external)
+ return 0;
+
msl_idx = msl - mcfg->memsegs;
primary_msl = &mcfg->memsegs[msl_idx];
local_msl = &local_memsegs[msl_idx];
@@ -1298,6 +1459,9 @@ secondary_msl_create_walk(const struct rte_memseg_list *msl,
char name[PATH_MAX];
int msl_idx, ret;
+ if (msl->external)
+ return 0;
+
msl_idx = msl - mcfg->memsegs;
primary_msl = &mcfg->memsegs[msl_idx];
local_msl = &local_memsegs[msl_idx];
@@ -1314,50 +1478,176 @@ secondary_msl_create_walk(const struct rte_memseg_list *msl,
return -1;
}
local_msl->base_va = primary_msl->base_va;
+ local_msl->len = primary_msl->len;
return 0;
}
static int
-secondary_lock_list_create_walk(const struct rte_memseg_list *msl,
- void *arg __rte_unused)
+alloc_list(int list_idx, int len)
{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- unsigned int i, len;
- int msl_idx;
int *data;
+ int i;
- msl_idx = msl - mcfg->memsegs;
- len = msl->memseg_arr.len;
-
- /* ensure we have space to store lock fd per each possible segment */
+ /* ensure we have space to store fd per each possible segment */
data = malloc(sizeof(int) * len);
if (data == NULL) {
- RTE_LOG(ERR, EAL, "Unable to allocate space for lock descriptors\n");
+ RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n");
return -1;
}
/* set all fd's as invalid */
for (i = 0; i < len; i++)
data[i] = -1;
- lock_fds[msl_idx].fds = data;
- lock_fds[msl_idx].len = len;
- lock_fds[msl_idx].count = 0;
- lock_fds[msl_idx].memseg_list_fd = -1;
+ fd_list[list_idx].fds = data;
+ fd_list[list_idx].len = len;
+ fd_list[list_idx].count = 0;
+ fd_list[list_idx].memseg_list_fd = -1;
+
+ return 0;
+}
+
+static int
+fd_list_create_walk(const struct rte_memseg_list *msl,
+ void *arg __rte_unused)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ unsigned int len;
+ int msl_idx;
+
+ if (msl->external)
+ return 0;
+
+ msl_idx = msl - mcfg->memsegs;
+ len = msl->memseg_arr.len;
+
+ return alloc_list(msl_idx, len);
+}
+
+int
+eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+
+ /* if list is not allocated, allocate it */
+ if (fd_list[list_idx].len == 0) {
+ int len = mcfg->memsegs[list_idx].memseg_arr.len;
+
+ if (alloc_list(list_idx, len) < 0)
+ return -ENOMEM;
+ }
+ fd_list[list_idx].fds[seg_idx] = fd;
return 0;
}
int
+eal_memalloc_get_seg_fd(int list_idx, int seg_idx)
+{
+ int fd;
+ if (internal_config.single_file_segments) {
+ fd = fd_list[list_idx].memseg_list_fd;
+ } else if (fd_list[list_idx].len == 0) {
+ /* list not initialized */
+ fd = -1;
+ } else {
+ fd = fd_list[list_idx].fds[seg_idx];
+ }
+ if (fd < 0)
+ return -ENODEV;
+ return fd;
+}
+
+static int
+test_memfd_create(void)
+{
+#ifdef MEMFD_SUPPORTED
+ unsigned int i;
+ for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+ uint64_t pagesz = internal_config.hugepage_info[i].hugepage_sz;
+ int pagesz_flag = pagesz_flags(pagesz);
+ int flags;
+
+ flags = pagesz_flag | MFD_HUGETLB;
+ int fd = memfd_create("test", flags);
+ if (fd < 0) {
+ /* we failed - let memalloc know this isn't working */
+ if (errno == EINVAL) {
+ memfd_create_supported = 0;
+ return 0; /* not supported */
+ }
+
+ /* we got other error - something's wrong */
+ return -1; /* error */
+ }
+ close(fd);
+ return 1; /* supported */
+ }
+#endif
+ return 0; /* not supported */
+}
+
+int
+eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+
+ /* fd_list not initialized? */
+ if (fd_list[list_idx].len == 0)
+ return -ENODEV;
+ if (internal_config.single_file_segments) {
+ size_t pgsz = mcfg->memsegs[list_idx].page_sz;
+
+ /* segment not active? */
+ if (fd_list[list_idx].memseg_list_fd < 0)
+ return -ENOENT;
+ *offset = pgsz * seg_idx;
+ } else {
+ /* segment not active? */
+ if (fd_list[list_idx].fds[seg_idx] < 0)
+ return -ENOENT;
+ *offset = 0;
+ }
+ return 0;
+}
+
+int
eal_memalloc_init(void)
{
if (rte_eal_process_type() == RTE_PROC_SECONDARY)
if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0)
return -1;
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
+ internal_config.in_memory) {
+ int mfd_res = test_memfd_create();
- /* initialize all of the lock fd lists */
- if (internal_config.single_file_segments)
- if (rte_memseg_list_walk(secondary_lock_list_create_walk, NULL))
+ if (mfd_res < 0) {
+ RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n");
+ return -1;
+ }
+ if (mfd_res == 1)
+ RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n");
+ else
+ RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n");
+
+ /* we only support single-file segments mode with in-memory mode
+ * if we support hugetlbfs with memfd_create. this code will
+ * test if we do.
+ */
+ if (internal_config.single_file_segments &&
+ mfd_res != 1) {
+ RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n");
return -1;
+ }
+ /* this cannot ever happen but better safe than sorry */
+ if (!anonymous_hugepages_supported) {
+ RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n");
+ return -1;
+ }
+ }
+
+ /* initialize all of the fd lists */
+ if (rte_memseg_list_walk(fd_list_create_walk, NULL))
+ return -1;
return 0;
}
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index dbf19499..fce86fda 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -5,6 +5,7 @@
#define _FILE_OFFSET_BITS 64
#include <errno.h>
+#include <fcntl.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdlib.h>
@@ -17,6 +18,7 @@
#include <sys/stat.h>
#include <sys/queue.h>
#include <sys/file.h>
+#include <sys/resource.h>
#include <unistd.h>
#include <limits.h>
#include <sys/ioctl.h>
@@ -263,7 +265,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
int node_id = -1;
int essential_prev = 0;
int oldpolicy;
- struct bitmask *oldmask = numa_allocate_nodemask();
+ struct bitmask *oldmask = NULL;
bool have_numa = true;
unsigned long maxnode = 0;
@@ -275,6 +277,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
if (have_numa) {
RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
+ oldmask = numa_allocate_nodemask();
if (get_mempolicy(&oldpolicy, oldmask->maskp,
oldmask->size + 1, 0, 0) < 0) {
RTE_LOG(ERR, EAL,
@@ -402,7 +405,8 @@ out:
numa_set_localalloc();
}
}
- numa_free_cpumask(oldmask);
+ if (oldmask != NULL)
+ numa_free_cpumask(oldmask);
#endif
return i;
}
@@ -584,7 +588,7 @@ unlink_hugepage_files(struct hugepage_file *hugepg_tbl,
for (page = 0; page < nrpages; page++) {
struct hugepage_file *hp = &hugepg_tbl[page];
- if (hp->final_va != NULL && unlink(hp->filepath)) {
+ if (hp->orig_va != NULL && unlink(hp->filepath)) {
RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n",
__func__, hp->filepath, strerror(errno));
}
@@ -771,7 +775,10 @@ remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
rte_fbarray_set_used(arr, ms_idx);
- close(fd);
+ /* store segment fd internally */
+ if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
+ RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n",
+ rte_strerror(rte_errno));
}
RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n",
(seg_len * page_sz) >> 20, socket_id);
@@ -857,6 +864,7 @@ alloc_va_space(struct rte_memseg_list *msl)
return -1;
}
msl->base_va = addr;
+ msl->len = mem_sz;
return 0;
}
@@ -1365,6 +1373,7 @@ eal_legacy_hugepage_init(void)
msl->base_va = addr;
msl->page_sz = page_sz;
msl->socket_id = 0;
+ msl->len = internal_config.memory;
/* populate memsegs. each memseg is one page long */
for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
@@ -1611,7 +1620,7 @@ eal_legacy_hugepage_init(void)
if (msl->memseg_arr.count > 0)
continue;
/* this is an unused list, deallocate it */
- mem_sz = (size_t)msl->page_sz * msl->memseg_arr.len;
+ mem_sz = msl->len;
munmap(msl->base_va, mem_sz);
msl->base_va = NULL;
@@ -1770,6 +1779,7 @@ getFileSize(int fd)
static int
eal_legacy_hugepage_attach(void)
{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct hugepage_file *hp = NULL;
unsigned int num_hp = 0;
unsigned int i = 0;
@@ -1813,6 +1823,9 @@ eal_legacy_hugepage_attach(void)
struct hugepage_file *hf = &hp[i];
size_t map_sz = hf->size;
void *map_addr = hf->final_va;
+ int msl_idx, ms_idx;
+ struct rte_memseg_list *msl;
+ struct rte_memseg *ms;
/* if size is zero, no more pages left */
if (map_sz == 0)
@@ -1830,25 +1843,50 @@ eal_legacy_hugepage_attach(void)
if (map_addr == MAP_FAILED) {
RTE_LOG(ERR, EAL, "Could not map %s: %s\n",
hf->filepath, strerror(errno));
- close(fd);
- goto error;
+ goto fd_error;
}
/* set shared lock on the file. */
if (flock(fd, LOCK_SH) < 0) {
RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n",
__func__, strerror(errno));
- close(fd);
- goto error;
+ goto fd_error;
}
- close(fd);
+ /* find segment data */
+ msl = rte_mem_virt2memseg_list(map_addr);
+ if (msl == NULL) {
+ RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg list\n",
+ __func__);
+ goto fd_error;
+ }
+ ms = rte_mem_virt2memseg(map_addr, msl);
+ if (ms == NULL) {
+ RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg\n",
+ __func__);
+ goto fd_error;
+ }
+
+ msl_idx = msl - mcfg->memsegs;
+ ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+ if (ms_idx < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg idx\n",
+ __func__);
+ goto fd_error;
+ }
+
+ /* store segment fd internally */
+ if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
+ RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n",
+ rte_strerror(rte_errno));
}
/* unmap the hugepage config file, since we are done using it */
munmap(hp, size);
close(fd_hugepage);
return 0;
+fd_error:
+ close(fd);
error:
/* map all segments into memory to make sure we get the addrs */
cur_seg = 0;
@@ -2093,18 +2131,65 @@ static int __rte_unused
memseg_primary_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int i, socket_id, hpi_idx, msl_idx = 0;
+ struct memtype {
+ uint64_t page_sz;
+ int socket_id;
+ } *memtypes = NULL;
+ int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */
struct rte_memseg_list *msl;
- uint64_t max_mem, total_mem;
+ uint64_t max_mem, max_mem_per_type;
+ unsigned int max_seglists_per_type;
+ unsigned int n_memtypes, cur_type;
/* no-huge does not need this at all */
if (internal_config.no_hugetlbfs)
return 0;
- max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
- total_mem = 0;
+ /*
+ * figuring out amount of memory we're going to have is a long and very
+ * involved process. the basic element we're operating with is a memory
+ * type, defined as a combination of NUMA node ID and page size (so that
+ * e.g. 2 sockets with 2 page sizes yield 4 memory types in total).
+ *
+ * deciding amount of memory going towards each memory type is a
+ * balancing act between maximum segments per type, maximum memory per
+ * type, and number of detected NUMA nodes. the goal is to make sure
+ * each memory type gets at least one memseg list.
+ *
+ * the total amount of memory is limited by RTE_MAX_MEM_MB value.
+ *
+ * the total amount of memory per type is limited by either
+ * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number
+ * of detected NUMA nodes. additionally, maximum number of segments per
+ * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for
+ * smaller page sizes, it can take hundreds of thousands of segments to
+ * reach the above specified per-type memory limits.
+ *
+ * additionally, each type may have multiple memseg lists associated
+ * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger
+ * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones.
+ *
+ * the number of memseg lists per type is decided based on the above
+ * limits, and also taking number of detected NUMA nodes, to make sure
+ * that we don't run out of memseg lists before we populate all NUMA
+ * nodes with memory.
+ *
+ * we do this in three stages. first, we collect the number of types.
+ * then, we figure out memory constraints and populate the list of
+ * would-be memseg lists. then, we go ahead and allocate the memseg
+ * lists.
+ */
- /* create memseg lists */
+ /* create space for mem types */
+ n_memtypes = internal_config.num_hugepage_sizes * rte_socket_count();
+ memtypes = calloc(n_memtypes, sizeof(*memtypes));
+ if (memtypes == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n");
+ return -1;
+ }
+
+ /* populate mem types */
+ cur_type = 0;
for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
hpi_idx++) {
struct hugepage_info *hpi;
@@ -2113,62 +2198,114 @@ memseg_primary_init(void)
hpi = &internal_config.hugepage_info[hpi_idx];
hugepage_sz = hpi->hugepage_sz;
- for (i = 0; i < (int) rte_socket_count(); i++) {
- uint64_t max_type_mem, total_type_mem = 0;
- int type_msl_idx, max_segs, total_segs = 0;
-
- socket_id = rte_socket_id_by_idx(i);
+ for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) {
+ int socket_id = rte_socket_id_by_idx(i);
#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
if (socket_id > 0)
break;
#endif
+ memtypes[cur_type].page_sz = hugepage_sz;
+ memtypes[cur_type].socket_id = socket_id;
- if (total_mem >= max_mem)
- break;
-
- max_type_mem = RTE_MIN(max_mem - total_mem,
- (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20);
- max_segs = RTE_MAX_MEMSEG_PER_TYPE;
+ RTE_LOG(DEBUG, EAL, "Detected memory type: "
+ "socket_id:%u hugepage_sz:%" PRIu64 "\n",
+ socket_id, hugepage_sz);
+ }
+ }
- type_msl_idx = 0;
- while (total_type_mem < max_type_mem &&
- total_segs < max_segs) {
- uint64_t cur_max_mem, cur_mem;
- unsigned int n_segs;
+ /* set up limits for types */
+ max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
+ max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20,
+ max_mem / n_memtypes);
+ /*
+ * limit maximum number of segment lists per type to ensure there's
+ * space for memseg lists for all NUMA nodes with all page sizes
+ */
+ max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes;
- if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
- RTE_LOG(ERR, EAL,
- "No more space in memseg lists, please increase %s\n",
- RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
- return -1;
- }
+ if (max_seglists_per_type == 0) {
+ RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ goto out;
+ }
- msl = &mcfg->memsegs[msl_idx++];
+ /* go through all mem types and create segment lists */
+ msl_idx = 0;
+ for (cur_type = 0; cur_type < n_memtypes; cur_type++) {
+ unsigned int cur_seglist, n_seglists, n_segs;
+ unsigned int max_segs_per_type, max_segs_per_list;
+ struct memtype *type = &memtypes[cur_type];
+ uint64_t max_mem_per_list, pagesz;
+ int socket_id;
- cur_max_mem = max_type_mem - total_type_mem;
+ pagesz = type->page_sz;
+ socket_id = type->socket_id;
- cur_mem = get_mem_amount(hugepage_sz,
- cur_max_mem);
- n_segs = cur_mem / hugepage_sz;
+ /*
+ * we need to create segment lists for this type. we must take
+ * into account the following things:
+ *
+ * 1. total amount of memory we can use for this memory type
+ * 2. total amount of memory per memseg list allowed
+ * 3. number of segments needed to fit the amount of memory
+ * 4. number of segments allowed per type
+ * 5. number of segments allowed per memseg list
+ * 6. number of memseg lists we are allowed to take up
+ */
- if (alloc_memseg_list(msl, hugepage_sz, n_segs,
- socket_id, type_msl_idx))
- return -1;
+ /* calculate how much segments we will need in total */
+ max_segs_per_type = max_mem_per_type / pagesz;
+ /* limit number of segments to maximum allowed per type */
+ max_segs_per_type = RTE_MIN(max_segs_per_type,
+ (unsigned int)RTE_MAX_MEMSEG_PER_TYPE);
+ /* limit number of segments to maximum allowed per list */
+ max_segs_per_list = RTE_MIN(max_segs_per_type,
+ (unsigned int)RTE_MAX_MEMSEG_PER_LIST);
+
+ /* calculate how much memory we can have per segment list */
+ max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz,
+ (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20);
+
+ /* calculate how many segments each segment list will have */
+ n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz);
+
+ /* calculate how many segment lists we can have */
+ n_seglists = RTE_MIN(max_segs_per_type / n_segs,
+ max_mem_per_type / max_mem_per_list);
+
+ /* limit number of segment lists according to our maximum */
+ n_seglists = RTE_MIN(n_seglists, max_seglists_per_type);
+
+ RTE_LOG(DEBUG, EAL, "Creating %i segment lists: "
+ "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n",
+ n_seglists, n_segs, socket_id, pagesz);
+
+ /* create all segment lists */
+ for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) {
+ if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL,
+ "No more space in memseg lists, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ goto out;
+ }
+ msl = &mcfg->memsegs[msl_idx++];
- total_segs += msl->memseg_arr.len;
- total_type_mem = total_segs * hugepage_sz;
- type_msl_idx++;
+ if (alloc_memseg_list(msl, pagesz, n_segs,
+ socket_id, cur_seglist))
+ goto out;
- if (alloc_va_space(msl)) {
- RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
- return -1;
- }
+ if (alloc_va_space(msl)) {
+ RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
+ goto out;
}
- total_mem += total_type_mem;
}
}
- return 0;
+ /* we're successful */
+ ret = 0;
+out:
+ free(memtypes);
+ return ret;
}
static int
@@ -2204,6 +2341,25 @@ memseg_secondary_init(void)
int
rte_eal_memseg_init(void)
{
+ /* increase rlimit to maximum */
+ struct rlimit lim;
+
+ if (getrlimit(RLIMIT_NOFILE, &lim) == 0) {
+ /* set limit to maximum */
+ lim.rlim_cur = lim.rlim_max;
+
+ if (setrlimit(RLIMIT_NOFILE, &lim) < 0) {
+ RTE_LOG(DEBUG, EAL, "Setting maximum number of open files failed: %s\n",
+ strerror(errno));
+ } else {
+ RTE_LOG(DEBUG, EAL, "Setting maximum number of open files to %"
+ PRIu64 "\n",
+ (uint64_t)lim.rlim_cur);
+ }
+ } else {
+ RTE_LOG(ERR, EAL, "Cannot get current resource limits\n");
+ }
+
return rte_eal_process_type() == RTE_PROC_PRIMARY ?
#ifndef RTE_ARCH_64
memseg_primary_init_32() :
diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c
index b496fc71..379773b6 100644
--- a/lib/librte_eal/linuxapp/eal/eal_thread.c
+++ b/lib/librte_eal/linuxapp/eal/eal_thread.c
@@ -121,8 +121,8 @@ eal_thread_loop(__attribute__((unused)) void *arg)
ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
- RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
- lcore_id, (int)thread_id, cpuset, ret == 0 ? "" : "...");
+ RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s%s])\n",
+ lcore_id, (uintptr_t)thread_id, cpuset, ret == 0 ? "" : "...");
/* read on our pipe to get commands */
while (1) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c b/lib/librte_eal/linuxapp/eal/eal_timer.c
index 2766bd78..bc8f0519 100644
--- a/lib/librte_eal/linuxapp/eal/eal_timer.c
+++ b/lib/librte_eal/linuxapp/eal/eal_timer.c
@@ -87,7 +87,7 @@ static pthread_t msb_inc_thread_id;
* containing used to process MSB of the HPET (unfortunately, we need
* this because hpet is 32 bits by default under linux).
*/
-static void
+static void *
hpet_msb_inc(__attribute__((unused)) void *arg)
{
uint32_t t;
@@ -98,6 +98,7 @@ hpet_msb_inc(__attribute__((unused)) void *arg)
eal_hpet_msb ++;
sleep(10);
}
+ return NULL;
}
uint64_t
@@ -178,7 +179,7 @@ rte_eal_hpet_init(int make_default)
/* create a thread that will increment a global variable for
* msb (hpet is 32 bits by default under linux) */
ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL,
- (void *(*)(void *))hpet_msb_inc, NULL);
+ hpet_msb_inc, NULL);
if (ret != 0) {
RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n");
internal_config.no_hpet = 1;
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index c68dc38e..0516b159 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -345,46 +345,13 @@ get_vfio_cfg_by_group_num(int iommu_group_num)
return NULL;
}
-static struct vfio_config *
-get_vfio_cfg_by_group_fd(int vfio_group_fd)
-{
- struct vfio_config *vfio_cfg;
- int i, j;
-
- for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
- vfio_cfg = &vfio_cfgs[i];
- for (j = 0; j < VFIO_MAX_GROUPS; j++)
- if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
- return vfio_cfg;
- }
-
- return NULL;
-}
-
-static struct vfio_config *
-get_vfio_cfg_by_container_fd(int container_fd)
-{
- int i;
-
- for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
- if (vfio_cfgs[i].vfio_container_fd == container_fd)
- return &vfio_cfgs[i];
- }
-
- return NULL;
-}
-
-int
-rte_vfio_get_group_fd(int iommu_group_num)
+static int
+vfio_get_group_fd(struct vfio_config *vfio_cfg,
+ int iommu_group_num)
{
int i;
int vfio_group_fd;
struct vfio_group *cur_grp;
- struct vfio_config *vfio_cfg;
-
- /* get the vfio_config it belongs to */
- vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
- vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
/* check if we already have the group descriptor open */
for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -423,6 +390,47 @@ rte_vfio_get_group_fd(int iommu_group_num)
return vfio_group_fd;
}
+static struct vfio_config *
+get_vfio_cfg_by_group_fd(int vfio_group_fd)
+{
+ struct vfio_config *vfio_cfg;
+ int i, j;
+
+ for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+ vfio_cfg = &vfio_cfgs[i];
+ for (j = 0; j < VFIO_MAX_GROUPS; j++)
+ if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
+ return vfio_cfg;
+ }
+
+ return NULL;
+}
+
+static struct vfio_config *
+get_vfio_cfg_by_container_fd(int container_fd)
+{
+ int i;
+
+ for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+ if (vfio_cfgs[i].vfio_container_fd == container_fd)
+ return &vfio_cfgs[i];
+ }
+
+ return NULL;
+}
+
+int
+rte_vfio_get_group_fd(int iommu_group_num)
+{
+ struct vfio_config *vfio_cfg;
+
+ /* get the vfio_config it belongs to */
+ vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
+ vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
+
+ return vfio_get_group_fd(vfio_cfg, iommu_group_num);
+}
+
static int
get_vfio_group_idx(int vfio_group_fd)
{
@@ -509,7 +517,7 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
msl = rte_mem_virt2memseg_list(addr);
/* for IOVA as VA mode, no need to care for IOVA addresses */
- if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+ if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) {
uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
if (type == RTE_MEM_EVENT_ALLOC)
vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va,
@@ -523,13 +531,19 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
/* memsegs are contiguous in memory */
ms = rte_mem_virt2memseg(addr, msl);
while (cur_len < len) {
+ /* some memory segments may have invalid IOVA */
+ if (ms->iova == RTE_BAD_IOVA) {
+ RTE_LOG(DEBUG, EAL, "Memory segment at %p has bad IOVA, skipping\n",
+ ms->addr);
+ goto next;
+ }
if (type == RTE_MEM_EVENT_ALLOC)
vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
ms->iova, ms->len, 1);
else
vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
ms->iova, ms->len, 0);
-
+next:
cur_len += ms->len;
++ms;
}
@@ -896,7 +910,15 @@ rte_vfio_enable(const char *modname)
return 0;
}
- default_vfio_cfg->vfio_container_fd = rte_vfio_get_container_fd();
+ if (internal_config.process_type == RTE_PROC_PRIMARY) {
+ /* open a new container */
+ default_vfio_cfg->vfio_container_fd =
+ rte_vfio_get_container_fd();
+ } else {
+ /* get the default container from the primary process */
+ default_vfio_cfg->vfio_container_fd =
+ vfio_get_default_container_fd();
+ }
/* check if we have VFIO driver enabled */
if (default_vfio_cfg->vfio_container_fd != -1) {
@@ -916,6 +938,45 @@ rte_vfio_is_enabled(const char *modname)
return default_vfio_cfg->vfio_enabled && mod_available;
}
+int
+vfio_get_default_container_fd(void)
+{
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+
+ if (default_vfio_cfg->vfio_enabled)
+ return default_vfio_cfg->vfio_container_fd;
+
+ if (internal_config.process_type == RTE_PROC_PRIMARY) {
+ /* if we were secondary process we would try requesting
+ * container fd from the primary, but we're the primary
+ * process so just exit here
+ */
+ return -1;
+ }
+
+ p->req = SOCKET_REQ_DEFAULT_CONTAINER;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ free(mp_reply.msgs);
+ return mp_rep->fds[0];
+ }
+ free(mp_reply.msgs);
+ }
+
+ RTE_LOG(ERR, EAL, " cannot request default container fd\n");
+ return -1;
+}
+
const struct vfio_iommu_type *
vfio_set_iommu_type(int vfio_container_fd)
{
@@ -1028,8 +1089,9 @@ rte_vfio_get_container_fd(void)
mp_rep = &mp_reply.msgs[0];
p = (struct vfio_mp_param *)mp_rep->param;
if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ vfio_container_fd = mp_rep->fds[0];
free(mp_reply.msgs);
- return mp_rep->fds[0];
+ return vfio_container_fd;
}
free(mp_reply.msgs);
}
@@ -1082,11 +1144,14 @@ rte_vfio_get_group_num(const char *sysfs_base,
}
static int
-type1_map(const struct rte_memseg_list *msl __rte_unused,
- const struct rte_memseg *ms, void *arg)
+type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+ void *arg)
{
int *vfio_container_fd = arg;
+ if (msl->external)
+ return 0;
+
return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
ms->len, 1);
}
@@ -1145,8 +1210,22 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
struct vfio_iommu_type1_dma_map dma_map;
struct vfio_iommu_type1_dma_unmap dma_unmap;
int ret;
+ struct vfio_iommu_spapr_register_memory reg = {
+ .argsz = sizeof(reg),
+ .flags = 0
+ };
+ reg.vaddr = (uintptr_t) vaddr;
+ reg.size = len;
if (do_map != 0) {
+ ret = ioctl(vfio_container_fd,
+ VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
memset(&dma_map, 0, sizeof(dma_map));
dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
dma_map.vaddr = vaddr;
@@ -1163,13 +1242,6 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
}
} else {
- struct vfio_iommu_spapr_register_memory reg = {
- .argsz = sizeof(reg),
- .flags = 0
- };
- reg.vaddr = (uintptr_t) vaddr;
- reg.size = len;
-
ret = ioctl(vfio_container_fd,
VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
if (ret) {
@@ -1196,12 +1268,15 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
}
static int
-vfio_spapr_map_walk(const struct rte_memseg_list *msl __rte_unused,
+vfio_spapr_map_walk(const struct rte_memseg_list *msl,
const struct rte_memseg *ms, void *arg)
{
int *vfio_container_fd = arg;
- return vfio_spapr_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
+ if (msl->external)
+ return 0;
+
+ return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
ms->len, 1);
}
@@ -1210,12 +1285,15 @@ struct spapr_walk_param {
uint64_t hugepage_sz;
};
static int
-vfio_spapr_window_size_walk(const struct rte_memseg_list *msl __rte_unused,
+vfio_spapr_window_size_walk(const struct rte_memseg_list *msl,
const struct rte_memseg *ms, void *arg)
{
struct spapr_walk_param *param = arg;
uint64_t max = ms->iova + ms->len;
+ if (msl->external)
+ return 0;
+
if (max > param->window_size) {
param->hugepage_sz = ms->hugepage_sz;
param->window_size = max;
@@ -1670,9 +1748,6 @@ int
rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
{
struct vfio_config *vfio_cfg;
- struct vfio_group *cur_grp;
- int vfio_group_fd;
- int i;
vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
if (vfio_cfg == NULL) {
@@ -1680,36 +1755,7 @@ rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
return -1;
}
- /* Check room for new group */
- if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
- RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
- return -1;
- }
-
- /* Get an index for the new group */
- for (i = 0; i < VFIO_MAX_GROUPS; i++)
- if (vfio_cfg->vfio_groups[i].group_num == -1) {
- cur_grp = &vfio_cfg->vfio_groups[i];
- break;
- }
-
- /* This should not happen */
- if (i == VFIO_MAX_GROUPS) {
- RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
- return -1;
- }
-
- vfio_group_fd = vfio_open_group_fd(iommu_group_num);
- if (vfio_group_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num);
- return -1;
- }
- cur_grp->group_num = iommu_group_num;
- cur_grp->fd = vfio_group_fd;
- cur_grp->devices = 0;
- vfio_cfg->vfio_active_groups++;
-
- return vfio_group_fd;
+ return vfio_get_group_fd(vfio_cfg, iommu_group_num);
}
int
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 68d4750a..63ae115c 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -115,6 +115,9 @@ struct vfio_iommu_type {
vfio_dma_func_t dma_map_func;
};
+/* get the vfio container that devices are bound to by default */
+int vfio_get_default_container_fd(void);
+
/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */
const struct vfio_iommu_type *
vfio_set_iommu_type(int vfio_container_fd);
@@ -129,6 +132,7 @@ int vfio_mp_sync_setup(void);
#define SOCKET_REQ_CONTAINER 0x100
#define SOCKET_REQ_GROUP 0x200
+#define SOCKET_REQ_DEFAULT_CONTAINER 0x400
#define SOCKET_OK 0x0
#define SOCKET_NO_FD 0x1
#define SOCKET_ERR 0xFF
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index 680a24aa..a1e8c834 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -66,6 +66,17 @@ vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
reply.fds[0] = fd;
}
break;
+ case SOCKET_REQ_DEFAULT_CONTAINER:
+ r->req = SOCKET_REQ_DEFAULT_CONTAINER;
+ fd = vfio_get_default_container_fd();
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else {
+ r->result = SOCKET_OK;
+ reply.num_fds = 1;
+ reply.fds[0] = fd;
+ }
+ break;
default:
RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
index cfa9448b..5afa0871 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
@@ -8,6 +8,7 @@
#ifdef __KERNEL__
#include <linux/if.h>
+#include <asm/barrier.h>
#define RTE_STD_C11
#else
#include <rte_common.h>
@@ -54,8 +55,13 @@ struct rte_kni_request {
* Writing should never overwrite the read position
*/
struct rte_kni_fifo {
+#ifdef RTE_USE_C11_MEM_MODEL
+ unsigned write; /**< Next position to be written*/
+ unsigned read; /**< Next position to be read */
+#else
volatile unsigned write; /**< Next position to be written*/
volatile unsigned read; /**< Next position to be read */
+#endif
unsigned len; /**< Circular buffer length */
unsigned elem_size; /**< Pointer size - for 32/64 bit OS */
void *volatile buffer[]; /**< The buffer contains mbuf pointers */
diff --git a/lib/librte_eal/meson.build b/lib/librte_eal/meson.build
index e1fde15d..a18f3a82 100644
--- a/lib/librte_eal/meson.build
+++ b/lib/librte_eal/meson.build
@@ -21,11 +21,10 @@ else
error('unsupported system type "@0@"'.format(host_machine.system()))
endif
-version = 8 # the version of the EAL API
+version = 9 # the version of the EAL API
allow_experimental_apis = true
deps += 'compat'
deps += 'kvargs'
-cflags += '-D_GNU_SOURCE'
sources = common_sources + env_sources
objs = common_objs + env_objs
headers = common_headers + env_headers
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 344a43d3..04f62424 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -19,9 +19,6 @@ DPDK_2.0 {
rte_dump_tailq;
rte_eal_alarm_cancel;
rte_eal_alarm_set;
- rte_eal_devargs_add;
- rte_eal_devargs_dump;
- rte_eal_devargs_type_count;
rte_eal_get_configuration;
rte_eal_get_lcore_state;
rte_eal_get_physmem_size;
@@ -32,7 +29,6 @@ DPDK_2.0 {
rte_eal_lcore_role;
rte_eal_mp_remote_launch;
rte_eal_mp_wait_lcore;
- rte_eal_parse_devargs_str;
rte_eal_process_type;
rte_eal_remote_launch;
rte_eal_tailq_lookup;
@@ -134,8 +130,6 @@ DPDK_16.11 {
rte_delay_us_block;
rte_delay_us_callback_register;
- rte_eal_dev_attach;
- rte_eal_dev_detach;
} DPDK_16.07;
@@ -262,6 +256,16 @@ DPDK_18.08 {
} DPDK_18.05;
+DPDK_18.11 {
+ global:
+
+ rte_eal_get_runtime_dir;
+ rte_eal_hotplug_add;
+ rte_eal_hotplug_remove;
+ rte_strscpy;
+
+} DPDK_18.08;
+
EXPERIMENTAL {
global:
@@ -270,12 +274,19 @@ EXPERIMENTAL {
rte_class_register;
rte_class_unregister;
rte_ctrl_thread_create;
+ rte_delay_us_sleep;
+ rte_dev_event_callback_process;
rte_dev_event_callback_register;
rte_dev_event_callback_unregister;
rte_dev_event_monitor_start;
rte_dev_event_monitor_stop;
+ rte_dev_hotplug_handle_disable;
+ rte_dev_hotplug_handle_enable;
+ rte_dev_is_probed;
rte_dev_iterator_init;
rte_dev_iterator_next;
+ rte_dev_probe;
+ rte_dev_remove;
rte_devargs_add;
rte_devargs_dump;
rte_devargs_insert;
@@ -284,9 +295,8 @@ EXPERIMENTAL {
rte_devargs_parsef;
rte_devargs_remove;
rte_devargs_type_count;
+ rte_eal_check_dma_mask;
rte_eal_cleanup;
- rte_eal_hotplug_add;
- rte_eal_hotplug_remove;
rte_fbarray_attach;
rte_fbarray_destroy;
rte_fbarray_detach;
@@ -311,6 +321,14 @@ EXPERIMENTAL {
rte_fbarray_set_used;
rte_log_register_type_and_pick_level;
rte_malloc_dump_heaps;
+ rte_malloc_heap_create;
+ rte_malloc_heap_destroy;
+ rte_malloc_heap_get_socket;
+ rte_malloc_heap_memory_add;
+ rte_malloc_heap_memory_attach;
+ rte_malloc_heap_memory_detach;
+ rte_malloc_heap_memory_remove;
+ rte_malloc_heap_socket_is_external;
rte_mem_alloc_validator_register;
rte_mem_alloc_validator_unregister;
rte_mem_event_callback_register;
@@ -320,6 +338,10 @@ EXPERIMENTAL {
rte_mem_virt2memseg_list;
rte_memseg_contig_walk;
rte_memseg_contig_walk_thread_unsafe;
+ rte_memseg_get_fd;
+ rte_memseg_get_fd_offset;
+ rte_memseg_get_fd_thread_unsafe;
+ rte_memseg_get_fd_offset_thread_unsafe;
rte_memseg_list_walk;
rte_memseg_list_walk_thread_unsafe;
rte_memseg_walk;
@@ -330,6 +352,7 @@ EXPERIMENTAL {
rte_mp_request_sync;
rte_mp_request_async;
rte_mp_sendmsg;
+ rte_option_register;
rte_service_lcore_attr_get;
rte_service_lcore_attr_reset_all;
rte_service_may_be_active;
diff --git a/lib/librte_ethdev/Makefile b/lib/librte_ethdev/Makefile
index 0935a275..3e27ae46 100644
--- a/lib/librte_ethdev/Makefile
+++ b/lib/librte_ethdev/Makefile
@@ -12,13 +12,15 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API
CFLAGS += -O3
CFLAGS += $(WERROR_FLAGS)
LDLIBS += -lrte_net -lrte_eal -lrte_mempool -lrte_ring
-LDLIBS += -lrte_mbuf
+LDLIBS += -lrte_mbuf -lrte_kvargs -lrte_cmdline
EXPORT_MAP := rte_ethdev_version.map
-LIBABIVER := 10
+LIBABIVER := 11
+SRCS-y += ethdev_private.c
SRCS-y += rte_ethdev.c
+SRCS-y += rte_class_eth.c
SRCS-y += rte_flow.c
SRCS-y += rte_tm.c
SRCS-y += rte_mtr.c
diff --git a/lib/librte_ethdev/ethdev_private.c b/lib/librte_ethdev/ethdev_private.c
new file mode 100644
index 00000000..162a502f
--- /dev/null
+++ b/lib/librte_ethdev/ethdev_private.c
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Gaëtan Rivet
+ */
+
+#include "rte_ethdev.h"
+#include "rte_ethdev_driver.h"
+#include "ethdev_private.h"
+
+uint16_t
+eth_dev_to_id(const struct rte_eth_dev *dev)
+{
+ if (dev == NULL)
+ return RTE_MAX_ETHPORTS;
+ return dev - rte_eth_devices;
+}
+
+struct rte_eth_dev *
+eth_find_device(const struct rte_eth_dev *start, rte_eth_cmp_t cmp,
+ const void *data)
+{
+ struct rte_eth_dev *edev;
+ ptrdiff_t idx;
+
+ /* Avoid Undefined Behaviour */
+ if (start != NULL &&
+ (start < &rte_eth_devices[0] ||
+ start > &rte_eth_devices[RTE_MAX_ETHPORTS]))
+ return NULL;
+ if (start != NULL)
+ idx = eth_dev_to_id(start) + 1;
+ else
+ idx = 0;
+ for (; idx < RTE_MAX_ETHPORTS; idx++) {
+ edev = &rte_eth_devices[idx];
+ if (cmp(edev, data) == 0)
+ return edev;
+ }
+ return NULL;
+}
+
+int
+rte_eth_devargs_parse_list(char *str, rte_eth_devargs_callback_t callback,
+ void *data)
+{
+ char *str_start;
+ int state;
+ int result;
+
+ if (*str != '[')
+ /* Single element, not a list */
+ return callback(str, data);
+
+ /* Sanity check, then strip the brackets */
+ str_start = &str[strlen(str) - 1];
+ if (*str_start != ']') {
+ RTE_LOG(ERR, EAL, "(%s): List does not end with ']'\n", str);
+ return -EINVAL;
+ }
+ str++;
+ *str_start = '\0';
+
+ /* Process list elements */
+ state = 0;
+ while (1) {
+ if (state == 0) {
+ if (*str == '\0')
+ break;
+ if (*str != ',') {
+ str_start = str;
+ state = 1;
+ }
+ } else if (state == 1) {
+ if (*str == ',' || *str == '\0') {
+ if (str > str_start) {
+ /* Non-empty string fragment */
+ *str = '\0';
+ result = callback(str_start, data);
+ if (result < 0)
+ return result;
+ }
+ state = 0;
+ }
+ }
+ str++;
+ }
+ return 0;
+}
+
+static int
+rte_eth_devargs_process_range(char *str, uint16_t *list, uint16_t *len_list,
+ const uint16_t max_list)
+{
+ uint16_t lo, hi, val;
+ int result;
+
+ result = sscanf(str, "%hu-%hu", &lo, &hi);
+ if (result == 1) {
+ if (*len_list >= max_list)
+ return -ENOMEM;
+ list[(*len_list)++] = lo;
+ } else if (result == 2) {
+ if (lo >= hi || lo > RTE_MAX_ETHPORTS || hi > RTE_MAX_ETHPORTS)
+ return -EINVAL;
+ for (val = lo; val <= hi; val++) {
+ if (*len_list >= max_list)
+ return -ENOMEM;
+ list[(*len_list)++] = val;
+ }
+ } else
+ return -EINVAL;
+ return 0;
+}
+
+int
+rte_eth_devargs_parse_representor_ports(char *str, void *data)
+{
+ struct rte_eth_devargs *eth_da = data;
+
+ return rte_eth_devargs_process_range(str, eth_da->representor_ports,
+ &eth_da->nb_representor_ports, RTE_MAX_ETHPORTS);
+}
diff --git a/lib/librte_ethdev/ethdev_private.h b/lib/librte_ethdev/ethdev_private.h
new file mode 100644
index 00000000..7b787bf9
--- /dev/null
+++ b/lib/librte_ethdev/ethdev_private.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Gaëtan Rivet
+ */
+
+#ifndef _RTE_ETH_PRIVATE_H_
+#define _RTE_ETH_PRIVATE_H_
+
+#include "rte_ethdev.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Convert rte_eth_dev pointer to port id.
+ * NULL will be translated to RTE_MAX_ETHPORTS.
+ */
+uint16_t eth_dev_to_id(const struct rte_eth_dev *dev);
+
+/* Generic rte_eth_dev comparison function. */
+typedef int (*rte_eth_cmp_t)(const struct rte_eth_dev *, const void *);
+
+/* Generic rte_eth_dev iterator. */
+struct rte_eth_dev *
+eth_find_device(const struct rte_eth_dev *_start, rte_eth_cmp_t cmp,
+ const void *data);
+
+/* Parse devargs value for representor parameter. */
+typedef int (*rte_eth_devargs_callback_t)(char *str, void *data);
+int rte_eth_devargs_parse_list(char *str, rte_eth_devargs_callback_t callback,
+ void *data);
+int rte_eth_devargs_parse_representor_ports(char *str, void *data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_ETH_PRIVATE_H_ */
diff --git a/lib/librte_ethdev/ethdev_profile.c b/lib/librte_ethdev/ethdev_profile.c
index 0d1dcda3..a3c303f6 100644
--- a/lib/librte_ethdev/ethdev_profile.c
+++ b/lib/librte_ethdev/ethdev_profile.c
@@ -1,87 +1,33 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2017 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/
#include "ethdev_profile.h"
/**
- * This conditional block enables RX queues profiling by tracking wasted
- * iterations, i.e. iterations which yielded no RX packets. Profiling is
- * performed using the Instrumentation and Tracing Technology (ITT) API,
- * employed by the Intel (R) VTune (TM) Amplifier.
+ * This conditional block enables Ethernet device profiling with
+ * Intel (R) VTune (TM) Amplifier.
*/
-#ifdef RTE_ETHDEV_PROFILE_ITT_WASTED_RX_ITERATIONS
-
-#include <ittnotify.h>
-
-#define ITT_MAX_NAME_LEN (100)
-
-/**
- * Auxiliary ITT structure belonging to Ethernet device and using to:
- * - track RX queue state to determine whether it is wasting loop iterations
- * - begin or end ITT task using task domain and task name (handle)
- */
-struct itt_profile_rx_data {
- /**
- * ITT domains for each queue.
- */
- __itt_domain *domains[RTE_MAX_QUEUES_PER_PORT];
- /**
- * ITT task names for each queue.
- */
- __itt_string_handle *handles[RTE_MAX_QUEUES_PER_PORT];
- /**
- * Flags indicating the queues state. Possible values:
- * 1 - queue is wasting iterations,
- * 0 - otherwise.
- */
- uint8_t queue_state[RTE_MAX_QUEUES_PER_PORT];
-};
-
-/**
- * The pool of *itt_profile_rx_data* structures.
- */
-struct itt_profile_rx_data itt_rx_data[RTE_MAX_ETHPORTS];
-
+#ifdef RTE_ETHDEV_PROFILE_WITH_VTUNE
/**
- * This callback function manages ITT tasks collection on given port and queue.
- * It must be registered with rte_eth_add_rx_callback() to be called from
- * rte_eth_rx_burst(). To find more comments see rte_rx_callback_fn function
- * type declaration.
+ * Hook callback to trace rte_eth_rx_burst() calls.
*/
-static uint16_t
-collect_itt_rx_burst_cb(uint16_t port_id, uint16_t queue_id,
+uint16_t
+profile_hook_rx_burst_cb(
+ __rte_unused uint16_t port_id, __rte_unused uint16_t queue_id,
__rte_unused struct rte_mbuf *pkts[], uint16_t nb_pkts,
__rte_unused uint16_t max_pkts, __rte_unused void *user_param)
{
- if (unlikely(nb_pkts == 0)) {
- if (!itt_rx_data[port_id].queue_state[queue_id]) {
- __itt_task_begin(
- itt_rx_data[port_id].domains[queue_id],
- __itt_null, __itt_null,
- itt_rx_data[port_id].handles[queue_id]);
- itt_rx_data[port_id].queue_state[queue_id] = 1;
- }
- } else {
- if (unlikely(itt_rx_data[port_id].queue_state[queue_id])) {
- __itt_task_end(
- itt_rx_data[port_id].domains[queue_id]);
- itt_rx_data[port_id].queue_state[queue_id] = 0;
- }
- }
return nb_pkts;
}
/**
- * Initialization of itt_profile_rx_data for a given Ethernet device.
+ * Setting profiling rx callback for a given Ethernet device.
* This function must be invoked when ethernet device is being configured.
- * Result will be stored in the global array *itt_rx_data*.
*
* @param port_id
* The port identifier of the Ethernet device.
- * @param port_name
- * The name of the Ethernet device.
* @param rx_queue_num
* The number of RX queues on specified port.
*
@@ -90,46 +36,27 @@ collect_itt_rx_burst_cb(uint16_t port_id, uint16_t queue_id,
* - On failure, a negative value.
*/
static inline int
-itt_profile_rx_init(uint16_t port_id, char *port_name, uint8_t rx_queue_num)
+vtune_profile_rx_init(uint16_t port_id, uint8_t rx_queue_num)
{
uint16_t q_id;
for (q_id = 0; q_id < rx_queue_num; ++q_id) {
- char domain_name[ITT_MAX_NAME_LEN];
-
- snprintf(domain_name, sizeof(domain_name),
- "RXBurst.WastedIterations.Port_%s.Queue_%d",
- port_name, q_id);
- itt_rx_data[port_id].domains[q_id]
- = __itt_domain_create(domain_name);
-
- char task_name[ITT_MAX_NAME_LEN];
-
- snprintf(task_name, sizeof(task_name),
- "port id: %d; queue id: %d",
- port_id, q_id);
- itt_rx_data[port_id].handles[q_id]
- = __itt_string_handle_create(task_name);
-
- itt_rx_data[port_id].queue_state[q_id] = 0;
-
if (!rte_eth_add_rx_callback(
- port_id, q_id, collect_itt_rx_burst_cb, NULL)) {
+ port_id, q_id, profile_hook_rx_burst_cb, NULL)) {
return -rte_errno;
}
}
return 0;
}
-#endif /* RTE_ETHDEV_PROFILE_ITT_WASTED_RX_ITERATIONS */
+#endif /* RTE_ETHDEV_PROFILE_WITH_VTUNE */
int
-__rte_eth_profile_rx_init(__rte_unused uint16_t port_id,
+__rte_eth_dev_profile_init(__rte_unused uint16_t port_id,
__rte_unused struct rte_eth_dev *dev)
{
-#ifdef RTE_ETHDEV_PROFILE_ITT_WASTED_RX_ITERATIONS
- return itt_profile_rx_init(
- port_id, dev->data->name, dev->data->nb_rx_queues);
+#ifdef RTE_ETHDEV_PROFILE_WITH_VTUNE
+ return vtune_profile_rx_init(port_id, dev->data->nb_rx_queues);
#endif
return 0;
}
diff --git a/lib/librte_ethdev/ethdev_profile.h b/lib/librte_ethdev/ethdev_profile.h
index e5ea3682..65031e6f 100644
--- a/lib/librte_ethdev/ethdev_profile.h
+++ b/lib/librte_ethdev/ethdev_profile.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2017 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/
#ifndef _RTE_ETHDEV_PROFILE_H_
@@ -8,7 +8,7 @@
#include "rte_ethdev.h"
/**
- * Initialization of profiling RX queues for the Ethernet device.
+ * Initialization of the Ethernet device profiling.
* Implementation of this function depends on chosen profiling method,
* defined in configs.
*
@@ -22,6 +22,6 @@
* - On failure, a negative value.
*/
int
-__rte_eth_profile_rx_init(uint16_t port_id, struct rte_eth_dev *dev);
+__rte_eth_dev_profile_init(uint16_t port_id, struct rte_eth_dev *dev);
#endif
diff --git a/lib/librte_ethdev/meson.build b/lib/librte_ethdev/meson.build
index 596cd0f3..a4d85026 100644
--- a/lib/librte_ethdev/meson.build
+++ b/lib/librte_ethdev/meson.build
@@ -2,9 +2,11 @@
# Copyright(c) 2017 Intel Corporation
name = 'ethdev'
-version = 10
+version = 11
allow_experimental_apis = true
-sources = files('ethdev_profile.c',
+sources = files('ethdev_private.c',
+ 'ethdev_profile.c',
+ 'rte_class_eth.c',
'rte_ethdev.c',
'rte_flow.c',
'rte_mtr.c',
@@ -24,4 +26,4 @@ headers = files('rte_ethdev.h',
'rte_tm.h',
'rte_tm_driver.h')
-deps += ['net', 'kvargs']
+deps += ['net', 'kvargs', 'cmdline']
diff --git a/lib/librte_ethdev/rte_class_eth.c b/lib/librte_ethdev/rte_class_eth.c
new file mode 100644
index 00000000..cb99c92e
--- /dev/null
+++ b/lib/librte_ethdev/rte_class_eth.c
@@ -0,0 +1,173 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Gaëtan Rivet
+ */
+
+#include <string.h>
+
+#include <cmdline_parse_etheraddr.h>
+#include <rte_class.h>
+#include <rte_compat.h>
+#include <rte_errno.h>
+#include <rte_kvargs.h>
+#include <rte_log.h>
+
+#include "rte_ethdev.h"
+#include "rte_ethdev_core.h"
+#include "rte_ethdev_driver.h"
+#include "ethdev_private.h"
+
+enum eth_params {
+ RTE_ETH_PARAM_MAC,
+ RTE_ETH_PARAM_REPRESENTOR,
+ RTE_ETH_PARAM_MAX,
+};
+
+static const char * const eth_params_keys[] = {
+ [RTE_ETH_PARAM_MAC] = "mac",
+ [RTE_ETH_PARAM_REPRESENTOR] = "representor",
+ [RTE_ETH_PARAM_MAX] = NULL,
+};
+
+struct eth_dev_match_arg {
+ struct rte_device *device;
+ struct rte_kvargs *kvlist;
+};
+
+#define eth_dev_match_arg(d, k) \
+ (&(const struct eth_dev_match_arg) { \
+ .device = (d), \
+ .kvlist = (k), \
+ })
+
+static int
+eth_mac_cmp(const char *key __rte_unused,
+ const char *value, void *opaque)
+{
+ int ret;
+ struct ether_addr mac;
+ const struct rte_eth_dev_data *data = opaque;
+ struct rte_eth_dev_info dev_info;
+ uint32_t index;
+
+ /* Parse devargs MAC address. */
+ /*
+ * cannot use ether_aton_r(value, &mac)
+ * because of include conflict with rte_ether.h
+ */
+ ret = cmdline_parse_etheraddr(NULL, value, &mac, sizeof(mac));
+ if (ret < 0)
+ return -1; /* invalid devargs value */
+
+ /* Return 0 if devargs MAC is matching one of the device MACs. */
+ rte_eth_dev_info_get(data->port_id, &dev_info);
+ for (index = 0; index < dev_info.max_mac_addrs; index++)
+ if (is_same_ether_addr(&mac, &data->mac_addrs[index]))
+ return 0;
+ return -1; /* no match */
+}
+
+static int
+eth_representor_cmp(const char *key __rte_unused,
+ const char *value, void *opaque)
+{
+ int ret;
+ char *values;
+ const struct rte_eth_dev_data *data = opaque;
+ struct rte_eth_devargs representors;
+ uint16_t index;
+
+ if ((data->dev_flags & RTE_ETH_DEV_REPRESENTOR) == 0)
+ return -1; /* not a representor port */
+
+ /* Parse devargs representor values. */
+ values = strdup(value);
+ if (values == NULL)
+ return -1;
+ memset(&representors, 0, sizeof(representors));
+ ret = rte_eth_devargs_parse_list(values,
+ rte_eth_devargs_parse_representor_ports,
+ &representors);
+ free(values);
+ if (ret != 0)
+ return -1; /* invalid devargs value */
+
+ /* Return 0 if representor id is matching one of the values. */
+ for (index = 0; index < representors.nb_representor_ports; index++)
+ if (data->representor_id ==
+ representors.representor_ports[index])
+ return 0;
+ return -1; /* no match */
+}
+
+static int
+eth_dev_match(const struct rte_eth_dev *edev,
+ const void *_arg)
+{
+ int ret;
+ const struct eth_dev_match_arg *arg = _arg;
+ const struct rte_kvargs *kvlist = arg->kvlist;
+ unsigned int pair;
+
+ if (edev->state == RTE_ETH_DEV_UNUSED)
+ return -1;
+ if (arg->device != NULL && arg->device != edev->device)
+ return -1;
+
+ ret = rte_kvargs_process(kvlist,
+ eth_params_keys[RTE_ETH_PARAM_MAC],
+ eth_mac_cmp, edev->data);
+ if (ret != 0)
+ return -1;
+
+ ret = rte_kvargs_process(kvlist,
+ eth_params_keys[RTE_ETH_PARAM_REPRESENTOR],
+ eth_representor_cmp, edev->data);
+ if (ret != 0)
+ return -1;
+ /* search for representor key */
+ for (pair = 0; pair < kvlist->count; pair++) {
+ ret = strcmp(kvlist->pairs[pair].key,
+ eth_params_keys[RTE_ETH_PARAM_REPRESENTOR]);
+ if (ret == 0)
+ break; /* there is a representor key */
+ }
+ /* if no representor key, default is to not match representor ports */
+ if (ret != 0)
+ if ((edev->data->dev_flags & RTE_ETH_DEV_REPRESENTOR) != 0)
+ return -1; /* do not match any representor */
+
+ return 0;
+}
+
+static void *
+eth_dev_iterate(const void *start,
+ const char *str,
+ const struct rte_dev_iterator *it)
+{
+ struct rte_kvargs *kvargs = NULL;
+ struct rte_eth_dev *edev = NULL;
+ const char * const *valid_keys = NULL;
+
+ if (str != NULL) {
+ if (str[0] == '+') /* no validation of keys */
+ str++;
+ else
+ valid_keys = eth_params_keys;
+ kvargs = rte_kvargs_parse(str, valid_keys);
+ if (kvargs == NULL) {
+ RTE_LOG(ERR, EAL, "cannot parse argument list\n");
+ rte_errno = EINVAL;
+ return NULL;
+ }
+ }
+ edev = eth_find_device(start, eth_dev_match,
+ eth_dev_match_arg(it->device, kvargs));
+ rte_kvargs_free(kvargs);
+ return edev;
+}
+
+static struct rte_class rte_class_eth = {
+ .dev_iterate = eth_dev_iterate,
+};
+
+RTE_REGISTER_CLASS(eth, rte_class_eth);
diff --git a/lib/librte_ethdev/rte_ethdev.c b/lib/librte_ethdev/rte_ethdev.c
index 4c320250..9d348138 100644
--- a/lib/librte_ethdev/rte_ethdev.c
+++ b/lib/librte_ethdev/rte_ethdev.c
@@ -36,11 +36,13 @@
#include <rte_spinlock.h>
#include <rte_string_fns.h>
#include <rte_kvargs.h>
+#include <rte_class.h>
#include "rte_ether.h"
#include "rte_ethdev.h"
#include "rte_ethdev_driver.h"
#include "ethdev_profile.h"
+#include "ethdev_private.h"
int rte_eth_dev_logtype;
@@ -122,11 +124,12 @@ static const struct {
RTE_RX_OFFLOAD_BIT2STR(VLAN_FILTER),
RTE_RX_OFFLOAD_BIT2STR(VLAN_EXTEND),
RTE_RX_OFFLOAD_BIT2STR(JUMBO_FRAME),
- RTE_RX_OFFLOAD_BIT2STR(CRC_STRIP),
RTE_RX_OFFLOAD_BIT2STR(SCATTER),
RTE_RX_OFFLOAD_BIT2STR(TIMESTAMP),
RTE_RX_OFFLOAD_BIT2STR(SECURITY),
RTE_RX_OFFLOAD_BIT2STR(KEEP_CRC),
+ RTE_RX_OFFLOAD_BIT2STR(SCTP_CKSUM),
+ RTE_RX_OFFLOAD_BIT2STR(OUTER_UDP_CKSUM),
};
#undef RTE_RX_OFFLOAD_BIT2STR
@@ -156,6 +159,10 @@ static const struct {
RTE_TX_OFFLOAD_BIT2STR(MULTI_SEGS),
RTE_TX_OFFLOAD_BIT2STR(MBUF_FAST_FREE),
RTE_TX_OFFLOAD_BIT2STR(SECURITY),
+ RTE_TX_OFFLOAD_BIT2STR(UDP_TNL_TSO),
+ RTE_TX_OFFLOAD_BIT2STR(IP_TNL_TSO),
+ RTE_TX_OFFLOAD_BIT2STR(OUTER_UDP_CKSUM),
+ RTE_TX_OFFLOAD_BIT2STR(MATCH_METADATA),
};
#undef RTE_TX_OFFLOAD_BIT2STR
@@ -180,6 +187,146 @@ enum {
STAT_QMAP_RX
};
+int __rte_experimental
+rte_eth_iterator_init(struct rte_dev_iterator *iter, const char *devargs_str)
+{
+ int ret;
+ struct rte_devargs devargs = {.args = NULL};
+ const char *bus_param_key;
+ char *bus_str = NULL;
+ char *cls_str = NULL;
+ int str_size;
+
+ memset(iter, 0, sizeof(*iter));
+
+ /*
+ * The devargs string may use various syntaxes:
+ * - 0000:08:00.0,representor=[1-3]
+ * - pci:0000:06:00.0,representor=[0,5]
+ * - class=eth,mac=00:11:22:33:44:55
+ * A new syntax is in development (not yet supported):
+ * - bus=X,paramX=x/class=Y,paramY=y/driver=Z,paramZ=z
+ */
+
+ /*
+ * Handle pure class filter (i.e. without any bus-level argument),
+ * from future new syntax.
+ * rte_devargs_parse() is not yet supporting the new syntax,
+ * that's why this simple case is temporarily parsed here.
+ */
+#define iter_anybus_str "class=eth,"
+ if (strncmp(devargs_str, iter_anybus_str,
+ strlen(iter_anybus_str)) == 0) {
+ iter->cls_str = devargs_str + strlen(iter_anybus_str);
+ goto end;
+ }
+
+ /* Split bus, device and parameters. */
+ ret = rte_devargs_parse(&devargs, devargs_str);
+ if (ret != 0)
+ goto error;
+
+ /*
+ * Assume parameters of old syntax can match only at ethdev level.
+ * Extra parameters will be ignored, thanks to "+" prefix.
+ */
+ str_size = strlen(devargs.args) + 2;
+ cls_str = malloc(str_size);
+ if (cls_str == NULL) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ ret = snprintf(cls_str, str_size, "+%s", devargs.args);
+ if (ret != str_size - 1) {
+ ret = -EINVAL;
+ goto error;
+ }
+ iter->cls_str = cls_str;
+ free(devargs.args); /* allocated by rte_devargs_parse() */
+ devargs.args = NULL;
+
+ iter->bus = devargs.bus;
+ if (iter->bus->dev_iterate == NULL) {
+ ret = -ENOTSUP;
+ goto error;
+ }
+
+ /* Convert bus args to new syntax for use with new API dev_iterate. */
+ if (strcmp(iter->bus->name, "vdev") == 0) {
+ bus_param_key = "name";
+ } else if (strcmp(iter->bus->name, "pci") == 0) {
+ bus_param_key = "addr";
+ } else {
+ ret = -ENOTSUP;
+ goto error;
+ }
+ str_size = strlen(bus_param_key) + strlen(devargs.name) + 2;
+ bus_str = malloc(str_size);
+ if (bus_str == NULL) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ ret = snprintf(bus_str, str_size, "%s=%s",
+ bus_param_key, devargs.name);
+ if (ret != str_size - 1) {
+ ret = -EINVAL;
+ goto error;
+ }
+ iter->bus_str = bus_str;
+
+end:
+ iter->cls = rte_class_find_by_name("eth");
+ return 0;
+
+error:
+ if (ret == -ENOTSUP)
+ RTE_LOG(ERR, EAL, "Bus %s does not support iterating.\n",
+ iter->bus->name);
+ free(devargs.args);
+ free(bus_str);
+ free(cls_str);
+ return ret;
+}
+
+uint16_t __rte_experimental
+rte_eth_iterator_next(struct rte_dev_iterator *iter)
+{
+ if (iter->cls == NULL) /* invalid ethdev iterator */
+ return RTE_MAX_ETHPORTS;
+
+ do { /* loop to try all matching rte_device */
+ /* If not pure ethdev filter and */
+ if (iter->bus != NULL &&
+ /* not in middle of rte_eth_dev iteration, */
+ iter->class_device == NULL) {
+ /* get next rte_device to try. */
+ iter->device = iter->bus->dev_iterate(
+ iter->device, iter->bus_str, iter);
+ if (iter->device == NULL)
+ break; /* no more rte_device candidate */
+ }
+ /* A device is matching bus part, need to check ethdev part. */
+ iter->class_device = iter->cls->dev_iterate(
+ iter->class_device, iter->cls_str, iter);
+ if (iter->class_device != NULL)
+ return eth_dev_to_id(iter->class_device); /* match */
+ } while (iter->bus != NULL); /* need to try next rte_device */
+
+ /* No more ethdev port to iterate. */
+ rte_eth_iterator_cleanup(iter);
+ return RTE_MAX_ETHPORTS;
+}
+
+void __rte_experimental
+rte_eth_iterator_cleanup(struct rte_dev_iterator *iter)
+{
+ if (iter->bus_str == NULL)
+ return; /* nothing to free in pure class filter */
+ free(RTE_CAST_FIELD(iter, bus_str, char *)); /* workaround const */
+ free(RTE_CAST_FIELD(iter, cls_str, char *)); /* workaround const */
+ memset(iter, 0, sizeof(*iter));
+}
+
uint16_t
rte_eth_find_next(uint16_t port_id)
{
@@ -366,13 +513,22 @@ rte_eth_dev_release_port(struct rte_eth_dev *eth_dev)
rte_eth_dev_shared_data_prepare();
- _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_DESTROY, NULL);
+ if (eth_dev->state != RTE_ETH_DEV_UNUSED)
+ _rte_eth_dev_callback_process(eth_dev,
+ RTE_ETH_EVENT_DESTROY, NULL);
rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
eth_dev->state = RTE_ETH_DEV_UNUSED;
- memset(eth_dev->data, 0, sizeof(struct rte_eth_dev_data));
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ rte_free(eth_dev->data->rx_queues);
+ rte_free(eth_dev->data->tx_queues);
+ rte_free(eth_dev->data->mac_addrs);
+ rte_free(eth_dev->data->hash_mac_addrs);
+ rte_free(eth_dev->data->dev_private);
+ memset(eth_dev->data, 0, sizeof(struct rte_eth_dev_data));
+ }
rte_spinlock_unlock(&rte_eth_dev_shared_data->ownership_lock);
@@ -393,11 +549,8 @@ static int
rte_eth_is_valid_owner_id(uint64_t owner_id)
{
if (owner_id == RTE_ETH_DEV_NO_OWNER ||
- rte_eth_dev_shared_data->next_owner_id <= owner_id) {
- RTE_ETHDEV_LOG(ERR, "Invalid owner_id=%016"PRIx64"\n",
- owner_id);
+ rte_eth_dev_shared_data->next_owner_id <= owner_id)
return 0;
- }
return 1;
}
@@ -444,8 +597,12 @@ _rte_eth_dev_owner_set(const uint16_t port_id, const uint64_t old_owner_id,
}
if (!rte_eth_is_valid_owner_id(new_owner->id) &&
- !rte_eth_is_valid_owner_id(old_owner_id))
+ !rte_eth_is_valid_owner_id(old_owner_id)) {
+ RTE_ETHDEV_LOG(ERR,
+ "Invalid owner old_id=%016"PRIx64" new_id=%016"PRIx64"\n",
+ old_owner_id, new_owner->id);
return -EINVAL;
+ }
port_owner = &rte_eth_devices[port_id].data->owner;
if (port_owner->id != old_owner_id) {
@@ -516,9 +673,13 @@ rte_eth_dev_owner_delete(const uint64_t owner_id)
if (rte_eth_devices[port_id].data->owner.id == owner_id)
memset(&rte_eth_devices[port_id].data->owner, 0,
sizeof(struct rte_eth_dev_owner));
- RTE_ETHDEV_LOG(ERR,
+ RTE_ETHDEV_LOG(NOTICE,
"All port owners owned by %016"PRIx64" identifier have removed\n",
owner_id);
+ } else {
+ RTE_ETHDEV_LOG(ERR,
+ "Invalid owner id=%016"PRIx64"\n",
+ owner_id);
}
rte_spinlock_unlock(&rte_eth_dev_shared_data->ownership_lock);
@@ -642,87 +803,6 @@ eth_err(uint16_t port_id, int ret)
return ret;
}
-/* attach the new device, then store port_id of the device */
-int
-rte_eth_dev_attach(const char *devargs, uint16_t *port_id)
-{
- int current = rte_eth_dev_count_total();
- struct rte_devargs da;
- int ret = -1;
-
- memset(&da, 0, sizeof(da));
-
- if ((devargs == NULL) || (port_id == NULL)) {
- ret = -EINVAL;
- goto err;
- }
-
- /* parse devargs */
- if (rte_devargs_parse(&da, devargs))
- goto err;
-
- ret = rte_eal_hotplug_add(da.bus->name, da.name, da.args);
- if (ret < 0)
- goto err;
-
- /* no point looking at the port count if no port exists */
- if (!rte_eth_dev_count_total()) {
- RTE_ETHDEV_LOG(ERR, "No port found for device (%s)\n", da.name);
- ret = -1;
- goto err;
- }
-
- /* if nothing happened, there is a bug here, since some driver told us
- * it did attach a device, but did not create a port.
- * FIXME: race condition in case of plug-out of another device
- */
- if (current == rte_eth_dev_count_total()) {
- ret = -1;
- goto err;
- }
-
- *port_id = eth_dev_last_created_port;
- ret = 0;
-
-err:
- free(da.args);
- return ret;
-}
-
-/* detach the device, then store the name of the device */
-int
-rte_eth_dev_detach(uint16_t port_id, char *name __rte_unused)
-{
- struct rte_device *dev;
- struct rte_bus *bus;
- uint32_t dev_flags;
- int ret = -1;
-
- RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
-
- dev_flags = rte_eth_devices[port_id].data->dev_flags;
- if (dev_flags & RTE_ETH_DEV_BONDED_SLAVE) {
- RTE_ETHDEV_LOG(ERR,
- "Port %"PRIu16" is bonded, cannot detach\n", port_id);
- return -ENOTSUP;
- }
-
- dev = rte_eth_devices[port_id].device;
- if (dev == NULL)
- return -EINVAL;
-
- bus = rte_bus_find_by_device(dev);
- if (bus == NULL)
- return -ENOENT;
-
- ret = rte_eal_hotplug_remove(bus->name, dev->name);
- if (ret < 0)
- return ret;
-
- rte_eth_dev_release_port(&rte_eth_devices[port_id]);
- return 0;
-}
-
static int
rte_eth_dev_rx_queue_config(struct rte_eth_dev *dev, uint16_t nb_queues)
{
@@ -974,7 +1054,7 @@ rte_eth_speed_bitflag(uint32_t speed, int duplex)
}
}
-const char * __rte_experimental
+const char *
rte_eth_dev_rx_offload_name(uint64_t offload)
{
const char *name = "UNKNOWN";
@@ -990,7 +1070,7 @@ rte_eth_dev_rx_offload_name(uint64_t offload)
return name;
}
-const char * __rte_experimental
+const char *
rte_eth_dev_tx_offload_name(uint64_t offload)
{
const char *name = "UNKNOWN";
@@ -1142,14 +1222,6 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q,
return -EINVAL;
}
- if ((local_conf.rxmode.offloads & DEV_RX_OFFLOAD_CRC_STRIP) &&
- (local_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)) {
- RTE_ETHDEV_LOG(ERR,
- "Port id=%u not allowed to set both CRC STRIP and KEEP CRC offload flags\n",
- port_id);
- return -EINVAL;
- }
-
/* Check that device supports requested rss hash functions. */
if ((dev_info.flow_type_rss_offloads |
dev_conf->rx_adv_conf.rss_conf.rss_hf) !=
@@ -1191,9 +1263,9 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q,
}
/* Initialize Rx profiling if enabled at compilation time. */
- diag = __rte_eth_profile_rx_init(port_id, dev);
+ diag = __rte_eth_dev_profile_init(port_id, dev);
if (diag != 0) {
- RTE_ETHDEV_LOG(ERR, "Port%u __rte_eth_profile_rx_init = %d\n",
+ RTE_ETHDEV_LOG(ERR, "Port%u __rte_eth_dev_profile_init = %d\n",
port_id, diag);
rte_eth_dev_rx_queue_config(dev, 0);
rte_eth_dev_tx_queue_config(dev, 0);
@@ -1219,19 +1291,14 @@ _rte_eth_dev_reset(struct rte_eth_dev *dev)
}
static void
-rte_eth_dev_config_restore(uint16_t port_id)
+rte_eth_dev_mac_restore(struct rte_eth_dev *dev,
+ struct rte_eth_dev_info *dev_info)
{
- struct rte_eth_dev *dev;
- struct rte_eth_dev_info dev_info;
struct ether_addr *addr;
uint16_t i;
uint32_t pool = 0;
uint64_t pool_mask;
- dev = &rte_eth_devices[port_id];
-
- rte_eth_dev_info_get(port_id, &dev_info);
-
/* replay MAC address configuration including default MAC */
addr = &dev->data->mac_addrs[0];
if (*dev->dev_ops->mac_addr_set != NULL)
@@ -1240,7 +1307,7 @@ rte_eth_dev_config_restore(uint16_t port_id)
(*dev->dev_ops->mac_addr_add)(dev, addr, 0, pool);
if (*dev->dev_ops->mac_addr_add != NULL) {
- for (i = 1; i < dev_info.max_mac_addrs; i++) {
+ for (i = 1; i < dev_info->max_mac_addrs; i++) {
addr = &dev->data->mac_addrs[i];
/* skip zero address */
@@ -1259,6 +1326,14 @@ rte_eth_dev_config_restore(uint16_t port_id)
} while (pool_mask);
}
}
+}
+
+static void
+rte_eth_dev_config_restore(struct rte_eth_dev *dev,
+ struct rte_eth_dev_info *dev_info, uint16_t port_id)
+{
+ if (!(*dev_info->dev_flags & RTE_ETH_DEV_NOLIVE_MAC_ADDR))
+ rte_eth_dev_mac_restore(dev, dev_info);
/* replay promiscuous configuration */
if (rte_eth_promiscuous_get(port_id) == 1)
@@ -1277,6 +1352,7 @@ int
rte_eth_dev_start(uint16_t port_id)
{
struct rte_eth_dev *dev;
+ struct rte_eth_dev_info dev_info;
int diag;
RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
@@ -1292,13 +1368,19 @@ rte_eth_dev_start(uint16_t port_id)
return 0;
}
+ rte_eth_dev_info_get(port_id, &dev_info);
+
+ /* Lets restore MAC now if device does not support live change */
+ if (*dev_info.dev_flags & RTE_ETH_DEV_NOLIVE_MAC_ADDR)
+ rte_eth_dev_mac_restore(dev, &dev_info);
+
diag = (*dev->dev_ops->dev_start)(dev);
if (diag == 0)
dev->data->dev_started = 1;
else
return eth_err(port_id, diag);
- rte_eth_dev_config_restore(port_id);
+ rte_eth_dev_config_restore(dev, &dev_info, port_id);
if (dev->data->dev_conf.intr_conf.lsc == 0) {
RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->link_update, -ENOTSUP);
@@ -1366,6 +1448,16 @@ rte_eth_dev_close(uint16_t port_id)
dev->data->dev_started = 0;
(*dev->dev_ops->dev_close)(dev);
+ /* check behaviour flag - temporary for PMD migration */
+ if ((dev->data->dev_flags & RTE_ETH_DEV_CLOSE_REMOVE) != 0) {
+ /* new behaviour: send event + reset state + free all data */
+ rte_eth_dev_release_port(dev);
+ return;
+ }
+ RTE_ETHDEV_LOG(DEBUG, "Port closing is using an old behaviour.\n"
+ "The driver %s should migrate to the new behaviour.\n",
+ dev->device->driver->name);
+ /* old behaviour: only free queue arrays */
dev->data->nb_rx_queues = 0;
rte_free(dev->data->rx_queues);
dev->data->rx_queues = NULL;
@@ -3425,6 +3517,43 @@ rte_eth_dev_rx_intr_ctl(uint16_t port_id, int epfd, int op, void *data)
return 0;
}
+int __rte_experimental
+rte_eth_dev_rx_intr_ctl_q_get_fd(uint16_t port_id, uint16_t queue_id)
+{
+ struct rte_intr_handle *intr_handle;
+ struct rte_eth_dev *dev;
+ unsigned int efd_idx;
+ uint32_t vec;
+ int fd;
+
+ RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -1);
+
+ dev = &rte_eth_devices[port_id];
+
+ if (queue_id >= dev->data->nb_rx_queues) {
+ RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", queue_id);
+ return -1;
+ }
+
+ if (!dev->intr_handle) {
+ RTE_ETHDEV_LOG(ERR, "RX Intr handle unset\n");
+ return -1;
+ }
+
+ intr_handle = dev->intr_handle;
+ if (!intr_handle->intr_vec) {
+ RTE_ETHDEV_LOG(ERR, "RX Intr vector unset\n");
+ return -1;
+ }
+
+ vec = intr_handle->intr_vec[queue_id];
+ efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
+ (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
+ fd = intr_handle->efds[efd_idx];
+
+ return fd;
+}
+
const struct rte_memzone *
rte_eth_dma_zone_reserve(const struct rte_eth_dev *dev, const char *ring_name,
uint16_t queue_id, size_t size, unsigned align,
@@ -3433,9 +3562,8 @@ rte_eth_dma_zone_reserve(const struct rte_eth_dev *dev, const char *ring_name,
char z_name[RTE_MEMZONE_NAMESIZE];
const struct rte_memzone *mz;
- snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d",
- dev->device->driver->name, ring_name,
- dev->data->port_id, queue_id);
+ snprintf(z_name, sizeof(z_name), "eth_p%d_q%d_%s",
+ dev->data->port_id, queue_id, ring_name);
mz = rte_memzone_lookup(z_name);
if (mz)
@@ -3459,10 +3587,8 @@ rte_eth_dev_create(struct rte_device *device, const char *name,
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
ethdev = rte_eth_dev_allocate(name);
- if (!ethdev) {
- retval = -ENODEV;
- goto probe_failed;
- }
+ if (!ethdev)
+ return -ENODEV;
if (priv_data_size) {
ethdev->data->dev_private = rte_zmalloc_socket(
@@ -3480,8 +3606,7 @@ rte_eth_dev_create(struct rte_device *device, const char *name,
if (!ethdev) {
RTE_LOG(ERR, EAL, "secondary process attach failed, "
"ethdev doesn't exist");
- retval = -ENODEV;
- goto probe_failed;
+ return -ENODEV;
}
}
@@ -3505,13 +3630,9 @@ rte_eth_dev_create(struct rte_device *device, const char *name,
rte_eth_dev_probing_finish(ethdev);
return retval;
-probe_failed:
- /* free ports private data if primary process */
- if (rte_eal_process_type() == RTE_PROC_PRIMARY)
- rte_free(ethdev->data->dev_private);
+probe_failed:
rte_eth_dev_release_port(ethdev);
-
return retval;
}
@@ -3532,11 +3653,6 @@ rte_eth_dev_destroy(struct rte_eth_dev *ethdev,
return ret;
}
- if (rte_eal_process_type() == RTE_PROC_PRIMARY)
- rte_free(ethdev->data->dev_private);
-
- ethdev->data->dev_private = NULL;
-
return rte_eth_dev_release_port(ethdev);
}
@@ -4195,7 +4311,7 @@ enum rte_eth_switch_domain_state {
* RTE_MAX_ETHPORTS elements as there cannot be more active switch domains than
* ethdev ports in a single process.
*/
-struct rte_eth_dev_switch {
+static struct rte_eth_dev_switch {
enum rte_eth_switch_domain_state state;
} rte_eth_switch_domains[RTE_MAX_ETHPORTS];
@@ -4236,8 +4352,6 @@ rte_eth_switch_domain_free(uint16_t domain_id)
return 0;
}
-typedef int (*rte_eth_devargs_callback_t)(char *str, void *data);
-
static int
rte_eth_devargs_tokenise(struct rte_kvargs *arglist, const char *str_in)
{
@@ -4302,89 +4416,6 @@ rte_eth_devargs_tokenise(struct rte_kvargs *arglist, const char *str_in)
}
}
-static int
-rte_eth_devargs_parse_list(char *str, rte_eth_devargs_callback_t callback,
- void *data)
-{
- char *str_start;
- int state;
- int result;
-
- if (*str != '[')
- /* Single element, not a list */
- return callback(str, data);
-
- /* Sanity check, then strip the brackets */
- str_start = &str[strlen(str) - 1];
- if (*str_start != ']') {
- RTE_LOG(ERR, EAL, "(%s): List does not end with ']'", str);
- return -EINVAL;
- }
- str++;
- *str_start = '\0';
-
- /* Process list elements */
- state = 0;
- while (1) {
- if (state == 0) {
- if (*str == '\0')
- break;
- if (*str != ',') {
- str_start = str;
- state = 1;
- }
- } else if (state == 1) {
- if (*str == ',' || *str == '\0') {
- if (str > str_start) {
- /* Non-empty string fragment */
- *str = '\0';
- result = callback(str_start, data);
- if (result < 0)
- return result;
- }
- state = 0;
- }
- }
- str++;
- }
- return 0;
-}
-
-static int
-rte_eth_devargs_process_range(char *str, uint16_t *list, uint16_t *len_list,
- const uint16_t max_list)
-{
- uint16_t lo, hi, val;
- int result;
-
- result = sscanf(str, "%hu-%hu", &lo, &hi);
- if (result == 1) {
- if (*len_list >= max_list)
- return -ENOMEM;
- list[(*len_list)++] = lo;
- } else if (result == 2) {
- if (lo >= hi || lo > RTE_MAX_ETHPORTS || hi > RTE_MAX_ETHPORTS)
- return -EINVAL;
- for (val = lo; val <= hi; val++) {
- if (*len_list >= max_list)
- return -ENOMEM;
- list[(*len_list)++] = val;
- }
- } else
- return -EINVAL;
- return 0;
-}
-
-
-static int
-rte_eth_devargs_parse_representor_ports(char *str, void *data)
-{
- struct rte_eth_devargs *eth_da = data;
-
- return rte_eth_devargs_process_range(str, eth_da->representor_ports,
- &eth_da->nb_representor_ports, RTE_MAX_ETHPORTS);
-}
-
int __rte_experimental
rte_eth_devargs_parse(const char *dargs, struct rte_eth_devargs *eth_da)
{
diff --git a/lib/librte_ethdev/rte_ethdev.h b/lib/librte_ethdev/rte_ethdev.h
index 7070e9ab..769a6943 100644
--- a/lib/librte_ethdev/rte_ethdev.h
+++ b/lib/librte_ethdev/rte_ethdev.h
@@ -167,6 +167,85 @@ extern int rte_eth_dev_logtype;
struct rte_mbuf;
/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Initializes a device iterator.
+ *
+ * This iterator allows accessing a list of devices matching some devargs.
+ *
+ * @param iter
+ * Device iterator handle initialized by the function.
+ * The fields bus_str and cls_str might be dynamically allocated,
+ * and could be freed by calling rte_eth_iterator_cleanup().
+ *
+ * @param devargs
+ * Device description string.
+ *
+ * @return
+ * 0 on successful initialization, negative otherwise.
+ */
+__rte_experimental
+int rte_eth_iterator_init(struct rte_dev_iterator *iter, const char *devargs);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Iterates on devices with devargs filter.
+ * The ownership is not checked.
+ *
+ * The next port id is returned, and the iterator is updated.
+ *
+ * @param iter
+ * Device iterator handle initialized by rte_eth_iterator_init().
+ * Some fields bus_str and cls_str might be freed when no more port is found,
+ * by calling rte_eth_iterator_cleanup().
+ *
+ * @return
+ * A port id if found, RTE_MAX_ETHPORTS otherwise.
+ */
+__rte_experimental
+uint16_t rte_eth_iterator_next(struct rte_dev_iterator *iter);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Free some allocated fields of the iterator.
+ *
+ * This function is automatically called by rte_eth_iterator_next()
+ * on the last iteration (i.e. when no more matching port is found).
+ *
+ * It is safe to call this function twice; it will do nothing more.
+ *
+ * @param iter
+ * Device iterator handle initialized by rte_eth_iterator_init().
+ * The fields bus_str and cls_str are freed if needed.
+ */
+__rte_experimental
+void rte_eth_iterator_cleanup(struct rte_dev_iterator *iter);
+
+/**
+ * Macro to iterate over all ethdev ports matching some devargs.
+ *
+ * If a break is done before the end of the loop,
+ * the function rte_eth_iterator_cleanup() must be called.
+ *
+ * @param id
+ * Iterated port id of type uint16_t.
+ * @param devargs
+ * Device parameters input as string of type char*.
+ * @param iter
+ * Iterator handle of type struct rte_dev_iterator, used internally.
+ */
+#define RTE_ETH_FOREACH_MATCHING_DEV(id, devargs, iter) \
+ for (rte_eth_iterator_init(iter, devargs), \
+ id = rte_eth_iterator_next(iter); \
+ id != RTE_MAX_ETHPORTS; \
+ id = rte_eth_iterator_next(iter))
+
+/**
* A structure used to retrieve statistics for an Ethernet port.
* Not all statistics fields in struct rte_eth_stats are supported
* by any type of network interface card (NIC). If any statistics
@@ -870,12 +949,6 @@ struct rte_eth_conf {
};
/**
- * A structure used to retrieve the contextual information of
- * an Ethernet device, such as the controlling driver of the device,
- * its PCI context, etc...
- */
-
-/**
* RX offload capabilities of a device.
*/
#define DEV_RX_OFFLOAD_VLAN_STRIP 0x00000001
@@ -890,16 +963,13 @@ struct rte_eth_conf {
#define DEV_RX_OFFLOAD_VLAN_FILTER 0x00000200
#define DEV_RX_OFFLOAD_VLAN_EXTEND 0x00000400
#define DEV_RX_OFFLOAD_JUMBO_FRAME 0x00000800
-#define DEV_RX_OFFLOAD_CRC_STRIP 0x00001000
#define DEV_RX_OFFLOAD_SCATTER 0x00002000
#define DEV_RX_OFFLOAD_TIMESTAMP 0x00004000
#define DEV_RX_OFFLOAD_SECURITY 0x00008000
-
-/**
- * Invalid to set both DEV_RX_OFFLOAD_CRC_STRIP and DEV_RX_OFFLOAD_KEEP_CRC
- * No DEV_RX_OFFLOAD_CRC_STRIP flag means keep CRC
- */
#define DEV_RX_OFFLOAD_KEEP_CRC 0x00010000
+#define DEV_RX_OFFLOAD_SCTP_CKSUM 0x00020000
+#define DEV_RX_OFFLOAD_OUTER_UDP_CKSUM 0x00040000
+
#define DEV_RX_OFFLOAD_CHECKSUM (DEV_RX_OFFLOAD_IPV4_CKSUM | \
DEV_RX_OFFLOAD_UDP_CKSUM | \
DEV_RX_OFFLOAD_TCP_CKSUM)
@@ -953,6 +1023,13 @@ struct rte_eth_conf {
* for tunnel TSO.
*/
#define DEV_TX_OFFLOAD_IP_TNL_TSO 0x00080000
+/** Device supports outer UDP checksum */
+#define DEV_TX_OFFLOAD_OUTER_UDP_CKSUM 0x00100000
+/**
+ * Device supports match on metadata Tx offload..
+ * Application must set PKT_TX_METADATA and mbuf metadata field.
+ */
+#define DEV_TX_OFFLOAD_MATCH_METADATA 0x00200000
#define RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP 0x00000001
/**< Device supports Rx queue setup after device started*/
@@ -1010,6 +1087,12 @@ struct rte_eth_switch_info {
/**
* Ethernet device information
*/
+
+/**
+ * A structure used to retrieve the contextual information of
+ * an Ethernet device, such as the controlling driver of the
+ * device, etc...
+ */
struct rte_eth_dev_info {
struct rte_device *device; /** Generic device information */
const char *driver_name; /**< Device Driver name. */
@@ -1260,6 +1343,11 @@ struct rte_eth_dev_owner {
char name[RTE_ETH_MAX_OWNER_NAME_LEN]; /**< The owner name. */
};
+/**
+ * Port is released (i.e. totally freed and data erased) on close.
+ * Temporary flag for PMD migration to new rte_eth_dev_close() behaviour.
+ */
+#define RTE_ETH_DEV_CLOSE_REMOVE 0x0001
/** Device supports link state interrupt */
#define RTE_ETH_DEV_INTR_LSC 0x0002
/** Device is a bonded slave */
@@ -1268,6 +1356,8 @@ struct rte_eth_dev_owner {
#define RTE_ETH_DEV_INTR_RMV 0x0008
/** Device is port representor */
#define RTE_ETH_DEV_REPRESENTOR 0x0010
+/** Device does not support MAC change after started */
+#define RTE_ETH_DEV_NOLIVE_MAC_ADDR 0x0020
/**
* Iterates over valid ethdev ports owned by a specific owner.
@@ -1420,37 +1510,6 @@ uint16_t rte_eth_dev_count_avail(void);
uint16_t __rte_experimental rte_eth_dev_count_total(void);
/**
- * Attach a new Ethernet device specified by arguments.
- *
- * @param devargs
- * A pointer to a strings array describing the new device
- * to be attached. The strings should be a pci address like
- * '0000:01:00.0' or virtual device name like 'net_pcap0'.
- * @param port_id
- * A pointer to a port identifier actually attached.
- * @return
- * 0 on success and port_id is filled, negative on error
- */
-__rte_deprecated
-int rte_eth_dev_attach(const char *devargs, uint16_t *port_id);
-
-/**
- * Detach a Ethernet device specified by port identifier.
- * This function must be called when the device is in the
- * closed state.
- *
- * @param port_id
- * The port identifier of the device to detach.
- * @param devname
- * A pointer to a buffer that will be filled with the device name.
- * This buffer must be at least RTE_DEV_NAME_MAX_LEN long.
- * @return
- * 0 on success and devname is filled, negative on error
- */
-__rte_deprecated
-int rte_eth_dev_detach(uint16_t port_id, char *devname);
-
-/**
* Convert a numerical speed in Mbps to a bitmap flag that can be used in
* the bitmap link_speeds of the struct rte_eth_conf
*
@@ -1464,9 +1523,6 @@ int rte_eth_dev_detach(uint16_t port_id, char *devname);
uint32_t rte_eth_speed_bitflag(uint32_t speed, int duplex);
/**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
* Get DEV_RX_OFFLOAD_* flag name.
*
* @param offload
@@ -1474,12 +1530,9 @@ uint32_t rte_eth_speed_bitflag(uint32_t speed, int duplex);
* @return
* Offload name or 'UNKNOWN' if the flag cannot be recognised.
*/
-const char * __rte_experimental rte_eth_dev_rx_offload_name(uint64_t offload);
+const char *rte_eth_dev_rx_offload_name(uint64_t offload);
/**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
* Get DEV_TX_OFFLOAD_* flag name.
*
* @param offload
@@ -1487,7 +1540,7 @@ const char * __rte_experimental rte_eth_dev_rx_offload_name(uint64_t offload);
* @return
* Offload name or 'UNKNOWN' if the flag cannot be recognised.
*/
-const char * __rte_experimental rte_eth_dev_tx_offload_name(uint64_t offload);
+const char *rte_eth_dev_tx_offload_name(uint64_t offload);
/**
* Configure an Ethernet device.
@@ -1750,6 +1803,10 @@ int rte_eth_dev_tx_queue_stop(uint16_t port_id, uint16_t tx_queue_id);
* The device start step is the last one and consists of setting the configured
* offload features and in starting the transmit and the receive units of the
* device.
+ *
+ * Device RTE_ETH_DEV_NOLIVE_MAC_ADDR flag causes MAC address to be set before
+ * PMD port start callback function is invoked.
+ *
* On success, all basic functions exported by the Ethernet API (link status,
* receive/transmit, and so on) can be invoked.
*
@@ -1797,8 +1854,8 @@ int rte_eth_dev_set_link_down(uint16_t port_id);
/**
* Close a stopped Ethernet device. The device cannot be restarted!
- * The function frees all resources except for needed by the
- * closed state. To free these resources, call rte_eth_dev_detach().
+ * The function frees all port resources if the driver supports
+ * the flag RTE_ETH_DEV_CLOSE_REMOVE.
*
* @param port_id
* The port identifier of the Ethernet device.
@@ -2719,6 +2776,26 @@ int rte_eth_dev_rx_intr_ctl_q(uint16_t port_id, uint16_t queue_id,
int epfd, int op, void *data);
/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Get interrupt fd per Rx queue.
+ *
+ * @param port_id
+ * The port identifier of the Ethernet device.
+ * @param queue_id
+ * The index of the receive queue from which to retrieve input packets.
+ * The value must be in the range [0, nb_rx_queue - 1] previously supplied
+ * to rte_eth_dev_configure().
+ * @return
+ * - (>=0) the interrupt fd associated to the requested Rx queue if
+ * successful.
+ * - (-1) on error.
+ */
+int __rte_experimental
+rte_eth_dev_rx_intr_ctl_q_get_fd(uint16_t port_id, uint16_t queue_id);
+
+/**
* Turn on the LED on the Ethernet device.
* This function turns on the LED on the Ethernet device.
*
diff --git a/lib/librte_ethdev/rte_ethdev_core.h b/lib/librte_ethdev/rte_ethdev_core.h
index 33d12b3a..8f03f83f 100644
--- a/lib/librte_ethdev/rte_ethdev_core.h
+++ b/lib/librte_ethdev/rte_ethdev_core.h
@@ -539,7 +539,13 @@ struct rte_eth_dev {
eth_rx_burst_t rx_pkt_burst; /**< Pointer to PMD receive function. */
eth_tx_burst_t tx_pkt_burst; /**< Pointer to PMD transmit function. */
eth_tx_prep_t tx_pkt_prepare; /**< Pointer to PMD transmit prepare function. */
- struct rte_eth_dev_data *data; /**< Pointer to device data */
+ /**
+ * Next two fields are per-device data but *data is shared between
+ * primary and secondary processes and *process_private is per-process
+ * private. The second one is managed by PMDs if necessary.
+ */
+ struct rte_eth_dev_data *data; /**< Pointer to device data. */
+ void *process_private; /**< Pointer to per-process device data. */
const struct eth_dev_ops *dev_ops; /**< Functions exported by PMD */
struct rte_device *device; /**< Backing device */
struct rte_intr_handle *intr_handle; /**< Device interrupt handle */
@@ -579,24 +585,30 @@ struct rte_eth_dev_data {
struct rte_eth_dev_sriov sriov; /**< SRIOV data */
- void *dev_private; /**< PMD-specific private data */
-
- struct rte_eth_link dev_link;
- /**< Link-level information & status */
+ void *dev_private;
+ /**< PMD-specific private data.
+ * @see rte_eth_dev_release_port()
+ */
+ struct rte_eth_link dev_link; /**< Link-level information & status. */
struct rte_eth_conf dev_conf; /**< Configuration applied to device. */
uint16_t mtu; /**< Maximum Transmission Unit. */
-
uint32_t min_rx_buf_size;
- /**< Common rx buffer size handled by all queues */
+ /**< Common RX buffer size handled by all queues. */
uint64_t rx_mbuf_alloc_failed; /**< RX ring mbuf allocation failures. */
- struct ether_addr* mac_addrs;/**< Device Ethernet Link address. */
+ struct ether_addr *mac_addrs;
+ /**< Device Ethernet link address.
+ * @see rte_eth_dev_release_port()
+ */
uint64_t mac_pool_sel[ETH_NUM_RECEIVE_MAC_ADDR];
- /** bitmap array of associating Ethernet MAC addresses to pools */
- struct ether_addr* hash_mac_addrs;
- /** Device Ethernet MAC addresses of hash filtering. */
+ /**< Bitmap associating MAC addresses to pools. */
+ struct ether_addr *hash_mac_addrs;
+ /**< Device Ethernet MAC addresses of hash filtering.
+ * @see rte_eth_dev_release_port()
+ */
uint16_t port_id; /**< Device [external] port identifier. */
+
__extension__
uint8_t promiscuous : 1, /**< RX promiscuous mode ON(1) / OFF(0). */
scattered_rx : 1, /**< RX of scattered packets is ON(1) / OFF(0) */
@@ -604,15 +616,19 @@ struct rte_eth_dev_data {
dev_started : 1, /**< Device state: STARTED(1) / STOPPED(0). */
lro : 1; /**< RX LRO is ON(1) / OFF(0) */
uint8_t rx_queue_state[RTE_MAX_QUEUES_PER_PORT];
- /** Queues state: STARTED(1) / STOPPED(0) */
+ /**< Queues state: STARTED(1) / STOPPED(0). */
uint8_t tx_queue_state[RTE_MAX_QUEUES_PER_PORT];
- /** Queues state: STARTED(1) / STOPPED(0) */
- uint32_t dev_flags; /**< Capabilities */
- enum rte_kernel_driver kdrv; /**< Kernel driver passthrough */
- int numa_node; /**< NUMA node connection */
+ /**< Queues state: STARTED(1) / STOPPED(0). */
+ uint32_t dev_flags; /**< Capabilities. */
+ enum rte_kernel_driver kdrv; /**< Kernel driver passthrough. */
+ int numa_node; /**< NUMA node connection. */
struct rte_vlan_filter_conf vlan_filter_conf;
- /**< VLAN filter configuration. */
+ /**< VLAN filter configuration. */
struct rte_eth_dev_owner owner; /**< The port owner. */
+ uint16_t representor_id;
+ /**< Switch-specific identifier.
+ * Valid if RTE_ETH_DEV_REPRESENTOR in dev_flags.
+ */
} __rte_cache_aligned;
/**
diff --git a/lib/librte_ethdev/rte_ethdev_driver.h b/lib/librte_ethdev/rte_ethdev_driver.h
index c6d9bc1a..c2ac2632 100644
--- a/lib/librte_ethdev/rte_ethdev_driver.h
+++ b/lib/librte_ethdev/rte_ethdev_driver.h
@@ -58,10 +58,17 @@ struct rte_eth_dev *rte_eth_dev_attach_secondary(const char *name);
/**
* @internal
- * Release the specified ethdev port.
+ * Notify RTE_ETH_EVENT_DESTROY and release the specified ethdev port.
+ *
+ * The following PMD-managed data fields will be freed:
+ * - dev_private
+ * - mac_addrs
+ * - hash_mac_addrs
+ * If one of these fields should not be freed,
+ * it must be reset to NULL by the PMD, typically in dev_close method.
*
* @param eth_dev
- * The *eth_dev* pointer is the address of the *rte_eth_dev* structure.
+ * Device to be detached.
* @return
* - 0 on success, negative on error
*/
@@ -324,32 +331,6 @@ typedef int (*ethdev_uninit_t)(struct rte_eth_dev *ethdev);
int __rte_experimental
rte_eth_dev_destroy(struct rte_eth_dev *ethdev, ethdev_uninit_t ethdev_uninit);
-/**
- * PMD helper function to check if keeping CRC is requested
- *
- * @note
- * When CRC_STRIP offload flag is removed and default behavior switch to
- * strip CRC, as planned, this helper function is not that useful and will be
- * removed. In PMDs this function will be replaced with check:
- * if (offloads & DEV_RX_OFFLOAD_KEEP_CRC)
- *
- * @param rx_offloads
- * offload bits to be applied
- *
- * @return
- * Return positive if keeping CRC is requested,
- * zero if stripping CRC is requested
- */
-static inline int
-rte_eth_dev_must_keep_crc(uint64_t rx_offloads)
-{
- if (rx_offloads & DEV_RX_OFFLOAD_CRC_STRIP)
- return 0;
-
- /* no KEEP_CRC or CRC_STRIP offload flags means keep CRC */
- return 1;
-}
-
#ifdef __cplusplus
}
#endif
diff --git a/lib/librte_ethdev/rte_ethdev_pci.h b/lib/librte_ethdev/rte_ethdev_pci.h
index f652596f..23257e98 100644
--- a/lib/librte_ethdev/rte_ethdev_pci.h
+++ b/lib/librte_ethdev/rte_ethdev_pci.h
@@ -135,17 +135,6 @@ rte_eth_dev_pci_allocate(struct rte_pci_device *dev, size_t private_data_size)
static inline void
rte_eth_dev_pci_release(struct rte_eth_dev *eth_dev)
{
- if (rte_eal_process_type() == RTE_PROC_PRIMARY)
- rte_free(eth_dev->data->dev_private);
-
- eth_dev->data->dev_private = NULL;
-
- /*
- * Secondary process will check the name to attach.
- * Clear this field to avoid attaching a released ports.
- */
- eth_dev->data->name[0] = '\0';
-
eth_dev->device = NULL;
eth_dev->intr_handle = NULL;
diff --git a/lib/librte_ethdev/rte_ethdev_version.map b/lib/librte_ethdev/rte_ethdev_version.map
index 38f117f0..3560c288 100644
--- a/lib/librte_ethdev/rte_ethdev_version.map
+++ b/lib/librte_ethdev/rte_ethdev_version.map
@@ -8,14 +8,12 @@ DPDK_2.2 {
rte_eth_allmulticast_get;
rte_eth_dev_allocate;
rte_eth_dev_allocated;
- rte_eth_dev_attach;
rte_eth_dev_callback_register;
rte_eth_dev_callback_unregister;
rte_eth_dev_close;
rte_eth_dev_configure;
rte_eth_dev_count;
rte_eth_dev_default_mac_addr_set;
- rte_eth_dev_detach;
rte_eth_dev_filter_supported;
rte_eth_dev_flow_ctrl_get;
rte_eth_dev_flow_ctrl_set;
@@ -220,6 +218,14 @@ DPDK_18.08 {
} DPDK_18.05;
+DPDK_18.11 {
+ global:
+
+ rte_eth_dev_rx_offload_name;
+ rte_eth_dev_tx_offload_name;
+
+} DPDK_18.08;
+
EXPERIMENTAL {
global:
@@ -235,10 +241,13 @@ EXPERIMENTAL {
rte_eth_dev_owner_new;
rte_eth_dev_owner_set;
rte_eth_dev_owner_unset;
- rte_eth_dev_rx_offload_name;
- rte_eth_dev_tx_offload_name;
+ rte_eth_dev_rx_intr_ctl_q_get_fd;
+ rte_eth_iterator_cleanup;
+ rte_eth_iterator_init;
+ rte_eth_iterator_next;
rte_eth_switch_domain_alloc;
rte_eth_switch_domain_free;
+ rte_flow_conv;
rte_flow_expand_rss;
rte_mtr_capabilities_get;
rte_mtr_create;
diff --git a/lib/librte_ethdev/rte_flow.c b/lib/librte_ethdev/rte_flow.c
index cff4b520..3277be1e 100644
--- a/lib/librte_ethdev/rte_flow.c
+++ b/lib/librte_ethdev/rte_flow.c
@@ -11,6 +11,7 @@
#include <rte_common.h>
#include <rte_errno.h>
#include <rte_branch_prediction.h>
+#include <rte_string_fns.h>
#include "rte_ethdev.h"
#include "rte_flow_driver.h"
#include "rte_flow.h"
@@ -50,10 +51,15 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
MK_FLOW_ITEM(TCP, sizeof(struct rte_flow_item_tcp)),
MK_FLOW_ITEM(SCTP, sizeof(struct rte_flow_item_sctp)),
MK_FLOW_ITEM(VXLAN, sizeof(struct rte_flow_item_vxlan)),
- MK_FLOW_ITEM(MPLS, sizeof(struct rte_flow_item_mpls)),
- MK_FLOW_ITEM(GRE, sizeof(struct rte_flow_item_gre)),
MK_FLOW_ITEM(E_TAG, sizeof(struct rte_flow_item_e_tag)),
MK_FLOW_ITEM(NVGRE, sizeof(struct rte_flow_item_nvgre)),
+ MK_FLOW_ITEM(MPLS, sizeof(struct rte_flow_item_mpls)),
+ MK_FLOW_ITEM(GRE, sizeof(struct rte_flow_item_gre)),
+ MK_FLOW_ITEM(FUZZY, sizeof(struct rte_flow_item_fuzzy)),
+ MK_FLOW_ITEM(GTP, sizeof(struct rte_flow_item_gtp)),
+ MK_FLOW_ITEM(GTPC, sizeof(struct rte_flow_item_gtp)),
+ MK_FLOW_ITEM(GTPU, sizeof(struct rte_flow_item_gtp)),
+ MK_FLOW_ITEM(ESP, sizeof(struct rte_flow_item_esp)),
MK_FLOW_ITEM(GENEVE, sizeof(struct rte_flow_item_geneve)),
MK_FLOW_ITEM(VXLAN_GPE, sizeof(struct rte_flow_item_vxlan_gpe)),
MK_FLOW_ITEM(ARP_ETH_IPV4, sizeof(struct rte_flow_item_arp_eth_ipv4)),
@@ -66,6 +72,8 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
sizeof(struct rte_flow_item_icmp6_nd_opt_sla_eth)),
MK_FLOW_ITEM(ICMP6_ND_OPT_TLA_ETH,
sizeof(struct rte_flow_item_icmp6_nd_opt_tla_eth)),
+ MK_FLOW_ITEM(MARK, sizeof(struct rte_flow_item_mark)),
+ MK_FLOW_ITEM(META, sizeof(struct rte_flow_item_meta)),
};
/** Generate flow_action[] entry. */
@@ -80,6 +88,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
MK_FLOW_ACTION(END, 0),
MK_FLOW_ACTION(VOID, 0),
MK_FLOW_ACTION(PASSTHRU, 0),
+ MK_FLOW_ACTION(JUMP, sizeof(struct rte_flow_action_jump)),
MK_FLOW_ACTION(MARK, sizeof(struct rte_flow_action_mark)),
MK_FLOW_ACTION(FLAG, 0),
MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
@@ -90,6 +99,8 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
MK_FLOW_ACTION(PORT_ID, sizeof(struct rte_flow_action_port_id)),
+ MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)),
+ MK_FLOW_ACTION(SECURITY, sizeof(struct rte_flow_action_security)),
MK_FLOW_ACTION(OF_SET_MPLS_TTL,
sizeof(struct rte_flow_action_of_set_mpls_ttl)),
MK_FLOW_ACTION(OF_DEC_MPLS_TTL, 0),
@@ -109,6 +120,29 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
sizeof(struct rte_flow_action_of_pop_mpls)),
MK_FLOW_ACTION(OF_PUSH_MPLS,
sizeof(struct rte_flow_action_of_push_mpls)),
+ MK_FLOW_ACTION(VXLAN_ENCAP, sizeof(struct rte_flow_action_vxlan_encap)),
+ MK_FLOW_ACTION(VXLAN_DECAP, 0),
+ MK_FLOW_ACTION(NVGRE_ENCAP, sizeof(struct rte_flow_action_vxlan_encap)),
+ MK_FLOW_ACTION(NVGRE_DECAP, 0),
+ MK_FLOW_ACTION(RAW_ENCAP, sizeof(struct rte_flow_action_raw_encap)),
+ MK_FLOW_ACTION(RAW_DECAP, sizeof(struct rte_flow_action_raw_decap)),
+ MK_FLOW_ACTION(SET_IPV4_SRC,
+ sizeof(struct rte_flow_action_set_ipv4)),
+ MK_FLOW_ACTION(SET_IPV4_DST,
+ sizeof(struct rte_flow_action_set_ipv4)),
+ MK_FLOW_ACTION(SET_IPV6_SRC,
+ sizeof(struct rte_flow_action_set_ipv6)),
+ MK_FLOW_ACTION(SET_IPV6_DST,
+ sizeof(struct rte_flow_action_set_ipv6)),
+ MK_FLOW_ACTION(SET_TP_SRC,
+ sizeof(struct rte_flow_action_set_tp)),
+ MK_FLOW_ACTION(SET_TP_DST,
+ sizeof(struct rte_flow_action_set_tp)),
+ MK_FLOW_ACTION(MAC_SWAP, 0),
+ MK_FLOW_ACTION(DEC_TTL, 0),
+ MK_FLOW_ACTION(SET_TTL, sizeof(struct rte_flow_action_set_ttl)),
+ MK_FLOW_ACTION(SET_MAC_SRC, sizeof(struct rte_flow_action_set_mac)),
+ MK_FLOW_ACTION(SET_MAC_DST, sizeof(struct rte_flow_action_set_mac)),
};
static int
@@ -288,26 +322,41 @@ rte_flow_error_set(struct rte_flow_error *error,
}
/** Pattern item specification types. */
-enum item_spec_type {
- ITEM_SPEC,
- ITEM_LAST,
- ITEM_MASK,
+enum rte_flow_conv_item_spec_type {
+ RTE_FLOW_CONV_ITEM_SPEC,
+ RTE_FLOW_CONV_ITEM_LAST,
+ RTE_FLOW_CONV_ITEM_MASK,
};
-/** Compute storage space needed by item specification and copy it. */
+/**
+ * Copy pattern item specification.
+ *
+ * @param[out] buf
+ * Output buffer. Can be NULL if @p size is zero.
+ * @param size
+ * Size of @p buf in bytes.
+ * @param[in] item
+ * Pattern item to copy specification from.
+ * @param type
+ * Specification selector for either @p spec, @p last or @p mask.
+ *
+ * @return
+ * Number of bytes needed to store pattern item specification regardless
+ * of @p size. @p buf contents are truncated to @p size if not large
+ * enough.
+ */
static size_t
-flow_item_spec_copy(void *buf, const struct rte_flow_item *item,
- enum item_spec_type type)
+rte_flow_conv_item_spec(void *buf, const size_t size,
+ const struct rte_flow_item *item,
+ enum rte_flow_conv_item_spec_type type)
{
- size_t size = 0;
+ size_t off;
const void *data =
- type == ITEM_SPEC ? item->spec :
- type == ITEM_LAST ? item->last :
- type == ITEM_MASK ? item->mask :
+ type == RTE_FLOW_CONV_ITEM_SPEC ? item->spec :
+ type == RTE_FLOW_CONV_ITEM_LAST ? item->last :
+ type == RTE_FLOW_CONV_ITEM_MASK ? item->mask :
NULL;
- if (!item->spec || !data)
- goto empty;
switch (item->type) {
union {
const struct rte_flow_item_raw *raw;
@@ -324,7 +373,7 @@ flow_item_spec_copy(void *buf, const struct rte_flow_item *item,
union {
struct rte_flow_item_raw *raw;
} dst;
- size_t off;
+ size_t tmp;
case RTE_FLOW_ITEM_TYPE_RAW:
spec.raw = item->spec;
@@ -332,91 +381,466 @@ flow_item_spec_copy(void *buf, const struct rte_flow_item *item,
mask.raw = item->mask ? item->mask : &rte_flow_item_raw_mask;
src.raw = data;
dst.raw = buf;
- off = RTE_ALIGN_CEIL(sizeof(struct rte_flow_item_raw),
- sizeof(*src.raw->pattern));
- if (type == ITEM_SPEC ||
- (type == ITEM_MASK &&
+ rte_memcpy(dst.raw,
+ (&(struct rte_flow_item_raw){
+ .relative = src.raw->relative,
+ .search = src.raw->search,
+ .reserved = src.raw->reserved,
+ .offset = src.raw->offset,
+ .limit = src.raw->limit,
+ .length = src.raw->length,
+ }),
+ size > sizeof(*dst.raw) ? sizeof(*dst.raw) : size);
+ off = sizeof(*dst.raw);
+ if (type == RTE_FLOW_CONV_ITEM_SPEC ||
+ (type == RTE_FLOW_CONV_ITEM_MASK &&
((spec.raw->length & mask.raw->length) >=
(last.raw->length & mask.raw->length))))
- size = spec.raw->length & mask.raw->length;
+ tmp = spec.raw->length & mask.raw->length;
else
- size = last.raw->length & mask.raw->length;
- size = off + size * sizeof(*src.raw->pattern);
- if (dst.raw) {
- memcpy(dst.raw, src.raw, sizeof(*src.raw));
- dst.raw->pattern = memcpy((uint8_t *)dst.raw + off,
- src.raw->pattern,
- size - off);
+ tmp = last.raw->length & mask.raw->length;
+ if (tmp) {
+ off = RTE_ALIGN_CEIL(off, sizeof(*dst.raw->pattern));
+ if (size >= off + tmp)
+ dst.raw->pattern = rte_memcpy
+ ((void *)((uintptr_t)dst.raw + off),
+ src.raw->pattern, tmp);
+ off += tmp;
}
break;
default:
- size = rte_flow_desc_item[item->type].size;
- if (buf)
- memcpy(buf, data, size);
+ off = rte_flow_desc_item[item->type].size;
+ rte_memcpy(buf, data, (size > off ? off : size));
break;
}
-empty:
- return RTE_ALIGN_CEIL(size, sizeof(double));
+ return off;
}
-/** Compute storage space needed by action configuration and copy it. */
+/**
+ * Copy action configuration.
+ *
+ * @param[out] buf
+ * Output buffer. Can be NULL if @p size is zero.
+ * @param size
+ * Size of @p buf in bytes.
+ * @param[in] action
+ * Action to copy configuration from.
+ *
+ * @return
+ * Number of bytes needed to store pattern item specification regardless
+ * of @p size. @p buf contents are truncated to @p size if not large
+ * enough.
+ */
static size_t
-flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
+rte_flow_conv_action_conf(void *buf, const size_t size,
+ const struct rte_flow_action *action)
{
- size_t size = 0;
+ size_t off;
- if (!action->conf)
- goto empty;
switch (action->type) {
union {
const struct rte_flow_action_rss *rss;
+ const struct rte_flow_action_vxlan_encap *vxlan_encap;
+ const struct rte_flow_action_nvgre_encap *nvgre_encap;
} src;
union {
struct rte_flow_action_rss *rss;
+ struct rte_flow_action_vxlan_encap *vxlan_encap;
+ struct rte_flow_action_nvgre_encap *nvgre_encap;
} dst;
- size_t off;
+ size_t tmp;
+ int ret;
case RTE_FLOW_ACTION_TYPE_RSS:
src.rss = action->conf;
dst.rss = buf;
- off = 0;
- if (dst.rss)
- *dst.rss = (struct rte_flow_action_rss){
+ rte_memcpy(dst.rss,
+ (&(struct rte_flow_action_rss){
.func = src.rss->func,
.level = src.rss->level,
.types = src.rss->types,
.key_len = src.rss->key_len,
.queue_num = src.rss->queue_num,
- };
- off += sizeof(*src.rss);
+ }),
+ size > sizeof(*dst.rss) ? sizeof(*dst.rss) : size);
+ off = sizeof(*dst.rss);
if (src.rss->key_len) {
- off = RTE_ALIGN_CEIL(off, sizeof(double));
- size = sizeof(*src.rss->key) * src.rss->key_len;
- if (dst.rss)
- dst.rss->key = memcpy
+ off = RTE_ALIGN_CEIL(off, sizeof(*dst.rss->key));
+ tmp = sizeof(*src.rss->key) * src.rss->key_len;
+ if (size >= off + tmp)
+ dst.rss->key = rte_memcpy
((void *)((uintptr_t)dst.rss + off),
- src.rss->key, size);
- off += size;
+ src.rss->key, tmp);
+ off += tmp;
}
if (src.rss->queue_num) {
- off = RTE_ALIGN_CEIL(off, sizeof(double));
- size = sizeof(*src.rss->queue) * src.rss->queue_num;
- if (dst.rss)
- dst.rss->queue = memcpy
+ off = RTE_ALIGN_CEIL(off, sizeof(*dst.rss->queue));
+ tmp = sizeof(*src.rss->queue) * src.rss->queue_num;
+ if (size >= off + tmp)
+ dst.rss->queue = rte_memcpy
((void *)((uintptr_t)dst.rss + off),
- src.rss->queue, size);
- off += size;
+ src.rss->queue, tmp);
+ off += tmp;
+ }
+ break;
+ case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+ case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP:
+ src.vxlan_encap = action->conf;
+ dst.vxlan_encap = buf;
+ RTE_BUILD_BUG_ON(sizeof(*src.vxlan_encap) !=
+ sizeof(*src.nvgre_encap) ||
+ offsetof(struct rte_flow_action_vxlan_encap,
+ definition) !=
+ offsetof(struct rte_flow_action_nvgre_encap,
+ definition));
+ off = sizeof(*dst.vxlan_encap);
+ if (src.vxlan_encap->definition) {
+ off = RTE_ALIGN_CEIL
+ (off, sizeof(*dst.vxlan_encap->definition));
+ ret = rte_flow_conv
+ (RTE_FLOW_CONV_OP_PATTERN,
+ (void *)((uintptr_t)dst.vxlan_encap + off),
+ size > off ? size - off : 0,
+ src.vxlan_encap->definition, NULL);
+ if (ret < 0)
+ return 0;
+ if (size >= off + ret)
+ dst.vxlan_encap->definition =
+ (void *)((uintptr_t)dst.vxlan_encap +
+ off);
+ off += ret;
}
- size = off;
break;
default:
- size = rte_flow_desc_action[action->type].size;
- if (buf)
- memcpy(buf, action->conf, size);
+ off = rte_flow_desc_action[action->type].size;
+ rte_memcpy(buf, action->conf, (size > off ? off : size));
break;
}
-empty:
- return RTE_ALIGN_CEIL(size, sizeof(double));
+ return off;
+}
+
+/**
+ * Copy a list of pattern items.
+ *
+ * @param[out] dst
+ * Destination buffer. Can be NULL if @p size is zero.
+ * @param size
+ * Size of @p dst in bytes.
+ * @param[in] src
+ * Source pattern items.
+ * @param num
+ * Maximum number of pattern items to process from @p src or 0 to process
+ * the entire list. In both cases, processing stops after
+ * RTE_FLOW_ITEM_TYPE_END is encountered.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * A positive value representing the number of bytes needed to store
+ * pattern items regardless of @p size on success (@p buf contents are
+ * truncated to @p size if not large enough), a negative errno value
+ * otherwise and rte_errno is set.
+ */
+static int
+rte_flow_conv_pattern(struct rte_flow_item *dst,
+ const size_t size,
+ const struct rte_flow_item *src,
+ unsigned int num,
+ struct rte_flow_error *error)
+{
+ uintptr_t data = (uintptr_t)dst;
+ size_t off;
+ size_t ret;
+ unsigned int i;
+
+ for (i = 0, off = 0; !num || i != num; ++i, ++src, ++dst) {
+ if ((size_t)src->type >= RTE_DIM(rte_flow_desc_item) ||
+ !rte_flow_desc_item[src->type].name)
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, src,
+ "cannot convert unknown item type");
+ if (size >= off + sizeof(*dst))
+ *dst = (struct rte_flow_item){
+ .type = src->type,
+ };
+ off += sizeof(*dst);
+ if (!src->type)
+ num = i + 1;
+ }
+ num = i;
+ src -= num;
+ dst -= num;
+ do {
+ if (src->spec) {
+ off = RTE_ALIGN_CEIL(off, sizeof(double));
+ ret = rte_flow_conv_item_spec
+ ((void *)(data + off),
+ size > off ? size - off : 0, src,
+ RTE_FLOW_CONV_ITEM_SPEC);
+ if (size && size >= off + ret)
+ dst->spec = (void *)(data + off);
+ off += ret;
+
+ }
+ if (src->last) {
+ off = RTE_ALIGN_CEIL(off, sizeof(double));
+ ret = rte_flow_conv_item_spec
+ ((void *)(data + off),
+ size > off ? size - off : 0, src,
+ RTE_FLOW_CONV_ITEM_LAST);
+ if (size && size >= off + ret)
+ dst->last = (void *)(data + off);
+ off += ret;
+ }
+ if (src->mask) {
+ off = RTE_ALIGN_CEIL(off, sizeof(double));
+ ret = rte_flow_conv_item_spec
+ ((void *)(data + off),
+ size > off ? size - off : 0, src,
+ RTE_FLOW_CONV_ITEM_MASK);
+ if (size && size >= off + ret)
+ dst->mask = (void *)(data + off);
+ off += ret;
+ }
+ ++src;
+ ++dst;
+ } while (--num);
+ return off;
+}
+
+/**
+ * Copy a list of actions.
+ *
+ * @param[out] dst
+ * Destination buffer. Can be NULL if @p size is zero.
+ * @param size
+ * Size of @p dst in bytes.
+ * @param[in] src
+ * Source actions.
+ * @param num
+ * Maximum number of actions to process from @p src or 0 to process the
+ * entire list. In both cases, processing stops after
+ * RTE_FLOW_ACTION_TYPE_END is encountered.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * A positive value representing the number of bytes needed to store
+ * actions regardless of @p size on success (@p buf contents are truncated
+ * to @p size if not large enough), a negative errno value otherwise and
+ * rte_errno is set.
+ */
+static int
+rte_flow_conv_actions(struct rte_flow_action *dst,
+ const size_t size,
+ const struct rte_flow_action *src,
+ unsigned int num,
+ struct rte_flow_error *error)
+{
+ uintptr_t data = (uintptr_t)dst;
+ size_t off;
+ size_t ret;
+ unsigned int i;
+
+ for (i = 0, off = 0; !num || i != num; ++i, ++src, ++dst) {
+ if ((size_t)src->type >= RTE_DIM(rte_flow_desc_action) ||
+ !rte_flow_desc_action[src->type].name)
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+ src, "cannot convert unknown action type");
+ if (size >= off + sizeof(*dst))
+ *dst = (struct rte_flow_action){
+ .type = src->type,
+ };
+ off += sizeof(*dst);
+ if (!src->type)
+ num = i + 1;
+ }
+ num = i;
+ src -= num;
+ dst -= num;
+ do {
+ if (src->conf) {
+ off = RTE_ALIGN_CEIL(off, sizeof(double));
+ ret = rte_flow_conv_action_conf
+ ((void *)(data + off),
+ size > off ? size - off : 0, src);
+ if (size && size >= off + ret)
+ dst->conf = (void *)(data + off);
+ off += ret;
+ }
+ ++src;
+ ++dst;
+ } while (--num);
+ return off;
+}
+
+/**
+ * Copy flow rule components.
+ *
+ * This comprises the flow rule descriptor itself, attributes, pattern and
+ * actions list. NULL components in @p src are skipped.
+ *
+ * @param[out] dst
+ * Destination buffer. Can be NULL if @p size is zero.
+ * @param size
+ * Size of @p dst in bytes.
+ * @param[in] src
+ * Source flow rule descriptor.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * A positive value representing the number of bytes needed to store all
+ * components including the descriptor regardless of @p size on success
+ * (@p buf contents are truncated to @p size if not large enough), a
+ * negative errno value otherwise and rte_errno is set.
+ */
+static int
+rte_flow_conv_rule(struct rte_flow_conv_rule *dst,
+ const size_t size,
+ const struct rte_flow_conv_rule *src,
+ struct rte_flow_error *error)
+{
+ size_t off;
+ int ret;
+
+ rte_memcpy(dst,
+ (&(struct rte_flow_conv_rule){
+ .attr = NULL,
+ .pattern = NULL,
+ .actions = NULL,
+ }),
+ size > sizeof(*dst) ? sizeof(*dst) : size);
+ off = sizeof(*dst);
+ if (src->attr_ro) {
+ off = RTE_ALIGN_CEIL(off, sizeof(double));
+ if (size && size >= off + sizeof(*dst->attr))
+ dst->attr = rte_memcpy
+ ((void *)((uintptr_t)dst + off),
+ src->attr_ro, sizeof(*dst->attr));
+ off += sizeof(*dst->attr);
+ }
+ if (src->pattern_ro) {
+ off = RTE_ALIGN_CEIL(off, sizeof(double));
+ ret = rte_flow_conv_pattern((void *)((uintptr_t)dst + off),
+ size > off ? size - off : 0,
+ src->pattern_ro, 0, error);
+ if (ret < 0)
+ return ret;
+ if (size && size >= off + (size_t)ret)
+ dst->pattern = (void *)((uintptr_t)dst + off);
+ off += ret;
+ }
+ if (src->actions_ro) {
+ off = RTE_ALIGN_CEIL(off, sizeof(double));
+ ret = rte_flow_conv_actions((void *)((uintptr_t)dst + off),
+ size > off ? size - off : 0,
+ src->actions_ro, 0, error);
+ if (ret < 0)
+ return ret;
+ if (size >= off + (size_t)ret)
+ dst->actions = (void *)((uintptr_t)dst + off);
+ off += ret;
+ }
+ return off;
+}
+
+/**
+ * Retrieve the name of a pattern item/action type.
+ *
+ * @param is_action
+ * Nonzero when @p src represents an action type instead of a pattern item
+ * type.
+ * @param is_ptr
+ * Nonzero to write string address instead of contents into @p dst.
+ * @param[out] dst
+ * Destination buffer. Can be NULL if @p size is zero.
+ * @param size
+ * Size of @p dst in bytes.
+ * @param[in] src
+ * Depending on @p is_action, source pattern item or action type cast as a
+ * pointer.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * A positive value representing the number of bytes needed to store the
+ * name or its address regardless of @p size on success (@p buf contents
+ * are truncated to @p size if not large enough), a negative errno value
+ * otherwise and rte_errno is set.
+ */
+static int
+rte_flow_conv_name(int is_action,
+ int is_ptr,
+ char *dst,
+ const size_t size,
+ const void *src,
+ struct rte_flow_error *error)
+{
+ struct desc_info {
+ const struct rte_flow_desc_data *data;
+ size_t num;
+ };
+ static const struct desc_info info_rep[2] = {
+ { rte_flow_desc_item, RTE_DIM(rte_flow_desc_item), },
+ { rte_flow_desc_action, RTE_DIM(rte_flow_desc_action), },
+ };
+ const struct desc_info *const info = &info_rep[!!is_action];
+ unsigned int type = (uintptr_t)src;
+
+ if (type >= info->num)
+ return rte_flow_error_set
+ (error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "unknown object type to retrieve the name of");
+ if (!is_ptr)
+ return strlcpy(dst, info->data[type].name, size);
+ if (size >= sizeof(const char **))
+ *((const char **)dst) = info->data[type].name;
+ return sizeof(const char **);
+}
+
+/** Helper function to convert flow API objects. */
+int
+rte_flow_conv(enum rte_flow_conv_op op,
+ void *dst,
+ size_t size,
+ const void *src,
+ struct rte_flow_error *error)
+{
+ switch (op) {
+ const struct rte_flow_attr *attr;
+
+ case RTE_FLOW_CONV_OP_NONE:
+ return 0;
+ case RTE_FLOW_CONV_OP_ATTR:
+ attr = src;
+ if (size > sizeof(*attr))
+ size = sizeof(*attr);
+ rte_memcpy(dst, attr, size);
+ return sizeof(*attr);
+ case RTE_FLOW_CONV_OP_ITEM:
+ return rte_flow_conv_pattern(dst, size, src, 1, error);
+ case RTE_FLOW_CONV_OP_ACTION:
+ return rte_flow_conv_actions(dst, size, src, 1, error);
+ case RTE_FLOW_CONV_OP_PATTERN:
+ return rte_flow_conv_pattern(dst, size, src, 0, error);
+ case RTE_FLOW_CONV_OP_ACTIONS:
+ return rte_flow_conv_actions(dst, size, src, 0, error);
+ case RTE_FLOW_CONV_OP_RULE:
+ return rte_flow_conv_rule(dst, size, src, error);
+ case RTE_FLOW_CONV_OP_ITEM_NAME:
+ return rte_flow_conv_name(0, 0, dst, size, src, error);
+ case RTE_FLOW_CONV_OP_ACTION_NAME:
+ return rte_flow_conv_name(1, 0, dst, size, src, error);
+ case RTE_FLOW_CONV_OP_ITEM_NAME_PTR:
+ return rte_flow_conv_name(0, 1, dst, size, src, error);
+ case RTE_FLOW_CONV_OP_ACTION_NAME_PTR:
+ return rte_flow_conv_name(1, 1, dst, size, src, error);
+ }
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "unknown object conversion operation");
}
/** Store a full rte_flow description. */
@@ -426,105 +850,49 @@ rte_flow_copy(struct rte_flow_desc *desc, size_t len,
const struct rte_flow_item *items,
const struct rte_flow_action *actions)
{
- struct rte_flow_desc *fd = NULL;
- size_t tmp;
- size_t off1 = 0;
- size_t off2 = 0;
- size_t size = 0;
-
-store:
- if (items) {
- const struct rte_flow_item *item;
-
- item = items;
- if (fd)
- fd->items = (void *)&fd->data[off1];
- do {
- struct rte_flow_item *dst = NULL;
-
- if ((size_t)item->type >=
- RTE_DIM(rte_flow_desc_item) ||
- !rte_flow_desc_item[item->type].name) {
- rte_errno = ENOTSUP;
- return 0;
- }
- if (fd)
- dst = memcpy(fd->data + off1, item,
- sizeof(*item));
- off1 += sizeof(*item);
- if (item->spec) {
- if (fd)
- dst->spec = fd->data + off2;
- off2 += flow_item_spec_copy
- (fd ? fd->data + off2 : NULL, item,
- ITEM_SPEC);
- }
- if (item->last) {
- if (fd)
- dst->last = fd->data + off2;
- off2 += flow_item_spec_copy
- (fd ? fd->data + off2 : NULL, item,
- ITEM_LAST);
- }
- if (item->mask) {
- if (fd)
- dst->mask = fd->data + off2;
- off2 += flow_item_spec_copy
- (fd ? fd->data + off2 : NULL, item,
- ITEM_MASK);
- }
- off2 = RTE_ALIGN_CEIL(off2, sizeof(double));
- } while ((item++)->type != RTE_FLOW_ITEM_TYPE_END);
- off1 = RTE_ALIGN_CEIL(off1, sizeof(double));
- }
- if (actions) {
- const struct rte_flow_action *action;
-
- action = actions;
- if (fd)
- fd->actions = (void *)&fd->data[off1];
- do {
- struct rte_flow_action *dst = NULL;
-
- if ((size_t)action->type >=
- RTE_DIM(rte_flow_desc_action) ||
- !rte_flow_desc_action[action->type].name) {
- rte_errno = ENOTSUP;
- return 0;
- }
- if (fd)
- dst = memcpy(fd->data + off1, action,
- sizeof(*action));
- off1 += sizeof(*action);
- if (action->conf) {
- if (fd)
- dst->conf = fd->data + off2;
- off2 += flow_action_conf_copy
- (fd ? fd->data + off2 : NULL, action);
- }
- off2 = RTE_ALIGN_CEIL(off2, sizeof(double));
- } while ((action++)->type != RTE_FLOW_ACTION_TYPE_END);
+ /*
+ * Overlap struct rte_flow_conv with struct rte_flow_desc in order
+ * to convert the former to the latter without wasting space.
+ */
+ struct rte_flow_conv_rule *dst =
+ len ?
+ (void *)((uintptr_t)desc +
+ (offsetof(struct rte_flow_desc, actions) -
+ offsetof(struct rte_flow_conv_rule, actions))) :
+ NULL;
+ size_t dst_size =
+ len > sizeof(*desc) - sizeof(*dst) ?
+ len - (sizeof(*desc) - sizeof(*dst)) :
+ 0;
+ struct rte_flow_conv_rule src = {
+ .attr_ro = NULL,
+ .pattern_ro = items,
+ .actions_ro = actions,
+ };
+ int ret;
+
+ RTE_BUILD_BUG_ON(sizeof(struct rte_flow_desc) <
+ sizeof(struct rte_flow_conv_rule));
+ if (dst_size &&
+ (&dst->pattern != &desc->items ||
+ &dst->actions != &desc->actions ||
+ (uintptr_t)(dst + 1) != (uintptr_t)(desc + 1))) {
+ rte_errno = EINVAL;
+ return 0;
}
- if (fd != NULL)
- return size;
- off1 = RTE_ALIGN_CEIL(off1, sizeof(double));
- tmp = RTE_ALIGN_CEIL(offsetof(struct rte_flow_desc, data),
- sizeof(double));
- size = tmp + off1 + off2;
- if (size > len)
- return size;
- fd = desc;
- if (fd != NULL) {
- *fd = (const struct rte_flow_desc) {
- .size = size,
+ ret = rte_flow_conv(RTE_FLOW_CONV_OP_RULE, dst, dst_size, &src, NULL);
+ if (ret < 0)
+ return 0;
+ ret += sizeof(*desc) - sizeof(*dst);
+ rte_memcpy(desc,
+ (&(struct rte_flow_desc){
+ .size = ret,
.attr = *attr,
- };
- tmp -= offsetof(struct rte_flow_desc, data);
- off2 = tmp + off1;
- off1 = tmp;
- goto store;
- }
- return 0;
+ .items = dst_size ? dst->pattern : NULL,
+ .actions = dst_size ? dst->actions : NULL,
+ }),
+ len > sizeof(*desc) ? sizeof(*desc) : len);
+ return ret;
}
/**
diff --git a/lib/librte_ethdev/rte_flow.h b/lib/librte_ethdev/rte_flow.h
index f8ba71cd..c0fe8792 100644
--- a/lib/librte_ethdev/rte_flow.h
+++ b/lib/librte_ethdev/rte_flow.h
@@ -18,6 +18,7 @@
#include <stdint.h>
#include <rte_arp.h>
+#include <rte_common.h>
#include <rte_ether.h>
#include <rte_eth_ctrl.h>
#include <rte_icmp.h>
@@ -413,6 +414,14 @@ enum rte_flow_item_type {
* See struct rte_flow_item_mark.
*/
RTE_FLOW_ITEM_TYPE_MARK,
+
+ /**
+ * [META]
+ *
+ * Matches a metadata value specified in mbuf metadata field.
+ * See struct rte_flow_item_meta.
+ */
+ RTE_FLOW_ITEM_TYPE_META,
};
/**
@@ -1156,6 +1165,22 @@ rte_flow_item_icmp6_nd_opt_tla_eth_mask = {
#endif
/**
+ * RTE_FLOW_ITEM_TYPE_META.
+ *
+ * Matches a specified metadata value.
+ */
+struct rte_flow_item_meta {
+ rte_be32_t data;
+};
+
+/** Default mask for RTE_FLOW_ITEM_TYPE_META. */
+#ifndef __cplusplus
+static const struct rte_flow_item_meta rte_flow_item_meta_mask = {
+ .data = RTE_BE32(UINT32_MAX),
+};
+#endif
+
+/**
* @warning
* @b EXPERIMENTAL: this structure may change without prior notice
*
@@ -1505,6 +1530,127 @@ enum rte_flow_action_type {
* error.
*/
RTE_FLOW_ACTION_TYPE_NVGRE_DECAP,
+
+ /**
+ * Add outer header whose template is provided in its data buffer
+ *
+ * See struct rte_flow_action_raw_encap.
+ */
+ RTE_FLOW_ACTION_TYPE_RAW_ENCAP,
+
+ /**
+ * Remove outer header whose template is provided in its data buffer.
+ *
+ * See struct rte_flow_action_raw_decap
+ */
+ RTE_FLOW_ACTION_TYPE_RAW_DECAP,
+
+ /**
+ * Modify IPv4 source address in the outermost IPv4 header.
+ *
+ * If flow pattern does not define a valid RTE_FLOW_ITEM_TYPE_IPV4,
+ * then the PMD should return a RTE_FLOW_ERROR_TYPE_ACTION error.
+ *
+ * See struct rte_flow_action_set_ipv4.
+ */
+ RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC,
+
+ /**
+ * Modify IPv4 destination address in the outermost IPv4 header.
+ *
+ * If flow pattern does not define a valid RTE_FLOW_ITEM_TYPE_IPV4,
+ * then the PMD should return a RTE_FLOW_ERROR_TYPE_ACTION error.
+ *
+ * See struct rte_flow_action_set_ipv4.
+ */
+ RTE_FLOW_ACTION_TYPE_SET_IPV4_DST,
+
+ /**
+ * Modify IPv6 source address in the outermost IPv6 header.
+ *
+ * If flow pattern does not define a valid RTE_FLOW_ITEM_TYPE_IPV6,
+ * then the PMD should return a RTE_FLOW_ERROR_TYPE_ACTION error.
+ *
+ * See struct rte_flow_action_set_ipv6.
+ */
+ RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC,
+
+ /**
+ * Modify IPv6 destination address in the outermost IPv6 header.
+ *
+ * If flow pattern does not define a valid RTE_FLOW_ITEM_TYPE_IPV6,
+ * then the PMD should return a RTE_FLOW_ERROR_TYPE_ACTION error.
+ *
+ * See struct rte_flow_action_set_ipv6.
+ */
+ RTE_FLOW_ACTION_TYPE_SET_IPV6_DST,
+
+ /**
+ * Modify source port number in the outermost TCP/UDP header.
+ *
+ * If flow pattern does not define a valid RTE_FLOW_ITEM_TYPE_TCP
+ * or RTE_FLOW_ITEM_TYPE_UDP, then the PMD should return a
+ * RTE_FLOW_ERROR_TYPE_ACTION error.
+ *
+ * See struct rte_flow_action_set_tp.
+ */
+ RTE_FLOW_ACTION_TYPE_SET_TP_SRC,
+
+ /**
+ * Modify destination port number in the outermost TCP/UDP header.
+ *
+ * If flow pattern does not define a valid RTE_FLOW_ITEM_TYPE_TCP
+ * or RTE_FLOW_ITEM_TYPE_UDP, then the PMD should return a
+ * RTE_FLOW_ERROR_TYPE_ACTION error.
+ *
+ * See struct rte_flow_action_set_tp.
+ */
+ RTE_FLOW_ACTION_TYPE_SET_TP_DST,
+
+ /**
+ * Swap the source and destination MAC addresses in the outermost
+ * Ethernet header.
+ *
+ * If flow pattern does not define a valid RTE_FLOW_ITEM_TYPE_ETH,
+ * then the PMD should return a RTE_FLOW_ERROR_TYPE_ACTION error.
+ *
+ * No associated configuration structure.
+ */
+ RTE_FLOW_ACTION_TYPE_MAC_SWAP,
+
+ /**
+ * Decrease TTL value directly
+ *
+ * No associated configuration structure.
+ */
+ RTE_FLOW_ACTION_TYPE_DEC_TTL,
+
+ /**
+ * Set TTL value
+ *
+ * See struct rte_flow_action_set_ttl
+ */
+ RTE_FLOW_ACTION_TYPE_SET_TTL,
+
+ /**
+ * Set source MAC address from matched flow.
+ *
+ * If flow pattern does not define a valid RTE_FLOW_ITEM_TYPE_ETH,
+ * the PMD should return a RTE_FLOW_ERROR_TYPE_ACTION error.
+ *
+ * See struct rte_flow_action_set_mac.
+ */
+ RTE_FLOW_ACTION_TYPE_SET_MAC_SRC,
+
+ /**
+ * Set destination MAC address from matched flow.
+ *
+ * If flow pattern does not define a valid RTE_FLOW_ITEM_TYPE_ETH,
+ * the PMD should return a RTE_FLOW_ERROR_TYPE_ACTION error.
+ *
+ * See struct rte_flow_action_set_mac.
+ */
+ RTE_FLOW_ACTION_TYPE_SET_MAC_DST,
};
/**
@@ -1868,6 +2014,114 @@ struct rte_flow_action_nvgre_encap {
struct rte_flow_item *definition;
};
+/**
+ * @warning
+ * @b EXPERIMENTAL: this structure may change without prior notice
+ *
+ * RTE_FLOW_ACTION_TYPE_RAW_ENCAP
+ *
+ * Raw tunnel end-point encapsulation data definition.
+ *
+ * The data holds the headers definitions to be applied on the packet.
+ * The data must start with ETH header up to the tunnel item header itself.
+ * When used right after RAW_DECAP (for decapsulating L3 tunnel type for
+ * example MPLSoGRE) the data will just hold layer 2 header.
+ *
+ * The preserve parameter holds which bits in the packet the PMD is not allowed
+ * to change, this parameter can also be NULL and then the PMD is allowed
+ * to update any field.
+ *
+ * size holds the number of bytes in @p data and @p preserve.
+ */
+struct rte_flow_action_raw_encap {
+ uint8_t *data; /**< Encapsulation data. */
+ uint8_t *preserve; /**< Bit-mask of @p data to preserve on output. */
+ size_t size; /**< Size of @p data and @p preserve. */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this structure may change without prior notice
+ *
+ * RTE_FLOW_ACTION_TYPE_RAW_DECAP
+ *
+ * Raw tunnel end-point decapsulation data definition.
+ *
+ * The data holds the headers definitions to be removed from the packet.
+ * The data must start with ETH header up to the tunnel item header itself.
+ * When used right before RAW_DECAP (for encapsulating L3 tunnel type for
+ * example MPLSoGRE) the data will just hold layer 2 header.
+ *
+ * size holds the number of bytes in @p data.
+ */
+struct rte_flow_action_raw_decap {
+ uint8_t *data; /**< Encapsulation data. */
+ size_t size; /**< Size of @p data and @p preserve. */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this structure may change without prior notice
+ *
+ * RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC
+ * RTE_FLOW_ACTION_TYPE_SET_IPV4_DST
+ *
+ * Allows modification of IPv4 source (RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC)
+ * and destination address (RTE_FLOW_ACTION_TYPE_SET_IPV4_DST) in the
+ * specified outermost IPv4 header.
+ */
+struct rte_flow_action_set_ipv4 {
+ rte_be32_t ipv4_addr;
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this structure may change without prior notice
+ *
+ * RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC
+ * RTE_FLOW_ACTION_TYPE_SET_IPV6_DST
+ *
+ * Allows modification of IPv6 source (RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC)
+ * and destination address (RTE_FLOW_ACTION_TYPE_SET_IPV6_DST) in the
+ * specified outermost IPv6 header.
+ */
+struct rte_flow_action_set_ipv6 {
+ uint8_t ipv6_addr[16];
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this structure may change without prior notice
+ *
+ * RTE_FLOW_ACTION_TYPE_SET_TP_SRC
+ * RTE_FLOW_ACTION_TYPE_SET_TP_DST
+ *
+ * Allows modification of source (RTE_FLOW_ACTION_TYPE_SET_TP_SRC)
+ * and destination (RTE_FLOW_ACTION_TYPE_SET_TP_DST) port numbers
+ * in the specified outermost TCP/UDP header.
+ */
+struct rte_flow_action_set_tp {
+ rte_be16_t port;
+};
+
+/**
+ * RTE_FLOW_ACTION_TYPE_SET_TTL
+ *
+ * Set the TTL value directly for IPv4 or IPv6
+ */
+struct rte_flow_action_set_ttl {
+ uint8_t ttl_value;
+};
+
+/**
+ * RTE_FLOW_ACTION_TYPE_SET_MAC
+ *
+ * Set MAC address from the matched flow
+ */
+struct rte_flow_action_set_mac {
+ uint8_t mac_addr[ETHER_ADDR_LEN];
+};
+
/*
* Definition of a single action.
*
@@ -1932,6 +2186,175 @@ struct rte_flow_error {
};
/**
+ * Complete flow rule description.
+ *
+ * This object type is used when converting a flow rule description.
+ *
+ * @see RTE_FLOW_CONV_OP_RULE
+ * @see rte_flow_conv()
+ */
+RTE_STD_C11
+struct rte_flow_conv_rule {
+ union {
+ const struct rte_flow_attr *attr_ro; /**< RO attributes. */
+ struct rte_flow_attr *attr; /**< Attributes. */
+ };
+ union {
+ const struct rte_flow_item *pattern_ro; /**< RO pattern. */
+ struct rte_flow_item *pattern; /**< Pattern items. */
+ };
+ union {
+ const struct rte_flow_action *actions_ro; /**< RO actions. */
+ struct rte_flow_action *actions; /**< List of actions. */
+ };
+};
+
+/**
+ * Conversion operations for flow API objects.
+ *
+ * @see rte_flow_conv()
+ */
+enum rte_flow_conv_op {
+ /**
+ * No operation to perform.
+ *
+ * rte_flow_conv() simply returns 0.
+ */
+ RTE_FLOW_CONV_OP_NONE,
+
+ /**
+ * Convert attributes structure.
+ *
+ * This is a basic copy of an attributes structure.
+ *
+ * - @p src type:
+ * @code const struct rte_flow_attr * @endcode
+ * - @p dst type:
+ * @code struct rte_flow_attr * @endcode
+ */
+ RTE_FLOW_CONV_OP_ATTR,
+
+ /**
+ * Convert a single item.
+ *
+ * Duplicates @p spec, @p last and @p mask but not outside objects.
+ *
+ * - @p src type:
+ * @code const struct rte_flow_item * @endcode
+ * - @p dst type:
+ * @code struct rte_flow_item * @endcode
+ */
+ RTE_FLOW_CONV_OP_ITEM,
+
+ /**
+ * Convert a single action.
+ *
+ * Duplicates @p conf but not outside objects.
+ *
+ * - @p src type:
+ * @code const struct rte_flow_action * @endcode
+ * - @p dst type:
+ * @code struct rte_flow_action * @endcode
+ */
+ RTE_FLOW_CONV_OP_ACTION,
+
+ /**
+ * Convert an entire pattern.
+ *
+ * Duplicates all pattern items at once with the same constraints as
+ * RTE_FLOW_CONV_OP_ITEM.
+ *
+ * - @p src type:
+ * @code const struct rte_flow_item * @endcode
+ * - @p dst type:
+ * @code struct rte_flow_item * @endcode
+ */
+ RTE_FLOW_CONV_OP_PATTERN,
+
+ /**
+ * Convert a list of actions.
+ *
+ * Duplicates the entire list of actions at once with the same
+ * constraints as RTE_FLOW_CONV_OP_ACTION.
+ *
+ * - @p src type:
+ * @code const struct rte_flow_action * @endcode
+ * - @p dst type:
+ * @code struct rte_flow_action * @endcode
+ */
+ RTE_FLOW_CONV_OP_ACTIONS,
+
+ /**
+ * Convert a complete flow rule description.
+ *
+ * Comprises attributes, pattern and actions together at once with
+ * the usual constraints.
+ *
+ * - @p src type:
+ * @code const struct rte_flow_conv_rule * @endcode
+ * - @p dst type:
+ * @code struct rte_flow_conv_rule * @endcode
+ */
+ RTE_FLOW_CONV_OP_RULE,
+
+ /**
+ * Convert item type to its name string.
+ *
+ * Writes a NUL-terminated string to @p dst. Like snprintf(), the
+ * returned value excludes the terminator which is always written
+ * nonetheless.
+ *
+ * - @p src type:
+ * @code (const void *)enum rte_flow_item_type @endcode
+ * - @p dst type:
+ * @code char * @endcode
+ **/
+ RTE_FLOW_CONV_OP_ITEM_NAME,
+
+ /**
+ * Convert action type to its name string.
+ *
+ * Writes a NUL-terminated string to @p dst. Like snprintf(), the
+ * returned value excludes the terminator which is always written
+ * nonetheless.
+ *
+ * - @p src type:
+ * @code (const void *)enum rte_flow_action_type @endcode
+ * - @p dst type:
+ * @code char * @endcode
+ **/
+ RTE_FLOW_CONV_OP_ACTION_NAME,
+
+ /**
+ * Convert item type to pointer to item name.
+ *
+ * Retrieves item name pointer from its type. The string itself is
+ * not copied; instead, a unique pointer to an internal static
+ * constant storage is written to @p dst.
+ *
+ * - @p src type:
+ * @code (const void *)enum rte_flow_item_type @endcode
+ * - @p dst type:
+ * @code const char ** @endcode
+ */
+ RTE_FLOW_CONV_OP_ITEM_NAME_PTR,
+
+ /**
+ * Convert action type to pointer to action name.
+ *
+ * Retrieves action name pointer from its type. The string itself is
+ * not copied; instead, a unique pointer to an internal static
+ * constant storage is written to @p dst.
+ *
+ * - @p src type:
+ * @code (const void *)enum rte_flow_action_type @endcode
+ * - @p dst type:
+ * @code const char ** @endcode
+ */
+ RTE_FLOW_CONV_OP_ACTION_NAME_PTR,
+};
+
+/**
* Check whether a flow rule can be created on a given port.
*
* The flow rule is validated for correctness and whether it could be accepted
@@ -2162,10 +2585,8 @@ rte_flow_error_set(struct rte_flow_error *error,
const char *message);
/**
- * Generic flow representation.
- *
- * This form is sufficient to describe an rte_flow independently from any
- * PMD implementation and allows for replayability and identification.
+ * @deprecated
+ * @see rte_flow_copy()
*/
struct rte_flow_desc {
size_t size; /**< Allocated space including data[]. */
@@ -2176,8 +2597,14 @@ struct rte_flow_desc {
};
/**
+ * @deprecated
* Copy an rte_flow rule description.
*
+ * This interface is kept for compatibility with older applications but is
+ * implemented as a wrapper to rte_flow_conv(). It is deprecated due to its
+ * lack of flexibility and reliance on a type unusable with C++ programs
+ * (struct rte_flow_desc).
+ *
* @param[in] fd
* Flow rule description.
* @param[in] len
@@ -2195,12 +2622,61 @@ struct rte_flow_desc {
* If len is lower than the size of the flow, the number of bytes that would
* have been written to desc had it been sufficient. Nothing is written.
*/
+__rte_deprecated
size_t
rte_flow_copy(struct rte_flow_desc *fd, size_t len,
const struct rte_flow_attr *attr,
const struct rte_flow_item *items,
const struct rte_flow_action *actions);
+/**
+ * Flow object conversion helper.
+ *
+ * This function performs conversion of various flow API objects to a
+ * pre-allocated destination buffer. See enum rte_flow_conv_op for possible
+ * operations and details about each of them.
+ *
+ * Since destination buffer must be large enough, it works in a manner
+ * reminiscent of snprintf():
+ *
+ * - If @p size is 0, @p dst may be a NULL pointer, otherwise @p dst must be
+ * non-NULL.
+ * - If positive, the returned value represents the number of bytes needed
+ * to store the conversion of @p src to @p dst according to @p op
+ * regardless of the @p size parameter.
+ * - Since no more than @p size bytes can be written to @p dst, output is
+ * truncated and may be inconsistent when the returned value is larger
+ * than that.
+ * - In case of conversion error, a negative error code is returned and
+ * @p dst contents are unspecified.
+ *
+ * @param op
+ * Operation to perform, related to the object type of @p dst.
+ * @param[out] dst
+ * Destination buffer address. Must be suitably aligned by the caller.
+ * @param size
+ * Destination buffer size in bytes.
+ * @param[in] src
+ * Source object to copy. Depending on @p op, its type may differ from
+ * that of @p dst.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL. Initialized in case of
+ * error only.
+ *
+ * @return
+ * The number of bytes required to convert @p src to @p dst on success, a
+ * negative errno value otherwise and rte_errno is set.
+ *
+ * @see rte_flow_conv_op
+ */
+__rte_experimental
+int
+rte_flow_conv(enum rte_flow_conv_op op,
+ void *dst,
+ size_t size,
+ const void *src,
+ struct rte_flow_error *error);
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/librte_ethdev/rte_tm.h b/lib/librte_ethdev/rte_tm.h
index 955f02ff..646ef388 100644
--- a/lib/librte_ethdev/rte_tm.h
+++ b/lib/librte_ethdev/rte_tm.h
@@ -831,10 +831,10 @@ enum rte_tm_cman_mode {
*/
struct rte_tm_red_params {
/** Minimum queue threshold */
- uint32_t min_th;
+ uint64_t min_th;
/** Maximum queue threshold */
- uint32_t max_th;
+ uint64_t max_th;
/** Inverse of packet marking probability maximum value (maxp), i.e.
* maxp_inv = 1 / maxp
diff --git a/lib/librte_eventdev/Makefile b/lib/librte_eventdev/Makefile
index 47f599a6..94961870 100644
--- a/lib/librte_eventdev/Makefile
+++ b/lib/librte_eventdev/Makefile
@@ -8,7 +8,7 @@ include $(RTE_SDK)/mk/rte.vars.mk
LIB = librte_eventdev.a
# library version
-LIBABIVER := 5
+LIBABIVER := 6
# build flags
CFLAGS += -DALLOW_EXPERIMENTAL_API
@@ -28,6 +28,7 @@ SRCS-y += rte_event_ring.c
SRCS-y += rte_event_eth_rx_adapter.c
SRCS-y += rte_event_timer_adapter.c
SRCS-y += rte_event_crypto_adapter.c
+SRCS-y += rte_event_eth_tx_adapter.c
# export include files
SYMLINK-y-include += rte_eventdev.h
@@ -39,6 +40,7 @@ SYMLINK-y-include += rte_event_eth_rx_adapter.h
SYMLINK-y-include += rte_event_timer_adapter.h
SYMLINK-y-include += rte_event_timer_adapter_pmd.h
SYMLINK-y-include += rte_event_crypto_adapter.h
+SYMLINK-y-include += rte_event_eth_tx_adapter.h
# versioning export map
EXPORT_MAP := rte_eventdev_version.map
diff --git a/lib/librte_eventdev/meson.build b/lib/librte_eventdev/meson.build
index 3cbaf298..6becfe86 100644
--- a/lib/librte_eventdev/meson.build
+++ b/lib/librte_eventdev/meson.build
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright(c) 2017 Intel Corporation
-version = 5
+version = 6
allow_experimental_apis = true
if host_machine.system() == 'linux'
@@ -14,7 +14,8 @@ sources = files('rte_eventdev.c',
'rte_event_ring.c',
'rte_event_eth_rx_adapter.c',
'rte_event_timer_adapter.c',
- 'rte_event_crypto_adapter.c')
+ 'rte_event_crypto_adapter.c',
+ 'rte_event_eth_tx_adapter.c')
headers = files('rte_eventdev.h',
'rte_eventdev_pmd.h',
'rte_eventdev_pmd_pci.h',
@@ -23,5 +24,6 @@ headers = files('rte_eventdev.h',
'rte_event_eth_rx_adapter.h',
'rte_event_timer_adapter.h',
'rte_event_timer_adapter_pmd.h',
- 'rte_event_crypto_adapter.h')
+ 'rte_event_crypto_adapter.h',
+ 'rte_event_eth_tx_adapter.h')
deps += ['ring', 'ethdev', 'hash', 'mempool', 'mbuf', 'timer', 'cryptodev']
diff --git a/lib/librte_eventdev/rte_event_eth_rx_adapter.c b/lib/librte_eventdev/rte_event_eth_rx_adapter.c
index f5e5a0b5..71d008cd 100644
--- a/lib/librte_eventdev/rte_event_eth_rx_adapter.c
+++ b/lib/librte_eventdev/rte_event_eth_rx_adapter.c
@@ -1125,7 +1125,6 @@ rxa_poll(struct rte_event_eth_rx_adapter *rx_adapter)
wrr_pos = rx_adapter->wrr_pos;
max_nb_rx = rx_adapter->max_nb_rx;
buf = &rx_adapter->event_enqueue_buffer;
- stats = &rx_adapter->stats;
/* Iterate through a WRR sequence */
for (num_queue = 0; num_queue < rx_adapter->wrr_len; num_queue++) {
@@ -1998,8 +1997,7 @@ rte_event_eth_rx_adapter_create_ext(uint8_t id, uint8_t dev_id,
rx_adapter->id = id;
strcpy(rx_adapter->mem_name, mem_name);
rx_adapter->eth_devices = rte_zmalloc_socket(rx_adapter->mem_name,
- /* FIXME: incompatible with hotplug */
- rte_eth_dev_count_total() *
+ RTE_MAX_ETHPORTS *
sizeof(struct eth_device_info), 0,
socket_id);
rte_convert_rss_key((const uint32_t *)default_rss_key,
@@ -2012,7 +2010,7 @@ rte_event_eth_rx_adapter_create_ext(uint8_t id, uint8_t dev_id,
return -ENOMEM;
}
rte_spinlock_init(&rx_adapter->rx_lock);
- RTE_ETH_FOREACH_DEV(i)
+ for (i = 0; i < RTE_MAX_ETHPORTS; i++)
rx_adapter->eth_devices[i].dev = &rte_eth_devices[i];
event_eth_rx_adapter[id] = rx_adapter;
diff --git a/lib/librte_eventdev/rte_event_eth_rx_adapter.h b/lib/librte_eventdev/rte_event_eth_rx_adapter.h
index 332ee216..863b72a1 100644
--- a/lib/librte_eventdev/rte_event_eth_rx_adapter.h
+++ b/lib/librte_eventdev/rte_event_eth_rx_adapter.h
@@ -76,10 +76,6 @@
* rte_event_eth_rx_adapter_cb_register() function allows the
* application to register a callback that selects which packets to enqueue
* to the event device.
- *
- * Note:
- * 1) Devices created after an instance of rte_event_eth_rx_adapter_create
- * should be added to a new instance of the rx adapter.
*/
#ifdef __cplusplus
diff --git a/lib/librte_eventdev/rte_event_eth_tx_adapter.c b/lib/librte_eventdev/rte_event_eth_tx_adapter.c
new file mode 100644
index 00000000..3a21defb
--- /dev/null
+++ b/lib/librte_eventdev/rte_event_eth_tx_adapter.c
@@ -0,0 +1,1138 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation.
+ */
+#include <rte_spinlock.h>
+#include <rte_service_component.h>
+#include <rte_ethdev.h>
+
+#include "rte_eventdev_pmd.h"
+#include "rte_event_eth_tx_adapter.h"
+
+#define TXA_BATCH_SIZE 32
+#define TXA_SERVICE_NAME_LEN 32
+#define TXA_MEM_NAME_LEN 32
+#define TXA_FLUSH_THRESHOLD 1024
+#define TXA_RETRY_CNT 100
+#define TXA_MAX_NB_TX 128
+#define TXA_INVALID_DEV_ID INT32_C(-1)
+#define TXA_INVALID_SERVICE_ID INT64_C(-1)
+
+#define txa_evdev(id) (&rte_eventdevs[txa_dev_id_array[(id)]])
+
+#define txa_dev_caps_get(id) txa_evdev((id))->dev_ops->eth_tx_adapter_caps_get
+
+#define txa_dev_adapter_create(t) txa_evdev(t)->dev_ops->eth_tx_adapter_create
+
+#define txa_dev_adapter_create_ext(t) \
+ txa_evdev(t)->dev_ops->eth_tx_adapter_create
+
+#define txa_dev_adapter_free(t) txa_evdev(t)->dev_ops->eth_tx_adapter_free
+
+#define txa_dev_queue_add(id) txa_evdev(id)->dev_ops->eth_tx_adapter_queue_add
+
+#define txa_dev_queue_del(t) txa_evdev(t)->dev_ops->eth_tx_adapter_queue_del
+
+#define txa_dev_start(t) txa_evdev(t)->dev_ops->eth_tx_adapter_start
+
+#define txa_dev_stop(t) txa_evdev(t)->dev_ops->eth_tx_adapter_stop
+
+#define txa_dev_stats_reset(t) txa_evdev(t)->dev_ops->eth_tx_adapter_stats_reset
+
+#define txa_dev_stats_get(t) txa_evdev(t)->dev_ops->eth_tx_adapter_stats_get
+
+#define RTE_EVENT_ETH_TX_ADAPTER_ID_VALID_OR_ERR_RET(id, retval) \
+do { \
+ if (!txa_valid_id(id)) { \
+ RTE_EDEV_LOG_ERR("Invalid eth Rx adapter id = %d", id); \
+ return retval; \
+ } \
+} while (0)
+
+#define TXA_CHECK_OR_ERR_RET(id) \
+do {\
+ int ret; \
+ RTE_EVENT_ETH_TX_ADAPTER_ID_VALID_OR_ERR_RET((id), -EINVAL); \
+ ret = txa_init(); \
+ if (ret != 0) \
+ return ret; \
+ if (!txa_adapter_exist((id))) \
+ return -EINVAL; \
+} while (0)
+
+/* Tx retry callback structure */
+struct txa_retry {
+ /* Ethernet port id */
+ uint16_t port_id;
+ /* Tx queue */
+ uint16_t tx_queue;
+ /* Adapter ID */
+ uint8_t id;
+};
+
+/* Per queue structure */
+struct txa_service_queue_info {
+ /* Queue has been added */
+ uint8_t added;
+ /* Retry callback argument */
+ struct txa_retry txa_retry;
+ /* Tx buffer */
+ struct rte_eth_dev_tx_buffer *tx_buf;
+};
+
+/* PMD private structure */
+struct txa_service_data {
+ /* Max mbufs processed in any service function invocation */
+ uint32_t max_nb_tx;
+ /* Number of Tx queues in adapter */
+ uint32_t nb_queues;
+ /* Synchronization with data path */
+ rte_spinlock_t tx_lock;
+ /* Event port ID */
+ uint8_t port_id;
+ /* Event device identifier */
+ uint8_t eventdev_id;
+ /* Highest port id supported + 1 */
+ uint16_t dev_count;
+ /* Loop count to flush Tx buffers */
+ int loop_cnt;
+ /* Per ethernet device structure */
+ struct txa_service_ethdev *txa_ethdev;
+ /* Statistics */
+ struct rte_event_eth_tx_adapter_stats stats;
+ /* Adapter Identifier */
+ uint8_t id;
+ /* Conf arg must be freed */
+ uint8_t conf_free;
+ /* Configuration callback */
+ rte_event_eth_tx_adapter_conf_cb conf_cb;
+ /* Configuration callback argument */
+ void *conf_arg;
+ /* socket id */
+ int socket_id;
+ /* Per adapter EAL service */
+ int64_t service_id;
+ /* Memory allocation name */
+ char mem_name[TXA_MEM_NAME_LEN];
+} __rte_cache_aligned;
+
+/* Per eth device structure */
+struct txa_service_ethdev {
+ /* Pointer to ethernet device */
+ struct rte_eth_dev *dev;
+ /* Number of queues added */
+ uint16_t nb_queues;
+ /* PMD specific queue data */
+ void *queues;
+};
+
+/* Array of adapter instances, initialized with event device id
+ * when adapter is created
+ */
+static int *txa_dev_id_array;
+
+/* Array of pointers to service implementation data */
+static struct txa_service_data **txa_service_data_array;
+
+static int32_t txa_service_func(void *args);
+static int txa_service_adapter_create_ext(uint8_t id,
+ struct rte_eventdev *dev,
+ rte_event_eth_tx_adapter_conf_cb conf_cb,
+ void *conf_arg);
+static int txa_service_queue_del(uint8_t id,
+ const struct rte_eth_dev *dev,
+ int32_t tx_queue_id);
+
+static int
+txa_adapter_exist(uint8_t id)
+{
+ return txa_dev_id_array[id] != TXA_INVALID_DEV_ID;
+}
+
+static inline int
+txa_valid_id(uint8_t id)
+{
+ return id < RTE_EVENT_ETH_TX_ADAPTER_MAX_INSTANCE;
+}
+
+static void *
+txa_memzone_array_get(const char *name, unsigned int elt_size, int nb_elems)
+{
+ const struct rte_memzone *mz;
+ unsigned int sz;
+
+ sz = elt_size * nb_elems;
+ sz = RTE_ALIGN(sz, RTE_CACHE_LINE_SIZE);
+
+ mz = rte_memzone_lookup(name);
+ if (mz == NULL) {
+ mz = rte_memzone_reserve_aligned(name, sz, rte_socket_id(), 0,
+ RTE_CACHE_LINE_SIZE);
+ if (mz == NULL) {
+ RTE_EDEV_LOG_ERR("failed to reserve memzone"
+ " name = %s err = %"
+ PRId32, name, rte_errno);
+ return NULL;
+ }
+ }
+
+ return mz->addr;
+}
+
+static int
+txa_dev_id_array_init(void)
+{
+ if (txa_dev_id_array == NULL) {
+ int i;
+
+ txa_dev_id_array = txa_memzone_array_get("txa_adapter_array",
+ sizeof(int),
+ RTE_EVENT_ETH_TX_ADAPTER_MAX_INSTANCE);
+ if (txa_dev_id_array == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < RTE_EVENT_ETH_TX_ADAPTER_MAX_INSTANCE; i++)
+ txa_dev_id_array[i] = TXA_INVALID_DEV_ID;
+ }
+
+ return 0;
+}
+
+static int
+txa_init(void)
+{
+ return txa_dev_id_array_init();
+}
+
+static int
+txa_service_data_init(void)
+{
+ if (txa_service_data_array == NULL) {
+ txa_service_data_array =
+ txa_memzone_array_get("txa_service_data_array",
+ sizeof(int),
+ RTE_EVENT_ETH_TX_ADAPTER_MAX_INSTANCE);
+ if (txa_service_data_array == NULL)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static inline struct txa_service_data *
+txa_service_id_to_data(uint8_t id)
+{
+ return txa_service_data_array[id];
+}
+
+static inline struct txa_service_queue_info *
+txa_service_queue(struct txa_service_data *txa, uint16_t port_id,
+ uint16_t tx_queue_id)
+{
+ struct txa_service_queue_info *tqi;
+
+ if (unlikely(txa->txa_ethdev == NULL || txa->dev_count < port_id + 1))
+ return NULL;
+
+ tqi = txa->txa_ethdev[port_id].queues;
+
+ return likely(tqi != NULL) ? tqi + tx_queue_id : NULL;
+}
+
+static int
+txa_service_conf_cb(uint8_t __rte_unused id, uint8_t dev_id,
+ struct rte_event_eth_tx_adapter_conf *conf, void *arg)
+{
+ int ret;
+ struct rte_eventdev *dev;
+ struct rte_event_port_conf *pc;
+ struct rte_event_dev_config dev_conf;
+ int started;
+ uint8_t port_id;
+
+ pc = arg;
+ dev = &rte_eventdevs[dev_id];
+ dev_conf = dev->data->dev_conf;
+
+ started = dev->data->dev_started;
+ if (started)
+ rte_event_dev_stop(dev_id);
+
+ port_id = dev_conf.nb_event_ports;
+ dev_conf.nb_event_ports += 1;
+
+ ret = rte_event_dev_configure(dev_id, &dev_conf);
+ if (ret) {
+ RTE_EDEV_LOG_ERR("failed to configure event dev %u",
+ dev_id);
+ if (started) {
+ if (rte_event_dev_start(dev_id))
+ return -EIO;
+ }
+ return ret;
+ }
+
+ pc->disable_implicit_release = 0;
+ ret = rte_event_port_setup(dev_id, port_id, pc);
+ if (ret) {
+ RTE_EDEV_LOG_ERR("failed to setup event port %u\n",
+ port_id);
+ if (started) {
+ if (rte_event_dev_start(dev_id))
+ return -EIO;
+ }
+ return ret;
+ }
+
+ conf->event_port_id = port_id;
+ conf->max_nb_tx = TXA_MAX_NB_TX;
+ if (started)
+ ret = rte_event_dev_start(dev_id);
+ return ret;
+}
+
+static int
+txa_service_ethdev_alloc(struct txa_service_data *txa)
+{
+ struct txa_service_ethdev *txa_ethdev;
+ uint16_t i, dev_count;
+
+ dev_count = rte_eth_dev_count_avail();
+ if (txa->txa_ethdev && dev_count == txa->dev_count)
+ return 0;
+
+ txa_ethdev = rte_zmalloc_socket(txa->mem_name,
+ dev_count * sizeof(*txa_ethdev),
+ 0,
+ txa->socket_id);
+ if (txa_ethdev == NULL) {
+ RTE_EDEV_LOG_ERR("Failed to alloc txa::txa_ethdev ");
+ return -ENOMEM;
+ }
+
+ if (txa->dev_count)
+ memcpy(txa_ethdev, txa->txa_ethdev,
+ txa->dev_count * sizeof(*txa_ethdev));
+
+ RTE_ETH_FOREACH_DEV(i) {
+ if (i == dev_count)
+ break;
+ txa_ethdev[i].dev = &rte_eth_devices[i];
+ }
+
+ txa->txa_ethdev = txa_ethdev;
+ txa->dev_count = dev_count;
+ return 0;
+}
+
+static int
+txa_service_queue_array_alloc(struct txa_service_data *txa,
+ uint16_t port_id)
+{
+ struct txa_service_queue_info *tqi;
+ uint16_t nb_queue;
+ int ret;
+
+ ret = txa_service_ethdev_alloc(txa);
+ if (ret != 0)
+ return ret;
+
+ if (txa->txa_ethdev[port_id].queues)
+ return 0;
+
+ nb_queue = txa->txa_ethdev[port_id].dev->data->nb_tx_queues;
+ tqi = rte_zmalloc_socket(txa->mem_name,
+ nb_queue *
+ sizeof(struct txa_service_queue_info), 0,
+ txa->socket_id);
+ if (tqi == NULL)
+ return -ENOMEM;
+ txa->txa_ethdev[port_id].queues = tqi;
+ return 0;
+}
+
+static void
+txa_service_queue_array_free(struct txa_service_data *txa,
+ uint16_t port_id)
+{
+ struct txa_service_ethdev *txa_ethdev;
+ struct txa_service_queue_info *tqi;
+
+ txa_ethdev = &txa->txa_ethdev[port_id];
+ if (txa->txa_ethdev == NULL || txa_ethdev->nb_queues != 0)
+ return;
+
+ tqi = txa_ethdev->queues;
+ txa_ethdev->queues = NULL;
+ rte_free(tqi);
+
+ if (txa->nb_queues == 0) {
+ rte_free(txa->txa_ethdev);
+ txa->txa_ethdev = NULL;
+ }
+}
+
+static void
+txa_service_unregister(struct txa_service_data *txa)
+{
+ if (txa->service_id != TXA_INVALID_SERVICE_ID) {
+ rte_service_component_runstate_set(txa->service_id, 0);
+ while (rte_service_may_be_active(txa->service_id))
+ rte_pause();
+ rte_service_component_unregister(txa->service_id);
+ }
+ txa->service_id = TXA_INVALID_SERVICE_ID;
+}
+
+static int
+txa_service_register(struct txa_service_data *txa)
+{
+ int ret;
+ struct rte_service_spec service;
+ struct rte_event_eth_tx_adapter_conf conf;
+
+ if (txa->service_id != TXA_INVALID_SERVICE_ID)
+ return 0;
+
+ memset(&service, 0, sizeof(service));
+ snprintf(service.name, TXA_SERVICE_NAME_LEN, "txa_%d", txa->id);
+ service.socket_id = txa->socket_id;
+ service.callback = txa_service_func;
+ service.callback_userdata = txa;
+ service.capabilities = RTE_SERVICE_CAP_MT_SAFE;
+ ret = rte_service_component_register(&service,
+ (uint32_t *)&txa->service_id);
+ if (ret) {
+ RTE_EDEV_LOG_ERR("failed to register service %s err = %"
+ PRId32, service.name, ret);
+ return ret;
+ }
+
+ ret = txa->conf_cb(txa->id, txa->eventdev_id, &conf, txa->conf_arg);
+ if (ret) {
+ txa_service_unregister(txa);
+ return ret;
+ }
+
+ rte_service_component_runstate_set(txa->service_id, 1);
+ txa->port_id = conf.event_port_id;
+ txa->max_nb_tx = conf.max_nb_tx;
+ return 0;
+}
+
+static struct rte_eth_dev_tx_buffer *
+txa_service_tx_buf_alloc(struct txa_service_data *txa,
+ const struct rte_eth_dev *dev)
+{
+ struct rte_eth_dev_tx_buffer *tb;
+ uint16_t port_id;
+
+ port_id = dev->data->port_id;
+ tb = rte_zmalloc_socket(txa->mem_name,
+ RTE_ETH_TX_BUFFER_SIZE(TXA_BATCH_SIZE),
+ 0,
+ rte_eth_dev_socket_id(port_id));
+ if (tb == NULL)
+ RTE_EDEV_LOG_ERR("Failed to allocate memory for tx buffer");
+ return tb;
+}
+
+static int
+txa_service_is_queue_added(struct txa_service_data *txa,
+ const struct rte_eth_dev *dev,
+ uint16_t tx_queue_id)
+{
+ struct txa_service_queue_info *tqi;
+
+ tqi = txa_service_queue(txa, dev->data->port_id, tx_queue_id);
+ return tqi && tqi->added;
+}
+
+static int
+txa_service_ctrl(uint8_t id, int start)
+{
+ int ret;
+ struct txa_service_data *txa;
+
+ txa = txa_service_id_to_data(id);
+ if (txa->service_id == TXA_INVALID_SERVICE_ID)
+ return 0;
+
+ ret = rte_service_runstate_set(txa->service_id, start);
+ if (ret == 0 && !start) {
+ while (rte_service_may_be_active(txa->service_id))
+ rte_pause();
+ }
+ return ret;
+}
+
+static void
+txa_service_buffer_retry(struct rte_mbuf **pkts, uint16_t unsent,
+ void *userdata)
+{
+ struct txa_retry *tr;
+ struct txa_service_data *data;
+ struct rte_event_eth_tx_adapter_stats *stats;
+ uint16_t sent = 0;
+ unsigned int retry = 0;
+ uint16_t i, n;
+
+ tr = (struct txa_retry *)(uintptr_t)userdata;
+ data = txa_service_id_to_data(tr->id);
+ stats = &data->stats;
+
+ do {
+ n = rte_eth_tx_burst(tr->port_id, tr->tx_queue,
+ &pkts[sent], unsent - sent);
+
+ sent += n;
+ } while (sent != unsent && retry++ < TXA_RETRY_CNT);
+
+ for (i = sent; i < unsent; i++)
+ rte_pktmbuf_free(pkts[i]);
+
+ stats->tx_retry += retry;
+ stats->tx_packets += sent;
+ stats->tx_dropped += unsent - sent;
+}
+
+static void
+txa_service_tx(struct txa_service_data *txa, struct rte_event *ev,
+ uint32_t n)
+{
+ uint32_t i;
+ uint16_t nb_tx;
+ struct rte_event_eth_tx_adapter_stats *stats;
+
+ stats = &txa->stats;
+
+ nb_tx = 0;
+ for (i = 0; i < n; i++) {
+ struct rte_mbuf *m;
+ uint16_t port;
+ uint16_t queue;
+ struct txa_service_queue_info *tqi;
+
+ m = ev[i].mbuf;
+ port = m->port;
+ queue = rte_event_eth_tx_adapter_txq_get(m);
+
+ tqi = txa_service_queue(txa, port, queue);
+ if (unlikely(tqi == NULL || !tqi->added)) {
+ rte_pktmbuf_free(m);
+ continue;
+ }
+
+ nb_tx += rte_eth_tx_buffer(port, queue, tqi->tx_buf, m);
+ }
+
+ stats->tx_packets += nb_tx;
+}
+
+static int32_t
+txa_service_func(void *args)
+{
+ struct txa_service_data *txa = args;
+ uint8_t dev_id;
+ uint8_t port;
+ uint16_t n;
+ uint32_t nb_tx, max_nb_tx;
+ struct rte_event ev[TXA_BATCH_SIZE];
+
+ dev_id = txa->eventdev_id;
+ max_nb_tx = txa->max_nb_tx;
+ port = txa->port_id;
+
+ if (txa->nb_queues == 0)
+ return 0;
+
+ if (!rte_spinlock_trylock(&txa->tx_lock))
+ return 0;
+
+ for (nb_tx = 0; nb_tx < max_nb_tx; nb_tx += n) {
+
+ n = rte_event_dequeue_burst(dev_id, port, ev, RTE_DIM(ev), 0);
+ if (!n)
+ break;
+ txa_service_tx(txa, ev, n);
+ }
+
+ if ((txa->loop_cnt++ & (TXA_FLUSH_THRESHOLD - 1)) == 0) {
+
+ struct txa_service_ethdev *tdi;
+ struct txa_service_queue_info *tqi;
+ struct rte_eth_dev *dev;
+ uint16_t i;
+
+ tdi = txa->txa_ethdev;
+ nb_tx = 0;
+
+ RTE_ETH_FOREACH_DEV(i) {
+ uint16_t q;
+
+ if (i == txa->dev_count)
+ break;
+
+ dev = tdi[i].dev;
+ if (tdi[i].nb_queues == 0)
+ continue;
+ for (q = 0; q < dev->data->nb_tx_queues; q++) {
+
+ tqi = txa_service_queue(txa, i, q);
+ if (unlikely(tqi == NULL || !tqi->added))
+ continue;
+
+ nb_tx += rte_eth_tx_buffer_flush(i, q,
+ tqi->tx_buf);
+ }
+ }
+
+ txa->stats.tx_packets += nb_tx;
+ }
+ rte_spinlock_unlock(&txa->tx_lock);
+ return 0;
+}
+
+static int
+txa_service_adapter_create(uint8_t id, struct rte_eventdev *dev,
+ struct rte_event_port_conf *port_conf)
+{
+ struct txa_service_data *txa;
+ struct rte_event_port_conf *cb_conf;
+ int ret;
+
+ cb_conf = rte_malloc(NULL, sizeof(*cb_conf), 0);
+ if (cb_conf == NULL)
+ return -ENOMEM;
+
+ *cb_conf = *port_conf;
+ ret = txa_service_adapter_create_ext(id, dev, txa_service_conf_cb,
+ cb_conf);
+ if (ret) {
+ rte_free(cb_conf);
+ return ret;
+ }
+
+ txa = txa_service_id_to_data(id);
+ txa->conf_free = 1;
+ return ret;
+}
+
+static int
+txa_service_adapter_create_ext(uint8_t id, struct rte_eventdev *dev,
+ rte_event_eth_tx_adapter_conf_cb conf_cb,
+ void *conf_arg)
+{
+ struct txa_service_data *txa;
+ int socket_id;
+ char mem_name[TXA_SERVICE_NAME_LEN];
+ int ret;
+
+ if (conf_cb == NULL)
+ return -EINVAL;
+
+ socket_id = dev->data->socket_id;
+ snprintf(mem_name, TXA_MEM_NAME_LEN,
+ "rte_event_eth_txa_%d",
+ id);
+
+ ret = txa_service_data_init();
+ if (ret != 0)
+ return ret;
+
+ txa = rte_zmalloc_socket(mem_name,
+ sizeof(*txa),
+ RTE_CACHE_LINE_SIZE, socket_id);
+ if (txa == NULL) {
+ RTE_EDEV_LOG_ERR("failed to get mem for tx adapter");
+ return -ENOMEM;
+ }
+
+ txa->id = id;
+ txa->eventdev_id = dev->data->dev_id;
+ txa->socket_id = socket_id;
+ strncpy(txa->mem_name, mem_name, TXA_SERVICE_NAME_LEN);
+ txa->conf_cb = conf_cb;
+ txa->conf_arg = conf_arg;
+ txa->service_id = TXA_INVALID_SERVICE_ID;
+ rte_spinlock_init(&txa->tx_lock);
+ txa_service_data_array[id] = txa;
+
+ return 0;
+}
+
+static int
+txa_service_event_port_get(uint8_t id, uint8_t *port)
+{
+ struct txa_service_data *txa;
+
+ txa = txa_service_id_to_data(id);
+ if (txa->service_id == TXA_INVALID_SERVICE_ID)
+ return -ENODEV;
+
+ *port = txa->port_id;
+ return 0;
+}
+
+static int
+txa_service_adapter_free(uint8_t id)
+{
+ struct txa_service_data *txa;
+
+ txa = txa_service_id_to_data(id);
+ if (txa->nb_queues) {
+ RTE_EDEV_LOG_ERR("%" PRIu16 " Tx queues not deleted",
+ txa->nb_queues);
+ return -EBUSY;
+ }
+
+ if (txa->conf_free)
+ rte_free(txa->conf_arg);
+ rte_free(txa);
+ return 0;
+}
+
+static int
+txa_service_queue_add(uint8_t id,
+ __rte_unused struct rte_eventdev *dev,
+ const struct rte_eth_dev *eth_dev,
+ int32_t tx_queue_id)
+{
+ struct txa_service_data *txa;
+ struct txa_service_ethdev *tdi;
+ struct txa_service_queue_info *tqi;
+ struct rte_eth_dev_tx_buffer *tb;
+ struct txa_retry *txa_retry;
+ int ret;
+
+ txa = txa_service_id_to_data(id);
+
+ if (tx_queue_id == -1) {
+ int nb_queues;
+ uint16_t i, j;
+ uint16_t *qdone;
+
+ nb_queues = eth_dev->data->nb_tx_queues;
+ if (txa->dev_count > eth_dev->data->port_id) {
+ tdi = &txa->txa_ethdev[eth_dev->data->port_id];
+ nb_queues -= tdi->nb_queues;
+ }
+
+ qdone = rte_zmalloc(txa->mem_name,
+ nb_queues * sizeof(*qdone), 0);
+ j = 0;
+ for (i = 0; i < nb_queues; i++) {
+ if (txa_service_is_queue_added(txa, eth_dev, i))
+ continue;
+ ret = txa_service_queue_add(id, dev, eth_dev, i);
+ if (ret == 0)
+ qdone[j++] = i;
+ else
+ break;
+ }
+
+ if (i != nb_queues) {
+ for (i = 0; i < j; i++)
+ txa_service_queue_del(id, eth_dev, qdone[i]);
+ }
+ rte_free(qdone);
+ return ret;
+ }
+
+ ret = txa_service_register(txa);
+ if (ret)
+ return ret;
+
+ rte_spinlock_lock(&txa->tx_lock);
+
+ if (txa_service_is_queue_added(txa, eth_dev, tx_queue_id)) {
+ rte_spinlock_unlock(&txa->tx_lock);
+ return 0;
+ }
+
+ ret = txa_service_queue_array_alloc(txa, eth_dev->data->port_id);
+ if (ret)
+ goto err_unlock;
+
+ tb = txa_service_tx_buf_alloc(txa, eth_dev);
+ if (tb == NULL)
+ goto err_unlock;
+
+ tdi = &txa->txa_ethdev[eth_dev->data->port_id];
+ tqi = txa_service_queue(txa, eth_dev->data->port_id, tx_queue_id);
+
+ txa_retry = &tqi->txa_retry;
+ txa_retry->id = txa->id;
+ txa_retry->port_id = eth_dev->data->port_id;
+ txa_retry->tx_queue = tx_queue_id;
+
+ rte_eth_tx_buffer_init(tb, TXA_BATCH_SIZE);
+ rte_eth_tx_buffer_set_err_callback(tb,
+ txa_service_buffer_retry, txa_retry);
+
+ tqi->tx_buf = tb;
+ tqi->added = 1;
+ tdi->nb_queues++;
+ txa->nb_queues++;
+
+err_unlock:
+ if (txa->nb_queues == 0) {
+ txa_service_queue_array_free(txa,
+ eth_dev->data->port_id);
+ txa_service_unregister(txa);
+ }
+
+ rte_spinlock_unlock(&txa->tx_lock);
+ return 0;
+}
+
+static int
+txa_service_queue_del(uint8_t id,
+ const struct rte_eth_dev *dev,
+ int32_t tx_queue_id)
+{
+ struct txa_service_data *txa;
+ struct txa_service_queue_info *tqi;
+ struct rte_eth_dev_tx_buffer *tb;
+ uint16_t port_id;
+
+ if (tx_queue_id == -1) {
+ uint16_t i;
+ int ret = -1;
+
+ for (i = 0; i < dev->data->nb_tx_queues; i++) {
+ ret = txa_service_queue_del(id, dev, i);
+ if (ret != 0)
+ break;
+ }
+ return ret;
+ }
+
+ txa = txa_service_id_to_data(id);
+ port_id = dev->data->port_id;
+
+ tqi = txa_service_queue(txa, port_id, tx_queue_id);
+ if (tqi == NULL || !tqi->added)
+ return 0;
+
+ tb = tqi->tx_buf;
+ tqi->added = 0;
+ tqi->tx_buf = NULL;
+ rte_free(tb);
+ txa->nb_queues--;
+ txa->txa_ethdev[port_id].nb_queues--;
+
+ txa_service_queue_array_free(txa, port_id);
+ return 0;
+}
+
+static int
+txa_service_id_get(uint8_t id, uint32_t *service_id)
+{
+ struct txa_service_data *txa;
+
+ txa = txa_service_id_to_data(id);
+ if (txa->service_id == TXA_INVALID_SERVICE_ID)
+ return -ESRCH;
+
+ if (service_id == NULL)
+ return -EINVAL;
+
+ *service_id = txa->service_id;
+ return 0;
+}
+
+static int
+txa_service_start(uint8_t id)
+{
+ return txa_service_ctrl(id, 1);
+}
+
+static int
+txa_service_stats_get(uint8_t id,
+ struct rte_event_eth_tx_adapter_stats *stats)
+{
+ struct txa_service_data *txa;
+
+ txa = txa_service_id_to_data(id);
+ *stats = txa->stats;
+ return 0;
+}
+
+static int
+txa_service_stats_reset(uint8_t id)
+{
+ struct txa_service_data *txa;
+
+ txa = txa_service_id_to_data(id);
+ memset(&txa->stats, 0, sizeof(txa->stats));
+ return 0;
+}
+
+static int
+txa_service_stop(uint8_t id)
+{
+ return txa_service_ctrl(id, 0);
+}
+
+
+int __rte_experimental
+rte_event_eth_tx_adapter_create(uint8_t id, uint8_t dev_id,
+ struct rte_event_port_conf *port_conf)
+{
+ struct rte_eventdev *dev;
+ int ret;
+
+ if (port_conf == NULL)
+ return -EINVAL;
+
+ RTE_EVENT_ETH_TX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL);
+ RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+ dev = &rte_eventdevs[dev_id];
+
+ ret = txa_init();
+ if (ret != 0)
+ return ret;
+
+ if (txa_adapter_exist(id))
+ return -EEXIST;
+
+ txa_dev_id_array[id] = dev_id;
+ if (txa_dev_adapter_create(id))
+ ret = txa_dev_adapter_create(id)(id, dev);
+
+ if (ret != 0) {
+ txa_dev_id_array[id] = TXA_INVALID_DEV_ID;
+ return ret;
+ }
+
+ ret = txa_service_adapter_create(id, dev, port_conf);
+ if (ret != 0) {
+ if (txa_dev_adapter_free(id))
+ txa_dev_adapter_free(id)(id, dev);
+ txa_dev_id_array[id] = TXA_INVALID_DEV_ID;
+ return ret;
+ }
+
+ txa_dev_id_array[id] = dev_id;
+ return 0;
+}
+
+int __rte_experimental
+rte_event_eth_tx_adapter_create_ext(uint8_t id, uint8_t dev_id,
+ rte_event_eth_tx_adapter_conf_cb conf_cb,
+ void *conf_arg)
+{
+ struct rte_eventdev *dev;
+ int ret;
+
+ RTE_EVENT_ETH_TX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL);
+ RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+ ret = txa_init();
+ if (ret != 0)
+ return ret;
+
+ if (txa_adapter_exist(id))
+ return -EINVAL;
+
+ dev = &rte_eventdevs[dev_id];
+
+ txa_dev_id_array[id] = dev_id;
+ if (txa_dev_adapter_create_ext(id))
+ ret = txa_dev_adapter_create_ext(id)(id, dev);
+
+ if (ret != 0) {
+ txa_dev_id_array[id] = TXA_INVALID_DEV_ID;
+ return ret;
+ }
+
+ ret = txa_service_adapter_create_ext(id, dev, conf_cb, conf_arg);
+ if (ret != 0) {
+ if (txa_dev_adapter_free(id))
+ txa_dev_adapter_free(id)(id, dev);
+ txa_dev_id_array[id] = TXA_INVALID_DEV_ID;
+ return ret;
+ }
+
+ txa_dev_id_array[id] = dev_id;
+ return 0;
+}
+
+
+int __rte_experimental
+rte_event_eth_tx_adapter_event_port_get(uint8_t id, uint8_t *event_port_id)
+{
+ TXA_CHECK_OR_ERR_RET(id);
+
+ return txa_service_event_port_get(id, event_port_id);
+}
+
+int __rte_experimental
+rte_event_eth_tx_adapter_free(uint8_t id)
+{
+ int ret;
+
+ TXA_CHECK_OR_ERR_RET(id);
+
+ ret = txa_dev_adapter_free(id) ?
+ txa_dev_adapter_free(id)(id, txa_evdev(id)) :
+ 0;
+
+ if (ret == 0)
+ ret = txa_service_adapter_free(id);
+ txa_dev_id_array[id] = TXA_INVALID_DEV_ID;
+
+ return ret;
+}
+
+int __rte_experimental
+rte_event_eth_tx_adapter_queue_add(uint8_t id,
+ uint16_t eth_dev_id,
+ int32_t queue)
+{
+ struct rte_eth_dev *eth_dev;
+ int ret;
+ uint32_t caps;
+
+ RTE_ETH_VALID_PORTID_OR_ERR_RET(eth_dev_id, -EINVAL);
+ TXA_CHECK_OR_ERR_RET(id);
+
+ eth_dev = &rte_eth_devices[eth_dev_id];
+ if (queue != -1 && (uint16_t)queue >= eth_dev->data->nb_tx_queues) {
+ RTE_EDEV_LOG_ERR("Invalid tx queue_id %" PRIu16,
+ (uint16_t)queue);
+ return -EINVAL;
+ }
+
+ caps = 0;
+ if (txa_dev_caps_get(id))
+ txa_dev_caps_get(id)(txa_evdev(id), eth_dev, &caps);
+
+ if (caps & RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT)
+ ret = txa_dev_queue_add(id) ?
+ txa_dev_queue_add(id)(id,
+ txa_evdev(id),
+ eth_dev,
+ queue) : 0;
+ else
+ ret = txa_service_queue_add(id, txa_evdev(id), eth_dev, queue);
+
+ return ret;
+}
+
+int __rte_experimental
+rte_event_eth_tx_adapter_queue_del(uint8_t id,
+ uint16_t eth_dev_id,
+ int32_t queue)
+{
+ struct rte_eth_dev *eth_dev;
+ int ret;
+ uint32_t caps;
+
+ RTE_ETH_VALID_PORTID_OR_ERR_RET(eth_dev_id, -EINVAL);
+ TXA_CHECK_OR_ERR_RET(id);
+
+ eth_dev = &rte_eth_devices[eth_dev_id];
+ if (queue != -1 && (uint16_t)queue >= eth_dev->data->nb_tx_queues) {
+ RTE_EDEV_LOG_ERR("Invalid tx queue_id %" PRIu16,
+ (uint16_t)queue);
+ return -EINVAL;
+ }
+
+ caps = 0;
+
+ if (txa_dev_caps_get(id))
+ txa_dev_caps_get(id)(txa_evdev(id), eth_dev, &caps);
+
+ if (caps & RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT)
+ ret = txa_dev_queue_del(id) ?
+ txa_dev_queue_del(id)(id, txa_evdev(id),
+ eth_dev,
+ queue) : 0;
+ else
+ ret = txa_service_queue_del(id, eth_dev, queue);
+
+ return ret;
+}
+
+int __rte_experimental
+rte_event_eth_tx_adapter_service_id_get(uint8_t id, uint32_t *service_id)
+{
+ TXA_CHECK_OR_ERR_RET(id);
+
+ return txa_service_id_get(id, service_id);
+}
+
+int __rte_experimental
+rte_event_eth_tx_adapter_start(uint8_t id)
+{
+ int ret;
+
+ TXA_CHECK_OR_ERR_RET(id);
+
+ ret = txa_dev_start(id) ? txa_dev_start(id)(id, txa_evdev(id)) : 0;
+ if (ret == 0)
+ ret = txa_service_start(id);
+ return ret;
+}
+
+int __rte_experimental
+rte_event_eth_tx_adapter_stats_get(uint8_t id,
+ struct rte_event_eth_tx_adapter_stats *stats)
+{
+ int ret;
+
+ TXA_CHECK_OR_ERR_RET(id);
+
+ if (stats == NULL)
+ return -EINVAL;
+
+ *stats = (struct rte_event_eth_tx_adapter_stats){0};
+
+ ret = txa_dev_stats_get(id) ?
+ txa_dev_stats_get(id)(id, txa_evdev(id), stats) : 0;
+
+ if (ret == 0 && txa_service_id_get(id, NULL) != ESRCH) {
+ if (txa_dev_stats_get(id)) {
+ struct rte_event_eth_tx_adapter_stats service_stats;
+
+ ret = txa_service_stats_get(id, &service_stats);
+ if (ret == 0) {
+ stats->tx_retry += service_stats.tx_retry;
+ stats->tx_packets += service_stats.tx_packets;
+ stats->tx_dropped += service_stats.tx_dropped;
+ }
+ } else
+ ret = txa_service_stats_get(id, stats);
+ }
+
+ return ret;
+}
+
+int __rte_experimental
+rte_event_eth_tx_adapter_stats_reset(uint8_t id)
+{
+ int ret;
+
+ TXA_CHECK_OR_ERR_RET(id);
+
+ ret = txa_dev_stats_reset(id) ?
+ txa_dev_stats_reset(id)(id, txa_evdev(id)) : 0;
+ if (ret == 0)
+ ret = txa_service_stats_reset(id);
+ return ret;
+}
+
+int __rte_experimental
+rte_event_eth_tx_adapter_stop(uint8_t id)
+{
+ int ret;
+
+ TXA_CHECK_OR_ERR_RET(id);
+
+ ret = txa_dev_stop(id) ? txa_dev_stop(id)(id, txa_evdev(id)) : 0;
+ if (ret == 0)
+ ret = txa_service_stop(id);
+ return ret;
+}
diff --git a/lib/librte_eventdev/rte_event_eth_tx_adapter.h b/lib/librte_eventdev/rte_event_eth_tx_adapter.h
new file mode 100644
index 00000000..81456d4a
--- /dev/null
+++ b/lib/librte_eventdev/rte_event_eth_tx_adapter.h
@@ -0,0 +1,462 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#ifndef _RTE_EVENT_ETH_TX_ADAPTER_
+#define _RTE_EVENT_ETH_TX_ADAPTER_
+
+/**
+ * @file
+ *
+ * RTE Event Ethernet Tx Adapter
+ *
+ * The event ethernet Tx adapter provides configuration and data path APIs
+ * for the ethernet transmit stage of an event driven packet processing
+ * application. These APIs abstract the implementation of the transmit stage
+ * and allow the application to use eventdev PMD support or a common
+ * implementation.
+ *
+ * In the common implementation, the application enqueues mbufs to the adapter
+ * which runs as a rte_service function. The service function dequeues events
+ * from its event port and transmits the mbufs referenced by these events.
+ *
+ * The ethernet Tx event adapter APIs are:
+ *
+ * - rte_event_eth_tx_adapter_create()
+ * - rte_event_eth_tx_adapter_create_ext()
+ * - rte_event_eth_tx_adapter_free()
+ * - rte_event_eth_tx_adapter_start()
+ * - rte_event_eth_tx_adapter_stop()
+ * - rte_event_eth_tx_adapter_queue_add()
+ * - rte_event_eth_tx_adapter_queue_del()
+ * - rte_event_eth_tx_adapter_stats_get()
+ * - rte_event_eth_tx_adapter_stats_reset()
+ * - rte_event_eth_tx_adapter_enqueue()
+ * - rte_event_eth_tx_adapter_event_port_get()
+ * - rte_event_eth_tx_adapter_service_id_get()
+ *
+ * The application creates the adapter using
+ * rte_event_eth_tx_adapter_create() or rte_event_eth_tx_adapter_create_ext().
+ *
+ * The adapter will use the common implementation when the eventdev PMD
+ * does not have the #RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT capability.
+ * The common implementation uses an event port that is created using the port
+ * configuration parameter passed to rte_event_eth_tx_adapter_create(). The
+ * application can get the port identifier using
+ * rte_event_eth_tx_adapter_event_port_get() and must link an event queue to
+ * this port.
+ *
+ * If the eventdev PMD has the #RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT
+ * flags set, Tx adapter events should be enqueued using the
+ * rte_event_eth_tx_adapter_enqueue() function, else the application should
+ * use rte_event_enqueue_burst().
+ *
+ * Transmit queues can be added and deleted from the adapter using
+ * rte_event_eth_tx_adapter_queue_add()/del() APIs respectively.
+ *
+ * The application can start and stop the adapter using the
+ * rte_event_eth_tx_adapter_start/stop() calls.
+ *
+ * The common adapter implementation uses an EAL service function as described
+ * before and its execution is controlled using the rte_service APIs. The
+ * rte_event_eth_tx_adapter_service_id_get()
+ * function can be used to retrieve the adapter's service function ID.
+ *
+ * The ethernet port and transmit queue index to transmit the mbuf on are
+ * specified using the mbuf port and the higher 16 bits of
+ * struct rte_mbuf::hash::sched:hi. The application should use the
+ * rte_event_eth_tx_adapter_txq_set() and rte_event_eth_tx_adapter_txq_get()
+ * functions to access the transmit queue index since it is expected that the
+ * transmit queue will be eventually defined within struct rte_mbuf and using
+ * these macros will help with minimizing application impact due to
+ * a change in how the transmit queue index is specified.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#include <rte_mbuf.h>
+
+#include "rte_eventdev.h"
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Adapter configuration structure
+ *
+ * @see rte_event_eth_tx_adapter_create_ext
+ * @see rte_event_eth_tx_adapter_conf_cb
+ */
+struct rte_event_eth_tx_adapter_conf {
+ uint8_t event_port_id;
+ /**< Event port identifier, the adapter service function dequeues mbuf
+ * events from this port.
+ * @see RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT
+ */
+ uint32_t max_nb_tx;
+ /**< The adapter can return early if it has processed at least
+ * max_nb_tx mbufs. This isn't treated as a requirement; batching may
+ * cause the adapter to process more than max_nb_tx mbufs.
+ */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Function type used for adapter configuration callback. The callback is
+ * used to fill in members of the struct rte_event_eth_tx_adapter_conf, this
+ * callback is invoked when creating a RTE service function based
+ * adapter implementation.
+ *
+ * @param id
+ * Adapter identifier.
+ * @param dev_id
+ * Event device identifier.
+ * @param [out] conf
+ * Structure that needs to be populated by this callback.
+ * @param arg
+ * Argument to the callback. This is the same as the conf_arg passed to the
+ * rte_event_eth_tx_adapter_create_ext().
+ *
+ * @return
+ * - 0: Success
+ * - <0: Error code on failure
+ */
+typedef int (*rte_event_eth_tx_adapter_conf_cb) (uint8_t id, uint8_t dev_id,
+ struct rte_event_eth_tx_adapter_conf *conf,
+ void *arg);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * A structure used to retrieve statistics for an ethernet Tx adapter instance.
+ */
+struct rte_event_eth_tx_adapter_stats {
+ uint64_t tx_retry;
+ /**< Number of transmit retries */
+ uint64_t tx_packets;
+ /**< Number of packets transmitted */
+ uint64_t tx_dropped;
+ /**< Number of packets dropped */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Create a new ethernet Tx adapter with the specified identifier.
+ *
+ * @param id
+ * The identifier of the ethernet Tx adapter.
+ * @param dev_id
+ * The event device identifier.
+ * @param port_config
+ * Event port configuration, the adapter uses this configuration to
+ * create an event port if needed.
+ * @return
+ * - 0: Success
+ * - <0: Error code on failure
+ */
+int __rte_experimental
+rte_event_eth_tx_adapter_create(uint8_t id, uint8_t dev_id,
+ struct rte_event_port_conf *port_config);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Create a new ethernet Tx adapter with the specified identifier.
+ *
+ * @param id
+ * The identifier of the ethernet Tx adapter.
+ * @param dev_id
+ * The event device identifier.
+ * @param conf_cb
+ * Callback function that initializes members of the
+ * struct rte_event_eth_tx_adapter_conf struct passed into
+ * it.
+ * @param conf_arg
+ * Argument that is passed to the conf_cb function.
+ * @return
+ * - 0: Success
+ * - <0: Error code on failure
+ */
+int __rte_experimental
+rte_event_eth_tx_adapter_create_ext(uint8_t id, uint8_t dev_id,
+ rte_event_eth_tx_adapter_conf_cb conf_cb,
+ void *conf_arg);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Free an ethernet Tx adapter
+ *
+ * @param id
+ * Adapter identifier.
+ * @return
+ * - 0: Success
+ * - <0: Error code on failure, If the adapter still has Tx queues
+ * added to it, the function returns -EBUSY.
+ */
+int __rte_experimental
+rte_event_eth_tx_adapter_free(uint8_t id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Start ethernet Tx adapter
+ *
+ * @param id
+ * Adapter identifier.
+ * @return
+ * - 0: Success, Adapter started correctly.
+ * - <0: Error code on failure.
+ */
+int __rte_experimental
+rte_event_eth_tx_adapter_start(uint8_t id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Stop ethernet Tx adapter
+ *
+ * @param id
+ * Adapter identifier.
+ * @return
+ * - 0: Success.
+ * - <0: Error code on failure.
+ */
+int __rte_experimental
+rte_event_eth_tx_adapter_stop(uint8_t id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Add a Tx queue to the adapter.
+ * A queue value of -1 is used to indicate all
+ * queues within the device.
+ *
+ * @param id
+ * Adapter identifier.
+ * @param eth_dev_id
+ * Ethernet Port Identifier.
+ * @param queue
+ * Tx queue index.
+ * @return
+ * - 0: Success, Queues added successfully.
+ * - <0: Error code on failure.
+ */
+int __rte_experimental
+rte_event_eth_tx_adapter_queue_add(uint8_t id,
+ uint16_t eth_dev_id,
+ int32_t queue);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Delete a Tx queue from the adapter.
+ * A queue value of -1 is used to indicate all
+ * queues within the device, that have been added to this
+ * adapter.
+ *
+ * @param id
+ * Adapter identifier.
+ * @param eth_dev_id
+ * Ethernet Port Identifier.
+ * @param queue
+ * Tx queue index.
+ * @return
+ * - 0: Success, Queues deleted successfully.
+ * - <0: Error code on failure.
+ */
+int __rte_experimental
+rte_event_eth_tx_adapter_queue_del(uint8_t id,
+ uint16_t eth_dev_id,
+ int32_t queue);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Set Tx queue in the mbuf. This queue is used by the adapter
+ * to transmit the mbuf.
+ *
+ * @param pkt
+ * Pointer to the mbuf.
+ * @param queue
+ * Tx queue index.
+ */
+static __rte_always_inline void __rte_experimental
+rte_event_eth_tx_adapter_txq_set(struct rte_mbuf *pkt, uint16_t queue)
+{
+ uint16_t *p = (uint16_t *)&pkt->hash.sched.hi;
+ p[1] = queue;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Retrieve Tx queue from the mbuf.
+ *
+ * @param pkt
+ * Pointer to the mbuf.
+ * @return
+ * Tx queue identifier.
+ *
+ * @see rte_event_eth_tx_adapter_txq_set()
+ */
+static __rte_always_inline uint16_t __rte_experimental
+rte_event_eth_tx_adapter_txq_get(struct rte_mbuf *pkt)
+{
+ uint16_t *p = (uint16_t *)&pkt->hash.sched.hi;
+ return p[1];
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Retrieve the adapter event port. The adapter creates an event port if
+ * the #RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT is not set in the
+ * ethernet Tx capabilities of the event device.
+ *
+ * @param id
+ * Adapter Identifier.
+ * @param[out] event_port_id
+ * Event port pointer.
+ * @return
+ * - 0: Success.
+ * - <0: Error code on failure.
+ */
+int __rte_experimental
+rte_event_eth_tx_adapter_event_port_get(uint8_t id, uint8_t *event_port_id);
+
+/**
+ * Enqueue a burst of events objects or an event object supplied in *rte_event*
+ * structure on an event device designated by its *dev_id* through the event
+ * port specified by *port_id*. This function is supported if the eventdev PMD
+ * has the #RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT capability flag set.
+ *
+ * The *nb_events* parameter is the number of event objects to enqueue which are
+ * supplied in the *ev* array of *rte_event* structure.
+ *
+ * The rte_event_eth_tx_adapter_enqueue() function returns the number of
+ * events objects it actually enqueued. A return value equal to *nb_events*
+ * means that all event objects have been enqueued.
+ *
+ * @param dev_id
+ * The identifier of the device.
+ * @param port_id
+ * The identifier of the event port.
+ * @param ev
+ * Points to an array of *nb_events* objects of type *rte_event* structure
+ * which contain the event object enqueue operations to be processed.
+ * @param nb_events
+ * The number of event objects to enqueue, typically number of
+ * rte_event_port_enqueue_depth() available for this port.
+ *
+ * @return
+ * The number of event objects actually enqueued on the event device. The
+ * return value can be less than the value of the *nb_events* parameter when
+ * the event devices queue is full or if invalid parameters are specified in a
+ * *rte_event*. If the return value is less than *nb_events*, the remaining
+ * events at the end of ev[] are not consumed and the caller has to take care
+ * of them, and rte_errno is set accordingly. Possible errno values include:
+ * - -EINVAL The port ID is invalid, device ID is invalid, an event's queue
+ * ID is invalid, or an event's sched type doesn't match the
+ * capabilities of the destination queue.
+ * - -ENOSPC The event port was backpressured and unable to enqueue
+ * one or more events. This error code is only applicable to
+ * closed systems.
+ */
+static inline uint16_t __rte_experimental
+rte_event_eth_tx_adapter_enqueue(uint8_t dev_id,
+ uint8_t port_id,
+ struct rte_event ev[],
+ uint16_t nb_events)
+{
+ const struct rte_eventdev *dev = &rte_eventdevs[dev_id];
+
+#ifdef RTE_LIBRTE_EVENTDEV_DEBUG
+ if (dev_id >= RTE_EVENT_MAX_DEVS ||
+ !rte_eventdevs[dev_id].attached) {
+ rte_errno = -EINVAL;
+ return 0;
+ }
+
+ if (port_id >= dev->data->nb_ports) {
+ rte_errno = -EINVAL;
+ return 0;
+ }
+#endif
+ return dev->txa_enqueue(dev->data->ports[port_id], ev, nb_events);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Retrieve statistics for an adapter
+ *
+ * @param id
+ * Adapter identifier.
+ * @param [out] stats
+ * A pointer to structure used to retrieve statistics for an adapter.
+ * @return
+ * - 0: Success, statistics retrieved successfully.
+ * - <0: Error code on failure.
+ */
+int __rte_experimental
+rte_event_eth_tx_adapter_stats_get(uint8_t id,
+ struct rte_event_eth_tx_adapter_stats *stats);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Reset statistics for an adapter.
+ *
+ * @param id
+ * Adapter identifier.
+ * @return
+ * - 0: Success, statistics reset successfully.
+ * - <0: Error code on failure.
+ */
+int __rte_experimental
+rte_event_eth_tx_adapter_stats_reset(uint8_t id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Retrieve the service ID of an adapter. If the adapter doesn't use
+ * a rte_service function, this function returns -ESRCH.
+ *
+ * @param id
+ * Adapter identifier.
+ * @param [out] service_id
+ * A pointer to a uint32_t, to be filled in with the service id.
+ * @return
+ * - 0: Success
+ * - <0: Error code on failure, if the adapter doesn't use a rte_service
+ * function, this function returns -ESRCH.
+ */
+int __rte_experimental
+rte_event_eth_tx_adapter_service_id_get(uint8_t id, uint32_t *service_id);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* _RTE_EVENT_ETH_TX_ADAPTER_ */
diff --git a/lib/librte_eventdev/rte_eventdev.c b/lib/librte_eventdev/rte_eventdev.c
index 801810ed..ebaf3087 100644
--- a/lib/librte_eventdev/rte_eventdev.c
+++ b/lib/librte_eventdev/rte_eventdev.c
@@ -35,22 +35,20 @@
#include "rte_eventdev.h"
#include "rte_eventdev_pmd.h"
-struct rte_eventdev rte_event_devices[RTE_EVENT_MAX_DEVS];
+static struct rte_eventdev rte_event_devices[RTE_EVENT_MAX_DEVS];
-struct rte_eventdev *rte_eventdevs = &rte_event_devices[0];
+struct rte_eventdev *rte_eventdevs = rte_event_devices;
static struct rte_eventdev_global eventdev_globals = {
.nb_devs = 0
};
-struct rte_eventdev_global *rte_eventdev_globals = &eventdev_globals;
-
/* Event dev north bound API implementation */
uint8_t
rte_event_dev_count(void)
{
- return rte_eventdev_globals->nb_devs;
+ return eventdev_globals.nb_devs;
}
int
@@ -62,7 +60,7 @@ rte_event_dev_get_dev_id(const char *name)
if (!name)
return -EINVAL;
- for (i = 0; i < rte_eventdev_globals->nb_devs; i++) {
+ for (i = 0; i < eventdev_globals.nb_devs; i++) {
cmp = (strncmp(rte_event_devices[i].data->name, name,
RTE_EVENTDEV_NAME_MAX_LEN) == 0) ||
(rte_event_devices[i].dev ? (strncmp(
@@ -109,7 +107,7 @@ rte_event_dev_info_get(uint8_t dev_id, struct rte_event_dev_info *dev_info)
}
int
-rte_event_eth_rx_adapter_caps_get(uint8_t dev_id, uint8_t eth_port_id,
+rte_event_eth_rx_adapter_caps_get(uint8_t dev_id, uint16_t eth_port_id,
uint32_t *caps)
{
struct rte_eventdev *dev;
@@ -175,6 +173,31 @@ rte_event_crypto_adapter_caps_get(uint8_t dev_id, uint8_t cdev_id,
(dev, cdev, caps) : -ENOTSUP;
}
+int __rte_experimental
+rte_event_eth_tx_adapter_caps_get(uint8_t dev_id, uint16_t eth_port_id,
+ uint32_t *caps)
+{
+ struct rte_eventdev *dev;
+ struct rte_eth_dev *eth_dev;
+
+ RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+ RTE_ETH_VALID_PORTID_OR_ERR_RET(eth_port_id, -EINVAL);
+
+ dev = &rte_eventdevs[dev_id];
+ eth_dev = &rte_eth_devices[eth_port_id];
+
+ if (caps == NULL)
+ return -EINVAL;
+
+ *caps = 0;
+
+ return dev->dev_ops->eth_tx_adapter_caps_get ?
+ (*dev->dev_ops->eth_tx_adapter_caps_get)(dev,
+ eth_dev,
+ caps)
+ : 0;
+}
+
static inline int
rte_event_dev_queue_config(struct rte_eventdev *dev, uint8_t nb_queues)
{
@@ -980,6 +1003,28 @@ rte_event_port_unlink(uint8_t dev_id, uint8_t port_id,
return diag;
}
+int __rte_experimental
+rte_event_port_unlinks_in_progress(uint8_t dev_id, uint8_t port_id)
+{
+ struct rte_eventdev *dev;
+
+ RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+ dev = &rte_eventdevs[dev_id];
+ if (!is_valid_port(dev, port_id)) {
+ RTE_EDEV_LOG_ERR("Invalid port_id=%" PRIu8, port_id);
+ return -EINVAL;
+ }
+
+ /* Return 0 if the PMD does not implement unlinks in progress.
+ * This allows PMDs which handle unlink synchronously to not implement
+ * this function at all.
+ */
+ RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->port_unlinks_in_progress, 0);
+
+ return (*dev->dev_ops->port_unlinks_in_progress)(dev,
+ dev->data->ports[port_id]);
+}
+
int
rte_event_port_links_get(uint8_t dev_id, uint8_t port_id,
uint8_t queues[], uint8_t priorities[])
@@ -1275,6 +1320,15 @@ rte_eventdev_find_free_device_index(void)
return RTE_EVENT_MAX_DEVS;
}
+static uint16_t
+rte_event_tx_adapter_enqueue(__rte_unused void *port,
+ __rte_unused struct rte_event ev[],
+ __rte_unused uint16_t nb_events)
+{
+ rte_errno = ENOTSUP;
+ return 0;
+}
+
struct rte_eventdev *
rte_event_pmd_allocate(const char *name, int socket_id)
{
@@ -1295,6 +1349,8 @@ rte_event_pmd_allocate(const char *name, int socket_id)
eventdev = &rte_eventdevs[dev_id];
+ eventdev->txa_enqueue = rte_event_tx_adapter_enqueue;
+
if (eventdev->data == NULL) {
struct rte_eventdev_data *eventdev_data = NULL;
diff --git a/lib/librte_eventdev/rte_eventdev.h b/lib/librte_eventdev/rte_eventdev.h
index b6fd6ee7..d7eb69d1 100644
--- a/lib/librte_eventdev/rte_eventdev.h
+++ b/lib/librte_eventdev/rte_eventdev.h
@@ -1112,7 +1112,7 @@ struct rte_event {
*
*/
int
-rte_event_eth_rx_adapter_caps_get(uint8_t dev_id, uint8_t eth_port_id,
+rte_event_eth_rx_adapter_caps_get(uint8_t dev_id, uint16_t eth_port_id,
uint32_t *caps);
#define RTE_EVENT_TIMER_ADAPTER_CAP_INTERNAL_PORT (1ULL << 0)
@@ -1186,6 +1186,32 @@ int __rte_experimental
rte_event_crypto_adapter_caps_get(uint8_t dev_id, uint8_t cdev_id,
uint32_t *caps);
+/* Ethdev Tx adapter capability bitmap flags */
+#define RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT 0x1
+/**< This flag is sent when the PMD supports a packet transmit callback
+ */
+
+/**
+ * Retrieve the event device's eth Tx adapter capabilities
+ *
+ * @param dev_id
+ * The identifier of the device.
+ *
+ * @param eth_port_id
+ * The identifier of the ethernet device.
+ *
+ * @param[out] caps
+ * A pointer to memory filled with eth Tx adapter capabilities.
+ *
+ * @return
+ * - 0: Success, driver provides eth Tx adapter capabilities.
+ * - <0: Error code returned by the driver function.
+ *
+ */
+int __rte_experimental
+rte_event_eth_tx_adapter_caps_get(uint8_t dev_id, uint16_t eth_port_id,
+ uint32_t *caps);
+
struct rte_eventdev_ops;
struct rte_eventdev;
@@ -1204,6 +1230,10 @@ typedef uint16_t (*event_dequeue_burst_t)(void *port, struct rte_event ev[],
uint16_t nb_events, uint64_t timeout_ticks);
/**< @internal Dequeue burst of events from port of a device */
+typedef uint16_t (*event_tx_adapter_enqueue)(void *port,
+ struct rte_event ev[], uint16_t nb_events);
+/**< @internal Enqueue burst of events on port of a device */
+
#define RTE_EVENTDEV_NAME_MAX_LEN (64)
/**< @internal Max length of name of event PMD */
@@ -1266,7 +1296,8 @@ struct rte_eventdev {
/**< Pointer to PMD dequeue function. */
event_dequeue_burst_t dequeue_burst;
/**< Pointer to PMD dequeue burst function. */
-
+ event_tx_adapter_enqueue txa_enqueue;
+ /**< Pointer to PMD eth Tx adapter enqueue function. */
struct rte_eventdev_data *data;
/**< Pointer to device data */
struct rte_eventdev_ops *dev_ops;
@@ -1656,12 +1687,13 @@ rte_event_port_link(uint8_t dev_id, uint8_t port_id,
* event port designated by its *port_id* on the event device designated
* by its *dev_id*.
*
- * The unlink establishment shall disable the event port *port_id* from
- * receiving events from the specified event queue *queue_id*
- *
+ * The unlink call issues an async request to disable the event port *port_id*
+ * from receiving events from the specified event queue *queue_id*.
* Event queue(s) to event port unlink establishment can be changed at runtime
* without re-configuring the device.
*
+ * @see rte_event_port_unlinks_in_progress() to poll for completed unlinks.
+ *
* @param dev_id
* The identifier of the device.
*
@@ -1679,22 +1711,48 @@ rte_event_port_link(uint8_t dev_id, uint8_t port_id,
* NULL.
*
* @return
- * The number of unlinks actually established. The return value can be less
+ * The number of unlinks successfully requested. The return value can be less
* than the value of the *nb_unlinks* parameter when the implementation has the
* limitation on specific queue to port unlink establishment or
* if invalid parameters are specified.
* If the return value is less than *nb_unlinks*, the remaining queues at the
- * end of queues[] are not established, and the caller has to take care of them.
+ * end of queues[] are not unlinked, and the caller has to take care of them.
* If return value is less than *nb_unlinks* then implementation shall update
* the rte_errno accordingly, Possible rte_errno values are
* (-EINVAL) Invalid parameter
- *
*/
int
rte_event_port_unlink(uint8_t dev_id, uint8_t port_id,
uint8_t queues[], uint16_t nb_unlinks);
/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Returns the number of unlinks in progress.
+ *
+ * This function provides the application with a method to detect when an
+ * unlink has been completed by the implementation.
+ *
+ * @see rte_event_port_unlink() to issue unlink requests.
+ *
+ * @param dev_id
+ * The indentifier of the device.
+ *
+ * @param port_id
+ * Event port identifier to select port to check for unlinks in progress.
+ *
+ * @return
+ * The number of unlinks that are in progress. A return of zero indicates that
+ * there are no outstanding unlink requests. A positive return value indicates
+ * the number of unlinks that are in progress, but are not yet complete.
+ * A negative return value indicates an error, -EINVAL indicates an invalid
+ * parameter passed for *dev_id* or *port_id*.
+ */
+int __rte_experimental
+rte_event_port_unlinks_in_progress(uint8_t dev_id, uint8_t port_id);
+
+/**
* Retrieve the list of source event queues and its associated service priority
* linked to the destination event port designated by its *port_id*
* on the event device designated by its *dev_id*.
diff --git a/lib/librte_eventdev/rte_eventdev_pmd.h b/lib/librte_eventdev/rte_eventdev_pmd.h
index 3fbb4d2b..1a01326b 100644
--- a/lib/librte_eventdev/rte_eventdev_pmd.h
+++ b/lib/librte_eventdev/rte_eventdev_pmd.h
@@ -87,8 +87,6 @@ struct rte_eventdev_global {
uint8_t nb_devs; /**< Number of devices found */
};
-extern struct rte_eventdev_global *rte_eventdev_globals;
-/** Pointer to global event devices data structure. */
extern struct rte_eventdev *rte_eventdevs;
/** The pool of rte_eventdev structures. */
@@ -333,6 +331,23 @@ typedef int (*eventdev_port_unlink_t)(struct rte_eventdev *dev, void *port,
uint8_t queues[], uint16_t nb_unlinks);
/**
+ * Unlinks in progress. Returns number of unlinks that the PMD is currently
+ * performing, but have not yet been completed.
+ *
+ * @param dev
+ * Event device pointer
+ *
+ * @param port
+ * Event port pointer
+ *
+ * @return
+ * Returns the number of in-progress unlinks. Zero is returned if none are
+ * in progress.
+ */
+typedef int (*eventdev_port_unlinks_in_progress_t)(struct rte_eventdev *dev,
+ void *port);
+
+/**
* Converts nanoseconds to *timeout_ticks* value for rte_event_dequeue()
*
* @param dev
@@ -450,7 +465,7 @@ typedef int (*eventdev_eth_rx_adapter_caps_get_t)
const struct rte_eth_dev *eth_dev,
uint32_t *caps);
-struct rte_event_eth_rx_adapter_queue_conf *queue_conf;
+struct rte_event_eth_rx_adapter_queue_conf;
/**
* Retrieve the event device's timer adapter capabilities, as well as the ops
@@ -575,7 +590,7 @@ typedef int (*eventdev_eth_rx_adapter_stop_t)
(const struct rte_eventdev *dev,
const struct rte_eth_dev *eth_dev);
-struct rte_event_eth_rx_adapter_stats *stats;
+struct rte_event_eth_rx_adapter_stats;
/**
* Retrieve ethernet Rx adapter statistics.
@@ -789,6 +804,186 @@ typedef int (*eventdev_crypto_adapter_stats_reset)
(const struct rte_eventdev *dev,
const struct rte_cryptodev *cdev);
+/**
+ * Retrieve the event device's eth Tx adapter capabilities.
+ *
+ * @param dev
+ * Event device pointer
+ *
+ * @param eth_dev
+ * Ethernet device pointer
+ *
+ * @param[out] caps
+ * A pointer to memory filled with eth Tx adapter capabilities.
+ *
+ * @return
+ * - 0: Success, driver provides eth Tx adapter capabilities
+ * - <0: Error code returned by the driver function.
+ *
+ */
+typedef int (*eventdev_eth_tx_adapter_caps_get_t)
+ (const struct rte_eventdev *dev,
+ const struct rte_eth_dev *eth_dev,
+ uint32_t *caps);
+
+/**
+ * Create adapter callback.
+ *
+ * @param id
+ * Adapter identifier
+ *
+ * @param dev
+ * Event device pointer
+ *
+ * @return
+ * - 0: Success.
+ * - <0: Error code on failure.
+ */
+typedef int (*eventdev_eth_tx_adapter_create_t)(uint8_t id,
+ const struct rte_eventdev *dev);
+
+/**
+ * Free adapter callback.
+ *
+ * @param id
+ * Adapter identifier
+ *
+ * @param dev
+ * Event device pointer
+ *
+ * @return
+ * - 0: Success.
+ * - <0: Error code on failure.
+ */
+typedef int (*eventdev_eth_tx_adapter_free_t)(uint8_t id,
+ const struct rte_eventdev *dev);
+
+/**
+ * Add a Tx queue to the adapter.
+ * A queue value of -1 is used to indicate all
+ * queues within the device.
+ *
+ * @param id
+ * Adapter identifier
+ *
+ * @param dev
+ * Event device pointer
+ *
+ * @param eth_dev
+ * Ethernet device pointer
+ *
+ * @param tx_queue_id
+ * Transmt queue index
+ *
+ * @return
+ * - 0: Success.
+ * - <0: Error code on failure.
+ */
+typedef int (*eventdev_eth_tx_adapter_queue_add_t)(
+ uint8_t id,
+ const struct rte_eventdev *dev,
+ const struct rte_eth_dev *eth_dev,
+ int32_t tx_queue_id);
+
+/**
+ * Delete a Tx queue from the adapter.
+ * A queue value of -1 is used to indicate all
+ * queues within the device, that have been added to this
+ * adapter.
+ *
+ * @param id
+ * Adapter identifier
+ *
+ * @param dev
+ * Event device pointer
+ *
+ * @param eth_dev
+ * Ethernet device pointer
+ *
+ * @param tx_queue_id
+ * Transmit queue index
+ *
+ * @return
+ * - 0: Success, Queues deleted successfully.
+ * - <0: Error code on failure.
+ */
+typedef int (*eventdev_eth_tx_adapter_queue_del_t)(
+ uint8_t id,
+ const struct rte_eventdev *dev,
+ const struct rte_eth_dev *eth_dev,
+ int32_t tx_queue_id);
+
+/**
+ * Start the adapter.
+ *
+ * @param id
+ * Adapter identifier
+ *
+ * @param dev
+ * Event device pointer
+ *
+ * @return
+ * - 0: Success, Adapter started correctly.
+ * - <0: Error code on failure.
+ */
+typedef int (*eventdev_eth_tx_adapter_start_t)(uint8_t id,
+ const struct rte_eventdev *dev);
+
+/**
+ * Stop the adapter.
+ *
+ * @param id
+ * Adapter identifier
+ *
+ * @param dev
+ * Event device pointer
+ *
+ * @return
+ * - 0: Success.
+ * - <0: Error code on failure.
+ */
+typedef int (*eventdev_eth_tx_adapter_stop_t)(uint8_t id,
+ const struct rte_eventdev *dev);
+
+struct rte_event_eth_tx_adapter_stats;
+
+/**
+ * Retrieve statistics for an adapter
+ *
+ * @param id
+ * Adapter identifier
+ *
+ * @param dev
+ * Event device pointer
+ *
+ * @param [out] stats
+ * A pointer to structure used to retrieve statistics for an adapter
+ *
+ * @return
+ * - 0: Success, statistics retrieved successfully.
+ * - <0: Error code on failure.
+ */
+typedef int (*eventdev_eth_tx_adapter_stats_get_t)(
+ uint8_t id,
+ const struct rte_eventdev *dev,
+ struct rte_event_eth_tx_adapter_stats *stats);
+
+/**
+ * Reset statistics for an adapter
+ *
+ * @param id
+ * Adapter identifier
+ *
+ * @param dev
+ * Event device pointer
+ *
+ * @return
+ * - 0: Success, statistics retrieved successfully.
+ * - <0: Error code on failure.
+ */
+typedef int (*eventdev_eth_tx_adapter_stats_reset_t)(uint8_t id,
+ const struct rte_eventdev *dev);
+
/** Event device operations function pointer table */
struct rte_eventdev_ops {
eventdev_info_get_t dev_infos_get; /**< Get device info. */
@@ -815,6 +1010,8 @@ struct rte_eventdev_ops {
/**< Link event queues to an event port. */
eventdev_port_unlink_t port_unlink;
/**< Unlink event queues from an event port. */
+ eventdev_port_unlinks_in_progress_t port_unlinks_in_progress;
+ /**< Unlinks in progress on an event port. */
eventdev_dequeue_timeout_ticks_t timeout_ticks;
/**< Converts ns to *timeout_ticks* value for rte_event_dequeue() */
eventdev_dump_t dump;
@@ -862,6 +1059,26 @@ struct rte_eventdev_ops {
eventdev_crypto_adapter_stats_reset crypto_adapter_stats_reset;
/**< Reset crypto stats */
+ eventdev_eth_tx_adapter_caps_get_t eth_tx_adapter_caps_get;
+ /**< Get ethernet Tx adapter capabilities */
+
+ eventdev_eth_tx_adapter_create_t eth_tx_adapter_create;
+ /**< Create adapter callback */
+ eventdev_eth_tx_adapter_free_t eth_tx_adapter_free;
+ /**< Free adapter callback */
+ eventdev_eth_tx_adapter_queue_add_t eth_tx_adapter_queue_add;
+ /**< Add Tx queues to the eth Tx adapter */
+ eventdev_eth_tx_adapter_queue_del_t eth_tx_adapter_queue_del;
+ /**< Delete Tx queues from the eth Tx adapter */
+ eventdev_eth_tx_adapter_start_t eth_tx_adapter_start;
+ /**< Start eth Tx adapter */
+ eventdev_eth_tx_adapter_stop_t eth_tx_adapter_stop;
+ /**< Stop eth Tx adapter */
+ eventdev_eth_tx_adapter_stats_get_t eth_tx_adapter_stats_get;
+ /**< Get eth Tx adapter statistics */
+ eventdev_eth_tx_adapter_stats_reset_t eth_tx_adapter_stats_reset;
+ /**< Reset eth Tx adapter statistics */
+
eventdev_selftest dev_selftest;
/**< Start eventdev Selftest */
diff --git a/lib/librte_eventdev/rte_eventdev_version.map b/lib/librte_eventdev/rte_eventdev_version.map
index 12835e9f..d558d7d5 100644
--- a/lib/librte_eventdev/rte_eventdev_version.map
+++ b/lib/librte_eventdev/rte_eventdev_version.map
@@ -96,6 +96,19 @@ EXPERIMENTAL {
rte_event_crypto_adapter_stats_reset;
rte_event_crypto_adapter_stop;
rte_event_eth_rx_adapter_cb_register;
+ rte_event_port_unlinks_in_progress;
+ rte_event_eth_tx_adapter_caps_get;
+ rte_event_eth_tx_adapter_create;
+ rte_event_eth_tx_adapter_create_ext;
+ rte_event_eth_tx_adapter_event_port_get;
+ rte_event_eth_tx_adapter_free;
+ rte_event_eth_tx_adapter_queue_add;
+ rte_event_eth_tx_adapter_queue_del;
+ rte_event_eth_tx_adapter_service_id_get;
+ rte_event_eth_tx_adapter_start;
+ rte_event_eth_tx_adapter_stats_get;
+ rte_event_eth_tx_adapter_stats_reset;
+ rte_event_eth_tx_adapter_stop;
rte_event_timer_adapter_caps_get;
rte_event_timer_adapter_create;
rte_event_timer_adapter_create_ext;
diff --git a/lib/librte_flow_classify/rte_flow_classify.c b/lib/librte_flow_classify/rte_flow_classify.c
index 4c3469da..fb652a2b 100644
--- a/lib/librte_flow_classify/rte_flow_classify.c
+++ b/lib/librte_flow_classify/rte_flow_classify.c
@@ -247,8 +247,7 @@ rte_flow_classifier_check_params(struct rte_flow_classifier_params *params)
}
/* socket */
- if ((params->socket_id < 0) ||
- (params->socket_id >= RTE_MAX_NUMA_NODES)) {
+ if (params->socket_id < 0) {
RTE_FLOW_CLASSIFY_LOG(ERR,
"%s: Incorrect value for parameter socket_id\n",
__func__);
diff --git a/lib/librte_hash/rte_cuckoo_hash.c b/lib/librte_hash/rte_cuckoo_hash.c
index f7b86c8c..5ddcccd8 100644
--- a/lib/librte_hash/rte_cuckoo_hash.c
+++ b/lib/librte_hash/rte_cuckoo_hash.c
@@ -1,5 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018 Arm Limited
*/
#include <string.h>
@@ -26,11 +27,14 @@
#include <rte_spinlock.h>
#include <rte_ring.h>
#include <rte_compat.h>
-#include <rte_pause.h>
#include "rte_hash.h"
#include "rte_cuckoo_hash.h"
+#define FOR_EACH_BUCKET(CURRENT_BKT, START_BUCKET) \
+ for (CURRENT_BKT = START_BUCKET; \
+ CURRENT_BKT != NULL; \
+ CURRENT_BKT = CURRENT_BKT->next)
TAILQ_HEAD(rte_hash_list, rte_tailq_entry);
@@ -63,6 +67,14 @@ rte_hash_find_existing(const char *name)
return h;
}
+static inline struct rte_hash_bucket *
+rte_hash_get_last_bkt(struct rte_hash_bucket *lst_bkt)
+{
+ while (lst_bkt->next != NULL)
+ lst_bkt = lst_bkt->next;
+ return lst_bkt;
+}
+
void rte_hash_set_cmp_func(struct rte_hash *h, rte_hash_cmp_eq_t func)
{
h->cmp_jump_table_idx = KEY_CUSTOM;
@@ -78,6 +90,36 @@ rte_hash_cmp_eq(const void *key1, const void *key2, const struct rte_hash *h)
return cmp_jump_table[h->cmp_jump_table_idx](key1, key2, h->key_len);
}
+/*
+ * We use higher 16 bits of hash as the signature value stored in table.
+ * We use the lower bits for the primary bucket
+ * location. Then we XOR primary bucket location and the signature
+ * to get the secondary bucket location. This is same as
+ * proposed in Bin Fan, et al's paper
+ * "MemC3: Compact and Concurrent MemCache with Dumber Caching and
+ * Smarter Hashing". The benefit to use
+ * XOR is that one could derive the alternative bucket location
+ * by only using the current bucket location and the signature.
+ */
+static inline uint16_t
+get_short_sig(const hash_sig_t hash)
+{
+ return hash >> 16;
+}
+
+static inline uint32_t
+get_prim_bucket_index(const struct rte_hash *h, const hash_sig_t hash)
+{
+ return hash & h->bucket_bitmask;
+}
+
+static inline uint32_t
+get_alt_bucket_index(const struct rte_hash *h,
+ uint32_t cur_bkt_idx, uint16_t sig)
+{
+ return (cur_bkt_idx ^ sig) & h->bucket_bitmask;
+}
+
struct rte_hash *
rte_hash_create(const struct rte_hash_parameters *params)
{
@@ -85,14 +127,22 @@ rte_hash_create(const struct rte_hash_parameters *params)
struct rte_tailq_entry *te = NULL;
struct rte_hash_list *hash_list;
struct rte_ring *r = NULL;
+ struct rte_ring *r_ext = NULL;
char hash_name[RTE_HASH_NAMESIZE];
void *k = NULL;
void *buckets = NULL;
+ void *buckets_ext = NULL;
char ring_name[RTE_RING_NAMESIZE];
+ char ext_ring_name[RTE_RING_NAMESIZE];
unsigned num_key_slots;
unsigned i;
- unsigned int hw_trans_mem_support = 0, multi_writer_support = 0;
+ unsigned int hw_trans_mem_support = 0, use_local_cache = 0;
+ unsigned int ext_table_support = 0;
unsigned int readwrite_concur_support = 0;
+ unsigned int writer_takes_lock = 0;
+ unsigned int no_free_on_del = 0;
+ uint32_t *tbl_chng_cnt = NULL;
+ unsigned int readwrite_concur_lf_support = 0;
rte_hash_function default_hash_func = (rte_hash_function)rte_jhash;
@@ -112,20 +162,52 @@ rte_hash_create(const struct rte_hash_parameters *params)
return NULL;
}
+ /* Validate correct usage of extra options */
+ if ((params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY) &&
+ (params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF)) {
+ rte_errno = EINVAL;
+ RTE_LOG(ERR, HASH, "rte_hash_create: choose rw concurrency or "
+ "rw concurrency lock free\n");
+ return NULL;
+ }
+
+ if ((params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF) &&
+ (params->extra_flag & RTE_HASH_EXTRA_FLAGS_EXT_TABLE)) {
+ rte_errno = EINVAL;
+ RTE_LOG(ERR, HASH, "rte_hash_create: extendable bucket "
+ "feature not supported with rw concurrency "
+ "lock free\n");
+ return NULL;
+ }
+
/* Check extra flags field to check extra options. */
if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT)
hw_trans_mem_support = 1;
- if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD)
- multi_writer_support = 1;
+ if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD) {
+ use_local_cache = 1;
+ writer_takes_lock = 1;
+ }
if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY) {
readwrite_concur_support = 1;
- multi_writer_support = 1;
+ writer_takes_lock = 1;
+ }
+
+ if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_EXT_TABLE)
+ ext_table_support = 1;
+
+ if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_NO_FREE_ON_DEL)
+ no_free_on_del = 1;
+
+ if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF) {
+ readwrite_concur_lf_support = 1;
+ /* Enable not freeing internal memory/index on delete */
+ no_free_on_del = 1;
}
/* Store all keys and leave the first entry as a dummy entry for lookup_bulk */
- if (multi_writer_support)
+ if (use_local_cache)
/*
* Increase number of slots by total number of indices
* that can be stored in the lcore caches
@@ -145,6 +227,24 @@ rte_hash_create(const struct rte_hash_parameters *params)
goto err;
}
+ const uint32_t num_buckets = rte_align32pow2(params->entries) /
+ RTE_HASH_BUCKET_ENTRIES;
+
+ /* Create ring for extendable buckets. */
+ if (ext_table_support) {
+ snprintf(ext_ring_name, sizeof(ext_ring_name), "HT_EXT_%s",
+ params->name);
+ r_ext = rte_ring_create(ext_ring_name,
+ rte_align32pow2(num_buckets + 1),
+ params->socket_id, 0);
+
+ if (r_ext == NULL) {
+ RTE_LOG(ERR, HASH, "ext buckets memory allocation "
+ "failed\n");
+ goto err;
+ }
+ }
+
snprintf(hash_name, sizeof(hash_name), "HT_%s", params->name);
rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
@@ -177,19 +277,37 @@ rte_hash_create(const struct rte_hash_parameters *params)
goto err_unlock;
}
- const uint32_t num_buckets = rte_align32pow2(params->entries)
- / RTE_HASH_BUCKET_ENTRIES;
-
buckets = rte_zmalloc_socket(NULL,
num_buckets * sizeof(struct rte_hash_bucket),
RTE_CACHE_LINE_SIZE, params->socket_id);
if (buckets == NULL) {
- RTE_LOG(ERR, HASH, "memory allocation failed\n");
+ RTE_LOG(ERR, HASH, "buckets memory allocation failed\n");
goto err_unlock;
}
- const uint32_t key_entry_size = sizeof(struct rte_hash_key) + params->key_len;
+ /* Allocate same number of extendable buckets */
+ if (ext_table_support) {
+ buckets_ext = rte_zmalloc_socket(NULL,
+ num_buckets * sizeof(struct rte_hash_bucket),
+ RTE_CACHE_LINE_SIZE, params->socket_id);
+ if (buckets_ext == NULL) {
+ RTE_LOG(ERR, HASH, "ext buckets memory allocation "
+ "failed\n");
+ goto err_unlock;
+ }
+ /* Populate ext bkt ring. We reserve 0 similar to the
+ * key-data slot, just in case in future we want to
+ * use bucket index for the linked list and 0 means NULL
+ * for next bucket
+ */
+ for (i = 1; i <= num_buckets; i++)
+ rte_ring_sp_enqueue(r_ext, (void *)((uintptr_t) i));
+ }
+
+ const uint32_t key_entry_size =
+ RTE_ALIGN(sizeof(struct rte_hash_key) + params->key_len,
+ KEY_ALIGNMENT);
const uint64_t key_tbl_size = (uint64_t) key_entry_size * num_key_slots;
k = rte_zmalloc_socket(NULL, key_tbl_size,
@@ -200,6 +318,14 @@ rte_hash_create(const struct rte_hash_parameters *params)
goto err_unlock;
}
+ tbl_chng_cnt = rte_zmalloc_socket(NULL, sizeof(uint32_t),
+ RTE_CACHE_LINE_SIZE, params->socket_id);
+
+ if (tbl_chng_cnt == NULL) {
+ RTE_LOG(ERR, HASH, "memory allocation failed\n");
+ goto err_unlock;
+ }
+
/*
* If x86 architecture is used, select appropriate compare function,
* which may use x86 intrinsics, otherwise use memcmp
@@ -239,7 +365,7 @@ rte_hash_create(const struct rte_hash_parameters *params)
h->cmp_jump_table_idx = KEY_OTHER_BYTES;
#endif
- if (multi_writer_support) {
+ if (use_local_cache) {
h->local_free_slots = rte_zmalloc_socket(NULL,
sizeof(struct lcore_cache) * RTE_MAX_LCORE,
RTE_CACHE_LINE_SIZE, params->socket_id);
@@ -262,27 +388,34 @@ rte_hash_create(const struct rte_hash_parameters *params)
h->num_buckets = num_buckets;
h->bucket_bitmask = h->num_buckets - 1;
h->buckets = buckets;
+ h->buckets_ext = buckets_ext;
+ h->free_ext_bkts = r_ext;
h->hash_func = (params->hash_func == NULL) ?
default_hash_func : params->hash_func;
h->key_store = k;
h->free_slots = r;
+ h->tbl_chng_cnt = tbl_chng_cnt;
+ *h->tbl_chng_cnt = 0;
h->hw_trans_mem_support = hw_trans_mem_support;
- h->multi_writer_support = multi_writer_support;
+ h->use_local_cache = use_local_cache;
h->readwrite_concur_support = readwrite_concur_support;
+ h->ext_table_support = ext_table_support;
+ h->writer_takes_lock = writer_takes_lock;
+ h->no_free_on_del = no_free_on_del;
+ h->readwrite_concur_lf_support = readwrite_concur_lf_support;
#if defined(RTE_ARCH_X86)
- if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2))
- h->sig_cmp_fn = RTE_HASH_COMPARE_AVX2;
- else if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE2))
+ if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE2))
h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
else
#endif
h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
- /* Turn on multi-writer only with explicit flag from user and TM
- * support.
+ /* Writer threads need to take the lock when:
+ * 1) RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY is enabled OR
+ * 2) RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD is enabled
*/
- if (h->multi_writer_support) {
+ if (h->writer_takes_lock) {
h->readwrite_lock = rte_malloc(NULL, sizeof(rte_rwlock_t),
RTE_CACHE_LINE_SIZE);
if (h->readwrite_lock == NULL)
@@ -304,10 +437,13 @@ err_unlock:
rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
err:
rte_ring_free(r);
+ rte_ring_free(r_ext);
rte_free(te);
rte_free(h);
rte_free(buckets);
+ rte_free(buckets_ext);
rte_free(k);
+ rte_free(tbl_chng_cnt);
return NULL;
}
@@ -339,13 +475,16 @@ rte_hash_free(struct rte_hash *h)
rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
- if (h->multi_writer_support) {
+ if (h->use_local_cache)
rte_free(h->local_free_slots);
+ if (h->writer_takes_lock)
rte_free(h->readwrite_lock);
- }
rte_ring_free(h->free_slots);
+ rte_ring_free(h->free_ext_bkts);
rte_free(h->key_store);
rte_free(h->buckets);
+ rte_free(h->buckets_ext);
+ rte_free(h->tbl_chng_cnt);
rte_free(h);
rte_free(te);
}
@@ -357,18 +496,6 @@ rte_hash_hash(const struct rte_hash *h, const void *key)
return h->hash_func(key, h->key_len, h->hash_func_init_val);
}
-/* Calc the secondary hash value from the primary hash value of a given key */
-static inline hash_sig_t
-rte_hash_secondary_hash(const hash_sig_t primary_hash)
-{
- static const unsigned all_bits_shift = 12;
- static const unsigned alt_bits_xor = 0x5bd1e995;
-
- uint32_t tag = primary_hash >> all_bits_shift;
-
- return primary_hash ^ ((tag + 1) * alt_bits_xor);
-}
-
int32_t
rte_hash_count(const struct rte_hash *h)
{
@@ -378,7 +505,7 @@ rte_hash_count(const struct rte_hash *h)
if (h == NULL)
return -EINVAL;
- if (h->multi_writer_support) {
+ if (h->use_local_cache) {
tot_ring_cnt = h->entries + (RTE_MAX_LCORE - 1) *
(LCORE_CACHE_SIZE - 1);
for (i = 0; i < RTE_MAX_LCORE; i++)
@@ -397,13 +524,12 @@ rte_hash_count(const struct rte_hash *h)
static inline void
__hash_rw_writer_lock(const struct rte_hash *h)
{
- if (h->multi_writer_support && h->hw_trans_mem_support)
+ if (h->writer_takes_lock && h->hw_trans_mem_support)
rte_rwlock_write_lock_tm(h->readwrite_lock);
- else if (h->multi_writer_support)
+ else if (h->writer_takes_lock)
rte_rwlock_write_lock(h->readwrite_lock);
}
-
static inline void
__hash_rw_reader_lock(const struct rte_hash *h)
{
@@ -416,9 +542,9 @@ __hash_rw_reader_lock(const struct rte_hash *h)
static inline void
__hash_rw_writer_unlock(const struct rte_hash *h)
{
- if (h->multi_writer_support && h->hw_trans_mem_support)
+ if (h->writer_takes_lock && h->hw_trans_mem_support)
rte_rwlock_write_unlock_tm(h->readwrite_lock);
- else if (h->multi_writer_support)
+ else if (h->writer_takes_lock)
rte_rwlock_write_unlock(h->readwrite_lock);
}
@@ -443,13 +569,22 @@ rte_hash_reset(struct rte_hash *h)
__hash_rw_writer_lock(h);
memset(h->buckets, 0, h->num_buckets * sizeof(struct rte_hash_bucket));
memset(h->key_store, 0, h->key_entry_size * (h->entries + 1));
+ *h->tbl_chng_cnt = 0;
/* clear the free ring */
while (rte_ring_dequeue(h->free_slots, &ptr) == 0)
- rte_pause();
+ continue;
+
+ /* clear free extendable bucket ring and memory */
+ if (h->ext_table_support) {
+ memset(h->buckets_ext, 0, h->num_buckets *
+ sizeof(struct rte_hash_bucket));
+ while (rte_ring_dequeue(h->free_ext_bkts, &ptr) == 0)
+ continue;
+ }
/* Repopulate the free slots ring. Entry zero is reserved for key misses */
- if (h->multi_writer_support)
+ if (h->use_local_cache)
tot_ring_cnt = h->entries + (RTE_MAX_LCORE - 1) *
(LCORE_CACHE_SIZE - 1);
else
@@ -458,7 +593,14 @@ rte_hash_reset(struct rte_hash *h)
for (i = 1; i < tot_ring_cnt + 1; i++)
rte_ring_sp_enqueue(h->free_slots, (void *)((uintptr_t) i));
- if (h->multi_writer_support) {
+ /* Repopulate the free ext bkt ring. */
+ if (h->ext_table_support) {
+ for (i = 1; i <= h->num_buckets; i++)
+ rte_ring_sp_enqueue(h->free_ext_bkts,
+ (void *)((uintptr_t) i));
+ }
+
+ if (h->use_local_cache) {
/* Reset local caches per lcore */
for (i = 0; i < RTE_MAX_LCORE; i++)
h->local_free_slots[i].len = 0;
@@ -476,29 +618,35 @@ enqueue_slot_back(const struct rte_hash *h,
struct lcore_cache *cached_free_slots,
void *slot_id)
{
- if (h->multi_writer_support) {
+ if (h->use_local_cache) {
cached_free_slots->objs[cached_free_slots->len] = slot_id;
cached_free_slots->len++;
} else
rte_ring_sp_enqueue(h->free_slots, slot_id);
}
-/* Search a key from bucket and update its data */
+/* Search a key from bucket and update its data.
+ * Writer holds the lock before calling this.
+ */
static inline int32_t
search_and_update(const struct rte_hash *h, void *data, const void *key,
- struct rte_hash_bucket *bkt, hash_sig_t sig, hash_sig_t alt_hash)
+ struct rte_hash_bucket *bkt, uint16_t sig)
{
int i;
struct rte_hash_key *k, *keys = h->key_store;
for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
- if (bkt->sig_current[i] == sig &&
- bkt->sig_alt[i] == alt_hash) {
+ if (bkt->sig_current[i] == sig) {
k = (struct rte_hash_key *) ((char *)keys +
bkt->key_idx[i] * h->key_entry_size);
if (rte_hash_cmp_eq(key, k->key, h) == 0) {
- /* Update data */
- k->pdata = data;
+ /* 'pdata' acts as the synchronization point
+ * when an existing hash entry is updated.
+ * Key is not updated in this case.
+ */
+ __atomic_store_n(&k->pdata,
+ data,
+ __ATOMIC_RELEASE);
/*
* Return index where key is stored,
* subtracting the first dummy index
@@ -520,28 +668,31 @@ rte_hash_cuckoo_insert_mw(const struct rte_hash *h,
struct rte_hash_bucket *prim_bkt,
struct rte_hash_bucket *sec_bkt,
const struct rte_hash_key *key, void *data,
- hash_sig_t sig, hash_sig_t alt_hash, uint32_t new_idx,
+ uint16_t sig, uint32_t new_idx,
int32_t *ret_val)
{
unsigned int i;
- struct rte_hash_bucket *cur_bkt = prim_bkt;
+ struct rte_hash_bucket *cur_bkt;
int32_t ret;
__hash_rw_writer_lock(h);
/* Check if key was inserted after last check but before this
* protected region in case of inserting duplicated keys.
*/
- ret = search_and_update(h, data, key, cur_bkt, sig, alt_hash);
+ ret = search_and_update(h, data, key, prim_bkt, sig);
if (ret != -1) {
__hash_rw_writer_unlock(h);
*ret_val = ret;
return 1;
}
- ret = search_and_update(h, data, key, sec_bkt, alt_hash, sig);
- if (ret != -1) {
- __hash_rw_writer_unlock(h);
- *ret_val = ret;
- return 1;
+
+ FOR_EACH_BUCKET(cur_bkt, sec_bkt) {
+ ret = search_and_update(h, data, key, cur_bkt, sig);
+ if (ret != -1) {
+ __hash_rw_writer_unlock(h);
+ *ret_val = ret;
+ return 1;
+ }
}
/* Insert new entry if there is room in the primary
@@ -551,8 +702,15 @@ rte_hash_cuckoo_insert_mw(const struct rte_hash *h,
/* Check if slot is available */
if (likely(prim_bkt->key_idx[i] == EMPTY_SLOT)) {
prim_bkt->sig_current[i] = sig;
- prim_bkt->sig_alt[i] = alt_hash;
- prim_bkt->key_idx[i] = new_idx;
+ /* Key can be of arbitrary length, so it is
+ * not possible to store it atomically.
+ * Hence the new key element's memory stores
+ * (key as well as data) should be complete
+ * before it is referenced.
+ */
+ __atomic_store_n(&prim_bkt->key_idx[i],
+ new_idx,
+ __ATOMIC_RELEASE);
break;
}
}
@@ -576,11 +734,11 @@ rte_hash_cuckoo_move_insert_mw(const struct rte_hash *h,
struct rte_hash_bucket *alt_bkt,
const struct rte_hash_key *key, void *data,
struct queue_node *leaf, uint32_t leaf_slot,
- hash_sig_t sig, hash_sig_t alt_hash, uint32_t new_idx,
+ uint16_t sig, uint32_t new_idx,
int32_t *ret_val)
{
uint32_t prev_alt_bkt_idx;
- struct rte_hash_bucket *cur_bkt = bkt;
+ struct rte_hash_bucket *cur_bkt;
struct queue_node *prev_node, *curr_node = leaf;
struct rte_hash_bucket *prev_bkt, *curr_bkt = leaf->bkt;
uint32_t prev_slot, curr_slot = leaf_slot;
@@ -597,18 +755,20 @@ rte_hash_cuckoo_move_insert_mw(const struct rte_hash *h,
/* Check if key was inserted after last check but before this
* protected region.
*/
- ret = search_and_update(h, data, key, cur_bkt, sig, alt_hash);
+ ret = search_and_update(h, data, key, bkt, sig);
if (ret != -1) {
__hash_rw_writer_unlock(h);
*ret_val = ret;
return 1;
}
- ret = search_and_update(h, data, key, alt_bkt, alt_hash, sig);
- if (ret != -1) {
- __hash_rw_writer_unlock(h);
- *ret_val = ret;
- return 1;
+ FOR_EACH_BUCKET(cur_bkt, alt_bkt) {
+ ret = search_and_update(h, data, key, cur_bkt, sig);
+ if (ret != -1) {
+ __hash_rw_writer_unlock(h);
+ *ret_val = ret;
+ return 1;
+ }
}
while (likely(curr_node->prev != NULL)) {
@@ -616,36 +776,73 @@ rte_hash_cuckoo_move_insert_mw(const struct rte_hash *h,
prev_bkt = prev_node->bkt;
prev_slot = curr_node->prev_slot;
- prev_alt_bkt_idx =
- prev_bkt->sig_alt[prev_slot] & h->bucket_bitmask;
+ prev_alt_bkt_idx = get_alt_bucket_index(h,
+ prev_node->cur_bkt_idx,
+ prev_bkt->sig_current[prev_slot]);
if (unlikely(&h->buckets[prev_alt_bkt_idx]
!= curr_bkt)) {
/* revert it to empty, otherwise duplicated keys */
- curr_bkt->key_idx[curr_slot] = EMPTY_SLOT;
+ __atomic_store_n(&curr_bkt->key_idx[curr_slot],
+ EMPTY_SLOT,
+ __ATOMIC_RELEASE);
__hash_rw_writer_unlock(h);
return -1;
}
+ if (h->readwrite_concur_lf_support) {
+ /* Inform the previous move. The current move need
+ * not be informed now as the current bucket entry
+ * is present in both primary and secondary.
+ * Since there is one writer, load acquires on
+ * tbl_chng_cnt are not required.
+ */
+ __atomic_store_n(h->tbl_chng_cnt,
+ *h->tbl_chng_cnt + 1,
+ __ATOMIC_RELEASE);
+ /* The stores to sig_alt and sig_current should not
+ * move above the store to tbl_chng_cnt.
+ */
+ __atomic_thread_fence(__ATOMIC_RELEASE);
+ }
+
/* Need to swap current/alt sig to allow later
* Cuckoo insert to move elements back to its
* primary bucket if available
*/
- curr_bkt->sig_alt[curr_slot] =
- prev_bkt->sig_current[prev_slot];
curr_bkt->sig_current[curr_slot] =
- prev_bkt->sig_alt[prev_slot];
- curr_bkt->key_idx[curr_slot] =
- prev_bkt->key_idx[prev_slot];
+ prev_bkt->sig_current[prev_slot];
+ /* Release the updated bucket entry */
+ __atomic_store_n(&curr_bkt->key_idx[curr_slot],
+ prev_bkt->key_idx[prev_slot],
+ __ATOMIC_RELEASE);
curr_slot = prev_slot;
curr_node = prev_node;
curr_bkt = curr_node->bkt;
}
+ if (h->readwrite_concur_lf_support) {
+ /* Inform the previous move. The current move need
+ * not be informed now as the current bucket entry
+ * is present in both primary and secondary.
+ * Since there is one writer, load acquires on
+ * tbl_chng_cnt are not required.
+ */
+ __atomic_store_n(h->tbl_chng_cnt,
+ *h->tbl_chng_cnt + 1,
+ __ATOMIC_RELEASE);
+ /* The stores to sig_alt and sig_current should not
+ * move above the store to tbl_chng_cnt.
+ */
+ __atomic_thread_fence(__ATOMIC_RELEASE);
+ }
+
curr_bkt->sig_current[curr_slot] = sig;
- curr_bkt->sig_alt[curr_slot] = alt_hash;
- curr_bkt->key_idx[curr_slot] = new_idx;
+ /* Release the new bucket entry */
+ __atomic_store_n(&curr_bkt->key_idx[curr_slot],
+ new_idx,
+ __ATOMIC_RELEASE);
__hash_rw_writer_unlock(h);
@@ -662,39 +859,44 @@ rte_hash_cuckoo_make_space_mw(const struct rte_hash *h,
struct rte_hash_bucket *bkt,
struct rte_hash_bucket *sec_bkt,
const struct rte_hash_key *key, void *data,
- hash_sig_t sig, hash_sig_t alt_hash,
+ uint16_t sig, uint32_t bucket_idx,
uint32_t new_idx, int32_t *ret_val)
{
unsigned int i;
struct queue_node queue[RTE_HASH_BFS_QUEUE_MAX_LEN];
struct queue_node *tail, *head;
struct rte_hash_bucket *curr_bkt, *alt_bkt;
+ uint32_t cur_idx, alt_idx;
tail = queue;
head = queue + 1;
tail->bkt = bkt;
tail->prev = NULL;
tail->prev_slot = -1;
+ tail->cur_bkt_idx = bucket_idx;
/* Cuckoo bfs Search */
while (likely(tail != head && head <
queue + RTE_HASH_BFS_QUEUE_MAX_LEN -
RTE_HASH_BUCKET_ENTRIES)) {
curr_bkt = tail->bkt;
+ cur_idx = tail->cur_bkt_idx;
for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
if (curr_bkt->key_idx[i] == EMPTY_SLOT) {
int32_t ret = rte_hash_cuckoo_move_insert_mw(h,
bkt, sec_bkt, key, data,
- tail, i, sig, alt_hash,
+ tail, i, sig,
new_idx, ret_val);
if (likely(ret != -1))
return ret;
}
/* Enqueue new node and keep prev node info */
- alt_bkt = &(h->buckets[curr_bkt->sig_alt[i]
- & h->bucket_bitmask]);
+ alt_idx = get_alt_bucket_index(h, cur_idx,
+ curr_bkt->sig_current[i]);
+ alt_bkt = &(h->buckets[alt_idx]);
head->bkt = alt_bkt;
+ head->cur_bkt_idx = alt_idx;
head->prev = tail;
head->prev_slot = i;
head++;
@@ -709,45 +911,50 @@ static inline int32_t
__rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key,
hash_sig_t sig, void *data)
{
- hash_sig_t alt_hash;
+ uint16_t short_sig;
uint32_t prim_bucket_idx, sec_bucket_idx;
- struct rte_hash_bucket *prim_bkt, *sec_bkt;
+ struct rte_hash_bucket *prim_bkt, *sec_bkt, *cur_bkt;
struct rte_hash_key *new_k, *keys = h->key_store;
void *slot_id = NULL;
- uint32_t new_idx;
+ void *ext_bkt_id = NULL;
+ uint32_t new_idx, bkt_id;
int ret;
unsigned n_slots;
unsigned lcore_id;
+ unsigned int i;
struct lcore_cache *cached_free_slots = NULL;
int32_t ret_val;
+ struct rte_hash_bucket *last;
- prim_bucket_idx = sig & h->bucket_bitmask;
+ short_sig = get_short_sig(sig);
+ prim_bucket_idx = get_prim_bucket_index(h, sig);
+ sec_bucket_idx = get_alt_bucket_index(h, prim_bucket_idx, short_sig);
prim_bkt = &h->buckets[prim_bucket_idx];
- rte_prefetch0(prim_bkt);
-
- alt_hash = rte_hash_secondary_hash(sig);
- sec_bucket_idx = alt_hash & h->bucket_bitmask;
sec_bkt = &h->buckets[sec_bucket_idx];
+ rte_prefetch0(prim_bkt);
rte_prefetch0(sec_bkt);
/* Check if key is already inserted in primary location */
__hash_rw_writer_lock(h);
- ret = search_and_update(h, data, key, prim_bkt, sig, alt_hash);
+ ret = search_and_update(h, data, key, prim_bkt, short_sig);
if (ret != -1) {
__hash_rw_writer_unlock(h);
return ret;
}
/* Check if key is already inserted in secondary location */
- ret = search_and_update(h, data, key, sec_bkt, alt_hash, sig);
- if (ret != -1) {
- __hash_rw_writer_unlock(h);
- return ret;
+ FOR_EACH_BUCKET(cur_bkt, sec_bkt) {
+ ret = search_and_update(h, data, key, cur_bkt, short_sig);
+ if (ret != -1) {
+ __hash_rw_writer_unlock(h);
+ return ret;
+ }
}
+
__hash_rw_writer_unlock(h);
/* Did not find a match, so get a new slot for storing the new key */
- if (h->multi_writer_support) {
+ if (h->use_local_cache) {
lcore_id = rte_lcore_id();
cached_free_slots = &h->local_free_slots[lcore_id];
/* Try to get a free slot from the local cache */
@@ -776,12 +983,19 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key,
new_idx = (uint32_t)((uintptr_t) slot_id);
/* Copy key */
rte_memcpy(new_k->key, key, h->key_len);
- new_k->pdata = data;
-
+ /* Key can be of arbitrary length, so it is not possible to store
+ * it atomically. Hence the new key element's memory stores
+ * (key as well as data) should be complete before it is referenced.
+ * 'pdata' acts as the synchronization point when an existing hash
+ * entry is updated.
+ */
+ __atomic_store_n(&new_k->pdata,
+ data,
+ __ATOMIC_RELEASE);
/* Find an empty slot and insert */
ret = rte_hash_cuckoo_insert_mw(h, prim_bkt, sec_bkt, key, data,
- sig, alt_hash, new_idx, &ret_val);
+ short_sig, new_idx, &ret_val);
if (ret == 0)
return new_idx - 1;
else if (ret == 1) {
@@ -791,7 +1005,7 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key,
/* Primary bucket full, need to make space for new entry */
ret = rte_hash_cuckoo_make_space_mw(h, prim_bkt, sec_bkt, key, data,
- sig, alt_hash, new_idx, &ret_val);
+ short_sig, prim_bucket_idx, new_idx, &ret_val);
if (ret == 0)
return new_idx - 1;
else if (ret == 1) {
@@ -801,17 +1015,75 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key,
/* Also search secondary bucket to get better occupancy */
ret = rte_hash_cuckoo_make_space_mw(h, sec_bkt, prim_bkt, key, data,
- alt_hash, sig, new_idx, &ret_val);
+ short_sig, sec_bucket_idx, new_idx, &ret_val);
if (ret == 0)
return new_idx - 1;
else if (ret == 1) {
enqueue_slot_back(h, cached_free_slots, slot_id);
return ret_val;
- } else {
+ }
+
+ /* if ext table not enabled, we failed the insertion */
+ if (!h->ext_table_support) {
enqueue_slot_back(h, cached_free_slots, slot_id);
return ret;
}
+
+ /* Now we need to go through the extendable bucket. Protection is needed
+ * to protect all extendable bucket processes.
+ */
+ __hash_rw_writer_lock(h);
+ /* We check for duplicates again since could be inserted before the lock */
+ ret = search_and_update(h, data, key, prim_bkt, short_sig);
+ if (ret != -1) {
+ enqueue_slot_back(h, cached_free_slots, slot_id);
+ goto failure;
+ }
+
+ FOR_EACH_BUCKET(cur_bkt, sec_bkt) {
+ ret = search_and_update(h, data, key, cur_bkt, short_sig);
+ if (ret != -1) {
+ enqueue_slot_back(h, cached_free_slots, slot_id);
+ goto failure;
+ }
+ }
+
+ /* Search sec and ext buckets to find an empty entry to insert. */
+ FOR_EACH_BUCKET(cur_bkt, sec_bkt) {
+ for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+ /* Check if slot is available */
+ if (likely(cur_bkt->key_idx[i] == EMPTY_SLOT)) {
+ cur_bkt->sig_current[i] = short_sig;
+ cur_bkt->key_idx[i] = new_idx;
+ __hash_rw_writer_unlock(h);
+ return new_idx - 1;
+ }
+ }
+ }
+
+ /* Failed to get an empty entry from extendable buckets. Link a new
+ * extendable bucket. We first get a free bucket from ring.
+ */
+ if (rte_ring_sc_dequeue(h->free_ext_bkts, &ext_bkt_id) != 0) {
+ ret = -ENOSPC;
+ goto failure;
+ }
+
+ bkt_id = (uint32_t)((uintptr_t)ext_bkt_id) - 1;
+ /* Use the first location of the new bucket */
+ (h->buckets_ext[bkt_id]).sig_current[0] = short_sig;
+ (h->buckets_ext[bkt_id]).key_idx[0] = new_idx;
+ /* Link the new bucket to sec bucket linked list */
+ last = rte_hash_get_last_bkt(sec_bkt);
+ last->next = &h->buckets_ext[bkt_id];
+ __hash_rw_writer_unlock(h);
+ return new_idx - 1;
+
+failure:
+ __hash_rw_writer_unlock(h);
+ return ret;
+
}
int32_t
@@ -859,25 +1131,31 @@ rte_hash_add_key_data(const struct rte_hash *h, const void *key, void *data)
/* Search one bucket to find the match key */
static inline int32_t
-search_one_bucket(const struct rte_hash *h, const void *key, hash_sig_t sig,
+search_one_bucket(const struct rte_hash *h, const void *key, uint16_t sig,
void **data, const struct rte_hash_bucket *bkt)
{
int i;
+ uint32_t key_idx;
+ void *pdata;
struct rte_hash_key *k, *keys = h->key_store;
for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
- if (bkt->sig_current[i] == sig &&
- bkt->key_idx[i] != EMPTY_SLOT) {
+ key_idx = __atomic_load_n(&bkt->key_idx[i],
+ __ATOMIC_ACQUIRE);
+ if (bkt->sig_current[i] == sig && key_idx != EMPTY_SLOT) {
k = (struct rte_hash_key *) ((char *)keys +
- bkt->key_idx[i] * h->key_entry_size);
+ key_idx * h->key_entry_size);
+ pdata = __atomic_load_n(&k->pdata,
+ __ATOMIC_ACQUIRE);
+
if (rte_hash_cmp_eq(key, k->key, h) == 0) {
if (data != NULL)
- *data = k->pdata;
+ *data = pdata;
/*
* Return index where key is stored,
* subtracting the first dummy index
*/
- return bkt->key_idx[i] - 1;
+ return key_idx - 1;
}
}
}
@@ -888,34 +1166,64 @@ static inline int32_t
__rte_hash_lookup_with_hash(const struct rte_hash *h, const void *key,
hash_sig_t sig, void **data)
{
- uint32_t bucket_idx;
- hash_sig_t alt_hash;
- struct rte_hash_bucket *bkt;
+ uint32_t prim_bucket_idx, sec_bucket_idx;
+ struct rte_hash_bucket *bkt, *cur_bkt;
+ uint32_t cnt_b, cnt_a;
int ret;
+ uint16_t short_sig;
- bucket_idx = sig & h->bucket_bitmask;
- bkt = &h->buckets[bucket_idx];
+ short_sig = get_short_sig(sig);
+ prim_bucket_idx = get_prim_bucket_index(h, sig);
+ sec_bucket_idx = get_alt_bucket_index(h, prim_bucket_idx, short_sig);
__hash_rw_reader_lock(h);
- /* Check if key is in primary location */
- ret = search_one_bucket(h, key, sig, data, bkt);
- if (ret != -1) {
- __hash_rw_reader_unlock(h);
- return ret;
- }
- /* Calculate secondary hash */
- alt_hash = rte_hash_secondary_hash(sig);
- bucket_idx = alt_hash & h->bucket_bitmask;
- bkt = &h->buckets[bucket_idx];
+ do {
+ /* Load the table change counter before the lookup
+ * starts. Acquire semantics will make sure that
+ * loads in search_one_bucket are not hoisted.
+ */
+ cnt_b = __atomic_load_n(h->tbl_chng_cnt,
+ __ATOMIC_ACQUIRE);
+
+ /* Check if key is in primary location */
+ bkt = &h->buckets[prim_bucket_idx];
+ ret = search_one_bucket(h, key, short_sig, data, bkt);
+ if (ret != -1) {
+ __hash_rw_reader_unlock(h);
+ return ret;
+ }
+ /* Calculate secondary hash */
+ bkt = &h->buckets[sec_bucket_idx];
+
+ /* Check if key is in secondary location */
+ FOR_EACH_BUCKET(cur_bkt, bkt) {
+ ret = search_one_bucket(h, key, short_sig,
+ data, cur_bkt);
+ if (ret != -1) {
+ __hash_rw_reader_unlock(h);
+ return ret;
+ }
+ }
+
+ /* The loads of sig_current in search_one_bucket
+ * should not move below the load from tbl_chng_cnt.
+ */
+ __atomic_thread_fence(__ATOMIC_ACQUIRE);
+ /* Re-read the table change counter to check if the
+ * table has changed during search. If yes, re-do
+ * the search.
+ * This load should not get hoisted. The load
+ * acquires on cnt_b, key index in primary bucket
+ * and key index in secondary bucket will make sure
+ * that it does not get hoisted.
+ */
+ cnt_a = __atomic_load_n(h->tbl_chng_cnt,
+ __ATOMIC_ACQUIRE);
+ } while (cnt_b != cnt_a);
- /* Check if key is in secondary location */
- ret = search_one_bucket(h, key, alt_hash, data, bkt);
- if (ret != -1) {
- __hash_rw_reader_unlock(h);
- return ret;
- }
__hash_rw_reader_unlock(h);
+
return -ENOENT;
}
@@ -955,9 +1263,7 @@ remove_entry(const struct rte_hash *h, struct rte_hash_bucket *bkt, unsigned i)
unsigned lcore_id, n_slots;
struct lcore_cache *cached_free_slots;
- bkt->sig_current[i] = NULL_SIGNATURE;
- bkt->sig_alt[i] = NULL_SIGNATURE;
- if (h->multi_writer_support) {
+ if (h->use_local_cache) {
lcore_id = rte_lcore_id();
cached_free_slots = &h->local_free_slots[lcore_id];
/* Cache full, need to free it. */
@@ -978,31 +1284,67 @@ remove_entry(const struct rte_hash *h, struct rte_hash_bucket *bkt, unsigned i)
}
}
-/* Search one bucket and remove the matched key */
+/* Compact the linked list by moving key from last entry in linked list to the
+ * empty slot.
+ */
+static inline void
+__rte_hash_compact_ll(struct rte_hash_bucket *cur_bkt, int pos) {
+ int i;
+ struct rte_hash_bucket *last_bkt;
+
+ if (!cur_bkt->next)
+ return;
+
+ last_bkt = rte_hash_get_last_bkt(cur_bkt);
+
+ for (i = RTE_HASH_BUCKET_ENTRIES - 1; i >= 0; i--) {
+ if (last_bkt->key_idx[i] != EMPTY_SLOT) {
+ cur_bkt->key_idx[pos] = last_bkt->key_idx[i];
+ cur_bkt->sig_current[pos] = last_bkt->sig_current[i];
+ last_bkt->sig_current[i] = NULL_SIGNATURE;
+ last_bkt->key_idx[i] = EMPTY_SLOT;
+ return;
+ }
+ }
+}
+
+/* Search one bucket and remove the matched key.
+ * Writer is expected to hold the lock while calling this
+ * function.
+ */
static inline int32_t
search_and_remove(const struct rte_hash *h, const void *key,
- struct rte_hash_bucket *bkt, hash_sig_t sig)
+ struct rte_hash_bucket *bkt, uint16_t sig, int *pos)
{
struct rte_hash_key *k, *keys = h->key_store;
unsigned int i;
- int32_t ret;
+ uint32_t key_idx;
- /* Check if key is in primary location */
+ /* Check if key is in bucket */
for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
- if (bkt->sig_current[i] == sig &&
- bkt->key_idx[i] != EMPTY_SLOT) {
+ key_idx = __atomic_load_n(&bkt->key_idx[i],
+ __ATOMIC_ACQUIRE);
+ if (bkt->sig_current[i] == sig && key_idx != EMPTY_SLOT) {
k = (struct rte_hash_key *) ((char *)keys +
- bkt->key_idx[i] * h->key_entry_size);
+ key_idx * h->key_entry_size);
if (rte_hash_cmp_eq(key, k->key, h) == 0) {
- remove_entry(h, bkt, i);
+ bkt->sig_current[i] = NULL_SIGNATURE;
+ /* Free the key store index if
+ * no_free_on_del is disabled.
+ */
+ if (!h->no_free_on_del)
+ remove_entry(h, bkt, i);
+
+ __atomic_store_n(&bkt->key_idx[i],
+ EMPTY_SLOT,
+ __ATOMIC_RELEASE);
+ *pos = i;
/*
* Return index where key is stored,
* subtracting the first dummy index
*/
- ret = bkt->key_idx[i] - 1;
- bkt->key_idx[i] = EMPTY_SLOT;
- return ret;
+ return key_idx - 1;
}
}
}
@@ -1013,36 +1355,68 @@ static inline int32_t
__rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key,
hash_sig_t sig)
{
- uint32_t bucket_idx;
- hash_sig_t alt_hash;
- struct rte_hash_bucket *bkt;
- int32_t ret;
-
- bucket_idx = sig & h->bucket_bitmask;
- bkt = &h->buckets[bucket_idx];
+ uint32_t prim_bucket_idx, sec_bucket_idx;
+ struct rte_hash_bucket *prim_bkt, *sec_bkt, *prev_bkt, *last_bkt;
+ struct rte_hash_bucket *cur_bkt;
+ int pos;
+ int32_t ret, i;
+ uint16_t short_sig;
+
+ short_sig = get_short_sig(sig);
+ prim_bucket_idx = get_prim_bucket_index(h, sig);
+ sec_bucket_idx = get_alt_bucket_index(h, prim_bucket_idx, short_sig);
+ prim_bkt = &h->buckets[prim_bucket_idx];
__hash_rw_writer_lock(h);
/* look for key in primary bucket */
- ret = search_and_remove(h, key, bkt, sig);
+ ret = search_and_remove(h, key, prim_bkt, short_sig, &pos);
if (ret != -1) {
- __hash_rw_writer_unlock(h);
- return ret;
+ __rte_hash_compact_ll(prim_bkt, pos);
+ last_bkt = prim_bkt->next;
+ prev_bkt = prim_bkt;
+ goto return_bkt;
}
/* Calculate secondary hash */
- alt_hash = rte_hash_secondary_hash(sig);
- bucket_idx = alt_hash & h->bucket_bitmask;
- bkt = &h->buckets[bucket_idx];
+ sec_bkt = &h->buckets[sec_bucket_idx];
- /* look for key in secondary bucket */
- ret = search_and_remove(h, key, bkt, alt_hash);
- if (ret != -1) {
+ FOR_EACH_BUCKET(cur_bkt, sec_bkt) {
+ ret = search_and_remove(h, key, cur_bkt, short_sig, &pos);
+ if (ret != -1) {
+ __rte_hash_compact_ll(cur_bkt, pos);
+ last_bkt = sec_bkt->next;
+ prev_bkt = sec_bkt;
+ goto return_bkt;
+ }
+ }
+
+ __hash_rw_writer_unlock(h);
+ return -ENOENT;
+
+/* Search last bucket to see if empty to be recycled */
+return_bkt:
+ if (!last_bkt) {
__hash_rw_writer_unlock(h);
return ret;
}
+ while (last_bkt->next) {
+ prev_bkt = last_bkt;
+ last_bkt = last_bkt->next;
+ }
+
+ for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+ if (last_bkt->key_idx[i] != EMPTY_SLOT)
+ break;
+ }
+ /* found empty bucket and recycle */
+ if (i == RTE_HASH_BUCKET_ENTRIES) {
+ prev_bkt->next = last_bkt->next = NULL;
+ uint32_t index = last_bkt - h->buckets_ext + 1;
+ rte_ring_sp_enqueue(h->free_ext_bkts, (void *)(uintptr_t)index);
+ }
__hash_rw_writer_unlock(h);
- return -ENOENT;
+ return ret;
}
int32_t
@@ -1080,59 +1454,76 @@ rte_hash_get_key_with_position(const struct rte_hash *h, const int32_t position,
return 0;
}
+int __rte_experimental
+rte_hash_free_key_with_position(const struct rte_hash *h,
+ const int32_t position)
+{
+ RETURN_IF_TRUE(((h == NULL) || (position == EMPTY_SLOT)), -EINVAL);
+
+ unsigned int lcore_id, n_slots;
+ struct lcore_cache *cached_free_slots;
+ const int32_t total_entries = h->num_buckets * RTE_HASH_BUCKET_ENTRIES;
+
+ /* Out of bounds */
+ if (position >= total_entries)
+ return -EINVAL;
+
+ if (h->use_local_cache) {
+ lcore_id = rte_lcore_id();
+ cached_free_slots = &h->local_free_slots[lcore_id];
+ /* Cache full, need to free it. */
+ if (cached_free_slots->len == LCORE_CACHE_SIZE) {
+ /* Need to enqueue the free slots in global ring. */
+ n_slots = rte_ring_mp_enqueue_burst(h->free_slots,
+ cached_free_slots->objs,
+ LCORE_CACHE_SIZE, NULL);
+ cached_free_slots->len -= n_slots;
+ }
+ /* Put index of new free slot in cache. */
+ cached_free_slots->objs[cached_free_slots->len] =
+ (void *)((uintptr_t)position);
+ cached_free_slots->len++;
+ } else {
+ rte_ring_sp_enqueue(h->free_slots,
+ (void *)((uintptr_t)position));
+ }
+
+ return 0;
+}
+
static inline void
compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
const struct rte_hash_bucket *prim_bkt,
const struct rte_hash_bucket *sec_bkt,
- hash_sig_t prim_hash, hash_sig_t sec_hash,
+ uint16_t sig,
enum rte_hash_sig_compare_function sig_cmp_fn)
{
unsigned int i;
+ /* For match mask the first bit of every two bits indicates the match */
switch (sig_cmp_fn) {
-#ifdef RTE_MACHINE_CPUFLAG_AVX2
- case RTE_HASH_COMPARE_AVX2:
- *prim_hash_matches = _mm256_movemask_ps((__m256)_mm256_cmpeq_epi32(
- _mm256_load_si256(
- (__m256i const *)prim_bkt->sig_current),
- _mm256_set1_epi32(prim_hash)));
- *sec_hash_matches = _mm256_movemask_ps((__m256)_mm256_cmpeq_epi32(
- _mm256_load_si256(
- (__m256i const *)sec_bkt->sig_current),
- _mm256_set1_epi32(sec_hash)));
- break;
-#endif
#ifdef RTE_MACHINE_CPUFLAG_SSE2
case RTE_HASH_COMPARE_SSE:
- /* Compare the first 4 signatures in the bucket */
- *prim_hash_matches = _mm_movemask_ps((__m128)_mm_cmpeq_epi16(
+ /* Compare all signatures in the bucket */
+ *prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
_mm_load_si128(
(__m128i const *)prim_bkt->sig_current),
- _mm_set1_epi32(prim_hash)));
- *prim_hash_matches |= (_mm_movemask_ps((__m128)_mm_cmpeq_epi16(
- _mm_load_si128(
- (__m128i const *)&prim_bkt->sig_current[4]),
- _mm_set1_epi32(prim_hash)))) << 4;
- /* Compare the first 4 signatures in the bucket */
- *sec_hash_matches = _mm_movemask_ps((__m128)_mm_cmpeq_epi16(
+ _mm_set1_epi16(sig)));
+ /* Compare all signatures in the bucket */
+ *sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
_mm_load_si128(
(__m128i const *)sec_bkt->sig_current),
- _mm_set1_epi32(sec_hash)));
- *sec_hash_matches |= (_mm_movemask_ps((__m128)_mm_cmpeq_epi16(
- _mm_load_si128(
- (__m128i const *)&sec_bkt->sig_current[4]),
- _mm_set1_epi32(sec_hash)))) << 4;
+ _mm_set1_epi16(sig)));
break;
#endif
default:
for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
*prim_hash_matches |=
- ((prim_hash == prim_bkt->sig_current[i]) << i);
+ ((sig == prim_bkt->sig_current[i]) << (i << 1));
*sec_hash_matches |=
- ((sec_hash == sec_bkt->sig_current[i]) << i);
+ ((sig == sec_bkt->sig_current[i]) << (i << 1));
}
}
-
}
#define PREFETCH_OFFSET 4
@@ -1143,12 +1534,18 @@ __rte_hash_lookup_bulk(const struct rte_hash *h, const void **keys,
{
uint64_t hits = 0;
int32_t i;
+ int32_t ret;
uint32_t prim_hash[RTE_HASH_LOOKUP_BULK_MAX];
- uint32_t sec_hash[RTE_HASH_LOOKUP_BULK_MAX];
+ uint32_t prim_index[RTE_HASH_LOOKUP_BULK_MAX];
+ uint32_t sec_index[RTE_HASH_LOOKUP_BULK_MAX];
+ uint16_t sig[RTE_HASH_LOOKUP_BULK_MAX];
const struct rte_hash_bucket *primary_bkt[RTE_HASH_LOOKUP_BULK_MAX];
const struct rte_hash_bucket *secondary_bkt[RTE_HASH_LOOKUP_BULK_MAX];
uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+ struct rte_hash_bucket *cur_bkt, *next_bkt;
+ void *pdata[RTE_HASH_LOOKUP_BULK_MAX];
+ uint32_t cnt_b, cnt_a;
/* Prefetch first keys */
for (i = 0; i < PREFETCH_OFFSET && i < num_keys; i++)
@@ -1162,10 +1559,13 @@ __rte_hash_lookup_bulk(const struct rte_hash *h, const void **keys,
rte_prefetch0(keys[i + PREFETCH_OFFSET]);
prim_hash[i] = rte_hash_hash(h, keys[i]);
- sec_hash[i] = rte_hash_secondary_hash(prim_hash[i]);
- primary_bkt[i] = &h->buckets[prim_hash[i] & h->bucket_bitmask];
- secondary_bkt[i] = &h->buckets[sec_hash[i] & h->bucket_bitmask];
+ sig[i] = get_short_sig(prim_hash[i]);
+ prim_index[i] = get_prim_bucket_index(h, prim_hash[i]);
+ sec_index[i] = get_alt_bucket_index(h, prim_index[i], sig[i]);
+
+ primary_bkt[i] = &h->buckets[prim_index[i]];
+ secondary_bkt[i] = &h->buckets[sec_index[i]];
rte_prefetch0(primary_bkt[i]);
rte_prefetch0(secondary_bkt[i]);
@@ -1174,96 +1574,178 @@ __rte_hash_lookup_bulk(const struct rte_hash *h, const void **keys,
/* Calculate and prefetch rest of the buckets */
for (; i < num_keys; i++) {
prim_hash[i] = rte_hash_hash(h, keys[i]);
- sec_hash[i] = rte_hash_secondary_hash(prim_hash[i]);
- primary_bkt[i] = &h->buckets[prim_hash[i] & h->bucket_bitmask];
- secondary_bkt[i] = &h->buckets[sec_hash[i] & h->bucket_bitmask];
+ sig[i] = get_short_sig(prim_hash[i]);
+ prim_index[i] = get_prim_bucket_index(h, prim_hash[i]);
+ sec_index[i] = get_alt_bucket_index(h, prim_index[i], sig[i]);
+
+ primary_bkt[i] = &h->buckets[prim_index[i]];
+ secondary_bkt[i] = &h->buckets[sec_index[i]];
rte_prefetch0(primary_bkt[i]);
rte_prefetch0(secondary_bkt[i]);
}
__hash_rw_reader_lock(h);
- /* Compare signatures and prefetch key slot of first hit */
- for (i = 0; i < num_keys; i++) {
- compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+ do {
+ /* Load the table change counter before the lookup
+ * starts. Acquire semantics will make sure that
+ * loads in compare_signatures are not hoisted.
+ */
+ cnt_b = __atomic_load_n(h->tbl_chng_cnt,
+ __ATOMIC_ACQUIRE);
+
+ /* Compare signatures and prefetch key slot of first hit */
+ for (i = 0; i < num_keys; i++) {
+ compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
primary_bkt[i], secondary_bkt[i],
- prim_hash[i], sec_hash[i], h->sig_cmp_fn);
-
- if (prim_hitmask[i]) {
- uint32_t first_hit = __builtin_ctzl(prim_hitmask[i]);
- uint32_t key_idx = primary_bkt[i]->key_idx[first_hit];
- const struct rte_hash_key *key_slot =
- (const struct rte_hash_key *)(
- (const char *)h->key_store +
- key_idx * h->key_entry_size);
- rte_prefetch0(key_slot);
- continue;
- }
+ sig[i], h->sig_cmp_fn);
+
+ if (prim_hitmask[i]) {
+ uint32_t first_hit =
+ __builtin_ctzl(prim_hitmask[i])
+ >> 1;
+ uint32_t key_idx =
+ primary_bkt[i]->key_idx[first_hit];
+ const struct rte_hash_key *key_slot =
+ (const struct rte_hash_key *)(
+ (const char *)h->key_store +
+ key_idx * h->key_entry_size);
+ rte_prefetch0(key_slot);
+ continue;
+ }
- if (sec_hitmask[i]) {
- uint32_t first_hit = __builtin_ctzl(sec_hitmask[i]);
- uint32_t key_idx = secondary_bkt[i]->key_idx[first_hit];
- const struct rte_hash_key *key_slot =
- (const struct rte_hash_key *)(
- (const char *)h->key_store +
- key_idx * h->key_entry_size);
- rte_prefetch0(key_slot);
+ if (sec_hitmask[i]) {
+ uint32_t first_hit =
+ __builtin_ctzl(sec_hitmask[i])
+ >> 1;
+ uint32_t key_idx =
+ secondary_bkt[i]->key_idx[first_hit];
+ const struct rte_hash_key *key_slot =
+ (const struct rte_hash_key *)(
+ (const char *)h->key_store +
+ key_idx * h->key_entry_size);
+ rte_prefetch0(key_slot);
+ }
}
- }
- /* Compare keys, first hits in primary first */
- for (i = 0; i < num_keys; i++) {
- positions[i] = -ENOENT;
- while (prim_hitmask[i]) {
- uint32_t hit_index = __builtin_ctzl(prim_hitmask[i]);
-
- uint32_t key_idx = primary_bkt[i]->key_idx[hit_index];
- const struct rte_hash_key *key_slot =
- (const struct rte_hash_key *)(
- (const char *)h->key_store +
- key_idx * h->key_entry_size);
- /*
- * If key index is 0, do not compare key,
- * as it is checking the dummy slot
- */
- if (!!key_idx & !rte_hash_cmp_eq(key_slot->key, keys[i], h)) {
- if (data != NULL)
- data[i] = key_slot->pdata;
+ /* Compare keys, first hits in primary first */
+ for (i = 0; i < num_keys; i++) {
+ positions[i] = -ENOENT;
+ while (prim_hitmask[i]) {
+ uint32_t hit_index =
+ __builtin_ctzl(prim_hitmask[i])
+ >> 1;
+ uint32_t key_idx =
+ __atomic_load_n(
+ &primary_bkt[i]->key_idx[hit_index],
+ __ATOMIC_ACQUIRE);
+ const struct rte_hash_key *key_slot =
+ (const struct rte_hash_key *)(
+ (const char *)h->key_store +
+ key_idx * h->key_entry_size);
+
+ if (key_idx != EMPTY_SLOT)
+ pdata[i] = __atomic_load_n(
+ &key_slot->pdata,
+ __ATOMIC_ACQUIRE);
+ /*
+ * If key index is 0, do not compare key,
+ * as it is checking the dummy slot
+ */
+ if (!!key_idx &
+ !rte_hash_cmp_eq(
+ key_slot->key, keys[i], h)) {
+ if (data != NULL)
+ data[i] = pdata[i];
+
+ hits |= 1ULL << i;
+ positions[i] = key_idx - 1;
+ goto next_key;
+ }
+ prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+ }
- hits |= 1ULL << i;
- positions[i] = key_idx - 1;
- goto next_key;
+ while (sec_hitmask[i]) {
+ uint32_t hit_index =
+ __builtin_ctzl(sec_hitmask[i])
+ >> 1;
+ uint32_t key_idx =
+ __atomic_load_n(
+ &secondary_bkt[i]->key_idx[hit_index],
+ __ATOMIC_ACQUIRE);
+ const struct rte_hash_key *key_slot =
+ (const struct rte_hash_key *)(
+ (const char *)h->key_store +
+ key_idx * h->key_entry_size);
+
+ if (key_idx != EMPTY_SLOT)
+ pdata[i] = __atomic_load_n(
+ &key_slot->pdata,
+ __ATOMIC_ACQUIRE);
+ /*
+ * If key index is 0, do not compare key,
+ * as it is checking the dummy slot
+ */
+
+ if (!!key_idx &
+ !rte_hash_cmp_eq(
+ key_slot->key, keys[i], h)) {
+ if (data != NULL)
+ data[i] = pdata[i];
+
+ hits |= 1ULL << i;
+ positions[i] = key_idx - 1;
+ goto next_key;
+ }
+ sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
}
- prim_hitmask[i] &= ~(1 << (hit_index));
+next_key:
+ continue;
}
- while (sec_hitmask[i]) {
- uint32_t hit_index = __builtin_ctzl(sec_hitmask[i]);
-
- uint32_t key_idx = secondary_bkt[i]->key_idx[hit_index];
- const struct rte_hash_key *key_slot =
- (const struct rte_hash_key *)(
- (const char *)h->key_store +
- key_idx * h->key_entry_size);
- /*
- * If key index is 0, do not compare key,
- * as it is checking the dummy slot
- */
-
- if (!!key_idx & !rte_hash_cmp_eq(key_slot->key, keys[i], h)) {
- if (data != NULL)
- data[i] = key_slot->pdata;
+ /* The loads of sig_current in compare_signatures
+ * should not move below the load from tbl_chng_cnt.
+ */
+ __atomic_thread_fence(__ATOMIC_ACQUIRE);
+ /* Re-read the table change counter to check if the
+ * table has changed during search. If yes, re-do
+ * the search.
+ * This load should not get hoisted. The load
+ * acquires on cnt_b, primary key index and secondary
+ * key index will make sure that it does not get
+ * hoisted.
+ */
+ cnt_a = __atomic_load_n(h->tbl_chng_cnt,
+ __ATOMIC_ACQUIRE);
+ } while (cnt_b != cnt_a);
+
+ /* all found, do not need to go through ext bkt */
+ if ((hits == ((1ULL << num_keys) - 1)) || !h->ext_table_support) {
+ if (hit_mask != NULL)
+ *hit_mask = hits;
+ __hash_rw_reader_unlock(h);
+ return;
+ }
+ /* need to check ext buckets for match */
+ for (i = 0; i < num_keys; i++) {
+ if ((hits & (1ULL << i)) != 0)
+ continue;
+ next_bkt = secondary_bkt[i]->next;
+ FOR_EACH_BUCKET(cur_bkt, next_bkt) {
+ if (data != NULL)
+ ret = search_one_bucket(h, keys[i],
+ sig[i], &data[i], cur_bkt);
+ else
+ ret = search_one_bucket(h, keys[i],
+ sig[i], NULL, cur_bkt);
+ if (ret != -1) {
+ positions[i] = ret;
hits |= 1ULL << i;
- positions[i] = key_idx - 1;
- goto next_key;
+ break;
}
- sec_hitmask[i] &= ~(1 << (hit_index));
}
-
-next_key:
- continue;
}
__hash_rw_reader_unlock(h);
@@ -1308,27 +1790,30 @@ rte_hash_iterate(const struct rte_hash *h, const void **key, void **data, uint32
RETURN_IF_TRUE(((h == NULL) || (next == NULL)), -EINVAL);
- const uint32_t total_entries = h->num_buckets * RTE_HASH_BUCKET_ENTRIES;
- /* Out of bounds */
- if (*next >= total_entries)
- return -ENOENT;
+ const uint32_t total_entries_main = h->num_buckets *
+ RTE_HASH_BUCKET_ENTRIES;
+ const uint32_t total_entries = total_entries_main << 1;
+
+ /* Out of bounds of all buckets (both main table and ext table) */
+ if (*next >= total_entries_main)
+ goto extend_table;
/* Calculate bucket and index of current iterator */
bucket_idx = *next / RTE_HASH_BUCKET_ENTRIES;
idx = *next % RTE_HASH_BUCKET_ENTRIES;
/* If current position is empty, go to the next one */
- while (h->buckets[bucket_idx].key_idx[idx] == EMPTY_SLOT) {
+ while ((position = __atomic_load_n(&h->buckets[bucket_idx].key_idx[idx],
+ __ATOMIC_ACQUIRE)) == EMPTY_SLOT) {
(*next)++;
/* End of table */
- if (*next == total_entries)
- return -ENOENT;
+ if (*next == total_entries_main)
+ goto extend_table;
bucket_idx = *next / RTE_HASH_BUCKET_ENTRIES;
idx = *next % RTE_HASH_BUCKET_ENTRIES;
}
+
__hash_rw_reader_lock(h);
- /* Get position of entry in key table */
- position = h->buckets[bucket_idx].key_idx[idx];
next_key = (struct rte_hash_key *) ((char *)h->key_store +
position * h->key_entry_size);
/* Return key and data */
@@ -1341,4 +1826,34 @@ rte_hash_iterate(const struct rte_hash *h, const void **key, void **data, uint32
(*next)++;
return position - 1;
+
+/* Begin to iterate extendable buckets */
+extend_table:
+ /* Out of total bound or if ext bucket feature is not enabled */
+ if (*next >= total_entries || !h->ext_table_support)
+ return -ENOENT;
+
+ bucket_idx = (*next - total_entries_main) / RTE_HASH_BUCKET_ENTRIES;
+ idx = (*next - total_entries_main) % RTE_HASH_BUCKET_ENTRIES;
+
+ while ((position = h->buckets_ext[bucket_idx].key_idx[idx]) == EMPTY_SLOT) {
+ (*next)++;
+ if (*next == total_entries)
+ return -ENOENT;
+ bucket_idx = (*next - total_entries_main) /
+ RTE_HASH_BUCKET_ENTRIES;
+ idx = (*next - total_entries_main) % RTE_HASH_BUCKET_ENTRIES;
+ }
+ __hash_rw_reader_lock(h);
+ next_key = (struct rte_hash_key *) ((char *)h->key_store +
+ position * h->key_entry_size);
+ /* Return key and data */
+ *key = next_key->key;
+ *data = next_key->pdata;
+
+ __hash_rw_reader_unlock(h);
+
+ /* Increment iterator */
+ (*next)++;
+ return position - 1;
}
diff --git a/lib/librte_hash/rte_cuckoo_hash.h b/lib/librte_hash/rte_cuckoo_hash.h
index b43f467d..5dfbbc48 100644
--- a/lib/librte_hash/rte_cuckoo_hash.h
+++ b/lib/librte_hash/rte_cuckoo_hash.h
@@ -1,5 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2016 Intel Corporation
+ * Copyright(c) 2018 Arm Limited
*/
/* rte_cuckoo_hash.h
@@ -104,8 +105,6 @@ const rte_hash_cmp_eq_t cmp_jump_table[NUM_KEY_CMP_CASES] = {
#define LCORE_CACHE_SIZE 64
-#define RTE_HASH_MAX_PUSHES 100
-
#define RTE_HASH_BFS_QUEUE_MAX_LEN 1000
#define RTE_XABORT_CUCKOO_PATH_INVALIDED 0x4
@@ -125,25 +124,24 @@ struct rte_hash_key {
};
/* Variable key size */
char key[0];
-} __attribute__((aligned(KEY_ALIGNMENT)));
+};
/* All different signature compare functions */
enum rte_hash_sig_compare_function {
RTE_HASH_COMPARE_SCALAR = 0,
RTE_HASH_COMPARE_SSE,
- RTE_HASH_COMPARE_AVX2,
RTE_HASH_COMPARE_NUM
};
/** Bucket structure */
struct rte_hash_bucket {
- hash_sig_t sig_current[RTE_HASH_BUCKET_ENTRIES];
+ uint16_t sig_current[RTE_HASH_BUCKET_ENTRIES];
uint32_t key_idx[RTE_HASH_BUCKET_ENTRIES];
- hash_sig_t sig_alt[RTE_HASH_BUCKET_ENTRIES];
-
uint8_t flag[RTE_HASH_BUCKET_ENTRIES];
+
+ void *next;
} __rte_cache_aligned;
/** A hash table structure. */
@@ -164,10 +162,23 @@ struct rte_hash {
/**< Length of hash key. */
uint8_t hw_trans_mem_support;
/**< If hardware transactional memory is used. */
- uint8_t multi_writer_support;
- /**< If multi-writer support is enabled. */
+ uint8_t use_local_cache;
+ /**< If multi-writer support is enabled, use local cache
+ * to allocate key-store slots.
+ */
uint8_t readwrite_concur_support;
/**< If read-write concurrency support is enabled */
+ uint8_t ext_table_support; /**< Enable extendable bucket table */
+ uint8_t no_free_on_del;
+ /**< If key index should be freed on calling rte_hash_del_xxx APIs.
+ * If this is set, rte_hash_free_key_with_position must be called to
+ * free the key index associated with the deleted entry.
+ * This flag is enabled by default.
+ */
+ uint8_t readwrite_concur_lf_support;
+ /**< If read-write concurrency lock free support is enabled */
+ uint8_t writer_takes_lock;
+ /**< Indicates if the writer threads need to take lock */
rte_hash_function hash_func; /**< Function used to calculate hash. */
uint32_t hash_func_init_val; /**< Init value used by hash_func. */
rte_hash_cmp_eq_t rte_hash_custom_cmp_eq;
@@ -186,10 +197,15 @@ struct rte_hash {
* to the key table.
*/
rte_rwlock_t *readwrite_lock; /**< Read-write lock thread-safety. */
+ struct rte_hash_bucket *buckets_ext; /**< Extra buckets array */
+ struct rte_ring *free_ext_bkts; /**< Ring of indexes of free buckets */
+ uint32_t *tbl_chng_cnt;
+ /**< Indicates if the hash table changed from last read. */
} __rte_cache_aligned;
struct queue_node {
struct rte_hash_bucket *bkt; /* Current bucket on the bfs search */
+ uint32_t cur_bkt_idx;
struct queue_node *prev; /* Parent(bucket) in search path */
int prev_slot; /* Parent(slot) in search path */
diff --git a/lib/librte_hash/rte_hash.h b/lib/librte_hash/rte_hash.h
index 9e7d9315..c93d1a13 100644
--- a/lib/librte_hash/rte_hash.h
+++ b/lib/librte_hash/rte_hash.h
@@ -14,6 +14,8 @@
#include <stdint.h>
#include <stddef.h>
+#include <rte_compat.h>
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -37,7 +39,27 @@ extern "C" {
/** Flag to support reader writer concurrency */
#define RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY 0x04
-/** Signature of key that is stored internally. */
+/** Flag to indicate the extendabe bucket table feature should be used */
+#define RTE_HASH_EXTRA_FLAGS_EXT_TABLE 0x08
+
+/** Flag to disable freeing of key index on hash delete.
+ * Refer to rte_hash_del_xxx APIs for more details.
+ * This is enabled by default when RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF
+ * is enabled.
+ */
+#define RTE_HASH_EXTRA_FLAGS_NO_FREE_ON_DEL 0x10
+
+/** Flag to support lock free reader writer concurrency. Both single writer
+ * and multi writer use cases are supported.
+ * Currently, extendable bucket table feature is not supported with
+ * this feature.
+ */
+#define RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF 0x20
+
+/**
+ * The type of hash value of a key.
+ * It should be a value of at least 32bit with fully random pattern.
+ */
typedef uint32_t hash_sig_t;
/** Type of function that can be used for calculating the hash value. */
@@ -119,7 +141,12 @@ void
rte_hash_free(struct rte_hash *h);
/**
- * Reset all hash structure, by zeroing all entries
+ * Reset all hash structure, by zeroing all entries.
+ * When RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF is enabled,
+ * it is application's responsibility to make sure that
+ * none of the readers are referencing the hash table
+ * while calling this API.
+ *
* @param h
* Hash table to reset
*/
@@ -143,6 +170,11 @@ rte_hash_count(const struct rte_hash *h);
* and should only be called from one thread by default.
* Thread safety can be enabled by setting flag during
* table creation.
+ * If the key exists already in the table, this API updates its value
+ * with 'data' passed in this API. It is the responsibility of
+ * the application to manage any memory associated with the old value.
+ * The readers might still be using the old value even after this API
+ * has returned.
*
* @param h
* Hash table to add the key to.
@@ -165,6 +197,11 @@ rte_hash_add_key_data(const struct rte_hash *h, const void *key, void *data);
* and should only be called from one thread by default.
* Thread safety can be enabled by setting flag during
* table creation.
+ * If the key exists already in the table, this API updates its value
+ * with 'data' passed in this API. It is the responsibility of
+ * the application to manage any memory associated with the old value.
+ * The readers might still be using the old value even after this API
+ * has returned.
*
* @param h
* Hash table to add the key to.
@@ -230,6 +267,14 @@ rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, hash_sig_t
* and should only be called from one thread by default.
* Thread safety can be enabled by setting flag during
* table creation.
+ * If RTE_HASH_EXTRA_FLAGS_NO_FREE_ON_DEL or
+ * RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF is enabled,
+ * the key index returned by rte_hash_add_key_xxx APIs will not be
+ * freed by this API. rte_hash_free_key_with_position API must be called
+ * additionally to free the index associated with the key.
+ * rte_hash_free_key_with_position API should be called after all
+ * the readers have stopped referencing the entry corresponding to
+ * this key. RCU mechanisms could be used to determine such a state.
*
* @param h
* Hash table to remove the key from.
@@ -251,6 +296,14 @@ rte_hash_del_key(const struct rte_hash *h, const void *key);
* and should only be called from one thread by default.
* Thread safety can be enabled by setting flag during
* table creation.
+ * If RTE_HASH_EXTRA_FLAGS_NO_FREE_ON_DEL or
+ * RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF is enabled,
+ * the key index returned by rte_hash_add_key_xxx APIs will not be
+ * freed by this API. rte_hash_free_key_with_position API must be called
+ * additionally to free the index associated with the key.
+ * rte_hash_free_key_with_position API should be called after all
+ * the readers have stopped referencing the entry corresponding to
+ * this key. RCU mechanisms could be used to determine such a state.
*
* @param h
* Hash table to remove the key from.
@@ -290,6 +343,34 @@ rte_hash_get_key_with_position(const struct rte_hash *h, const int32_t position,
void **key);
/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Free a hash key in the hash table given the position
+ * of the key. This operation is not multi-thread safe and should
+ * only be called from one thread by default. Thread safety
+ * can be enabled by setting flag during table creation.
+ * If RTE_HASH_EXTRA_FLAGS_NO_FREE_ON_DEL or
+ * RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF is enabled,
+ * the key index returned by rte_hash_del_key_xxx APIs must be freed
+ * using this API. This API should be called after all the readers
+ * have stopped referencing the entry corresponding to this key.
+ * RCU mechanisms could be used to determine such a state.
+ * This API does not validate if the key is already freed.
+ *
+ * @param h
+ * Hash table to free the key from.
+ * @param position
+ * Position returned when the key was deleted.
+ * @return
+ * - 0 if freed successfully
+ * - -EINVAL if the parameters are invalid.
+ */
+int __rte_experimental
+rte_hash_free_key_with_position(const struct rte_hash *h,
+ const int32_t position);
+
+/**
* Find a key-value pair in the hash table.
* This operation is multi-thread safe with regarding to other lookup threads.
* Read-write concurrency can be enabled by setting flag during
diff --git a/lib/librte_hash/rte_hash_version.map b/lib/librte_hash/rte_hash_version.map
index e216ac8e..734ae28b 100644
--- a/lib/librte_hash/rte_hash_version.map
+++ b/lib/librte_hash/rte_hash_version.map
@@ -53,3 +53,10 @@ DPDK_18.08 {
rte_hash_count;
} DPDK_16.07;
+
+EXPERIMENTAL {
+ global:
+
+ rte_hash_free_key_with_position;
+
+};
diff --git a/lib/librte_ip_frag/ip_frag_common.h b/lib/librte_ip_frag/ip_frag_common.h
index 197acf8d..0f62e2e1 100644
--- a/lib/librte_ip_frag/ip_frag_common.h
+++ b/lib/librte_ip_frag/ip_frag_common.h
@@ -25,6 +25,12 @@
#define IPv6_KEY_BYTES_FMT \
"%08" PRIx64 "%08" PRIx64 "%08" PRIx64 "%08" PRIx64
+#ifdef RTE_LIBRTE_IP_FRAG_TBL_STAT
+#define IP_FRAG_TBL_STAT_UPDATE(s, f, v) ((s)->f += (v))
+#else
+#define IP_FRAG_TBL_STAT_UPDATE(s, f, v) do {} while (0)
+#endif /* IP_FRAG_TBL_STAT */
+
/* internal functions declarations */
struct rte_mbuf * ip_frag_process(struct ip_frag_pkt *fp,
struct rte_ip_frag_death_row *dr, struct rte_mbuf *mb,
@@ -69,10 +75,11 @@ ip_frag_key_invalidate(struct ip_frag_key * key)
}
/* compare two keys */
-static inline int
+static inline uint64_t
ip_frag_key_cmp(const struct ip_frag_key * k1, const struct ip_frag_key * k2)
{
- uint32_t i, val;
+ uint32_t i;
+ uint64_t val;
val = k1->id ^ k2->id;
for (i = 0; i < k1->key_len; i++)
val |= k1->src_dst[i] ^ k2->src_dst[i];
@@ -149,4 +156,16 @@ ip_frag_reset(struct ip_frag_pkt *fp, uint64_t tms)
fp->frags[IP_FIRST_FRAG_IDX] = zero_frag;
}
+/* local frag table helper functions */
+static inline void
+ip_frag_tbl_del(struct rte_ip_frag_tbl *tbl, struct rte_ip_frag_death_row *dr,
+ struct ip_frag_pkt *fp)
+{
+ ip_frag_free(fp, dr);
+ ip_frag_key_invalidate(&fp->key);
+ TAILQ_REMOVE(&tbl->lru, fp, lru);
+ tbl->use_entries--;
+ IP_FRAG_TBL_STAT_UPDATE(&tbl->stat, del_num, 1);
+}
+
#endif /* _IP_FRAG_COMMON_H_ */
diff --git a/lib/librte_ip_frag/ip_frag_internal.c b/lib/librte_ip_frag/ip_frag_internal.c
index 2560c771..97470a87 100644
--- a/lib/librte_ip_frag/ip_frag_internal.c
+++ b/lib/librte_ip_frag/ip_frag_internal.c
@@ -14,24 +14,6 @@
#define IP_FRAG_TBL_POS(tbl, sig) \
((tbl)->pkt + ((sig) & (tbl)->entry_mask))
-#ifdef RTE_LIBRTE_IP_FRAG_TBL_STAT
-#define IP_FRAG_TBL_STAT_UPDATE(s, f, v) ((s)->f += (v))
-#else
-#define IP_FRAG_TBL_STAT_UPDATE(s, f, v) do {} while (0)
-#endif /* IP_FRAG_TBL_STAT */
-
-/* local frag table helper functions */
-static inline void
-ip_frag_tbl_del(struct rte_ip_frag_tbl *tbl, struct rte_ip_frag_death_row *dr,
- struct ip_frag_pkt *fp)
-{
- ip_frag_free(fp, dr);
- ip_frag_key_invalidate(&fp->key);
- TAILQ_REMOVE(&tbl->lru, fp, lru);
- tbl->use_entries--;
- IP_FRAG_TBL_STAT_UPDATE(&tbl->stat, del_num, 1);
-}
-
static inline void
ip_frag_tbl_add(struct rte_ip_frag_tbl *tbl, struct ip_frag_pkt *fp,
const struct ip_frag_key *key, uint64_t tms)
diff --git a/lib/librte_ip_frag/rte_ip_frag.h b/lib/librte_ip_frag/rte_ip_frag.h
index b3f3f78d..7f425f61 100644
--- a/lib/librte_ip_frag/rte_ip_frag.h
+++ b/lib/librte_ip_frag/rte_ip_frag.h
@@ -65,10 +65,13 @@ struct ip_frag_pkt {
#define IP_FRAG_DEATH_ROW_LEN 32 /**< death row size (in packets) */
+/* death row size in mbufs */
+#define IP_FRAG_DEATH_ROW_MBUF_LEN (IP_FRAG_DEATH_ROW_LEN * (IP_MAX_FRAG_NUM + 1))
+
/** mbuf death row (packets to be freed) */
struct rte_ip_frag_death_row {
uint32_t cnt; /**< number of mbufs currently on death row */
- struct rte_mbuf *row[IP_FRAG_DEATH_ROW_LEN * (IP_MAX_FRAG_NUM + 1)];
+ struct rte_mbuf *row[IP_FRAG_DEATH_ROW_MBUF_LEN];
/**< mbufs to be freed */
};
@@ -325,6 +328,20 @@ void rte_ip_frag_free_death_row(struct rte_ip_frag_death_row *dr,
void
rte_ip_frag_table_statistics_dump(FILE * f, const struct rte_ip_frag_tbl *tbl);
+/**
+ * Delete expired fragments
+ *
+ * @param tbl
+ * Table to delete expired fragments from
+ * @param dr
+ * Death row to free buffers to
+ * @param tms
+ * Current timestamp
+ */
+void __rte_experimental
+rte_frag_table_del_expired_entries(struct rte_ip_frag_tbl *tbl,
+ struct rte_ip_frag_death_row *dr, uint64_t tms);
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/librte_ip_frag/rte_ip_frag_common.c b/lib/librte_ip_frag/rte_ip_frag_common.c
index 659a1795..a23f6f24 100644
--- a/lib/librte_ip_frag/rte_ip_frag_common.c
+++ b/lib/librte_ip_frag/rte_ip_frag_common.c
@@ -121,3 +121,24 @@ rte_ip_frag_table_statistics_dump(FILE *f, const struct rte_ip_frag_tbl *tbl)
fail_nospace,
fail_total - fail_nospace);
}
+
+/* Delete expired fragments */
+void __rte_experimental
+rte_frag_table_del_expired_entries(struct rte_ip_frag_tbl *tbl,
+ struct rte_ip_frag_death_row *dr, uint64_t tms)
+{
+ uint64_t max_cycles;
+ struct ip_frag_pkt *fp;
+
+ max_cycles = tbl->max_cycles;
+
+ TAILQ_FOREACH(fp, &tbl->lru, lru)
+ if (max_cycles + fp->start < tms) {
+ /* check that death row has enough space */
+ if (IP_FRAG_DEATH_ROW_MBUF_LEN - dr->cnt >= fp->last_idx)
+ ip_frag_tbl_del(tbl, dr, fp);
+ else
+ return;
+ } else
+ return;
+}
diff --git a/lib/librte_ip_frag/rte_ip_frag_version.map b/lib/librte_ip_frag/rte_ip_frag_version.map
index d1acf07c..d40d5515 100644
--- a/lib/librte_ip_frag/rte_ip_frag_version.map
+++ b/lib/librte_ip_frag/rte_ip_frag_version.map
@@ -18,3 +18,9 @@ DPDK_17.08 {
rte_ip_frag_table_destroy;
} DPDK_2.0;
+
+EXPERIMENTAL {
+ global:
+
+ rte_frag_table_del_expired_entries;
+};
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 65f6a2b0..c9726d4f 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -18,6 +18,9 @@
#include <rte_log.h>
#include <rte_kni.h>
#include <rte_memzone.h>
+#include <rte_tailq.h>
+#include <rte_rwlock.h>
+#include <rte_eal_memconfig.h>
#include <exec-env/rte_kni_common.h>
#include "rte_kni_fifo.h"
@@ -30,7 +33,23 @@
#define KNI_REQUEST_MBUF_NUM_MAX 32
-#define KNI_MEM_CHECK(cond) do { if (cond) goto kni_fail; } while (0)
+#define KNI_MEM_CHECK(cond, fail) do { if (cond) goto fail; } while (0)
+
+#define KNI_MZ_NAME_FMT "kni_info_%s"
+#define KNI_TX_Q_MZ_NAME_FMT "kni_tx_%s"
+#define KNI_RX_Q_MZ_NAME_FMT "kni_rx_%s"
+#define KNI_ALLOC_Q_MZ_NAME_FMT "kni_alloc_%s"
+#define KNI_FREE_Q_MZ_NAME_FMT "kni_free_%s"
+#define KNI_REQ_Q_MZ_NAME_FMT "kni_req_%s"
+#define KNI_RESP_Q_MZ_NAME_FMT "kni_resp_%s"
+#define KNI_SYNC_ADDR_MZ_NAME_FMT "kni_sync_%s"
+
+TAILQ_HEAD(rte_kni_list, rte_tailq_entry);
+
+static struct rte_tailq_elem rte_kni_tailq = {
+ .name = "RTE_KNI",
+};
+EAL_REGISTER_TAILQ(rte_kni_tailq)
/**
* KNI context
@@ -42,18 +61,26 @@ struct rte_kni {
struct rte_mempool *pktmbuf_pool; /**< pkt mbuf mempool */
unsigned mbuf_size; /**< mbuf size */
+ const struct rte_memzone *m_tx_q; /**< TX queue memzone */
+ const struct rte_memzone *m_rx_q; /**< RX queue memzone */
+ const struct rte_memzone *m_alloc_q;/**< Alloc queue memzone */
+ const struct rte_memzone *m_free_q; /**< Free queue memzone */
+
struct rte_kni_fifo *tx_q; /**< TX queue */
struct rte_kni_fifo *rx_q; /**< RX queue */
struct rte_kni_fifo *alloc_q; /**< Allocated mbufs queue */
struct rte_kni_fifo *free_q; /**< To be freed mbufs queue */
+ const struct rte_memzone *m_req_q; /**< Request queue memzone */
+ const struct rte_memzone *m_resp_q; /**< Response queue memzone */
+ const struct rte_memzone *m_sync_addr;/**< Sync addr memzone */
+
/* For request & response */
struct rte_kni_fifo *req_q; /**< Request queue */
struct rte_kni_fifo *resp_q; /**< Response queue */
void * sync_addr; /**< Req/Resp Mem address */
struct rte_kni_ops ops; /**< operations for request */
- uint8_t in_use : 1; /**< kni in use */
};
enum kni_ops_status {
@@ -61,231 +88,111 @@ enum kni_ops_status {
KNI_REQ_REGISTERED,
};
-/**
- * KNI memzone pool slot
- */
-struct rte_kni_memzone_slot {
- uint32_t id;
- uint8_t in_use : 1; /**< slot in use */
-
- /* Memzones */
- const struct rte_memzone *m_ctx; /**< KNI ctx */
- const struct rte_memzone *m_tx_q; /**< TX queue */
- const struct rte_memzone *m_rx_q; /**< RX queue */
- const struct rte_memzone *m_alloc_q; /**< Allocated mbufs queue */
- const struct rte_memzone *m_free_q; /**< To be freed mbufs queue */
- const struct rte_memzone *m_req_q; /**< Request queue */
- const struct rte_memzone *m_resp_q; /**< Response queue */
- const struct rte_memzone *m_sync_addr;
-
- /* Free linked list */
- struct rte_kni_memzone_slot *next; /**< Next slot link.list */
-};
-
-/**
- * KNI memzone pool
- */
-struct rte_kni_memzone_pool {
- uint8_t initialized : 1; /**< Global KNI pool init flag */
-
- uint32_t max_ifaces; /**< Max. num of KNI ifaces */
- struct rte_kni_memzone_slot *slots; /**< Pool slots */
- rte_spinlock_t mutex; /**< alloc/release mutex */
-
- /* Free memzone slots linked-list */
- struct rte_kni_memzone_slot *free; /**< First empty slot */
- struct rte_kni_memzone_slot *free_tail; /**< Last empty slot */
-};
-
-
static void kni_free_mbufs(struct rte_kni *kni);
static void kni_allocate_mbufs(struct rte_kni *kni);
static volatile int kni_fd = -1;
-static struct rte_kni_memzone_pool kni_memzone_pool = {
- .initialized = 0,
-};
-static const struct rte_memzone *
-kni_memzone_reserve(const char *name, size_t len, int socket_id,
- unsigned flags)
+/* Shall be called before any allocation happens */
+int
+rte_kni_init(unsigned int max_kni_ifaces __rte_unused)
{
- const struct rte_memzone *mz = rte_memzone_lookup(name);
-
- if (mz == NULL)
- mz = rte_memzone_reserve(name, len, socket_id, flags);
+ /* Check FD and open */
+ if (kni_fd < 0) {
+ kni_fd = open("/dev/" KNI_DEVICE, O_RDWR);
+ if (kni_fd < 0) {
+ RTE_LOG(ERR, KNI,
+ "Can not open /dev/%s\n", KNI_DEVICE);
+ return -1;
+ }
+ }
- return mz;
+ return 0;
}
-/* Pool mgmt */
-static struct rte_kni_memzone_slot*
-kni_memzone_pool_alloc(void)
+static struct rte_kni *
+__rte_kni_get(const char *name)
{
- struct rte_kni_memzone_slot *slot;
+ struct rte_kni *kni;
+ struct rte_tailq_entry *te;
+ struct rte_kni_list *kni_list;
- rte_spinlock_lock(&kni_memzone_pool.mutex);
+ kni_list = RTE_TAILQ_CAST(rte_kni_tailq.head, rte_kni_list);
- if (!kni_memzone_pool.free) {
- rte_spinlock_unlock(&kni_memzone_pool.mutex);
- return NULL;
+ TAILQ_FOREACH(te, kni_list, next) {
+ kni = te->data;
+ if (strncmp(name, kni->name, RTE_KNI_NAMESIZE) == 0)
+ break;
}
- slot = kni_memzone_pool.free;
- kni_memzone_pool.free = slot->next;
- slot->in_use = 1;
+ if (te == NULL)
+ kni = NULL;
- if (!kni_memzone_pool.free)
- kni_memzone_pool.free_tail = NULL;
-
- rte_spinlock_unlock(&kni_memzone_pool.mutex);
-
- return slot;
+ return kni;
}
-static void
-kni_memzone_pool_release(struct rte_kni_memzone_slot *slot)
+static int
+kni_reserve_mz(struct rte_kni *kni)
{
- rte_spinlock_lock(&kni_memzone_pool.mutex);
+ char mz_name[RTE_MEMZONE_NAMESIZE];
- if (kni_memzone_pool.free)
- kni_memzone_pool.free_tail->next = slot;
- else
- kni_memzone_pool.free = slot;
+ snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_TX_Q_MZ_NAME_FMT, kni->name);
+ kni->m_tx_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0);
+ KNI_MEM_CHECK(kni->m_tx_q == NULL, tx_q_fail);
- kni_memzone_pool.free_tail = slot;
- slot->next = NULL;
- slot->in_use = 0;
+ snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_RX_Q_MZ_NAME_FMT, kni->name);
+ kni->m_rx_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0);
+ KNI_MEM_CHECK(kni->m_rx_q == NULL, rx_q_fail);
- rte_spinlock_unlock(&kni_memzone_pool.mutex);
-}
+ snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_ALLOC_Q_MZ_NAME_FMT, kni->name);
+ kni->m_alloc_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0);
+ KNI_MEM_CHECK(kni->m_alloc_q == NULL, alloc_q_fail);
+ snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_FREE_Q_MZ_NAME_FMT, kni->name);
+ kni->m_free_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0);
+ KNI_MEM_CHECK(kni->m_free_q == NULL, free_q_fail);
-/* Shall be called before any allocation happens */
-void
-rte_kni_init(unsigned int max_kni_ifaces)
-{
- uint32_t i;
- struct rte_kni_memzone_slot *it;
- const struct rte_memzone *mz;
-#define OBJNAMSIZ 32
- char obj_name[OBJNAMSIZ];
- char mz_name[RTE_MEMZONE_NAMESIZE];
-
- /* Immediately return if KNI is already initialized */
- if (kni_memzone_pool.initialized) {
- RTE_LOG(WARNING, KNI, "Double call to rte_kni_init()");
- return;
- }
+ snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_REQ_Q_MZ_NAME_FMT, kni->name);
+ kni->m_req_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0);
+ KNI_MEM_CHECK(kni->m_req_q == NULL, req_q_fail);
- if (max_kni_ifaces == 0) {
- RTE_LOG(ERR, KNI, "Invalid number of max_kni_ifaces %d\n",
- max_kni_ifaces);
- RTE_LOG(ERR, KNI, "Unable to initialize KNI\n");
- return;
- }
+ snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_RESP_Q_MZ_NAME_FMT, kni->name);
+ kni->m_resp_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0);
+ KNI_MEM_CHECK(kni->m_resp_q == NULL, resp_q_fail);
- /* Check FD and open */
- if (kni_fd < 0) {
- kni_fd = open("/dev/" KNI_DEVICE, O_RDWR);
- if (kni_fd < 0) {
- RTE_LOG(ERR, KNI,
- "Can not open /dev/%s\n", KNI_DEVICE);
- return;
- }
- }
+ snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_SYNC_ADDR_MZ_NAME_FMT, kni->name);
+ kni->m_sync_addr = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0);
+ KNI_MEM_CHECK(kni->m_sync_addr == NULL, sync_addr_fail);
- /* Allocate slot objects */
- kni_memzone_pool.slots = (struct rte_kni_memzone_slot *)
- rte_malloc(NULL,
- sizeof(struct rte_kni_memzone_slot) *
- max_kni_ifaces,
- 0);
- KNI_MEM_CHECK(kni_memzone_pool.slots == NULL);
-
- /* Initialize general pool variables */
- kni_memzone_pool.initialized = 1;
- kni_memzone_pool.max_ifaces = max_kni_ifaces;
- kni_memzone_pool.free = &kni_memzone_pool.slots[0];
- rte_spinlock_init(&kni_memzone_pool.mutex);
-
- /* Pre-allocate all memzones of all the slots; panic on error */
- for (i = 0; i < max_kni_ifaces; i++) {
-
- /* Recover current slot */
- it = &kni_memzone_pool.slots[i];
- it->id = i;
-
- /* Allocate KNI context */
- snprintf(mz_name, RTE_MEMZONE_NAMESIZE, "KNI_INFO_%d", i);
- mz = kni_memzone_reserve(mz_name, sizeof(struct rte_kni),
- SOCKET_ID_ANY, 0);
- KNI_MEM_CHECK(mz == NULL);
- it->m_ctx = mz;
-
- /* TX RING */
- snprintf(obj_name, OBJNAMSIZ, "kni_tx_%d", i);
- mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE,
- SOCKET_ID_ANY, 0);
- KNI_MEM_CHECK(mz == NULL);
- it->m_tx_q = mz;
-
- /* RX RING */
- snprintf(obj_name, OBJNAMSIZ, "kni_rx_%d", i);
- mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE,
- SOCKET_ID_ANY, 0);
- KNI_MEM_CHECK(mz == NULL);
- it->m_rx_q = mz;
-
- /* ALLOC RING */
- snprintf(obj_name, OBJNAMSIZ, "kni_alloc_%d", i);
- mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE,
- SOCKET_ID_ANY, 0);
- KNI_MEM_CHECK(mz == NULL);
- it->m_alloc_q = mz;
-
- /* FREE RING */
- snprintf(obj_name, OBJNAMSIZ, "kni_free_%d", i);
- mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE,
- SOCKET_ID_ANY, 0);
- KNI_MEM_CHECK(mz == NULL);
- it->m_free_q = mz;
-
- /* Request RING */
- snprintf(obj_name, OBJNAMSIZ, "kni_req_%d", i);
- mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE,
- SOCKET_ID_ANY, 0);
- KNI_MEM_CHECK(mz == NULL);
- it->m_req_q = mz;
-
- /* Response RING */
- snprintf(obj_name, OBJNAMSIZ, "kni_resp_%d", i);
- mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE,
- SOCKET_ID_ANY, 0);
- KNI_MEM_CHECK(mz == NULL);
- it->m_resp_q = mz;
-
- /* Req/Resp sync mem area */
- snprintf(obj_name, OBJNAMSIZ, "kni_sync_%d", i);
- mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE,
- SOCKET_ID_ANY, 0);
- KNI_MEM_CHECK(mz == NULL);
- it->m_sync_addr = mz;
-
- if ((i+1) == max_kni_ifaces) {
- it->next = NULL;
- kni_memzone_pool.free_tail = it;
- } else
- it->next = &kni_memzone_pool.slots[i+1];
- }
-
- return;
+ return 0;
-kni_fail:
- RTE_LOG(ERR, KNI, "Unable to allocate memory for max_kni_ifaces:%d."
- "Increase the amount of hugepages memory\n", max_kni_ifaces);
+sync_addr_fail:
+ rte_memzone_free(kni->m_resp_q);
+resp_q_fail:
+ rte_memzone_free(kni->m_req_q);
+req_q_fail:
+ rte_memzone_free(kni->m_free_q);
+free_q_fail:
+ rte_memzone_free(kni->m_alloc_q);
+alloc_q_fail:
+ rte_memzone_free(kni->m_rx_q);
+rx_q_fail:
+ rte_memzone_free(kni->m_tx_q);
+tx_q_fail:
+ return -1;
}
+static void
+kni_release_mz(struct rte_kni *kni)
+{
+ rte_memzone_free(kni->m_tx_q);
+ rte_memzone_free(kni->m_rx_q);
+ rte_memzone_free(kni->m_alloc_q);
+ rte_memzone_free(kni->m_free_q);
+ rte_memzone_free(kni->m_req_q);
+ rte_memzone_free(kni->m_resp_q);
+ rte_memzone_free(kni->m_sync_addr);
+}
struct rte_kni *
rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
@@ -294,41 +201,45 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
{
int ret;
struct rte_kni_device_info dev_info;
- struct rte_kni *ctx;
- char intf_name[RTE_KNI_NAMESIZE];
- const struct rte_memzone *mz;
- struct rte_kni_memzone_slot *slot = NULL;
+ struct rte_kni *kni;
+ struct rte_tailq_entry *te;
+ struct rte_kni_list *kni_list;
if (!pktmbuf_pool || !conf || !conf->name[0])
return NULL;
/* Check if KNI subsystem has been initialized */
- if (kni_memzone_pool.initialized != 1) {
+ if (kni_fd < 0) {
RTE_LOG(ERR, KNI, "KNI subsystem has not been initialized. Invoke rte_kni_init() first\n");
return NULL;
}
- /* Get an available slot from the pool */
- slot = kni_memzone_pool_alloc();
- if (!slot) {
- RTE_LOG(ERR, KNI, "Cannot allocate more KNI interfaces; increase the number of max_kni_ifaces(current %d) or release unused ones.\n",
- kni_memzone_pool.max_ifaces);
- return NULL;
+ rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
+
+ kni = __rte_kni_get(conf->name);
+ if (kni != NULL) {
+ RTE_LOG(ERR, KNI, "KNI already exists\n");
+ goto unlock;
}
- /* Recover ctx */
- ctx = slot->m_ctx->addr;
- snprintf(intf_name, RTE_KNI_NAMESIZE, "%s", conf->name);
+ te = rte_zmalloc("KNI_TAILQ_ENTRY", sizeof(*te), 0);
+ if (te == NULL) {
+ RTE_LOG(ERR, KNI, "Failed to allocate tailq entry\n");
+ goto unlock;
+ }
- if (ctx->in_use) {
- RTE_LOG(ERR, KNI, "KNI %s is in use\n", ctx->name);
- return NULL;
+ kni = rte_zmalloc("KNI", sizeof(struct rte_kni), RTE_CACHE_LINE_SIZE);
+ if (kni == NULL) {
+ RTE_LOG(ERR, KNI, "KNI memory allocation failed\n");
+ goto kni_fail;
}
- memset(ctx, 0, sizeof(struct rte_kni));
+
+ snprintf(kni->name, RTE_KNI_NAMESIZE, "%s", conf->name);
+
if (ops)
- memcpy(&ctx->ops, ops, sizeof(struct rte_kni_ops));
+ memcpy(&kni->ops, ops, sizeof(struct rte_kni_ops));
else
- ctx->ops.port_id = UINT16_MAX;
+ kni->ops.port_id = UINT16_MAX;
memset(&dev_info, 0, sizeof(dev_info));
dev_info.bus = conf->addr.bus;
@@ -344,72 +255,79 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
memcpy(dev_info.mac_addr, conf->mac_addr, ETHER_ADDR_LEN);
- snprintf(ctx->name, RTE_KNI_NAMESIZE, "%s", intf_name);
- snprintf(dev_info.name, RTE_KNI_NAMESIZE, "%s", intf_name);
+ snprintf(dev_info.name, RTE_KNI_NAMESIZE, "%s", conf->name);
RTE_LOG(INFO, KNI, "pci: %02x:%02x:%02x \t %02x:%02x\n",
dev_info.bus, dev_info.devid, dev_info.function,
dev_info.vendor_id, dev_info.device_id);
+
+ ret = kni_reserve_mz(kni);
+ if (ret < 0)
+ goto mz_fail;
+
/* TX RING */
- mz = slot->m_tx_q;
- ctx->tx_q = mz->addr;
- kni_fifo_init(ctx->tx_q, KNI_FIFO_COUNT_MAX);
- dev_info.tx_phys = mz->phys_addr;
+ kni->tx_q = kni->m_tx_q->addr;
+ kni_fifo_init(kni->tx_q, KNI_FIFO_COUNT_MAX);
+ dev_info.tx_phys = kni->m_tx_q->phys_addr;
/* RX RING */
- mz = slot->m_rx_q;
- ctx->rx_q = mz->addr;
- kni_fifo_init(ctx->rx_q, KNI_FIFO_COUNT_MAX);
- dev_info.rx_phys = mz->phys_addr;
+ kni->rx_q = kni->m_rx_q->addr;
+ kni_fifo_init(kni->rx_q, KNI_FIFO_COUNT_MAX);
+ dev_info.rx_phys = kni->m_rx_q->phys_addr;
/* ALLOC RING */
- mz = slot->m_alloc_q;
- ctx->alloc_q = mz->addr;
- kni_fifo_init(ctx->alloc_q, KNI_FIFO_COUNT_MAX);
- dev_info.alloc_phys = mz->phys_addr;
+ kni->alloc_q = kni->m_alloc_q->addr;
+ kni_fifo_init(kni->alloc_q, KNI_FIFO_COUNT_MAX);
+ dev_info.alloc_phys = kni->m_alloc_q->phys_addr;
/* FREE RING */
- mz = slot->m_free_q;
- ctx->free_q = mz->addr;
- kni_fifo_init(ctx->free_q, KNI_FIFO_COUNT_MAX);
- dev_info.free_phys = mz->phys_addr;
+ kni->free_q = kni->m_free_q->addr;
+ kni_fifo_init(kni->free_q, KNI_FIFO_COUNT_MAX);
+ dev_info.free_phys = kni->m_free_q->phys_addr;
/* Request RING */
- mz = slot->m_req_q;
- ctx->req_q = mz->addr;
- kni_fifo_init(ctx->req_q, KNI_FIFO_COUNT_MAX);
- dev_info.req_phys = mz->phys_addr;
+ kni->req_q = kni->m_req_q->addr;
+ kni_fifo_init(kni->req_q, KNI_FIFO_COUNT_MAX);
+ dev_info.req_phys = kni->m_req_q->phys_addr;
/* Response RING */
- mz = slot->m_resp_q;
- ctx->resp_q = mz->addr;
- kni_fifo_init(ctx->resp_q, KNI_FIFO_COUNT_MAX);
- dev_info.resp_phys = mz->phys_addr;
+ kni->resp_q = kni->m_resp_q->addr;
+ kni_fifo_init(kni->resp_q, KNI_FIFO_COUNT_MAX);
+ dev_info.resp_phys = kni->m_resp_q->phys_addr;
/* Req/Resp sync mem area */
- mz = slot->m_sync_addr;
- ctx->sync_addr = mz->addr;
- dev_info.sync_va = mz->addr;
- dev_info.sync_phys = mz->phys_addr;
+ kni->sync_addr = kni->m_sync_addr->addr;
+ dev_info.sync_va = kni->m_sync_addr->addr;
+ dev_info.sync_phys = kni->m_sync_addr->phys_addr;
- ctx->pktmbuf_pool = pktmbuf_pool;
- ctx->group_id = conf->group_id;
- ctx->slot_id = slot->id;
- ctx->mbuf_size = conf->mbuf_size;
+ kni->pktmbuf_pool = pktmbuf_pool;
+ kni->group_id = conf->group_id;
+ kni->mbuf_size = conf->mbuf_size;
ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
- KNI_MEM_CHECK(ret < 0);
+ if (ret < 0)
+ goto ioctl_fail;
+
+ te->data = kni;
+
+ kni_list = RTE_TAILQ_CAST(rte_kni_tailq.head, rte_kni_list);
+ TAILQ_INSERT_TAIL(kni_list, te, next);
- ctx->in_use = 1;
+ rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
/* Allocate mbufs and then put them into alloc_q */
- kni_allocate_mbufs(ctx);
+ kni_allocate_mbufs(kni);
- return ctx;
+ return kni;
+ioctl_fail:
+ kni_release_mz(kni);
+mz_fail:
+ rte_free(kni);
kni_fail:
- if (slot)
- kni_memzone_pool_release(&kni_memzone_pool.slots[slot->id]);
+ rte_free(te);
+unlock:
+ rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
return NULL;
}
@@ -462,19 +380,36 @@ kni_free_fifo_phy(struct rte_mempool *mp, struct rte_kni_fifo *fifo)
int
rte_kni_release(struct rte_kni *kni)
{
+ struct rte_tailq_entry *te;
+ struct rte_kni_list *kni_list;
struct rte_kni_device_info dev_info;
- uint32_t slot_id;
uint32_t retry = 5;
- if (!kni || !kni->in_use)
+ if (!kni)
return -1;
+ kni_list = RTE_TAILQ_CAST(rte_kni_tailq.head, rte_kni_list);
+
+ rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
+
+ TAILQ_FOREACH(te, kni_list, next) {
+ if (te->data == kni)
+ break;
+ }
+
+ if (te == NULL)
+ goto unlock;
+
snprintf(dev_info.name, sizeof(dev_info.name), "%s", kni->name);
if (ioctl(kni_fd, RTE_KNI_IOCTL_RELEASE, &dev_info) < 0) {
RTE_LOG(ERR, KNI, "Fail to release kni device\n");
- return -1;
+ goto unlock;
}
+ TAILQ_REMOVE(kni_list, te, next);
+
+ rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
+
/* mbufs in all fifo should be released, except request/response */
/* wait until all rxq packets processed by kernel */
@@ -488,20 +423,18 @@ rte_kni_release(struct rte_kni *kni)
kni_free_fifo(kni->tx_q);
kni_free_fifo(kni->free_q);
- slot_id = kni->slot_id;
+ kni_release_mz(kni);
- /* Memset the KNI struct */
- memset(kni, 0, sizeof(struct rte_kni));
+ rte_free(kni);
- /* Release memzone */
- if (slot_id > kni_memzone_pool.max_ifaces) {
- RTE_LOG(ERR, KNI, "KNI pool: corrupted slot ID: %d, max: %d\n",
- slot_id, kni_memzone_pool.max_ifaces);
- return -1;
- }
- kni_memzone_pool_release(&kni_memzone_pool.slots[slot_id]);
+ rte_free(te);
return 0;
+
+unlock:
+ rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
+
+ return -1;
}
/* default callback for request of configuring device mac address */
@@ -711,24 +644,18 @@ kni_allocate_mbufs(struct rte_kni *kni)
struct rte_kni *
rte_kni_get(const char *name)
{
- uint32_t i;
- struct rte_kni_memzone_slot *it;
struct rte_kni *kni;
if (name == NULL || name[0] == '\0')
return NULL;
- /* Note: could be improved perf-wise if necessary */
- for (i = 0; i < kni_memzone_pool.max_ifaces; i++) {
- it = &kni_memzone_pool.slots[i];
- if (it->in_use == 0)
- continue;
- kni = it->m_ctx->addr;
- if (strncmp(kni->name, name, RTE_KNI_NAMESIZE) == 0)
- return kni;
- }
+ rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK);
- return NULL;
+ kni = __rte_kni_get(name);
+
+ rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK);
+
+ return kni;
}
const char *
@@ -790,6 +717,47 @@ rte_kni_unregister_handlers(struct rte_kni *kni)
return 0;
}
+
+int __rte_experimental
+rte_kni_update_link(struct rte_kni *kni, unsigned int linkup)
+{
+ char path[64];
+ char old_carrier[2];
+ const char *new_carrier;
+ int old_linkup;
+ int fd, ret;
+
+ if (kni == NULL)
+ return -1;
+
+ snprintf(path, sizeof(path), "/sys/devices/virtual/net/%s/carrier",
+ kni->name);
+
+ fd = open(path, O_RDWR);
+ if (fd == -1) {
+ RTE_LOG(ERR, KNI, "Failed to open file: %s.\n", path);
+ return -1;
+ }
+
+ ret = read(fd, old_carrier, 2);
+ if (ret < 1) {
+ close(fd);
+ return -1;
+ }
+ old_linkup = (old_carrier[0] == '1');
+
+ new_carrier = linkup ? "1" : "0";
+ ret = write(fd, new_carrier, 1);
+ if (ret < 1) {
+ RTE_LOG(ERR, KNI, "Failed to write file: %s.\n", path);
+ close(fd);
+ return -1;
+ }
+
+ close(fd);
+ return old_linkup;
+}
+
void
rte_kni_close(void)
{
diff --git a/lib/librte_kni/rte_kni.h b/lib/librte_kni/rte_kni.h
index 99055e2c..02ca43b4 100644
--- a/lib/librte_kni/rte_kni.h
+++ b/lib/librte_kni/rte_kni.h
@@ -81,8 +81,12 @@ struct rte_kni_conf {
*
* @param max_kni_ifaces
* The maximum number of KNI interfaces that can coexist concurrently
+ *
+ * @return
+ * - 0 indicates success.
+ * - negative value indicates failure.
*/
-void rte_kni_init(unsigned int max_kni_ifaces);
+int rte_kni_init(unsigned int max_kni_ifaces);
/**
@@ -229,6 +233,26 @@ int rte_kni_register_handlers(struct rte_kni *kni, struct rte_kni_ops *ops);
int rte_kni_unregister_handlers(struct rte_kni *kni);
/**
+ * Update link carrier state for KNI port.
+ *
+ * Update the linkup/linkdown state of a KNI interface in the kernel.
+ *
+ * @param kni
+ * pointer to struct rte_kni.
+ * @param linkup
+ * New link state:
+ * 0 for linkdown.
+ * > 0 for linkup.
+ *
+ * @return
+ * On failure: -1
+ * Previous link state == linkdown: 0
+ * Previous link state == linkup: 1
+ */
+int __rte_experimental
+rte_kni_update_link(struct rte_kni *kni, unsigned int linkup);
+
+/**
* Close KNI device.
*/
void rte_kni_close(void);
diff --git a/lib/librte_kni/rte_kni_fifo.h b/lib/librte_kni/rte_kni_fifo.h
index ac26a8c0..287d7deb 100644
--- a/lib/librte_kni/rte_kni_fifo.h
+++ b/lib/librte_kni/rte_kni_fifo.h
@@ -5,6 +5,36 @@
/**
+ * @internal when c11 memory model enabled use c11 atomic memory barrier.
+ * when under non c11 memory model use rte_smp_* memory barrier.
+ *
+ * @param src
+ * Pointer to the source data.
+ * @param dst
+ * Pointer to the destination data.
+ * @param value
+ * Data value.
+ */
+#ifdef RTE_USE_C11_MEM_MODEL
+#define __KNI_LOAD_ACQUIRE(src) ({ \
+ __atomic_load_n((src), __ATOMIC_ACQUIRE); \
+ })
+#define __KNI_STORE_RELEASE(dst, value) do { \
+ __atomic_store_n((dst), value, __ATOMIC_RELEASE); \
+ } while(0)
+#else
+#define __KNI_LOAD_ACQUIRE(src) ({ \
+ typeof (*(src)) val = *(src); \
+ rte_smp_rmb(); \
+ val; \
+ })
+#define __KNI_STORE_RELEASE(dst, value) do { \
+ *(dst) = value; \
+ rte_smp_wmb(); \
+ } while(0)
+#endif
+
+/**
* Initializes the kni fifo structure
*/
static void
@@ -28,8 +58,8 @@ kni_fifo_put(struct rte_kni_fifo *fifo, void **data, unsigned num)
{
unsigned i = 0;
unsigned fifo_write = fifo->write;
- unsigned fifo_read = fifo->read;
unsigned new_write = fifo_write;
+ unsigned fifo_read = __KNI_LOAD_ACQUIRE(&fifo->read);
for (i = 0; i < num; i++) {
new_write = (new_write + 1) & (fifo->len - 1);
@@ -39,7 +69,7 @@ kni_fifo_put(struct rte_kni_fifo *fifo, void **data, unsigned num)
fifo->buffer[fifo_write] = data[i];
fifo_write = new_write;
}
- fifo->write = fifo_write;
+ __KNI_STORE_RELEASE(&fifo->write, fifo_write);
return i;
}
@@ -51,7 +81,8 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void **data, unsigned num)
{
unsigned i = 0;
unsigned new_read = fifo->read;
- unsigned fifo_write = fifo->write;
+ unsigned fifo_write = __KNI_LOAD_ACQUIRE(&fifo->write);
+
for (i = 0; i < num; i++) {
if (new_read == fifo_write)
break;
@@ -59,7 +90,7 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void **data, unsigned num)
data[i] = fifo->buffer[new_read];
new_read = (new_read + 1) & (fifo->len - 1);
}
- fifo->read = new_read;
+ __KNI_STORE_RELEASE(&fifo->read, new_read);
return i;
}
@@ -69,5 +100,7 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void **data, unsigned num)
static inline uint32_t
kni_fifo_count(struct rte_kni_fifo *fifo)
{
- return (fifo->len + fifo->write - fifo->read) & (fifo->len - 1);
+ unsigned fifo_write = __KNI_LOAD_ACQUIRE(&fifo->write);
+ unsigned fifo_read = __KNI_LOAD_ACQUIRE(&fifo->read);
+ return (fifo->len + fifo_write - fifo_read) & (fifo->len - 1);
}
diff --git a/lib/librte_kni/rte_kni_version.map b/lib/librte_kni/rte_kni_version.map
index acd515eb..c877dc6a 100644
--- a/lib/librte_kni/rte_kni_version.map
+++ b/lib/librte_kni/rte_kni_version.map
@@ -15,3 +15,9 @@ DPDK_2.0 {
local: *;
};
+
+EXPERIMENTAL {
+ global:
+
+ rte_kni_update_link;
+};
diff --git a/lib/librte_kvargs/rte_kvargs.c b/lib/librte_kvargs/rte_kvargs.c
index a28f7694..f7030c63 100644
--- a/lib/librte_kvargs/rte_kvargs.c
+++ b/lib/librte_kvargs/rte_kvargs.c
@@ -44,6 +44,20 @@ rte_kvargs_tokenize(struct rte_kvargs *kvlist, const char *params)
kvlist->pairs[i].value == NULL)
return -1;
+ /* Detect list [a,b] to skip comma delimiter in list. */
+ str = kvlist->pairs[i].value;
+ if (str[0] == '[') {
+ /* Find the end of the list. */
+ while (str[strlen(str) - 1] != ']') {
+ /* Restore the comma erased by strtok_r(). */
+ str[strlen(str)] = ',';
+ /* Parse until next comma. */
+ str = strtok_r(NULL, RTE_KVARGS_PAIRS_DELIM, &ctx1);
+ if (str == NULL)
+ return -1; /* no closing bracket */
+ }
+ }
+
kvlist->count++;
str = NULL;
}
@@ -120,6 +134,9 @@ rte_kvargs_process(const struct rte_kvargs *kvlist,
const struct rte_kvargs_pair *pair;
unsigned i;
+ if (kvlist == NULL)
+ return 0;
+
for (i = 0; i < kvlist->count; i++) {
pair = &kvlist->pairs[i];
if (key_match == NULL || strcmp(pair->key, key_match) == 0) {
diff --git a/lib/librte_kvargs/rte_kvargs.h b/lib/librte_kvargs/rte_kvargs.h
index fc041956..1946195d 100644
--- a/lib/librte_kvargs/rte_kvargs.h
+++ b/lib/librte_kvargs/rte_kvargs.h
@@ -110,7 +110,7 @@ struct rte_kvargs *rte_kvargs_parse_delim(const char *args,
* rte_kvargs_parse().
*
* @param kvlist
- * The rte_kvargs structure
+ * The rte_kvargs structure. No error if NULL.
*/
void rte_kvargs_free(struct rte_kvargs *kvlist);
@@ -119,11 +119,10 @@ void rte_kvargs_free(struct rte_kvargs *kvlist);
*
* For each key/value association that matches the given key, calls the
* handler function with the for a given arg_name passing the value on the
- * dictionary for that key and a given extra argument. If *kvlist* is NULL
- * function does nothing.
+ * dictionary for that key and a given extra argument.
*
* @param kvlist
- * The rte_kvargs structure
+ * The rte_kvargs structure. No error if NULL.
* @param key_match
* The key on which the handler should be called, or NULL to process handler
* on all associations
diff --git a/lib/librte_latencystats/rte_latencystats.c b/lib/librte_latencystats/rte_latencystats.c
index 1fdec68e..5715549e 100644
--- a/lib/librte_latencystats/rte_latencystats.c
+++ b/lib/librte_latencystats/rte_latencystats.c
@@ -125,8 +125,11 @@ add_time_stamps(uint16_t pid __rte_unused,
for (i = 0; i < nb_pkts; i++) {
diff_tsc = now - prev_tsc;
timer_tsc += diff_tsc;
- if (timer_tsc >= samp_intvl) {
+
+ if ((pkts[i]->ol_flags & PKT_RX_TIMESTAMP) == 0
+ && (timer_tsc >= samp_intvl)) {
pkts[i]->timestamp = now;
+ pkts[i]->ol_flags |= PKT_RX_TIMESTAMP;
timer_tsc = 0;
}
prev_tsc = now;
@@ -156,7 +159,7 @@ calc_latency(uint16_t pid __rte_unused,
now = rte_rdtsc();
for (i = 0; i < nb_pkts; i++) {
- if (pkts[i]->timestamp)
+ if (pkts[i]->ol_flags & PKT_RX_TIMESTAMP)
latency[cnt++] = now - pkts[i]->timestamp;
}
diff --git a/lib/librte_lpm/Makefile b/lib/librte_lpm/Makefile
index 482bd72e..a7946a1c 100644
--- a/lib/librte_lpm/Makefile
+++ b/lib/librte_lpm/Makefile
@@ -8,7 +8,7 @@ LIB = librte_lpm.a
CFLAGS += -O3
CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)
-LDLIBS += -lrte_eal
+LDLIBS += -lrte_eal -lrte_hash
EXPORT_MAP := rte_lpm_version.map
diff --git a/lib/librte_lpm/meson.build b/lib/librte_lpm/meson.build
index 06784942..a5176d8a 100644
--- a/lib/librte_lpm/meson.build
+++ b/lib/librte_lpm/meson.build
@@ -7,3 +7,4 @@ headers = files('rte_lpm.h', 'rte_lpm6.h')
# since header files have different names, we can install all vector headers
# without worrying about which architecture we actually need
headers += files('rte_lpm_altivec.h', 'rte_lpm_neon.h', 'rte_lpm_sse.h')
+deps += ['hash']
diff --git a/lib/librte_lpm/rte_lpm6.c b/lib/librte_lpm/rte_lpm6.c
index 149677eb..6212003f 100644
--- a/lib/librte_lpm/rte_lpm6.c
+++ b/lib/librte_lpm/rte_lpm6.c
@@ -21,6 +21,9 @@
#include <rte_errno.h>
#include <rte_rwlock.h>
#include <rte_spinlock.h>
+#include <rte_hash.h>
+#include <assert.h>
+#include <rte_jhash.h>
#include "rte_lpm6.h"
@@ -37,6 +40,9 @@
#define BYTE_SIZE 8
#define BYTES2_SIZE 16
+#define RULE_HASH_TABLE_EXTRA_SPACE 64
+#define TBL24_IND UINT32_MAX
+
#define lpm6_tbl8_gindex next_hop
/** Flags for setting an entry as valid/invalid. */
@@ -70,6 +76,23 @@ struct rte_lpm6_rule {
uint8_t depth; /**< Rule depth. */
};
+/** Rules tbl entry key. */
+struct rte_lpm6_rule_key {
+ uint8_t ip[RTE_LPM6_IPV6_ADDR_SIZE]; /**< Rule IP address. */
+ uint8_t depth; /**< Rule depth. */
+};
+
+/* Header of tbl8 */
+struct rte_lpm_tbl8_hdr {
+ uint32_t owner_tbl_ind; /**< owner table: TBL24_IND if owner is tbl24,
+ * otherwise index of tbl8
+ */
+ uint32_t owner_entry_ind; /**< index of the owner table entry where
+ * pointer to the tbl8 is stored
+ */
+ uint32_t ref_cnt; /**< table reference counter */
+};
+
/** LPM6 structure. */
struct rte_lpm6 {
/* LPM metadata. */
@@ -77,12 +100,17 @@ struct rte_lpm6 {
uint32_t max_rules; /**< Max number of rules. */
uint32_t used_rules; /**< Used rules so far. */
uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */
- uint32_t next_tbl8; /**< Next tbl8 to be used. */
/* LPM Tables. */
- struct rte_lpm6_rule *rules_tbl; /**< LPM rules. */
+ struct rte_hash *rules_tbl; /**< LPM rules. */
struct rte_lpm6_tbl_entry tbl24[RTE_LPM6_TBL24_NUM_ENTRIES]
__rte_cache_aligned; /**< LPM tbl24 table. */
+
+ uint32_t *tbl8_pool; /**< pool of indexes of free tbl8s */
+ uint32_t tbl8_pool_pos; /**< current position in the tbl8 pool */
+
+ struct rte_lpm_tbl8_hdr *tbl8_hdrs; /* array of tbl8 headers */
+
struct rte_lpm6_tbl_entry tbl8[0]
__rte_cache_aligned; /**< LPM tbl8 table. */
};
@@ -93,22 +121,122 @@ struct rte_lpm6 {
* and set the rest to 0.
*/
static inline void
-mask_ip(uint8_t *ip, uint8_t depth)
+ip6_mask_addr(uint8_t *ip, uint8_t depth)
{
- int16_t part_depth, mask;
- int i;
+ int16_t part_depth, mask;
+ int i;
- part_depth = depth;
+ part_depth = depth;
- for (i = 0; i < RTE_LPM6_IPV6_ADDR_SIZE; i++) {
- if (part_depth < BYTE_SIZE && part_depth >= 0) {
- mask = (uint16_t)(~(UINT8_MAX >> part_depth));
- ip[i] = (uint8_t)(ip[i] & mask);
- } else if (part_depth < 0) {
- ip[i] = 0;
- }
- part_depth -= BYTE_SIZE;
- }
+ for (i = 0; i < RTE_LPM6_IPV6_ADDR_SIZE; i++) {
+ if (part_depth < BYTE_SIZE && part_depth >= 0) {
+ mask = (uint16_t)(~(UINT8_MAX >> part_depth));
+ ip[i] = (uint8_t)(ip[i] & mask);
+ } else if (part_depth < 0)
+ ip[i] = 0;
+
+ part_depth -= BYTE_SIZE;
+ }
+}
+
+/* copy ipv6 address */
+static inline void
+ip6_copy_addr(uint8_t *dst, const uint8_t *src)
+{
+ rte_memcpy(dst, src, RTE_LPM6_IPV6_ADDR_SIZE);
+}
+
+/*
+ * LPM6 rule hash function
+ *
+ * It's used as a hash function for the rte_hash
+ * containing rules
+ */
+static inline uint32_t
+rule_hash(const void *data, __rte_unused uint32_t data_len,
+ uint32_t init_val)
+{
+ return rte_jhash(data, sizeof(struct rte_lpm6_rule_key), init_val);
+}
+
+/*
+ * Init pool of free tbl8 indexes
+ */
+static void
+tbl8_pool_init(struct rte_lpm6 *lpm)
+{
+ uint32_t i;
+
+ /* put entire range of indexes to the tbl8 pool */
+ for (i = 0; i < lpm->number_tbl8s; i++)
+ lpm->tbl8_pool[i] = i;
+
+ lpm->tbl8_pool_pos = 0;
+}
+
+/*
+ * Get an index of a free tbl8 from the pool
+ */
+static inline uint32_t
+tbl8_get(struct rte_lpm6 *lpm, uint32_t *tbl8_ind)
+{
+ if (lpm->tbl8_pool_pos == lpm->number_tbl8s)
+ /* no more free tbl8 */
+ return -ENOSPC;
+
+ /* next index */
+ *tbl8_ind = lpm->tbl8_pool[lpm->tbl8_pool_pos++];
+ return 0;
+}
+
+/*
+ * Put an index of a free tbl8 back to the pool
+ */
+static inline uint32_t
+tbl8_put(struct rte_lpm6 *lpm, uint32_t tbl8_ind)
+{
+ if (lpm->tbl8_pool_pos == 0)
+ /* pool is full */
+ return -ENOSPC;
+
+ lpm->tbl8_pool[--lpm->tbl8_pool_pos] = tbl8_ind;
+ return 0;
+}
+
+/*
+ * Returns number of tbl8s available in the pool
+ */
+static inline uint32_t
+tbl8_available(struct rte_lpm6 *lpm)
+{
+ return lpm->number_tbl8s - lpm->tbl8_pool_pos;
+}
+
+/*
+ * Init a rule key.
+ * note that ip must be already masked
+ */
+static inline void
+rule_key_init(struct rte_lpm6_rule_key *key, uint8_t *ip, uint8_t depth)
+{
+ ip6_copy_addr(key->ip, ip);
+ key->depth = depth;
+}
+
+/*
+ * Rebuild the entire LPM tree by reinserting all rules
+ */
+static void
+rebuild_lpm(struct rte_lpm6 *lpm)
+{
+ uint64_t next_hop;
+ struct rte_lpm6_rule_key *rule_key;
+ uint32_t iter = 0;
+
+ while (rte_hash_iterate(lpm->rules_tbl, (void *) &rule_key,
+ (void **) &next_hop, &iter) >= 0)
+ rte_lpm6_add(lpm, rule_key->ip, rule_key->depth,
+ (uint32_t) next_hop);
}
/*
@@ -121,8 +249,11 @@ rte_lpm6_create(const char *name, int socket_id,
char mem_name[RTE_LPM6_NAMESIZE];
struct rte_lpm6 *lpm = NULL;
struct rte_tailq_entry *te;
- uint64_t mem_size, rules_size;
+ uint64_t mem_size;
struct rte_lpm6_list *lpm_list;
+ struct rte_hash *rules_tbl = NULL;
+ uint32_t *tbl8_pool = NULL;
+ struct rte_lpm_tbl8_hdr *tbl8_hdrs = NULL;
lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list);
@@ -136,12 +267,54 @@ rte_lpm6_create(const char *name, int socket_id,
return NULL;
}
+ /* create rules hash table */
+ snprintf(mem_name, sizeof(mem_name), "LRH_%s", name);
+ struct rte_hash_parameters rule_hash_tbl_params = {
+ .entries = config->max_rules * 1.2 +
+ RULE_HASH_TABLE_EXTRA_SPACE,
+ .key_len = sizeof(struct rte_lpm6_rule_key),
+ .hash_func = rule_hash,
+ .hash_func_init_val = 0,
+ .name = mem_name,
+ .reserved = 0,
+ .socket_id = socket_id,
+ .extra_flag = 0
+ };
+
+ rules_tbl = rte_hash_create(&rule_hash_tbl_params);
+ if (rules_tbl == NULL) {
+ RTE_LOG(ERR, LPM, "LPM rules hash table allocation failed: %s (%d)",
+ rte_strerror(rte_errno), rte_errno);
+ goto fail_wo_unlock;
+ }
+
+ /* allocate tbl8 indexes pool */
+ tbl8_pool = rte_malloc(NULL,
+ sizeof(uint32_t) * config->number_tbl8s,
+ RTE_CACHE_LINE_SIZE);
+ if (tbl8_pool == NULL) {
+ RTE_LOG(ERR, LPM, "LPM tbl8 pool allocation failed: %s (%d)",
+ rte_strerror(rte_errno), rte_errno);
+ rte_errno = ENOMEM;
+ goto fail_wo_unlock;
+ }
+
+ /* allocate tbl8 headers */
+ tbl8_hdrs = rte_malloc(NULL,
+ sizeof(struct rte_lpm_tbl8_hdr) * config->number_tbl8s,
+ RTE_CACHE_LINE_SIZE);
+ if (tbl8_hdrs == NULL) {
+ RTE_LOG(ERR, LPM, "LPM tbl8 headers allocation failed: %s (%d)",
+ rte_strerror(rte_errno), rte_errno);
+ rte_errno = ENOMEM;
+ goto fail_wo_unlock;
+ }
+
snprintf(mem_name, sizeof(mem_name), "LPM_%s", name);
/* Determine the amount of memory to allocate. */
mem_size = sizeof(*lpm) + (sizeof(lpm->tbl8[0]) *
RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * config->number_tbl8s);
- rules_size = sizeof(struct rte_lpm6_rule) * config->max_rules;
rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
@@ -154,7 +327,7 @@ rte_lpm6_create(const char *name, int socket_id,
lpm = NULL;
if (te != NULL) {
rte_errno = EEXIST;
- goto exit;
+ goto fail;
}
/* allocate tailq entry */
@@ -162,7 +335,7 @@ rte_lpm6_create(const char *name, int socket_id,
if (te == NULL) {
RTE_LOG(ERR, LPM, "Failed to allocate tailq entry!\n");
rte_errno = ENOMEM;
- goto exit;
+ goto fail;
}
/* Allocate memory to store the LPM data structures. */
@@ -173,34 +346,35 @@ rte_lpm6_create(const char *name, int socket_id,
RTE_LOG(ERR, LPM, "LPM memory allocation failed\n");
rte_free(te);
rte_errno = ENOMEM;
- goto exit;
- }
-
- lpm->rules_tbl = rte_zmalloc_socket(NULL,
- (size_t)rules_size, RTE_CACHE_LINE_SIZE, socket_id);
-
- if (lpm->rules_tbl == NULL) {
- RTE_LOG(ERR, LPM, "LPM rules_tbl allocation failed\n");
- rte_free(lpm);
- lpm = NULL;
- rte_free(te);
- rte_errno = ENOMEM;
- goto exit;
+ goto fail;
}
/* Save user arguments. */
lpm->max_rules = config->max_rules;
lpm->number_tbl8s = config->number_tbl8s;
snprintf(lpm->name, sizeof(lpm->name), "%s", name);
+ lpm->rules_tbl = rules_tbl;
+ lpm->tbl8_pool = tbl8_pool;
+ lpm->tbl8_hdrs = tbl8_hdrs;
+
+ /* init the stack */
+ tbl8_pool_init(lpm);
te->data = (void *) lpm;
TAILQ_INSERT_TAIL(lpm_list, te, next);
+ rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
+ return lpm;
-exit:
+fail:
rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
- return lpm;
+fail_wo_unlock:
+ rte_free(tbl8_hdrs);
+ rte_free(tbl8_pool);
+ rte_hash_free(rules_tbl);
+
+ return NULL;
}
/*
@@ -259,50 +433,88 @@ rte_lpm6_free(struct rte_lpm6 *lpm)
rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
- rte_free(lpm->rules_tbl);
+ rte_free(lpm->tbl8_hdrs);
+ rte_free(lpm->tbl8_pool);
+ rte_hash_free(lpm->rules_tbl);
rte_free(lpm);
rte_free(te);
}
+/* Find a rule */
+static inline int
+rule_find_with_key(struct rte_lpm6 *lpm,
+ const struct rte_lpm6_rule_key *rule_key,
+ uint32_t *next_hop)
+{
+ uint64_t hash_val;
+ int ret;
+
+ /* lookup for a rule */
+ ret = rte_hash_lookup_data(lpm->rules_tbl, (const void *) rule_key,
+ (void **) &hash_val);
+ if (ret >= 0) {
+ *next_hop = (uint32_t) hash_val;
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Find a rule */
+static int
+rule_find(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth,
+ uint32_t *next_hop)
+{
+ struct rte_lpm6_rule_key rule_key;
+
+ /* init a rule key */
+ rule_key_init(&rule_key, ip, depth);
+
+ return rule_find_with_key(lpm, &rule_key, next_hop);
+}
+
/*
* Checks if a rule already exists in the rules table and updates
* the nexthop if so. Otherwise it adds a new rule if enough space is available.
+ *
+ * Returns:
+ * 0 - next hop of existed rule is updated
+ * 1 - new rule successfully added
+ * <0 - error
*/
-static inline int32_t
-rule_add(struct rte_lpm6 *lpm, uint8_t *ip, uint32_t next_hop, uint8_t depth)
+static inline int
+rule_add(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, uint32_t next_hop)
{
- uint32_t rule_index;
-
- /* Scan through rule list to see if rule already exists. */
- for (rule_index = 0; rule_index < lpm->used_rules; rule_index++) {
+ int ret, rule_exist;
+ struct rte_lpm6_rule_key rule_key;
+ uint32_t unused;
- /* If rule already exists update its next_hop and return. */
- if ((memcmp (lpm->rules_tbl[rule_index].ip, ip,
- RTE_LPM6_IPV6_ADDR_SIZE) == 0) &&
- lpm->rules_tbl[rule_index].depth == depth) {
- lpm->rules_tbl[rule_index].next_hop = next_hop;
+ /* init a rule key */
+ rule_key_init(&rule_key, ip, depth);
- return rule_index;
- }
- }
+ /* Scan through rule list to see if rule already exists. */
+ rule_exist = rule_find_with_key(lpm, &rule_key, &unused);
/*
* If rule does not exist check if there is space to add a new rule to
* this rule group. If there is no space return error.
*/
- if (lpm->used_rules == lpm->max_rules) {
+ if (!rule_exist && lpm->used_rules == lpm->max_rules)
return -ENOSPC;
- }
- /* If there is space for the new rule add it. */
- rte_memcpy(lpm->rules_tbl[rule_index].ip, ip, RTE_LPM6_IPV6_ADDR_SIZE);
- lpm->rules_tbl[rule_index].next_hop = next_hop;
- lpm->rules_tbl[rule_index].depth = depth;
+ /* add the rule or update rules next hop */
+ ret = rte_hash_add_key_data(lpm->rules_tbl, &rule_key,
+ (void *)(uintptr_t) next_hop);
+ if (ret < 0)
+ return ret;
/* Increment the used rules counter for this rule group. */
- lpm->used_rules++;
+ if (!rule_exist) {
+ lpm->used_rules++;
+ return 1;
+ }
- return rule_index;
+ return 0;
}
/*
@@ -311,24 +523,24 @@ rule_add(struct rte_lpm6 *lpm, uint8_t *ip, uint32_t next_hop, uint8_t depth)
* in the IP address returns a match.
*/
static void
-expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t depth,
- uint32_t next_hop)
+expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t old_depth,
+ uint8_t new_depth, uint32_t next_hop, uint8_t valid)
{
uint32_t tbl8_group_end, tbl8_gindex_next, j;
tbl8_group_end = tbl8_gindex + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES;
struct rte_lpm6_tbl_entry new_tbl8_entry = {
- .valid = VALID,
- .valid_group = VALID,
- .depth = depth,
+ .valid = valid,
+ .valid_group = valid,
+ .depth = new_depth,
.next_hop = next_hop,
.ext_entry = 0,
};
for (j = tbl8_gindex; j < tbl8_group_end; j++) {
if (!lpm->tbl8[j].valid || (lpm->tbl8[j].ext_entry == 0
- && lpm->tbl8[j].depth <= depth)) {
+ && lpm->tbl8[j].depth <= old_depth)) {
lpm->tbl8[j] = new_tbl8_entry;
@@ -336,37 +548,123 @@ expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t depth,
tbl8_gindex_next = lpm->tbl8[j].lpm6_tbl8_gindex
* RTE_LPM6_TBL8_GROUP_NUM_ENTRIES;
- expand_rule(lpm, tbl8_gindex_next, depth, next_hop);
+ expand_rule(lpm, tbl8_gindex_next, old_depth, new_depth,
+ next_hop, valid);
}
}
}
/*
+ * Init a tbl8 header
+ */
+static inline void
+init_tbl8_header(struct rte_lpm6 *lpm, uint32_t tbl_ind,
+ uint32_t owner_tbl_ind, uint32_t owner_entry_ind)
+{
+ struct rte_lpm_tbl8_hdr *tbl_hdr = &lpm->tbl8_hdrs[tbl_ind];
+ tbl_hdr->owner_tbl_ind = owner_tbl_ind;
+ tbl_hdr->owner_entry_ind = owner_entry_ind;
+ tbl_hdr->ref_cnt = 0;
+}
+
+/*
+ * Calculate index to the table based on the number and position
+ * of the bytes being inspected in this step.
+ */
+static uint32_t
+get_bitshift(const uint8_t *ip, uint8_t first_byte, uint8_t bytes)
+{
+ uint32_t entry_ind, i;
+ int8_t bitshift;
+
+ entry_ind = 0;
+ for (i = first_byte; i < (uint32_t)(first_byte + bytes); i++) {
+ bitshift = (int8_t)((bytes - i)*BYTE_SIZE);
+
+ if (bitshift < 0)
+ bitshift = 0;
+ entry_ind = entry_ind | ip[i-1] << bitshift;
+ }
+
+ return entry_ind;
+}
+
+/*
+ * Simulate adding a new route to the LPM counting number
+ * of new tables that will be needed
+ *
+ * It returns 0 on success, or 1 if
+ * the process needs to be continued by calling the function again.
+ */
+static inline int
+simulate_add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl,
+ struct rte_lpm6_tbl_entry **next_tbl, const uint8_t *ip,
+ uint8_t bytes, uint8_t first_byte, uint8_t depth,
+ uint32_t *need_tbl_nb)
+{
+ uint32_t entry_ind;
+ uint8_t bits_covered;
+ uint32_t next_tbl_ind;
+
+ /*
+ * Calculate index to the table based on the number and position
+ * of the bytes being inspected in this step.
+ */
+ entry_ind = get_bitshift(ip, first_byte, bytes);
+
+ /* Number of bits covered in this step */
+ bits_covered = (uint8_t)((bytes+first_byte-1)*BYTE_SIZE);
+
+ if (depth <= bits_covered) {
+ *need_tbl_nb = 0;
+ return 0;
+ }
+
+ if (tbl[entry_ind].valid == 0 || tbl[entry_ind].ext_entry == 0) {
+ /* from this point on a new table is needed on each level
+ * that is not covered yet
+ */
+ depth -= bits_covered;
+ uint32_t cnt = depth >> 3; /* depth / BYTE_SIZE */
+ if (depth & 7) /* 0b00000111 */
+ /* if depth % 8 > 0 then one more table is needed
+ * for those last bits
+ */
+ cnt++;
+
+ *need_tbl_nb = cnt;
+ return 0;
+ }
+
+ next_tbl_ind = tbl[entry_ind].lpm6_tbl8_gindex;
+ *next_tbl = &(lpm->tbl8[next_tbl_ind *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]);
+ *need_tbl_nb = 0;
+ return 1;
+}
+
+/*
* Partially adds a new route to the data structure (tbl24+tbl8s).
* It returns 0 on success, a negative number on failure, or 1 if
* the process needs to be continued by calling the function again.
*/
static inline int
add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl,
- struct rte_lpm6_tbl_entry **tbl_next, uint8_t *ip, uint8_t bytes,
- uint8_t first_byte, uint8_t depth, uint32_t next_hop)
+ uint32_t tbl_ind, struct rte_lpm6_tbl_entry **next_tbl,
+ uint32_t *next_tbl_ind, uint8_t *ip, uint8_t bytes,
+ uint8_t first_byte, uint8_t depth, uint32_t next_hop,
+ uint8_t is_new_rule)
{
- uint32_t tbl_index, tbl_range, tbl8_group_start, tbl8_group_end, i;
- int32_t tbl8_gindex;
- int8_t bitshift;
+ uint32_t entry_ind, tbl_range, tbl8_group_start, tbl8_group_end, i;
+ uint32_t tbl8_gindex;
uint8_t bits_covered;
+ int ret;
/*
* Calculate index to the table based on the number and position
* of the bytes being inspected in this step.
*/
- tbl_index = 0;
- for (i = first_byte; i < (uint32_t)(first_byte + bytes); i++) {
- bitshift = (int8_t)((bytes - i)*BYTE_SIZE);
-
- if (bitshift < 0) bitshift = 0;
- tbl_index = tbl_index | ip[i-1] << bitshift;
- }
+ entry_ind = get_bitshift(ip, first_byte, bytes);
/* Number of bits covered in this step */
bits_covered = (uint8_t)((bytes+first_byte-1)*BYTE_SIZE);
@@ -378,7 +676,7 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl,
if (depth <= bits_covered) {
tbl_range = 1 << (bits_covered - depth);
- for (i = tbl_index; i < (tbl_index + tbl_range); i++) {
+ for (i = entry_ind; i < (entry_ind + tbl_range); i++) {
if (!tbl[i].valid || (tbl[i].ext_entry == 0 &&
tbl[i].depth <= depth)) {
@@ -400,10 +698,15 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl,
*/
tbl8_gindex = tbl[i].lpm6_tbl8_gindex *
RTE_LPM6_TBL8_GROUP_NUM_ENTRIES;
- expand_rule(lpm, tbl8_gindex, depth, next_hop);
+ expand_rule(lpm, tbl8_gindex, depth, depth,
+ next_hop, VALID);
}
}
+ /* update tbl8 rule reference counter */
+ if (tbl_ind != TBL24_IND && is_new_rule)
+ lpm->tbl8_hdrs[tbl_ind].ref_cnt++;
+
return 0;
}
/*
@@ -412,12 +715,24 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl,
*/
else {
/* If it's invalid a new tbl8 is needed */
- if (!tbl[tbl_index].valid) {
- if (lpm->next_tbl8 < lpm->number_tbl8s)
- tbl8_gindex = (lpm->next_tbl8)++;
- else
+ if (!tbl[entry_ind].valid) {
+ /* get a new table */
+ ret = tbl8_get(lpm, &tbl8_gindex);
+ if (ret != 0)
return -ENOSPC;
+ /* invalidate all new tbl8 entries */
+ tbl8_group_start = tbl8_gindex *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES;
+ memset(&lpm->tbl8[tbl8_group_start], 0,
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES);
+
+ /* init the new table's header:
+ * save the reference to the owner table
+ */
+ init_tbl8_header(lpm, tbl8_gindex, tbl_ind, entry_ind);
+
+ /* reference to a new tbl8 */
struct rte_lpm6_tbl_entry new_tbl_entry = {
.lpm6_tbl8_gindex = tbl8_gindex,
.depth = 0,
@@ -426,17 +741,20 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl,
.ext_entry = 1,
};
- tbl[tbl_index] = new_tbl_entry;
+ tbl[entry_ind] = new_tbl_entry;
+
+ /* update the current table's reference counter */
+ if (tbl_ind != TBL24_IND)
+ lpm->tbl8_hdrs[tbl_ind].ref_cnt++;
}
/*
- * If it's valid but not extended the rule that was stored *
+ * If it's valid but not extended the rule that was stored
* here needs to be moved to the next table.
*/
- else if (tbl[tbl_index].ext_entry == 0) {
- /* Search for free tbl8 group. */
- if (lpm->next_tbl8 < lpm->number_tbl8s)
- tbl8_gindex = (lpm->next_tbl8)++;
- else
+ else if (tbl[entry_ind].ext_entry == 0) {
+ /* get a new tbl8 index */
+ ret = tbl8_get(lpm, &tbl8_gindex);
+ if (ret != 0)
return -ENOSPC;
tbl8_group_start = tbl8_gindex *
@@ -444,13 +762,22 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl,
tbl8_group_end = tbl8_group_start +
RTE_LPM6_TBL8_GROUP_NUM_ENTRIES;
+ struct rte_lpm6_tbl_entry tbl_entry = {
+ .next_hop = tbl[entry_ind].next_hop,
+ .depth = tbl[entry_ind].depth,
+ .valid = VALID,
+ .valid_group = VALID,
+ .ext_entry = 0
+ };
+
/* Populate new tbl8 with tbl value. */
- for (i = tbl8_group_start; i < tbl8_group_end; i++) {
- lpm->tbl8[i].valid = VALID;
- lpm->tbl8[i].depth = tbl[tbl_index].depth;
- lpm->tbl8[i].next_hop = tbl[tbl_index].next_hop;
- lpm->tbl8[i].ext_entry = 0;
- }
+ for (i = tbl8_group_start; i < tbl8_group_end; i++)
+ lpm->tbl8[i] = tbl_entry;
+
+ /* init the new table's header:
+ * save the reference to the owner table
+ */
+ init_tbl8_header(lpm, tbl8_gindex, tbl_ind, entry_ind);
/*
* Update tbl entry to point to new tbl8 entry. Note: The
@@ -465,11 +792,16 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl,
.ext_entry = 1,
};
- tbl[tbl_index] = new_tbl_entry;
+ tbl[entry_ind] = new_tbl_entry;
+
+ /* update the current table's reference counter */
+ if (tbl_ind != TBL24_IND)
+ lpm->tbl8_hdrs[tbl_ind].ref_cnt++;
}
- *tbl_next = &(lpm->tbl8[tbl[tbl_index].lpm6_tbl8_gindex *
- RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]);
+ *next_tbl_ind = tbl[entry_ind].lpm6_tbl8_gindex;
+ *next_tbl = &(lpm->tbl8[*next_tbl_ind *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]);
}
return 1;
@@ -486,13 +818,56 @@ rte_lpm6_add_v20(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth,
}
VERSION_SYMBOL(rte_lpm6_add, _v20, 2.0);
+
+/*
+ * Simulate adding a route to LPM
+ *
+ * Returns:
+ * 0 on success
+ * -ENOSPC not enought tbl8 left
+ */
+static int
+simulate_add(struct rte_lpm6 *lpm, const uint8_t *masked_ip, uint8_t depth)
+{
+ struct rte_lpm6_tbl_entry *tbl;
+ struct rte_lpm6_tbl_entry *tbl_next = NULL;
+ int ret, i;
+
+ /* number of new tables needed for a step */
+ uint32_t need_tbl_nb;
+ /* total number of new tables needed */
+ uint32_t total_need_tbl_nb;
+
+ /* Inspect the first three bytes through tbl24 on the first step. */
+ ret = simulate_add_step(lpm, lpm->tbl24, &tbl_next, masked_ip,
+ ADD_FIRST_BYTE, 1, depth, &need_tbl_nb);
+ total_need_tbl_nb = need_tbl_nb;
+ /*
+ * Inspect one by one the rest of the bytes until
+ * the process is completed.
+ */
+ for (i = ADD_FIRST_BYTE; i < RTE_LPM6_IPV6_ADDR_SIZE && ret == 1; i++) {
+ tbl = tbl_next;
+ ret = simulate_add_step(lpm, tbl, &tbl_next, masked_ip, 1,
+ (uint8_t)(i+1), depth, &need_tbl_nb);
+ total_need_tbl_nb += need_tbl_nb;
+ }
+
+ if (tbl8_available(lpm) < total_need_tbl_nb)
+ /* not enought tbl8 to add a rule */
+ return -ENOSPC;
+
+ return 0;
+}
+
int
rte_lpm6_add_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth,
uint32_t next_hop)
{
struct rte_lpm6_tbl_entry *tbl;
struct rte_lpm6_tbl_entry *tbl_next = NULL;
- int32_t rule_index;
+ /* init to avoid compiler warning */
+ uint32_t tbl_next_num = 123456;
int status;
uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE];
int i;
@@ -502,26 +877,26 @@ rte_lpm6_add_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth,
return -EINVAL;
/* Copy the IP and mask it to avoid modifying user's input data. */
- memcpy(masked_ip, ip, RTE_LPM6_IPV6_ADDR_SIZE);
- mask_ip(masked_ip, depth);
+ ip6_copy_addr(masked_ip, ip);
+ ip6_mask_addr(masked_ip, depth);
- /* Add the rule to the rule table. */
- rule_index = rule_add(lpm, masked_ip, next_hop, depth);
+ /* Simulate adding a new route */
+ int ret = simulate_add(lpm, masked_ip, depth);
+ if (ret < 0)
+ return ret;
+ /* Add the rule to the rule table. */
+ int is_new_rule = rule_add(lpm, masked_ip, depth, next_hop);
/* If there is no space available for new rule return error. */
- if (rule_index < 0) {
- return rule_index;
- }
+ if (is_new_rule < 0)
+ return is_new_rule;
/* Inspect the first three bytes through tbl24 on the first step. */
tbl = lpm->tbl24;
- status = add_step (lpm, tbl, &tbl_next, masked_ip, ADD_FIRST_BYTE, 1,
- depth, next_hop);
- if (status < 0) {
- rte_lpm6_delete(lpm, masked_ip, depth);
-
- return status;
- }
+ status = add_step(lpm, tbl, TBL24_IND, &tbl_next, &tbl_next_num,
+ masked_ip, ADD_FIRST_BYTE, 1, depth, next_hop,
+ is_new_rule);
+ assert(status >= 0);
/*
* Inspect one by one the rest of the bytes until
@@ -529,13 +904,10 @@ rte_lpm6_add_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth,
*/
for (i = ADD_FIRST_BYTE; i < RTE_LPM6_IPV6_ADDR_SIZE && status == 1; i++) {
tbl = tbl_next;
- status = add_step (lpm, tbl, &tbl_next, masked_ip, 1, (uint8_t)(i+1),
- depth, next_hop);
- if (status < 0) {
- rte_lpm6_delete(lpm, masked_ip, depth);
-
- return status;
- }
+ status = add_step(lpm, tbl, tbl_next_num, &tbl_next,
+ &tbl_next_num, masked_ip, 1, (uint8_t)(i+1),
+ depth, next_hop, is_new_rule);
+ assert(status >= 0);
}
return status;
@@ -610,9 +982,8 @@ rte_lpm6_lookup_v1705(const struct rte_lpm6 *lpm, uint8_t *ip,
uint32_t tbl24_index;
/* DEBUG: Check user input arguments. */
- if ((lpm == NULL) || (ip == NULL) || (next_hop == NULL)) {
+ if ((lpm == NULL) || (ip == NULL) || (next_hop == NULL))
return -EINVAL;
- }
first_byte = LOOKUP_FIRST_BYTE;
tbl24_index = (ip[0] << BYTES2_SIZE) | (ip[1] << BYTE_SIZE) | ip[2];
@@ -648,9 +1019,8 @@ rte_lpm6_lookup_bulk_func_v20(const struct rte_lpm6 *lpm,
int status;
/* DEBUG: Check user input arguments. */
- if ((lpm == NULL) || (ips == NULL) || (next_hops == NULL)) {
+ if ((lpm == NULL) || (ips == NULL) || (next_hops == NULL))
return -EINVAL;
- }
for (i = 0; i < n; i++) {
first_byte = LOOKUP_FIRST_BYTE;
@@ -725,30 +1095,6 @@ MAP_STATIC_SYMBOL(int rte_lpm6_lookup_bulk_func(const struct rte_lpm6 *lpm,
rte_lpm6_lookup_bulk_func_v1705);
/*
- * Finds a rule in rule table.
- * NOTE: Valid range for depth parameter is 1 .. 128 inclusive.
- */
-static inline int32_t
-rule_find(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth)
-{
- uint32_t rule_index;
-
- /* Scan used rules at given depth to find rule. */
- for (rule_index = 0; rule_index < lpm->used_rules; rule_index++) {
- /* If rule is found return the rule index. */
- if ((memcmp (lpm->rules_tbl[rule_index].ip, ip,
- RTE_LPM6_IPV6_ADDR_SIZE) == 0) &&
- lpm->rules_tbl[rule_index].depth == depth) {
-
- return rule_index;
- }
- }
-
- /* If rule is not found return -ENOENT. */
- return -ENOENT;
-}
-
-/*
* Look for a rule in the high-level rules table
*/
int
@@ -775,8 +1121,7 @@ int
rte_lpm6_is_rule_present_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth,
uint32_t *next_hop)
{
- uint8_t ip_masked[RTE_LPM6_IPV6_ADDR_SIZE];
- int32_t rule_index;
+ uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE];
/* Check user arguments. */
if ((lpm == NULL) || next_hop == NULL || ip == NULL ||
@@ -784,19 +1129,10 @@ rte_lpm6_is_rule_present_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth,
return -EINVAL;
/* Copy the IP and mask it to avoid modifying user's input data. */
- memcpy(ip_masked, ip, RTE_LPM6_IPV6_ADDR_SIZE);
- mask_ip(ip_masked, depth);
-
- /* Look for the rule using rule_find. */
- rule_index = rule_find(lpm, ip_masked, depth);
-
- if (rule_index >= 0) {
- *next_hop = lpm->rules_tbl[rule_index].next_hop;
- return 1;
- }
+ ip6_copy_addr(masked_ip, ip);
+ ip6_mask_addr(masked_ip, depth);
- /* If rule is not found return 0. */
- return 0;
+ return rule_find(lpm, masked_ip, depth, next_hop);
}
BIND_DEFAULT_SYMBOL(rte_lpm6_is_rule_present, _v1705, 17.05);
MAP_STATIC_SYMBOL(int rte_lpm6_is_rule_present(struct rte_lpm6 *lpm,
@@ -806,133 +1142,66 @@ MAP_STATIC_SYMBOL(int rte_lpm6_is_rule_present(struct rte_lpm6 *lpm,
/*
* Delete a rule from the rule table.
* NOTE: Valid range for depth parameter is 1 .. 128 inclusive.
+ * return
+ * 0 on success
+ * <0 on failure
*/
-static inline void
-rule_delete(struct rte_lpm6 *lpm, int32_t rule_index)
-{
- /*
- * Overwrite redundant rule with last rule in group and decrement rule
- * counter.
- */
- lpm->rules_tbl[rule_index] = lpm->rules_tbl[lpm->used_rules-1];
- lpm->used_rules--;
-}
-
-/*
- * Deletes a rule
- */
-int
-rte_lpm6_delete(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth)
+static inline int
+rule_delete(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth)
{
- int32_t rule_to_delete_index;
- uint8_t ip_masked[RTE_LPM6_IPV6_ADDR_SIZE];
- unsigned i;
-
- /*
- * Check input arguments.
- */
- if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH)) {
- return -EINVAL;
- }
-
- /* Copy the IP and mask it to avoid modifying user's input data. */
- memcpy(ip_masked, ip, RTE_LPM6_IPV6_ADDR_SIZE);
- mask_ip(ip_masked, depth);
-
- /*
- * Find the index of the input rule, that needs to be deleted, in the
- * rule table.
- */
- rule_to_delete_index = rule_find(lpm, ip_masked, depth);
-
- /*
- * Check if rule_to_delete_index was found. If no rule was found the
- * function rule_find returns -ENOENT.
- */
- if (rule_to_delete_index < 0)
- return rule_to_delete_index;
-
- /* Delete the rule from the rule table. */
- rule_delete(lpm, rule_to_delete_index);
+ int ret;
+ struct rte_lpm6_rule_key rule_key;
- /*
- * Set all the table entries to 0 (ie delete every rule
- * from the data structure.
- */
- lpm->next_tbl8 = 0;
- memset(lpm->tbl24, 0, sizeof(lpm->tbl24));
- memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0])
- * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s);
+ /* init rule key */
+ rule_key_init(&rule_key, ip, depth);
- /*
- * Add every rule again (except for the one that was removed from
- * the rules table).
- */
- for (i = 0; i < lpm->used_rules; i++) {
- rte_lpm6_add(lpm, lpm->rules_tbl[i].ip, lpm->rules_tbl[i].depth,
- lpm->rules_tbl[i].next_hop);
- }
+ /* delete the rule */
+ ret = rte_hash_del_key(lpm->rules_tbl, (void *) &rule_key);
+ if (ret >= 0)
+ lpm->used_rules--;
- return 0;
+ return ret;
}
/*
* Deletes a group of rules
+ *
+ * Note that the function rebuilds the lpm table,
+ * rather than doing incremental updates like
+ * the regular delete function
*/
int
rte_lpm6_delete_bulk_func(struct rte_lpm6 *lpm,
- uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], uint8_t *depths, unsigned n)
+ uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], uint8_t *depths,
+ unsigned n)
{
- int32_t rule_to_delete_index;
- uint8_t ip_masked[RTE_LPM6_IPV6_ADDR_SIZE];
+ uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE];
unsigned i;
- /*
- * Check input arguments.
- */
- if ((lpm == NULL) || (ips == NULL) || (depths == NULL)) {
+ /* Check input arguments. */
+ if ((lpm == NULL) || (ips == NULL) || (depths == NULL))
return -EINVAL;
- }
for (i = 0; i < n; i++) {
- /* Copy the IP and mask it to avoid modifying user's input data. */
- memcpy(ip_masked, ips[i], RTE_LPM6_IPV6_ADDR_SIZE);
- mask_ip(ip_masked, depths[i]);
-
- /*
- * Find the index of the input rule, that needs to be deleted, in the
- * rule table.
- */
- rule_to_delete_index = rule_find(lpm, ip_masked, depths[i]);
-
- /*
- * Check if rule_to_delete_index was found. If no rule was found the
- * function rule_find returns -ENOENT.
- */
- if (rule_to_delete_index < 0)
- continue;
-
- /* Delete the rule from the rule table. */
- rule_delete(lpm, rule_to_delete_index);
+ ip6_copy_addr(masked_ip, ips[i]);
+ ip6_mask_addr(masked_ip, depths[i]);
+ rule_delete(lpm, masked_ip, depths[i]);
}
/*
* Set all the table entries to 0 (ie delete every rule
* from the data structure.
*/
- lpm->next_tbl8 = 0;
memset(lpm->tbl24, 0, sizeof(lpm->tbl24));
memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0])
* RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s);
+ tbl8_pool_init(lpm);
/*
* Add every rule again (except for the ones that were removed from
* the rules table).
*/
- for (i = 0; i < lpm->used_rules; i++) {
- rte_lpm6_add(lpm, lpm->rules_tbl[i].ip, lpm->rules_tbl[i].depth,
- lpm->rules_tbl[i].next_hop);
- }
+ rebuild_lpm(lpm);
return 0;
}
@@ -946,9 +1215,6 @@ rte_lpm6_delete_all(struct rte_lpm6 *lpm)
/* Zero used rules counter. */
lpm->used_rules = 0;
- /* Zero next tbl8 index. */
- lpm->next_tbl8 = 0;
-
/* Zero tbl24. */
memset(lpm->tbl24, 0, sizeof(lpm->tbl24));
@@ -956,6 +1222,268 @@ rte_lpm6_delete_all(struct rte_lpm6 *lpm)
memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) *
RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s);
+ /* init pool of free tbl8 indexes */
+ tbl8_pool_init(lpm);
+
/* Delete all rules form the rules table. */
- memset(lpm->rules_tbl, 0, sizeof(struct rte_lpm6_rule) * lpm->max_rules);
+ rte_hash_reset(lpm->rules_tbl);
+}
+
+/*
+ * Convert a depth to a one byte long mask
+ * Example: 4 will be converted to 0xF0
+ */
+static uint8_t __attribute__((pure))
+depth_to_mask_1b(uint8_t depth)
+{
+ /* To calculate a mask start with a 1 on the left hand side and right
+ * shift while populating the left hand side with 1's
+ */
+ return (signed char)0x80 >> (depth - 1);
+}
+
+/*
+ * Find a less specific rule
+ */
+static int
+rule_find_less_specific(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth,
+ struct rte_lpm6_rule *rule)
+{
+ int ret;
+ uint32_t next_hop;
+ uint8_t mask;
+ struct rte_lpm6_rule_key rule_key;
+
+ if (depth == 1)
+ return 0;
+
+ rule_key_init(&rule_key, ip, depth);
+
+ while (depth > 1) {
+ depth--;
+
+ /* each iteration zero one more bit of the key */
+ mask = depth & 7; /* depth % BYTE_SIZE */
+ if (mask > 0)
+ mask = depth_to_mask_1b(mask);
+
+ rule_key.depth = depth;
+ rule_key.ip[depth >> 3] &= mask;
+
+ ret = rule_find_with_key(lpm, &rule_key, &next_hop);
+ if (ret) {
+ rule->depth = depth;
+ ip6_copy_addr(rule->ip, rule_key.ip);
+ rule->next_hop = next_hop;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Find range of tbl8 cells occupied by a rule
+ */
+static void
+rule_find_range(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth,
+ struct rte_lpm6_tbl_entry **from,
+ struct rte_lpm6_tbl_entry **to,
+ uint32_t *out_tbl_ind)
+{
+ uint32_t ind;
+ uint32_t first_3bytes = (uint32_t)ip[0] << 16 | ip[1] << 8 | ip[2];
+
+ if (depth <= 24) {
+ /* rule is within the top level */
+ ind = first_3bytes;
+ *from = &lpm->tbl24[ind];
+ ind += (1 << (24 - depth)) - 1;
+ *to = &lpm->tbl24[ind];
+ *out_tbl_ind = TBL24_IND;
+ } else {
+ /* top level entry */
+ struct rte_lpm6_tbl_entry *tbl = &lpm->tbl24[first_3bytes];
+ assert(tbl->ext_entry == 1);
+ /* first tbl8 */
+ uint32_t tbl_ind = tbl->lpm6_tbl8_gindex;
+ tbl = &lpm->tbl8[tbl_ind *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES];
+ /* current ip byte, the top level is already behind */
+ uint8_t byte = 3;
+ /* minus top level */
+ depth -= 24;
+
+ /* interate through levels (tbl8s)
+ * until we reach the last one
+ */
+ while (depth > 8) {
+ tbl += ip[byte];
+ assert(tbl->ext_entry == 1);
+ /* go to the next level/tbl8 */
+ tbl_ind = tbl->lpm6_tbl8_gindex;
+ tbl = &lpm->tbl8[tbl_ind *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES];
+ byte += 1;
+ depth -= 8;
+ }
+
+ /* last level/tbl8 */
+ ind = ip[byte] & depth_to_mask_1b(depth);
+ *from = &tbl[ind];
+ ind += (1 << (8 - depth)) - 1;
+ *to = &tbl[ind];
+ *out_tbl_ind = tbl_ind;
+ }
+}
+
+/*
+ * Remove a table from the LPM tree
+ */
+static void
+remove_tbl(struct rte_lpm6 *lpm, struct rte_lpm_tbl8_hdr *tbl_hdr,
+ uint32_t tbl_ind, struct rte_lpm6_rule *lsp_rule)
+{
+ struct rte_lpm6_tbl_entry *owner_entry;
+
+ if (tbl_hdr->owner_tbl_ind == TBL24_IND)
+ owner_entry = &lpm->tbl24[tbl_hdr->owner_entry_ind];
+ else {
+ uint32_t owner_tbl_ind = tbl_hdr->owner_tbl_ind;
+ owner_entry = &lpm->tbl8[
+ owner_tbl_ind * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES +
+ tbl_hdr->owner_entry_ind];
+
+ struct rte_lpm_tbl8_hdr *owner_tbl_hdr =
+ &lpm->tbl8_hdrs[owner_tbl_ind];
+ if (--owner_tbl_hdr->ref_cnt == 0)
+ remove_tbl(lpm, owner_tbl_hdr, owner_tbl_ind, lsp_rule);
+ }
+
+ assert(owner_entry->ext_entry == 1);
+
+ /* unlink the table */
+ if (lsp_rule != NULL) {
+ struct rte_lpm6_tbl_entry new_tbl_entry = {
+ .next_hop = lsp_rule->next_hop,
+ .depth = lsp_rule->depth,
+ .valid = VALID,
+ .valid_group = VALID,
+ .ext_entry = 0
+ };
+
+ *owner_entry = new_tbl_entry;
+ } else {
+ struct rte_lpm6_tbl_entry new_tbl_entry = {
+ .next_hop = 0,
+ .depth = 0,
+ .valid = INVALID,
+ .valid_group = INVALID,
+ .ext_entry = 0
+ };
+
+ *owner_entry = new_tbl_entry;
+ }
+
+ /* return the table to the pool */
+ tbl8_put(lpm, tbl_ind);
+}
+
+/*
+ * Deletes a rule
+ */
+int
+rte_lpm6_delete(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth)
+{
+ uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE];
+ struct rte_lpm6_rule lsp_rule_obj;
+ struct rte_lpm6_rule *lsp_rule;
+ int ret;
+ uint32_t tbl_ind;
+ struct rte_lpm6_tbl_entry *from, *to;
+
+ /* Check input arguments. */
+ if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH))
+ return -EINVAL;
+
+ /* Copy the IP and mask it to avoid modifying user's input data. */
+ ip6_copy_addr(masked_ip, ip);
+ ip6_mask_addr(masked_ip, depth);
+
+ /* Delete the rule from the rule table. */
+ ret = rule_delete(lpm, masked_ip, depth);
+ if (ret < 0)
+ return -ENOENT;
+
+ /* find rule cells */
+ rule_find_range(lpm, masked_ip, depth, &from, &to, &tbl_ind);
+
+ /* find a less specific rule (a rule with smaller depth)
+ * note: masked_ip will be modified, don't use it anymore
+ */
+ ret = rule_find_less_specific(lpm, masked_ip, depth,
+ &lsp_rule_obj);
+ lsp_rule = ret ? &lsp_rule_obj : NULL;
+
+ /* decrement the table rule counter,
+ * note that tbl24 doesn't have a header
+ */
+ if (tbl_ind != TBL24_IND) {
+ struct rte_lpm_tbl8_hdr *tbl_hdr = &lpm->tbl8_hdrs[tbl_ind];
+ if (--tbl_hdr->ref_cnt == 0) {
+ /* remove the table */
+ remove_tbl(lpm, tbl_hdr, tbl_ind, lsp_rule);
+ return 0;
+ }
+ }
+
+ /* iterate rule cells */
+ for (; from <= to; from++)
+ if (from->ext_entry == 1) {
+ /* reference to a more specific space
+ * of the prefix/rule. Entries in a more
+ * specific space that are not used by
+ * a more specific prefix must be occupied
+ * by the prefix
+ */
+ if (lsp_rule != NULL)
+ expand_rule(lpm,
+ from->lpm6_tbl8_gindex *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES,
+ depth, lsp_rule->depth,
+ lsp_rule->next_hop, VALID);
+ else
+ /* since the prefix has no less specific prefix,
+ * its more specific space must be invalidated
+ */
+ expand_rule(lpm,
+ from->lpm6_tbl8_gindex *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES,
+ depth, 0, 0, INVALID);
+ } else if (from->depth == depth) {
+ /* entry is not a reference and belongs to the prefix */
+ if (lsp_rule != NULL) {
+ struct rte_lpm6_tbl_entry new_tbl_entry = {
+ .next_hop = lsp_rule->next_hop,
+ .depth = lsp_rule->depth,
+ .valid = VALID,
+ .valid_group = VALID,
+ .ext_entry = 0
+ };
+
+ *from = new_tbl_entry;
+ } else {
+ struct rte_lpm6_tbl_entry new_tbl_entry = {
+ .next_hop = 0,
+ .depth = 0,
+ .valid = INVALID,
+ .valid_group = INVALID,
+ .ext_entry = 0
+ };
+
+ *from = new_tbl_entry;
+ }
+ }
+
+ return 0;
}
diff --git a/lib/librte_mbuf/meson.build b/lib/librte_mbuf/meson.build
index 45ffb0db..94d9c4c9 100644
--- a/lib/librte_mbuf/meson.build
+++ b/lib/librte_mbuf/meson.build
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright(c) 2017 Intel Corporation
-version = 3
+version = 4
sources = files('rte_mbuf.c', 'rte_mbuf_ptype.c', 'rte_mbuf_pool_ops.c')
headers = files('rte_mbuf.h', 'rte_mbuf_ptype.h', 'rte_mbuf_pool_ops.h')
deps += ['mempool']
diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
index e714c5a5..9790b4fb 100644
--- a/lib/librte_mbuf/rte_mbuf.c
+++ b/lib/librte_mbuf/rte_mbuf.c
@@ -296,11 +296,19 @@ const char *rte_get_rx_ol_flag_name(uint64_t mask)
case PKT_RX_VLAN_STRIPPED: return "PKT_RX_VLAN_STRIPPED";
case PKT_RX_IEEE1588_PTP: return "PKT_RX_IEEE1588_PTP";
case PKT_RX_IEEE1588_TMST: return "PKT_RX_IEEE1588_TMST";
+ case PKT_RX_FDIR_ID: return "PKT_RX_FDIR_ID";
+ case PKT_RX_FDIR_FLX: return "PKT_RX_FDIR_FLX";
case PKT_RX_QINQ_STRIPPED: return "PKT_RX_QINQ_STRIPPED";
+ case PKT_RX_QINQ: return "PKT_RX_QINQ";
case PKT_RX_LRO: return "PKT_RX_LRO";
case PKT_RX_TIMESTAMP: return "PKT_RX_TIMESTAMP";
case PKT_RX_SEC_OFFLOAD: return "PKT_RX_SEC_OFFLOAD";
case PKT_RX_SEC_OFFLOAD_FAILED: return "PKT_RX_SEC_OFFLOAD_FAILED";
+ case PKT_RX_OUTER_L4_CKSUM_BAD: return "PKT_RX_OUTER_L4_CKSUM_BAD";
+ case PKT_RX_OUTER_L4_CKSUM_GOOD: return "PKT_RX_OUTER_L4_CKSUM_GOOD";
+ case PKT_RX_OUTER_L4_CKSUM_INVALID:
+ return "PKT_RX_OUTER_L4_CKSUM_INVALID";
+
default: return NULL;
}
}
@@ -333,12 +341,21 @@ rte_get_rx_ol_flag_list(uint64_t mask, char *buf, size_t buflen)
{ PKT_RX_VLAN_STRIPPED, PKT_RX_VLAN_STRIPPED, NULL },
{ PKT_RX_IEEE1588_PTP, PKT_RX_IEEE1588_PTP, NULL },
{ PKT_RX_IEEE1588_TMST, PKT_RX_IEEE1588_TMST, NULL },
+ { PKT_RX_FDIR_ID, PKT_RX_FDIR_ID, NULL },
+ { PKT_RX_FDIR_FLX, PKT_RX_FDIR_FLX, NULL },
{ PKT_RX_QINQ_STRIPPED, PKT_RX_QINQ_STRIPPED, NULL },
{ PKT_RX_LRO, PKT_RX_LRO, NULL },
{ PKT_RX_TIMESTAMP, PKT_RX_TIMESTAMP, NULL },
{ PKT_RX_SEC_OFFLOAD, PKT_RX_SEC_OFFLOAD, NULL },
{ PKT_RX_SEC_OFFLOAD_FAILED, PKT_RX_SEC_OFFLOAD_FAILED, NULL },
{ PKT_RX_QINQ, PKT_RX_QINQ, NULL },
+ { PKT_RX_OUTER_L4_CKSUM_BAD, PKT_RX_OUTER_L4_CKSUM_MASK, NULL },
+ { PKT_RX_OUTER_L4_CKSUM_GOOD, PKT_RX_OUTER_L4_CKSUM_MASK,
+ NULL },
+ { PKT_RX_OUTER_L4_CKSUM_INVALID, PKT_RX_OUTER_L4_CKSUM_MASK,
+ NULL },
+ { PKT_RX_OUTER_L4_CKSUM_UNKNOWN, PKT_RX_OUTER_L4_CKSUM_MASK,
+ "PKT_RX_OUTER_L4_CKSUM_UNKNOWN" },
};
const char *name;
unsigned int i;
@@ -373,7 +390,7 @@ rte_get_rx_ol_flag_list(uint64_t mask, char *buf, size_t buflen)
const char *rte_get_tx_ol_flag_name(uint64_t mask)
{
switch (mask) {
- case PKT_TX_VLAN_PKT: return "PKT_TX_VLAN_PKT";
+ case PKT_TX_VLAN: return "PKT_TX_VLAN";
case PKT_TX_IP_CKSUM: return "PKT_TX_IP_CKSUM";
case PKT_TX_TCP_CKSUM: return "PKT_TX_TCP_CKSUM";
case PKT_TX_SCTP_CKSUM: return "PKT_TX_SCTP_CKSUM";
@@ -393,8 +410,12 @@ const char *rte_get_tx_ol_flag_name(uint64_t mask)
case PKT_TX_TUNNEL_VXLAN_GPE: return "PKT_TX_TUNNEL_VXLAN_GPE";
case PKT_TX_TUNNEL_IP: return "PKT_TX_TUNNEL_IP";
case PKT_TX_TUNNEL_UDP: return "PKT_TX_TUNNEL_UDP";
+ case PKT_TX_QINQ: return "PKT_TX_QINQ";
case PKT_TX_MACSEC: return "PKT_TX_MACSEC";
case PKT_TX_SEC_OFFLOAD: return "PKT_TX_SEC_OFFLOAD";
+ case PKT_TX_UDP_SEG: return "PKT_TX_UDP_SEG";
+ case PKT_TX_OUTER_UDP_CKSUM: return "PKT_TX_OUTER_UDP_CKSUM";
+ case PKT_TX_METADATA: return "PKT_TX_METADATA";
default: return NULL;
}
}
@@ -404,7 +425,7 @@ int
rte_get_tx_ol_flag_list(uint64_t mask, char *buf, size_t buflen)
{
const struct flag_mask tx_flags[] = {
- { PKT_TX_VLAN_PKT, PKT_TX_VLAN_PKT, NULL },
+ { PKT_TX_VLAN, PKT_TX_VLAN, NULL },
{ PKT_TX_IP_CKSUM, PKT_TX_IP_CKSUM, NULL },
{ PKT_TX_TCP_CKSUM, PKT_TX_L4_MASK, NULL },
{ PKT_TX_SCTP_CKSUM, PKT_TX_L4_MASK, NULL },
@@ -417,24 +438,20 @@ rte_get_tx_ol_flag_list(uint64_t mask, char *buf, size_t buflen)
{ PKT_TX_OUTER_IP_CKSUM, PKT_TX_OUTER_IP_CKSUM, NULL },
{ PKT_TX_OUTER_IPV4, PKT_TX_OUTER_IPV4, NULL },
{ PKT_TX_OUTER_IPV6, PKT_TX_OUTER_IPV6, NULL },
- { PKT_TX_TUNNEL_VXLAN, PKT_TX_TUNNEL_MASK,
- "PKT_TX_TUNNEL_NONE" },
- { PKT_TX_TUNNEL_GRE, PKT_TX_TUNNEL_MASK,
- "PKT_TX_TUNNEL_NONE" },
- { PKT_TX_TUNNEL_IPIP, PKT_TX_TUNNEL_MASK,
- "PKT_TX_TUNNEL_NONE" },
- { PKT_TX_TUNNEL_GENEVE, PKT_TX_TUNNEL_MASK,
- "PKT_TX_TUNNEL_NONE" },
- { PKT_TX_TUNNEL_MPLSINUDP, PKT_TX_TUNNEL_MASK,
- "PKT_TX_TUNNEL_NONE" },
- { PKT_TX_TUNNEL_VXLAN_GPE, PKT_TX_TUNNEL_MASK,
- "PKT_TX_TUNNEL_NONE" },
- { PKT_TX_TUNNEL_IP, PKT_TX_TUNNEL_MASK,
- "PKT_TX_TUNNEL_NONE" },
- { PKT_TX_TUNNEL_UDP, PKT_TX_TUNNEL_MASK,
- "PKT_TX_TUNNEL_NONE" },
+ { PKT_TX_TUNNEL_VXLAN, PKT_TX_TUNNEL_MASK, NULL },
+ { PKT_TX_TUNNEL_GRE, PKT_TX_TUNNEL_MASK, NULL },
+ { PKT_TX_TUNNEL_IPIP, PKT_TX_TUNNEL_MASK, NULL },
+ { PKT_TX_TUNNEL_GENEVE, PKT_TX_TUNNEL_MASK, NULL },
+ { PKT_TX_TUNNEL_MPLSINUDP, PKT_TX_TUNNEL_MASK, NULL },
+ { PKT_TX_TUNNEL_VXLAN_GPE, PKT_TX_TUNNEL_MASK, NULL },
+ { PKT_TX_TUNNEL_IP, PKT_TX_TUNNEL_MASK, NULL },
+ { PKT_TX_TUNNEL_UDP, PKT_TX_TUNNEL_MASK, NULL },
+ { PKT_TX_QINQ, PKT_TX_QINQ, NULL },
{ PKT_TX_MACSEC, PKT_TX_MACSEC, NULL },
{ PKT_TX_SEC_OFFLOAD, PKT_TX_SEC_OFFLOAD, NULL },
+ { PKT_TX_UDP_SEG, PKT_TX_UDP_SEG, NULL },
+ { PKT_TX_OUTER_UDP_CKSUM, PKT_TX_OUTER_UDP_CKSUM, NULL },
+ { PKT_TX_METADATA, PKT_TX_METADATA, NULL },
};
const char *name;
unsigned int i;
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 9ce5d76d..3dbc6695 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -140,7 +140,7 @@ extern "C" {
* The 2 vlans have been stripped by the hardware and their tci are
* saved in mbuf->vlan_tci (inner) and mbuf->vlan_tci_outer (outer).
* This can only happen if vlan stripping is enabled in the RX
- * configuration of the PMD. If this flag is set,
+ * configuration of the PMD.
* When PKT_RX_QINQ_STRIPPED is set, the flags (PKT_RX_VLAN |
* PKT_RX_VLAN_STRIPPED | PKT_RX_QINQ) must also be set.
*/
@@ -170,18 +170,54 @@ extern "C" {
/**
* The RX packet is a double VLAN, and the outer tci has been
- * saved in in mbuf->vlan_tci_outer.
+ * saved in in mbuf->vlan_tci_outer. If PKT_RX_QINQ set, PKT_RX_VLAN
+ * also should be set and inner tci should be saved to mbuf->vlan_tci.
* If the flag PKT_RX_QINQ_STRIPPED is also present, both VLANs
* headers have been stripped from mbuf data, else they are still
* present.
*/
#define PKT_RX_QINQ (1ULL << 20)
+/**
+ * Mask of bits used to determine the status of outer RX L4 checksum.
+ * - PKT_RX_OUTER_L4_CKSUM_UNKNOWN: no info about the outer RX L4 checksum
+ * - PKT_RX_OUTER_L4_CKSUM_BAD: the outer L4 checksum in the packet is wrong
+ * - PKT_RX_OUTER_L4_CKSUM_GOOD: the outer L4 checksum in the packet is valid
+ * - PKT_RX_OUTER_L4_CKSUM_INVALID: invalid outer L4 checksum state.
+ *
+ * The detection of PKT_RX_OUTER_L4_CKSUM_GOOD shall be based on the given
+ * HW capability, At minimum, the PMD should support
+ * PKT_RX_OUTER_L4_CKSUM_UNKNOWN and PKT_RX_OUTER_L4_CKSUM_BAD states
+ * if the DEV_RX_OFFLOAD_OUTER_UDP_CKSUM offload is available.
+ */
+#define PKT_RX_OUTER_L4_CKSUM_MASK ((1ULL << 21) | (1ULL << 22))
+
+#define PKT_RX_OUTER_L4_CKSUM_UNKNOWN 0
+#define PKT_RX_OUTER_L4_CKSUM_BAD (1ULL << 21)
+#define PKT_RX_OUTER_L4_CKSUM_GOOD (1ULL << 22)
+#define PKT_RX_OUTER_L4_CKSUM_INVALID ((1ULL << 21) | (1ULL << 22))
+
/* add new RX flags here */
/* add new TX flags here */
/**
+ * Indicate that the metadata field in the mbuf is in use.
+ */
+#define PKT_TX_METADATA (1ULL << 40)
+
+/**
+ * Outer UDP checksum offload flag. This flag is used for enabling
+ * outer UDP checksum in PMD. To use outer UDP checksum, the user needs to
+ * 1) Enable the following in mbuff,
+ * a) Fill outer_l2_len and outer_l3_len in mbuf.
+ * b) Set the PKT_TX_OUTER_UDP_CKSUM flag.
+ * c) Set the PKT_TX_OUTER_IPV4 or PKT_TX_OUTER_IPV6 flag.
+ * 2) Configure DEV_TX_OFFLOAD_OUTER_UDP_CKSUM offload flag.
+ */
+#define PKT_TX_OUTER_UDP_CKSUM (1ULL << 41)
+
+/**
* UDP Fragmentation Offload flag. This flag is used for enabling UDP
* fragmentation in SW or in HW. When use UFO, mbuf->tso_segsz is used
* to store the MSS of UDP fragments.
@@ -334,16 +370,23 @@ extern "C" {
* which can be set for packet.
*/
#define PKT_TX_OFFLOAD_MASK ( \
+ PKT_TX_OUTER_IPV6 | \
+ PKT_TX_OUTER_IPV4 | \
+ PKT_TX_OUTER_IP_CKSUM | \
+ PKT_TX_VLAN_PKT | \
+ PKT_TX_IPV6 | \
+ PKT_TX_IPV4 | \
PKT_TX_IP_CKSUM | \
PKT_TX_L4_MASK | \
- PKT_TX_OUTER_IP_CKSUM | \
- PKT_TX_TCP_SEG | \
PKT_TX_IEEE1588_TMST | \
+ PKT_TX_TCP_SEG | \
PKT_TX_QINQ_PKT | \
- PKT_TX_VLAN_PKT | \
PKT_TX_TUNNEL_MASK | \
PKT_TX_MACSEC | \
- PKT_TX_SEC_OFFLOAD)
+ PKT_TX_SEC_OFFLOAD | \
+ PKT_TX_UDP_SEG | \
+ PKT_TX_OUTER_UDP_CKSUM | \
+ PKT_TX_METADATA)
/**
* Mbuf having an external buffer attached. shinfo in mbuf must be filled.
@@ -464,7 +507,9 @@ struct rte_mbuf {
};
uint16_t nb_segs; /**< Number of segments. */
- /** Input port (16 bits to support more than 256 virtual ports). */
+ /** Input port (16 bits to support more than 256 virtual ports).
+ * The event eth Tx adapter uses this field to specify the output port.
+ */
uint16_t port;
uint64_t ol_flags; /**< Offload features. */
@@ -511,28 +556,47 @@ struct rte_mbuf {
/** VLAN TCI (CPU order), valid if PKT_RX_VLAN is set. */
uint16_t vlan_tci;
+ RTE_STD_C11
union {
- uint32_t rss; /**< RSS hash result if RSS enabled */
- struct {
- RTE_STD_C11
- union {
- struct {
- uint16_t hash;
- uint16_t id;
+ union {
+ uint32_t rss; /**< RSS hash result if RSS enabled */
+ struct {
+ union {
+ struct {
+ uint16_t hash;
+ uint16_t id;
+ };
+ uint32_t lo;
+ /**< Second 4 flexible bytes */
};
+ uint32_t hi;
+ /**< First 4 flexible bytes or FD ID, dependent
+ * on PKT_RX_FDIR_* flag in ol_flags.
+ */
+ } fdir; /**< Filter identifier if FDIR enabled */
+ struct {
uint32_t lo;
- /**< Second 4 flexible bytes */
- };
- uint32_t hi;
- /**< First 4 flexible bytes or FD ID, dependent on
- PKT_RX_FDIR_* flag in ol_flags. */
- } fdir; /**< Filter identifier if FDIR enabled */
+ uint32_t hi;
+ /**< The event eth Tx adapter uses this field
+ * to store Tx queue id.
+ * @see rte_event_eth_tx_adapter_txq_set()
+ */
+ } sched; /**< Hierarchical scheduler */
+ /**< User defined tags. See rte_distributor_process() */
+ uint32_t usr;
+ } hash; /**< hash information */
struct {
- uint32_t lo;
- uint32_t hi;
- } sched; /**< Hierarchical scheduler */
- uint32_t usr; /**< User defined tags. See rte_distributor_process() */
- } hash; /**< hash information */
+ /**
+ * Application specific metadata value
+ * for egress flow rule match.
+ * Valid if PKT_TX_METADATA is set.
+ * Located here to allow conjunct use
+ * with hash.sched.hi.
+ */
+ uint32_t tx_metadata;
+ uint32_t reserved;
+ };
+ };
/** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ is set. */
uint16_t vlan_tci_outer;
@@ -1038,14 +1102,6 @@ rte_mbuf_raw_free(struct rte_mbuf *m)
rte_mempool_put(m->pool, m);
}
-/* compat with older versions */
-__rte_deprecated
-static inline void
-__rte_mbuf_raw_free(struct rte_mbuf *m)
-{
- rte_mbuf_raw_free(m);
-}
-
/**
* The packet mbuf constructor.
*
@@ -1658,14 +1714,6 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
return NULL;
}
-/* deprecated, replaced by rte_pktmbuf_prefree_seg() */
-__rte_deprecated
-static inline struct rte_mbuf *
-__rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
-{
- return rte_pktmbuf_prefree_seg(m);
-}
-
/**
* Free a segment of a packet mbuf into its original mempool.
*
diff --git a/lib/librte_mbuf/rte_mbuf_ptype.c b/lib/librte_mbuf/rte_mbuf_ptype.c
index d7835e28..d6f906b0 100644
--- a/lib/librte_mbuf/rte_mbuf_ptype.c
+++ b/lib/librte_mbuf/rte_mbuf_ptype.c
@@ -19,6 +19,8 @@ const char *rte_get_ptype_l2_name(uint32_t ptype)
case RTE_PTYPE_L2_ETHER_VLAN: return "L2_ETHER_VLAN";
case RTE_PTYPE_L2_ETHER_QINQ: return "L2_ETHER_QINQ";
case RTE_PTYPE_L2_ETHER_PPPOE: return "L2_ETHER_PPPOE";
+ case RTE_PTYPE_L2_ETHER_FCOE: return "L2_ETHER_FCOE";
+ case RTE_PTYPE_L2_ETHER_MPLS: return "L2_ETHER_MPLS";
default: return "L2_UNKNOWN";
}
}
@@ -47,6 +49,7 @@ const char *rte_get_ptype_l4_name(uint32_t ptype)
case RTE_PTYPE_L4_SCTP: return "L4_SCTP";
case RTE_PTYPE_L4_ICMP: return "L4_ICMP";
case RTE_PTYPE_L4_NONFRAG: return "L4_NONFRAG";
+ case RTE_PTYPE_L4_IGMP: return "L4_IGMP";
default: return "L4_UNKNOWN";
}
}
diff --git a/lib/librte_mbuf/rte_mbuf_ptype.h b/lib/librte_mbuf/rte_mbuf_ptype.h
index 01acc66e..23bc635f 100644
--- a/lib/librte_mbuf/rte_mbuf_ptype.h
+++ b/lib/librte_mbuf/rte_mbuf_ptype.h
@@ -131,6 +131,20 @@ extern "C" {
*/
#define RTE_PTYPE_L2_ETHER_PPPOE 0x00000008
/**
+ * FCoE packet type.
+ *
+ * Packet format:
+ * <'ether type'=[0x8906]>
+ */
+#define RTE_PTYPE_L2_ETHER_FCOE 0x00000009
+/**
+ * MPLS packet type.
+ *
+ * Packet format:
+ * <'ether type'=[0x8847|0x8848]>
+ */
+#define RTE_PTYPE_L2_ETHER_MPLS 0x0000000a
+/**
* Mask of layer 2 packet types.
* It is used for outer packet for tunneling cases.
*/
@@ -287,6 +301,14 @@ extern "C" {
*/
#define RTE_PTYPE_L4_NONFRAG 0x00000600
/**
+ * IGMP (Internet Group Management Protocol) packet type.
+ *
+ * Packet format:
+ * <'ether type'=0x0800
+ * | 'version'=4, 'protocol'=2, 'MF'=0, 'frag_offset'=0>
+ */
+#define RTE_PTYPE_L4_IGMP 0x00000700
+/**
* Mask of layer 4 packet types.
* It is used for outer packet for tunneling cases.
*/
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 03e6b5f7..683b216f 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -99,25 +99,44 @@ static unsigned optimize_object_size(unsigned obj_size)
return new_obj_size * RTE_MEMPOOL_ALIGN;
}
+struct pagesz_walk_arg {
+ int socket_id;
+ size_t min;
+};
+
static int
find_min_pagesz(const struct rte_memseg_list *msl, void *arg)
{
- size_t *min = arg;
+ struct pagesz_walk_arg *wa = arg;
+ bool valid;
+
+ /*
+ * we need to only look at page sizes available for a particular socket
+ * ID. so, we either need an exact match on socket ID (can match both
+ * native and external memory), or, if SOCKET_ID_ANY was specified as a
+ * socket ID argument, we must only look at native memory and ignore any
+ * page sizes associated with external memory.
+ */
+ valid = msl->socket_id == wa->socket_id;
+ valid |= wa->socket_id == SOCKET_ID_ANY && msl->external == 0;
- if (msl->page_sz < *min)
- *min = msl->page_sz;
+ if (valid && msl->page_sz < wa->min)
+ wa->min = msl->page_sz;
return 0;
}
static size_t
-get_min_page_size(void)
+get_min_page_size(int socket_id)
{
- size_t min_pagesz = SIZE_MAX;
+ struct pagesz_walk_arg wa;
- rte_memseg_list_walk(find_min_pagesz, &min_pagesz);
+ wa.min = SIZE_MAX;
+ wa.socket_id = socket_id;
- return min_pagesz == SIZE_MAX ? (size_t) getpagesize() : min_pagesz;
+ rte_memseg_list_walk(find_min_pagesz, &wa);
+
+ return wa.min == SIZE_MAX ? (size_t) getpagesize() : wa.min;
}
@@ -409,12 +428,18 @@ rte_mempool_populate_default(struct rte_mempool *mp)
rte_iova_t iova;
unsigned mz_id, n;
int ret;
- bool no_contig, try_contig, no_pageshift;
+ bool no_contig, try_contig, no_pageshift, external;
ret = mempool_ops_alloc_once(mp);
if (ret != 0)
return ret;
+ /* check if we can retrieve a valid socket ID */
+ ret = rte_malloc_heap_socket_is_external(mp->socket_id);
+ if (ret < 0)
+ return -EINVAL;
+ external = ret;
+
/* mempool must not be populated */
if (mp->nb_mem_chunks != 0)
return -EEXIST;
@@ -462,15 +487,25 @@ rte_mempool_populate_default(struct rte_mempool *mp)
* in one contiguous chunk as well (otherwise we might end up wasting a
* 1G page on a 10MB memzone). If we fail to get enough contiguous
* memory, then we'll go and reserve space page-by-page.
+ *
+ * We also have to take into account the fact that memory that we're
+ * going to allocate from can belong to an externally allocated memory
+ * area, in which case the assumption of IOVA as VA mode being
+ * synonymous with IOVA contiguousness will not hold. We should also try
+ * to go for contiguous memory even if we're in no-huge mode, because
+ * external memory may in fact be IOVA-contiguous.
*/
- no_pageshift = no_contig || rte_eal_iova_mode() == RTE_IOVA_VA;
- try_contig = !no_contig && !no_pageshift && rte_eal_has_hugepages();
+ external = rte_malloc_heap_socket_is_external(mp->socket_id) == 1;
+ no_pageshift = no_contig ||
+ (!external && rte_eal_iova_mode() == RTE_IOVA_VA);
+ try_contig = !no_contig && !no_pageshift &&
+ (rte_eal_has_hugepages() || external);
if (no_pageshift) {
pg_sz = 0;
pg_shift = 0;
} else if (try_contig) {
- pg_sz = get_min_page_size();
+ pg_sz = get_min_page_size(mp->socket_id);
pg_shift = rte_bsf32(pg_sz);
} else {
pg_sz = getpagesize();
diff --git a/lib/librte_net/Makefile b/lib/librte_net/Makefile
index 85e403f4..c3082069 100644
--- a/lib/librte_net/Makefile
+++ b/lib/librte_net/Makefile
@@ -20,6 +20,6 @@ SRCS-$(CONFIG_RTE_LIBRTE_NET) += rte_arp.c
SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include := rte_ip.h rte_tcp.h rte_udp.h rte_esp.h
SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include += rte_sctp.h rte_icmp.h rte_arp.h
SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include += rte_ether.h rte_gre.h rte_net.h
-SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include += rte_net_crc.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include += rte_net_crc.h rte_mpls.h
include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_net/meson.build b/lib/librte_net/meson.build
index d3ea1feb..7d66f693 100644
--- a/lib/librte_net/meson.build
+++ b/lib/librte_net/meson.build
@@ -13,7 +13,8 @@ headers = files('rte_ip.h',
'rte_ether.h',
'rte_gre.h',
'rte_net.h',
- 'rte_net_crc.h')
+ 'rte_net_crc.h',
+ 'rte_mpls.h')
sources = files('rte_arp.c', 'rte_net.c', 'rte_net_crc.c')
deps += ['mbuf']
diff --git a/lib/librte_net/net_crc_sse.h b/lib/librte_net/net_crc_sse.h
index da815243..1c7b7a54 100644
--- a/lib/librte_net/net_crc_sse.h
+++ b/lib/librte_net/net_crc_sse.h
@@ -21,8 +21,8 @@ struct crc_pclmulqdq_ctx {
__m128i rk7_rk8;
};
-struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16);
-struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16);
+static struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16);
+static struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16);
/**
* @brief Performs one folding round
*
diff --git a/lib/librte_net/rte_ether.h b/lib/librte_net/rte_ether.h
index bee2b34f..c2c5e249 100644
--- a/lib/librte_net/rte_ether.h
+++ b/lib/librte_net/rte_ether.h
@@ -306,6 +306,8 @@ struct vxlan_hdr {
#define ETHER_TYPE_SLOW 0x8809 /**< Slow protocols (LACP and Marker). */
#define ETHER_TYPE_TEB 0x6558 /**< Transparent Ethernet Bridging. */
#define ETHER_TYPE_LLDP 0x88CC /**< LLDP Protocol. */
+#define ETHER_TYPE_MPLS 0x8847 /**< MPLS ethertype. */
+#define ETHER_TYPE_MPLSM 0x8848 /**< MPLS multicast ethertype. */
#define ETHER_VXLAN_HLEN (sizeof(struct udp_hdr) + sizeof(struct vxlan_hdr))
/**< VXLAN tunnel header length. */
diff --git a/lib/librte_net/rte_mpls.h b/lib/librte_net/rte_mpls.h
new file mode 100644
index 00000000..11d26ba3
--- /dev/null
+++ b/lib/librte_net/rte_mpls.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016 6WIND S.A.
+ */
+
+#ifndef _RTE_MPLS_H_
+#define _RTE_MPLS_H_
+
+/**
+ * @file
+ *
+ * MPLS-related defines
+ */
+
+#include <stdint.h>
+#include <rte_byteorder.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * MPLS header.
+ */
+struct mpls_hdr {
+ uint16_t tag_msb; /**< Label(msb). */
+#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
+ uint8_t tag_lsb:4; /**< Label(lsb). */
+ uint8_t tc:3; /**< Traffic class. */
+ uint8_t bs:1; /**< Bottom of stack. */
+#else
+ uint8_t bs:1; /**< Bottom of stack. */
+ uint8_t tc:3; /**< Traffic class. */
+ uint8_t tag_lsb:4; /**< label(lsb) */
+#endif
+ uint8_t ttl; /**< Time to live. */
+} __attribute__((__packed__));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RTE_MPLS_H_ */
diff --git a/lib/librte_net/rte_net.c b/lib/librte_net/rte_net.c
index 9eb7c743..378a4126 100644
--- a/lib/librte_net/rte_net.c
+++ b/lib/librte_net/rte_net.c
@@ -13,6 +13,7 @@
#include <rte_udp.h>
#include <rte_sctp.h>
#include <rte_gre.h>
+#include <rte_mpls.h>
#include <rte_net.h>
/* get l3 packet type from ip6 next protocol */
@@ -274,9 +275,27 @@ uint32_t rte_net_get_ptype(const struct rte_mbuf *m,
off += 2 * sizeof(*vh);
hdr_lens->l2_len += 2 * sizeof(*vh);
proto = vh->eth_proto;
+ } else if ((proto == rte_cpu_to_be_16(ETHER_TYPE_MPLS)) ||
+ (proto == rte_cpu_to_be_16(ETHER_TYPE_MPLSM))) {
+ unsigned int i;
+ const struct mpls_hdr *mh;
+ struct mpls_hdr mh_copy;
+
+#define MAX_MPLS_HDR 5
+ for (i = 0; i < MAX_MPLS_HDR; i++) {
+ mh = rte_pktmbuf_read(m, off + (i * sizeof(*mh)),
+ sizeof(*mh), &mh_copy);
+ if (unlikely(mh == NULL))
+ return pkt_type;
+ }
+ if (i == MAX_MPLS_HDR)
+ return pkt_type;
+ pkt_type = RTE_PTYPE_L2_ETHER_MPLS;
+ hdr_lens->l2_len += (sizeof(*mh) * i);
+ return pkt_type;
}
- l3:
+l3:
if ((layers & RTE_PTYPE_L3_MASK) == 0)
return pkt_type;
diff --git a/lib/librte_net/rte_net.h b/lib/librte_net/rte_net.h
index b6ab6e1d..e59760a0 100644
--- a/lib/librte_net/rte_net.h
+++ b/lib/librte_net/rte_net.h
@@ -122,14 +122,16 @@ rte_net_intel_cksum_flags_prepare(struct rte_mbuf *m, uint64_t ol_flags)
(ol_flags & PKT_TX_OUTER_IPV6))
inner_l3_offset += m->outer_l2_len + m->outer_l3_len;
- if ((ol_flags & PKT_TX_UDP_CKSUM) == PKT_TX_UDP_CKSUM) {
- if (ol_flags & PKT_TX_IPV4) {
- ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *,
- inner_l3_offset);
+ if (ol_flags & PKT_TX_IPV4) {
+ ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *,
+ inner_l3_offset);
- if (ol_flags & PKT_TX_IP_CKSUM)
- ipv4_hdr->hdr_checksum = 0;
+ if (ol_flags & PKT_TX_IP_CKSUM)
+ ipv4_hdr->hdr_checksum = 0;
+ }
+ if ((ol_flags & PKT_TX_UDP_CKSUM) == PKT_TX_UDP_CKSUM) {
+ if (ol_flags & PKT_TX_IPV4) {
udp_hdr = (struct udp_hdr *)((char *)ipv4_hdr +
m->l3_len);
udp_hdr->dgram_cksum = rte_ipv4_phdr_cksum(ipv4_hdr,
@@ -146,12 +148,6 @@ rte_net_intel_cksum_flags_prepare(struct rte_mbuf *m, uint64_t ol_flags)
} else if ((ol_flags & PKT_TX_TCP_CKSUM) ||
(ol_flags & PKT_TX_TCP_SEG)) {
if (ol_flags & PKT_TX_IPV4) {
- ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *,
- inner_l3_offset);
-
- if (ol_flags & PKT_TX_IP_CKSUM)
- ipv4_hdr->hdr_checksum = 0;
-
/* non-TSO tcp or TSO */
tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr +
m->l3_len);
diff --git a/lib/librte_pdump/Makefile b/lib/librte_pdump/Makefile
index 0ee0fa1a..b241151d 100644
--- a/lib/librte_pdump/Makefile
+++ b/lib/librte_pdump/Makefile
@@ -8,8 +8,6 @@ LIB = librte_pdump.a
CFLAGS += -DALLOW_EXPERIMENTAL_API
CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3
-CFLAGS += -D_GNU_SOURCE
-LDLIBS += -lpthread
LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev
EXPORT_MAP := rte_pdump_version.map
diff --git a/lib/librte_pipeline/Makefile b/lib/librte_pipeline/Makefile
index 84afe98c..cf265503 100644
--- a/lib/librte_pipeline/Makefile
+++ b/lib/librte_pipeline/Makefile
@@ -12,7 +12,7 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API
CFLAGS += -O3
CFLAGS += $(WERROR_FLAGS)
LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_table
-LDLIBS += -lrte_port -lrte_meter -lrte_sched
+LDLIBS += -lrte_port -lrte_meter -lrte_sched -lrte_cryptodev
EXPORT_MAP := rte_pipeline_version.map
diff --git a/lib/librte_pipeline/meson.build b/lib/librte_pipeline/meson.build
index dc16ab42..04e5f517 100644
--- a/lib/librte_pipeline/meson.build
+++ b/lib/librte_pipeline/meson.build
@@ -5,4 +5,4 @@ version = 3
allow_experimental_apis = true
sources = files('rte_pipeline.c', 'rte_port_in_action.c', 'rte_table_action.c')
headers = files('rte_pipeline.h', 'rte_port_in_action.h', 'rte_table_action.h')
-deps += ['port', 'table', 'meter', 'sched']
+deps += ['port', 'table', 'meter', 'sched', 'cryptodev']
diff --git a/lib/librte_pipeline/rte_pipeline.c b/lib/librte_pipeline/rte_pipeline.c
index 0cb8b804..2c047a8a 100644
--- a/lib/librte_pipeline/rte_pipeline.c
+++ b/lib/librte_pipeline/rte_pipeline.c
@@ -178,8 +178,7 @@ rte_pipeline_check_params(struct rte_pipeline_params *params)
}
/* socket */
- if ((params->socket_id < 0) ||
- (params->socket_id >= RTE_MAX_NUMA_NODES)) {
+ if (params->socket_id < 0) {
RTE_LOG(ERR, PIPELINE,
"%s: Incorrect value for parameter socket_id\n",
__func__);
diff --git a/lib/librte_pipeline/rte_pipeline_version.map b/lib/librte_pipeline/rte_pipeline_version.map
index d820b22f..420f065d 100644
--- a/lib/librte_pipeline/rte_pipeline_version.map
+++ b/lib/librte_pipeline/rte_pipeline_version.map
@@ -72,4 +72,5 @@ EXPERIMENTAL {
rte_table_action_stats_read;
rte_table_action_time_read;
rte_table_action_ttl_read;
+ rte_table_action_crypto_sym_session_get;
};
diff --git a/lib/librte_pipeline/rte_table_action.c b/lib/librte_pipeline/rte_table_action.c
index 83ffa5de..537e6593 100644
--- a/lib/librte_pipeline/rte_table_action.c
+++ b/lib/librte_pipeline/rte_table_action.c
@@ -1,7 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2010-2018 Intel Corporation
*/
-
#include <stdlib.h>
#include <string.h>
@@ -15,6 +14,8 @@
#include <rte_esp.h>
#include <rte_tcp.h>
#include <rte_udp.h>
+#include <rte_cryptodev.h>
+#include <rte_cryptodev_pmd.h>
#include "rte_table_action.h"
@@ -430,6 +431,7 @@ encap_valid(enum rte_table_action_encap_type encap)
case RTE_TABLE_ACTION_ENCAP_QINQ:
case RTE_TABLE_ACTION_ENCAP_MPLS:
case RTE_TABLE_ACTION_ENCAP_PPPOE:
+ case RTE_TABLE_ACTION_ENCAP_VXLAN:
return 1;
default:
return 0;
@@ -498,6 +500,38 @@ struct encap_pppoe_data {
struct pppoe_ppp_hdr pppoe_ppp;
} __attribute__((__packed__));
+#define IP_PROTO_UDP 17
+
+struct encap_vxlan_ipv4_data {
+ struct ether_hdr ether;
+ struct ipv4_hdr ipv4;
+ struct udp_hdr udp;
+ struct vxlan_hdr vxlan;
+} __attribute__((__packed__));
+
+struct encap_vxlan_ipv4_vlan_data {
+ struct ether_hdr ether;
+ struct vlan_hdr vlan;
+ struct ipv4_hdr ipv4;
+ struct udp_hdr udp;
+ struct vxlan_hdr vxlan;
+} __attribute__((__packed__));
+
+struct encap_vxlan_ipv6_data {
+ struct ether_hdr ether;
+ struct ipv6_hdr ipv6;
+ struct udp_hdr udp;
+ struct vxlan_hdr vxlan;
+} __attribute__((__packed__));
+
+struct encap_vxlan_ipv6_vlan_data {
+ struct ether_hdr ether;
+ struct vlan_hdr vlan;
+ struct ipv6_hdr ipv6;
+ struct udp_hdr udp;
+ struct vxlan_hdr vxlan;
+} __attribute__((__packed__));
+
static size_t
encap_data_size(struct rte_table_action_encap_config *encap)
{
@@ -517,6 +551,18 @@ encap_data_size(struct rte_table_action_encap_config *encap)
case 1LLU << RTE_TABLE_ACTION_ENCAP_PPPOE:
return sizeof(struct encap_pppoe_data);
+ case 1LLU << RTE_TABLE_ACTION_ENCAP_VXLAN:
+ if (encap->vxlan.ip_version)
+ if (encap->vxlan.vlan)
+ return sizeof(struct encap_vxlan_ipv4_vlan_data);
+ else
+ return sizeof(struct encap_vxlan_ipv4_data);
+ else
+ if (encap->vxlan.vlan)
+ return sizeof(struct encap_vxlan_ipv6_vlan_data);
+ else
+ return sizeof(struct encap_vxlan_ipv6_data);
+
default:
return 0;
}
@@ -550,6 +596,9 @@ encap_apply_check(struct rte_table_action_encap_params *p,
case RTE_TABLE_ACTION_ENCAP_PPPOE:
return 0;
+ case RTE_TABLE_ACTION_ENCAP_VXLAN:
+ return 0;
+
default:
return -EINVAL;
}
@@ -679,6 +728,168 @@ encap_pppoe_apply(void *data,
}
static int
+encap_vxlan_apply(void *data,
+ struct rte_table_action_encap_params *p,
+ struct rte_table_action_encap_config *cfg)
+{
+ if ((p->vxlan.vxlan.vni > 0xFFFFFF) ||
+ (cfg->vxlan.ip_version && (p->vxlan.ipv4.dscp > 0x3F)) ||
+ (!cfg->vxlan.ip_version && (p->vxlan.ipv6.flow_label > 0xFFFFF)) ||
+ (!cfg->vxlan.ip_version && (p->vxlan.ipv6.dscp > 0x3F)) ||
+ (cfg->vxlan.vlan && (p->vxlan.vlan.vid > 0xFFF)))
+ return -1;
+
+ if (cfg->vxlan.ip_version)
+ if (cfg->vxlan.vlan) {
+ struct encap_vxlan_ipv4_vlan_data *d = data;
+
+ /* Ethernet */
+ ether_addr_copy(&p->vxlan.ether.da, &d->ether.d_addr);
+ ether_addr_copy(&p->vxlan.ether.sa, &d->ether.s_addr);
+ d->ether.ether_type = rte_htons(ETHER_TYPE_VLAN);
+
+ /* VLAN */
+ d->vlan.vlan_tci = rte_htons(VLAN(p->vxlan.vlan.pcp,
+ p->vxlan.vlan.dei,
+ p->vxlan.vlan.vid));
+ d->vlan.eth_proto = rte_htons(ETHER_TYPE_IPv4);
+
+ /* IPv4*/
+ d->ipv4.version_ihl = 0x45;
+ d->ipv4.type_of_service = p->vxlan.ipv4.dscp << 2;
+ d->ipv4.total_length = 0; /* not pre-computed */
+ d->ipv4.packet_id = 0;
+ d->ipv4.fragment_offset = 0;
+ d->ipv4.time_to_live = p->vxlan.ipv4.ttl;
+ d->ipv4.next_proto_id = IP_PROTO_UDP;
+ d->ipv4.hdr_checksum = 0;
+ d->ipv4.src_addr = rte_htonl(p->vxlan.ipv4.sa);
+ d->ipv4.dst_addr = rte_htonl(p->vxlan.ipv4.da);
+
+ d->ipv4.hdr_checksum = rte_ipv4_cksum(&d->ipv4);
+
+ /* UDP */
+ d->udp.src_port = rte_htons(p->vxlan.udp.sp);
+ d->udp.dst_port = rte_htons(p->vxlan.udp.dp);
+ d->udp.dgram_len = 0; /* not pre-computed */
+ d->udp.dgram_cksum = 0;
+
+ /* VXLAN */
+ d->vxlan.vx_flags = rte_htonl(0x08000000);
+ d->vxlan.vx_vni = rte_htonl(p->vxlan.vxlan.vni << 8);
+
+ return 0;
+ } else {
+ struct encap_vxlan_ipv4_data *d = data;
+
+ /* Ethernet */
+ ether_addr_copy(&p->vxlan.ether.da, &d->ether.d_addr);
+ ether_addr_copy(&p->vxlan.ether.sa, &d->ether.s_addr);
+ d->ether.ether_type = rte_htons(ETHER_TYPE_IPv4);
+
+ /* IPv4*/
+ d->ipv4.version_ihl = 0x45;
+ d->ipv4.type_of_service = p->vxlan.ipv4.dscp << 2;
+ d->ipv4.total_length = 0; /* not pre-computed */
+ d->ipv4.packet_id = 0;
+ d->ipv4.fragment_offset = 0;
+ d->ipv4.time_to_live = p->vxlan.ipv4.ttl;
+ d->ipv4.next_proto_id = IP_PROTO_UDP;
+ d->ipv4.hdr_checksum = 0;
+ d->ipv4.src_addr = rte_htonl(p->vxlan.ipv4.sa);
+ d->ipv4.dst_addr = rte_htonl(p->vxlan.ipv4.da);
+
+ d->ipv4.hdr_checksum = rte_ipv4_cksum(&d->ipv4);
+
+ /* UDP */
+ d->udp.src_port = rte_htons(p->vxlan.udp.sp);
+ d->udp.dst_port = rte_htons(p->vxlan.udp.dp);
+ d->udp.dgram_len = 0; /* not pre-computed */
+ d->udp.dgram_cksum = 0;
+
+ /* VXLAN */
+ d->vxlan.vx_flags = rte_htonl(0x08000000);
+ d->vxlan.vx_vni = rte_htonl(p->vxlan.vxlan.vni << 8);
+
+ return 0;
+ }
+ else
+ if (cfg->vxlan.vlan) {
+ struct encap_vxlan_ipv6_vlan_data *d = data;
+
+ /* Ethernet */
+ ether_addr_copy(&p->vxlan.ether.da, &d->ether.d_addr);
+ ether_addr_copy(&p->vxlan.ether.sa, &d->ether.s_addr);
+ d->ether.ether_type = rte_htons(ETHER_TYPE_VLAN);
+
+ /* VLAN */
+ d->vlan.vlan_tci = rte_htons(VLAN(p->vxlan.vlan.pcp,
+ p->vxlan.vlan.dei,
+ p->vxlan.vlan.vid));
+ d->vlan.eth_proto = rte_htons(ETHER_TYPE_IPv6);
+
+ /* IPv6*/
+ d->ipv6.vtc_flow = rte_htonl((6 << 28) |
+ (p->vxlan.ipv6.dscp << 22) |
+ p->vxlan.ipv6.flow_label);
+ d->ipv6.payload_len = 0; /* not pre-computed */
+ d->ipv6.proto = IP_PROTO_UDP;
+ d->ipv6.hop_limits = p->vxlan.ipv6.hop_limit;
+ memcpy(d->ipv6.src_addr,
+ p->vxlan.ipv6.sa,
+ sizeof(p->vxlan.ipv6.sa));
+ memcpy(d->ipv6.dst_addr,
+ p->vxlan.ipv6.da,
+ sizeof(p->vxlan.ipv6.da));
+
+ /* UDP */
+ d->udp.src_port = rte_htons(p->vxlan.udp.sp);
+ d->udp.dst_port = rte_htons(p->vxlan.udp.dp);
+ d->udp.dgram_len = 0; /* not pre-computed */
+ d->udp.dgram_cksum = 0;
+
+ /* VXLAN */
+ d->vxlan.vx_flags = rte_htonl(0x08000000);
+ d->vxlan.vx_vni = rte_htonl(p->vxlan.vxlan.vni << 8);
+
+ return 0;
+ } else {
+ struct encap_vxlan_ipv6_data *d = data;
+
+ /* Ethernet */
+ ether_addr_copy(&p->vxlan.ether.da, &d->ether.d_addr);
+ ether_addr_copy(&p->vxlan.ether.sa, &d->ether.s_addr);
+ d->ether.ether_type = rte_htons(ETHER_TYPE_IPv6);
+
+ /* IPv6*/
+ d->ipv6.vtc_flow = rte_htonl((6 << 28) |
+ (p->vxlan.ipv6.dscp << 22) |
+ p->vxlan.ipv6.flow_label);
+ d->ipv6.payload_len = 0; /* not pre-computed */
+ d->ipv6.proto = IP_PROTO_UDP;
+ d->ipv6.hop_limits = p->vxlan.ipv6.hop_limit;
+ memcpy(d->ipv6.src_addr,
+ p->vxlan.ipv6.sa,
+ sizeof(p->vxlan.ipv6.sa));
+ memcpy(d->ipv6.dst_addr,
+ p->vxlan.ipv6.da,
+ sizeof(p->vxlan.ipv6.da));
+
+ /* UDP */
+ d->udp.src_port = rte_htons(p->vxlan.udp.sp);
+ d->udp.dst_port = rte_htons(p->vxlan.udp.dp);
+ d->udp.dgram_len = 0; /* not pre-computed */
+ d->udp.dgram_cksum = 0;
+
+ /* VXLAN */
+ d->vxlan.vx_flags = rte_htonl(0x08000000);
+ d->vxlan.vx_vni = rte_htonl(p->vxlan.vxlan.vni << 8);
+
+ return 0;
+ }
+}
+
+static int
encap_apply(void *data,
struct rte_table_action_encap_params *p,
struct rte_table_action_encap_config *cfg,
@@ -707,11 +918,31 @@ encap_apply(void *data,
case RTE_TABLE_ACTION_ENCAP_PPPOE:
return encap_pppoe_apply(data, p);
+ case RTE_TABLE_ACTION_ENCAP_VXLAN:
+ return encap_vxlan_apply(data, p, cfg);
+
default:
return -EINVAL;
}
}
+static __rte_always_inline uint16_t
+encap_vxlan_ipv4_checksum_update(uint16_t cksum0,
+ uint16_t total_length)
+{
+ int32_t cksum1;
+
+ cksum1 = cksum0;
+ cksum1 = ~cksum1 & 0xFFFF;
+
+ /* Add total length (one's complement logic) */
+ cksum1 += total_length;
+ cksum1 = (cksum1 & 0xFFFF) + (cksum1 >> 16);
+ cksum1 = (cksum1 & 0xFFFF) + (cksum1 >> 16);
+
+ return (uint16_t)(~cksum1);
+}
+
static __rte_always_inline void *
encap(void *dst, const void *src, size_t n)
{
@@ -720,6 +951,118 @@ encap(void *dst, const void *src, size_t n)
}
static __rte_always_inline void
+pkt_work_encap_vxlan_ipv4(struct rte_mbuf *mbuf,
+ struct encap_vxlan_ipv4_data *vxlan_tbl,
+ struct rte_table_action_encap_config *cfg)
+{
+ uint32_t ether_offset = cfg->vxlan.data_offset;
+ void *ether = RTE_MBUF_METADATA_UINT32_PTR(mbuf, ether_offset);
+ struct encap_vxlan_ipv4_data *vxlan_pkt;
+ uint16_t ether_length, ipv4_total_length, ipv4_hdr_cksum, udp_length;
+
+ ether_length = (uint16_t)mbuf->pkt_len;
+ ipv4_total_length = ether_length +
+ (sizeof(struct vxlan_hdr) +
+ sizeof(struct udp_hdr) +
+ sizeof(struct ipv4_hdr));
+ ipv4_hdr_cksum = encap_vxlan_ipv4_checksum_update(vxlan_tbl->ipv4.hdr_checksum,
+ rte_htons(ipv4_total_length));
+ udp_length = ether_length +
+ (sizeof(struct vxlan_hdr) +
+ sizeof(struct udp_hdr));
+
+ vxlan_pkt = encap(ether, vxlan_tbl, sizeof(*vxlan_tbl));
+ vxlan_pkt->ipv4.total_length = rte_htons(ipv4_total_length);
+ vxlan_pkt->ipv4.hdr_checksum = ipv4_hdr_cksum;
+ vxlan_pkt->udp.dgram_len = rte_htons(udp_length);
+
+ mbuf->data_off = ether_offset - (sizeof(struct rte_mbuf) + sizeof(*vxlan_pkt));
+ mbuf->pkt_len = mbuf->data_len = ether_length + sizeof(*vxlan_pkt);
+}
+
+static __rte_always_inline void
+pkt_work_encap_vxlan_ipv4_vlan(struct rte_mbuf *mbuf,
+ struct encap_vxlan_ipv4_vlan_data *vxlan_tbl,
+ struct rte_table_action_encap_config *cfg)
+{
+ uint32_t ether_offset = cfg->vxlan.data_offset;
+ void *ether = RTE_MBUF_METADATA_UINT32_PTR(mbuf, ether_offset);
+ struct encap_vxlan_ipv4_vlan_data *vxlan_pkt;
+ uint16_t ether_length, ipv4_total_length, ipv4_hdr_cksum, udp_length;
+
+ ether_length = (uint16_t)mbuf->pkt_len;
+ ipv4_total_length = ether_length +
+ (sizeof(struct vxlan_hdr) +
+ sizeof(struct udp_hdr) +
+ sizeof(struct ipv4_hdr));
+ ipv4_hdr_cksum = encap_vxlan_ipv4_checksum_update(vxlan_tbl->ipv4.hdr_checksum,
+ rte_htons(ipv4_total_length));
+ udp_length = ether_length +
+ (sizeof(struct vxlan_hdr) +
+ sizeof(struct udp_hdr));
+
+ vxlan_pkt = encap(ether, vxlan_tbl, sizeof(*vxlan_tbl));
+ vxlan_pkt->ipv4.total_length = rte_htons(ipv4_total_length);
+ vxlan_pkt->ipv4.hdr_checksum = ipv4_hdr_cksum;
+ vxlan_pkt->udp.dgram_len = rte_htons(udp_length);
+
+ mbuf->data_off = ether_offset - (sizeof(struct rte_mbuf) + sizeof(*vxlan_pkt));
+ mbuf->pkt_len = mbuf->data_len = ether_length + sizeof(*vxlan_pkt);
+}
+
+static __rte_always_inline void
+pkt_work_encap_vxlan_ipv6(struct rte_mbuf *mbuf,
+ struct encap_vxlan_ipv6_data *vxlan_tbl,
+ struct rte_table_action_encap_config *cfg)
+{
+ uint32_t ether_offset = cfg->vxlan.data_offset;
+ void *ether = RTE_MBUF_METADATA_UINT32_PTR(mbuf, ether_offset);
+ struct encap_vxlan_ipv6_data *vxlan_pkt;
+ uint16_t ether_length, ipv6_payload_length, udp_length;
+
+ ether_length = (uint16_t)mbuf->pkt_len;
+ ipv6_payload_length = ether_length +
+ (sizeof(struct vxlan_hdr) +
+ sizeof(struct udp_hdr));
+ udp_length = ether_length +
+ (sizeof(struct vxlan_hdr) +
+ sizeof(struct udp_hdr));
+
+ vxlan_pkt = encap(ether, vxlan_tbl, sizeof(*vxlan_tbl));
+ vxlan_pkt->ipv6.payload_len = rte_htons(ipv6_payload_length);
+ vxlan_pkt->udp.dgram_len = rte_htons(udp_length);
+
+ mbuf->data_off = ether_offset - (sizeof(struct rte_mbuf) + sizeof(*vxlan_pkt));
+ mbuf->pkt_len = mbuf->data_len = ether_length + sizeof(*vxlan_pkt);
+}
+
+static __rte_always_inline void
+pkt_work_encap_vxlan_ipv6_vlan(struct rte_mbuf *mbuf,
+ struct encap_vxlan_ipv6_vlan_data *vxlan_tbl,
+ struct rte_table_action_encap_config *cfg)
+{
+ uint32_t ether_offset = cfg->vxlan.data_offset;
+ void *ether = RTE_MBUF_METADATA_UINT32_PTR(mbuf, ether_offset);
+ struct encap_vxlan_ipv6_vlan_data *vxlan_pkt;
+ uint16_t ether_length, ipv6_payload_length, udp_length;
+
+ ether_length = (uint16_t)mbuf->pkt_len;
+ ipv6_payload_length = ether_length +
+ (sizeof(struct vxlan_hdr) +
+ sizeof(struct udp_hdr));
+ udp_length = ether_length +
+ (sizeof(struct vxlan_hdr) +
+ sizeof(struct udp_hdr));
+
+ vxlan_pkt = encap(ether, vxlan_tbl, sizeof(*vxlan_tbl));
+ vxlan_pkt->ipv6.payload_len = rte_htons(ipv6_payload_length);
+ vxlan_pkt->udp.dgram_len = rte_htons(udp_length);
+
+ mbuf->data_off = ether_offset - (sizeof(struct rte_mbuf) + sizeof(*vxlan_pkt));
+ mbuf->pkt_len = mbuf->data_len = ether_length + sizeof(*vxlan_pkt);
+}
+
+static __rte_always_inline void
pkt_work_encap(struct rte_mbuf *mbuf,
void *data,
struct rte_table_action_encap_config *cfg,
@@ -776,6 +1119,20 @@ pkt_work_encap(struct rte_mbuf *mbuf,
break;
}
+ case 1LLU << RTE_TABLE_ACTION_ENCAP_VXLAN:
+ {
+ if (cfg->vxlan.ip_version)
+ if (cfg->vxlan.vlan)
+ pkt_work_encap_vxlan_ipv4_vlan(mbuf, data, cfg);
+ else
+ pkt_work_encap_vxlan_ipv4(mbuf, data, cfg);
+ else
+ if (cfg->vxlan.vlan)
+ pkt_work_encap_vxlan_ipv6_vlan(mbuf, data, cfg);
+ else
+ pkt_work_encap_vxlan_ipv6(mbuf, data, cfg);
+ }
+
default:
break;
}
@@ -1219,6 +1576,562 @@ pkt_work_time(struct time_data *data,
data->time = time;
}
+
+/**
+ * RTE_TABLE_ACTION_CRYPTO
+ */
+
+#define CRYPTO_OP_MASK_CIPHER 0x1
+#define CRYPTO_OP_MASK_AUTH 0x2
+#define CRYPTO_OP_MASK_AEAD 0x4
+
+struct crypto_op_sym_iv_aad {
+ struct rte_crypto_op op;
+ struct rte_crypto_sym_op sym_op;
+ union {
+ struct {
+ uint8_t cipher_iv[
+ RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX];
+ uint8_t auth_iv[
+ RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX];
+ } cipher_auth;
+
+ struct {
+ uint8_t iv[RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX];
+ uint8_t aad[RTE_TABLE_ACTION_SYM_CRYPTO_AAD_SIZE_MAX];
+ } aead_iv_aad;
+
+ } iv_aad;
+};
+
+struct sym_crypto_data {
+
+ union {
+ struct {
+
+ /** Length of cipher iv. */
+ uint16_t cipher_iv_len;
+
+ /** Offset from start of IP header to the cipher iv. */
+ uint16_t cipher_iv_data_offset;
+
+ /** Length of cipher iv to be updated in the mbuf. */
+ uint16_t cipher_iv_update_len;
+
+ /** Offset from start of IP header to the auth iv. */
+ uint16_t auth_iv_data_offset;
+
+ /** Length of auth iv in the mbuf. */
+ uint16_t auth_iv_len;
+
+ /** Length of auth iv to be updated in the mbuf. */
+ uint16_t auth_iv_update_len;
+
+ } cipher_auth;
+ struct {
+
+ /** Length of iv. */
+ uint16_t iv_len;
+
+ /** Offset from start of IP header to the aead iv. */
+ uint16_t iv_data_offset;
+
+ /** Length of iv to be updated in the mbuf. */
+ uint16_t iv_update_len;
+
+ /** Length of aad */
+ uint16_t aad_len;
+
+ /** Offset from start of IP header to the aad. */
+ uint16_t aad_data_offset;
+
+ /** Length of aad to updated in the mbuf. */
+ uint16_t aad_update_len;
+
+ } aead;
+ };
+
+ /** Offset from start of IP header to the data. */
+ uint16_t data_offset;
+
+ /** Digest length. */
+ uint16_t digest_len;
+
+ /** block size */
+ uint16_t block_size;
+
+ /** Mask of crypto operation */
+ uint16_t op_mask;
+
+ /** Session pointer. */
+ struct rte_cryptodev_sym_session *session;
+
+ /** Direction of crypto, encrypt or decrypt */
+ uint16_t direction;
+
+ /** Private data size to store cipher iv / aad. */
+ uint8_t iv_aad_data[32];
+
+} __attribute__((__packed__));
+
+static int
+sym_crypto_cfg_check(struct rte_table_action_sym_crypto_config *cfg)
+{
+ if (!rte_cryptodev_pmd_is_valid_dev(cfg->cryptodev_id))
+ return -EINVAL;
+ if (cfg->mp_create == NULL || cfg->mp_init == NULL)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int
+get_block_size(const struct rte_crypto_sym_xform *xform, uint8_t cdev_id)
+{
+ struct rte_cryptodev_info dev_info;
+ const struct rte_cryptodev_capabilities *cap;
+ uint32_t i;
+
+ rte_cryptodev_info_get(cdev_id, &dev_info);
+
+ for (i = 0;; i++) {
+ cap = &dev_info.capabilities[i];
+ if (!cap)
+ break;
+
+ if (cap->sym.xform_type != xform->type)
+ continue;
+
+ if ((xform->type == RTE_CRYPTO_SYM_XFORM_CIPHER) &&
+ (cap->sym.cipher.algo == xform->cipher.algo))
+ return cap->sym.cipher.block_size;
+
+ if ((xform->type == RTE_CRYPTO_SYM_XFORM_AEAD) &&
+ (cap->sym.aead.algo == xform->aead.algo))
+ return cap->sym.aead.block_size;
+
+ if (xform->type == RTE_CRYPTO_SYM_XFORM_NOT_SPECIFIED)
+ break;
+ }
+
+ return -1;
+}
+
+static int
+sym_crypto_apply(struct sym_crypto_data *data,
+ struct rte_table_action_sym_crypto_config *cfg,
+ struct rte_table_action_sym_crypto_params *p)
+{
+ const struct rte_crypto_cipher_xform *cipher_xform = NULL;
+ const struct rte_crypto_auth_xform *auth_xform = NULL;
+ const struct rte_crypto_aead_xform *aead_xform = NULL;
+ struct rte_crypto_sym_xform *xform = p->xform;
+ struct rte_cryptodev_sym_session *session;
+ int ret;
+
+ memset(data, 0, sizeof(*data));
+
+ while (xform) {
+ if (xform->type == RTE_CRYPTO_SYM_XFORM_CIPHER) {
+ cipher_xform = &xform->cipher;
+
+ if (cipher_xform->iv.length >
+ RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX)
+ return -ENOMEM;
+ if (cipher_xform->iv.offset !=
+ RTE_TABLE_ACTION_SYM_CRYPTO_IV_OFFSET)
+ return -EINVAL;
+
+ ret = get_block_size(xform, cfg->cryptodev_id);
+ if (ret < 0)
+ return -1;
+ data->block_size = (uint16_t)ret;
+ data->op_mask |= CRYPTO_OP_MASK_CIPHER;
+
+ data->cipher_auth.cipher_iv_len =
+ cipher_xform->iv.length;
+ data->cipher_auth.cipher_iv_data_offset = (uint16_t)
+ p->cipher_auth.cipher_iv_update.offset;
+ data->cipher_auth.cipher_iv_update_len = (uint16_t)
+ p->cipher_auth.cipher_iv_update.length;
+
+ rte_memcpy(data->iv_aad_data,
+ p->cipher_auth.cipher_iv.val,
+ p->cipher_auth.cipher_iv.length);
+
+ data->direction = cipher_xform->op;
+
+ } else if (xform->type == RTE_CRYPTO_SYM_XFORM_AUTH) {
+ auth_xform = &xform->auth;
+ if (auth_xform->iv.length >
+ RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX)
+ return -ENOMEM;
+ data->op_mask |= CRYPTO_OP_MASK_AUTH;
+
+ data->cipher_auth.auth_iv_len = auth_xform->iv.length;
+ data->cipher_auth.auth_iv_data_offset = (uint16_t)
+ p->cipher_auth.auth_iv_update.offset;
+ data->cipher_auth.auth_iv_update_len = (uint16_t)
+ p->cipher_auth.auth_iv_update.length;
+ data->digest_len = auth_xform->digest_length;
+
+ data->direction = (auth_xform->op ==
+ RTE_CRYPTO_AUTH_OP_GENERATE) ?
+ RTE_CRYPTO_CIPHER_OP_ENCRYPT :
+ RTE_CRYPTO_CIPHER_OP_DECRYPT;
+
+ } else if (xform->type == RTE_CRYPTO_SYM_XFORM_AEAD) {
+ aead_xform = &xform->aead;
+
+ if ((aead_xform->iv.length >
+ RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX) || (
+ aead_xform->aad_length >
+ RTE_TABLE_ACTION_SYM_CRYPTO_AAD_SIZE_MAX))
+ return -EINVAL;
+ if (aead_xform->iv.offset !=
+ RTE_TABLE_ACTION_SYM_CRYPTO_IV_OFFSET)
+ return -EINVAL;
+
+ ret = get_block_size(xform, cfg->cryptodev_id);
+ if (ret < 0)
+ return -1;
+ data->block_size = (uint16_t)ret;
+ data->op_mask |= CRYPTO_OP_MASK_AEAD;
+
+ data->digest_len = aead_xform->digest_length;
+ data->aead.iv_len = aead_xform->iv.length;
+ data->aead.aad_len = aead_xform->aad_length;
+
+ data->aead.iv_data_offset = (uint16_t)
+ p->aead.iv_update.offset;
+ data->aead.iv_update_len = (uint16_t)
+ p->aead.iv_update.length;
+ data->aead.aad_data_offset = (uint16_t)
+ p->aead.aad_update.offset;
+ data->aead.aad_update_len = (uint16_t)
+ p->aead.aad_update.length;
+
+ rte_memcpy(data->iv_aad_data,
+ p->aead.iv.val,
+ p->aead.iv.length);
+
+ rte_memcpy(data->iv_aad_data + p->aead.iv.length,
+ p->aead.aad.val,
+ p->aead.aad.length);
+
+ data->direction = (aead_xform->op ==
+ RTE_CRYPTO_AEAD_OP_ENCRYPT) ?
+ RTE_CRYPTO_CIPHER_OP_ENCRYPT :
+ RTE_CRYPTO_CIPHER_OP_DECRYPT;
+ } else
+ return -EINVAL;
+
+ xform = xform->next;
+ }
+
+ if (auth_xform && auth_xform->iv.length) {
+ if (cipher_xform) {
+ if (auth_xform->iv.offset !=
+ RTE_TABLE_ACTION_SYM_CRYPTO_IV_OFFSET +
+ cipher_xform->iv.length)
+ return -EINVAL;
+
+ rte_memcpy(data->iv_aad_data + cipher_xform->iv.length,
+ p->cipher_auth.auth_iv.val,
+ p->cipher_auth.auth_iv.length);
+ } else {
+ rte_memcpy(data->iv_aad_data,
+ p->cipher_auth.auth_iv.val,
+ p->cipher_auth.auth_iv.length);
+ }
+ }
+
+ session = rte_cryptodev_sym_session_create(cfg->mp_create);
+ if (!session)
+ return -ENOMEM;
+
+ ret = rte_cryptodev_sym_session_init(cfg->cryptodev_id, session,
+ p->xform, cfg->mp_init);
+ if (ret < 0) {
+ rte_cryptodev_sym_session_free(session);
+ return ret;
+ }
+
+ data->data_offset = (uint16_t)p->data_offset;
+ data->session = session;
+
+ return 0;
+}
+
+static __rte_always_inline uint64_t
+pkt_work_sym_crypto(struct rte_mbuf *mbuf, struct sym_crypto_data *data,
+ struct rte_table_action_sym_crypto_config *cfg,
+ uint16_t ip_offset)
+{
+ struct crypto_op_sym_iv_aad *crypto_op = (struct crypto_op_sym_iv_aad *)
+ RTE_MBUF_METADATA_UINT8_PTR(mbuf, cfg->op_offset);
+ struct rte_crypto_op *op = &crypto_op->op;
+ struct rte_crypto_sym_op *sym = op->sym;
+ uint32_t pkt_offset = sizeof(*mbuf) + mbuf->data_off;
+ uint32_t payload_len = pkt_offset + mbuf->data_len - data->data_offset;
+
+ op->type = RTE_CRYPTO_OP_TYPE_SYMMETRIC;
+ op->sess_type = RTE_CRYPTO_OP_WITH_SESSION;
+ op->phys_addr = mbuf->buf_iova + cfg->op_offset - sizeof(*mbuf);
+ op->status = RTE_CRYPTO_OP_STATUS_NOT_PROCESSED;
+ sym->m_src = mbuf;
+ sym->m_dst = NULL;
+ sym->session = data->session;
+
+ /** pad the packet */
+ if (data->direction == RTE_CRYPTO_CIPHER_OP_ENCRYPT) {
+ uint32_t append_len = RTE_ALIGN_CEIL(payload_len,
+ data->block_size) - payload_len;
+
+ if (unlikely(rte_pktmbuf_append(mbuf, append_len +
+ data->digest_len) == NULL))
+ return 1;
+
+ payload_len += append_len;
+ } else
+ payload_len -= data->digest_len;
+
+ if (data->op_mask & CRYPTO_OP_MASK_CIPHER) {
+ /** prepare cipher op */
+ uint8_t *iv = crypto_op->iv_aad.cipher_auth.cipher_iv;
+
+ sym->cipher.data.length = payload_len;
+ sym->cipher.data.offset = data->data_offset - pkt_offset;
+
+ if (data->cipher_auth.cipher_iv_update_len) {
+ uint8_t *pkt_iv = RTE_MBUF_METADATA_UINT8_PTR(mbuf,
+ data->cipher_auth.cipher_iv_data_offset
+ + ip_offset);
+
+ /** For encryption, update the pkt iv field, otherwise
+ * update the iv_aad_field
+ **/
+ if (data->direction == RTE_CRYPTO_CIPHER_OP_ENCRYPT)
+ rte_memcpy(pkt_iv, data->iv_aad_data,
+ data->cipher_auth.cipher_iv_update_len);
+ else
+ rte_memcpy(data->iv_aad_data, pkt_iv,
+ data->cipher_auth.cipher_iv_update_len);
+ }
+
+ /** write iv */
+ rte_memcpy(iv, data->iv_aad_data,
+ data->cipher_auth.cipher_iv_len);
+ }
+
+ if (data->op_mask & CRYPTO_OP_MASK_AUTH) {
+ /** authentication always start from IP header. */
+ sym->auth.data.offset = ip_offset - pkt_offset;
+ sym->auth.data.length = mbuf->data_len - sym->auth.data.offset -
+ data->digest_len;
+ sym->auth.digest.data = rte_pktmbuf_mtod_offset(mbuf,
+ uint8_t *, rte_pktmbuf_pkt_len(mbuf) -
+ data->digest_len);
+ sym->auth.digest.phys_addr = rte_pktmbuf_iova_offset(mbuf,
+ rte_pktmbuf_pkt_len(mbuf) - data->digest_len);
+
+ if (data->cipher_auth.auth_iv_update_len) {
+ uint8_t *pkt_iv = RTE_MBUF_METADATA_UINT8_PTR(mbuf,
+ data->cipher_auth.auth_iv_data_offset
+ + ip_offset);
+ uint8_t *data_iv = data->iv_aad_data +
+ data->cipher_auth.cipher_iv_len;
+
+ if (data->direction == RTE_CRYPTO_CIPHER_OP_ENCRYPT)
+ rte_memcpy(pkt_iv, data_iv,
+ data->cipher_auth.auth_iv_update_len);
+ else
+ rte_memcpy(data_iv, pkt_iv,
+ data->cipher_auth.auth_iv_update_len);
+ }
+
+ if (data->cipher_auth.auth_iv_len) {
+ /** prepare cipher op */
+ uint8_t *iv = crypto_op->iv_aad.cipher_auth.auth_iv;
+
+ rte_memcpy(iv, data->iv_aad_data +
+ data->cipher_auth.cipher_iv_len,
+ data->cipher_auth.auth_iv_len);
+ }
+ }
+
+ if (data->op_mask & CRYPTO_OP_MASK_AEAD) {
+ uint8_t *iv = crypto_op->iv_aad.aead_iv_aad.iv;
+ uint8_t *aad = crypto_op->iv_aad.aead_iv_aad.aad;
+
+ sym->aead.aad.data = aad;
+ sym->aead.aad.phys_addr = rte_pktmbuf_iova_offset(mbuf,
+ aad - rte_pktmbuf_mtod(mbuf, uint8_t *));
+ sym->aead.digest.data = rte_pktmbuf_mtod_offset(mbuf,
+ uint8_t *, rte_pktmbuf_pkt_len(mbuf) -
+ data->digest_len);
+ sym->aead.digest.phys_addr = rte_pktmbuf_iova_offset(mbuf,
+ rte_pktmbuf_pkt_len(mbuf) - data->digest_len);
+ sym->aead.data.offset = data->data_offset - pkt_offset;
+ sym->aead.data.length = payload_len;
+
+ if (data->aead.iv_update_len) {
+ uint8_t *pkt_iv = RTE_MBUF_METADATA_UINT8_PTR(mbuf,
+ data->aead.iv_data_offset + ip_offset);
+ uint8_t *data_iv = data->iv_aad_data;
+
+ if (data->direction == RTE_CRYPTO_CIPHER_OP_ENCRYPT)
+ rte_memcpy(pkt_iv, data_iv,
+ data->aead.iv_update_len);
+ else
+ rte_memcpy(data_iv, pkt_iv,
+ data->aead.iv_update_len);
+ }
+
+ rte_memcpy(iv, data->iv_aad_data, data->aead.iv_len);
+
+ if (data->aead.aad_update_len) {
+ uint8_t *pkt_aad = RTE_MBUF_METADATA_UINT8_PTR(mbuf,
+ data->aead.aad_data_offset + ip_offset);
+ uint8_t *data_aad = data->iv_aad_data +
+ data->aead.iv_len;
+
+ if (data->direction == RTE_CRYPTO_CIPHER_OP_ENCRYPT)
+ rte_memcpy(pkt_aad, data_aad,
+ data->aead.iv_update_len);
+ else
+ rte_memcpy(data_aad, pkt_aad,
+ data->aead.iv_update_len);
+ }
+
+ rte_memcpy(aad, data->iv_aad_data + data->aead.iv_len,
+ data->aead.aad_len);
+ }
+
+ return 0;
+}
+
+/**
+ * RTE_TABLE_ACTION_TAG
+ */
+struct tag_data {
+ uint32_t tag;
+} __attribute__((__packed__));
+
+static int
+tag_apply(struct tag_data *data,
+ struct rte_table_action_tag_params *p)
+{
+ data->tag = p->tag;
+ return 0;
+}
+
+static __rte_always_inline void
+pkt_work_tag(struct rte_mbuf *mbuf,
+ struct tag_data *data)
+{
+ mbuf->hash.fdir.hi = data->tag;
+ mbuf->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID;
+}
+
+static __rte_always_inline void
+pkt4_work_tag(struct rte_mbuf *mbuf0,
+ struct rte_mbuf *mbuf1,
+ struct rte_mbuf *mbuf2,
+ struct rte_mbuf *mbuf3,
+ struct tag_data *data0,
+ struct tag_data *data1,
+ struct tag_data *data2,
+ struct tag_data *data3)
+{
+ mbuf0->hash.fdir.hi = data0->tag;
+ mbuf1->hash.fdir.hi = data1->tag;
+ mbuf2->hash.fdir.hi = data2->tag;
+ mbuf3->hash.fdir.hi = data3->tag;
+
+ mbuf0->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID;
+ mbuf1->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID;
+ mbuf2->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID;
+ mbuf3->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID;
+}
+
+/**
+ * RTE_TABLE_ACTION_DECAP
+ */
+struct decap_data {
+ uint16_t n;
+} __attribute__((__packed__));
+
+static int
+decap_apply(struct decap_data *data,
+ struct rte_table_action_decap_params *p)
+{
+ data->n = p->n;
+ return 0;
+}
+
+static __rte_always_inline void
+pkt_work_decap(struct rte_mbuf *mbuf,
+ struct decap_data *data)
+{
+ uint16_t data_off = mbuf->data_off;
+ uint16_t data_len = mbuf->data_len;
+ uint32_t pkt_len = mbuf->pkt_len;
+ uint16_t n = data->n;
+
+ mbuf->data_off = data_off + n;
+ mbuf->data_len = data_len - n;
+ mbuf->pkt_len = pkt_len - n;
+}
+
+static __rte_always_inline void
+pkt4_work_decap(struct rte_mbuf *mbuf0,
+ struct rte_mbuf *mbuf1,
+ struct rte_mbuf *mbuf2,
+ struct rte_mbuf *mbuf3,
+ struct decap_data *data0,
+ struct decap_data *data1,
+ struct decap_data *data2,
+ struct decap_data *data3)
+{
+ uint16_t data_off0 = mbuf0->data_off;
+ uint16_t data_len0 = mbuf0->data_len;
+ uint32_t pkt_len0 = mbuf0->pkt_len;
+
+ uint16_t data_off1 = mbuf1->data_off;
+ uint16_t data_len1 = mbuf1->data_len;
+ uint32_t pkt_len1 = mbuf1->pkt_len;
+
+ uint16_t data_off2 = mbuf2->data_off;
+ uint16_t data_len2 = mbuf2->data_len;
+ uint32_t pkt_len2 = mbuf2->pkt_len;
+
+ uint16_t data_off3 = mbuf3->data_off;
+ uint16_t data_len3 = mbuf3->data_len;
+ uint32_t pkt_len3 = mbuf3->pkt_len;
+
+ uint16_t n0 = data0->n;
+ uint16_t n1 = data1->n;
+ uint16_t n2 = data2->n;
+ uint16_t n3 = data3->n;
+
+ mbuf0->data_off = data_off0 + n0;
+ mbuf0->data_len = data_len0 - n0;
+ mbuf0->pkt_len = pkt_len0 - n0;
+
+ mbuf1->data_off = data_off1 + n1;
+ mbuf1->data_len = data_len1 - n1;
+ mbuf1->pkt_len = pkt_len1 - n1;
+
+ mbuf2->data_off = data_off2 + n2;
+ mbuf2->data_len = data_len2 - n2;
+ mbuf2->pkt_len = pkt_len2 - n2;
+
+ mbuf3->data_off = data_off3 + n3;
+ mbuf3->data_len = data_len3 - n3;
+ mbuf3->pkt_len = pkt_len3 - n3;
+}
+
/**
* Action profile
*/
@@ -1235,6 +2148,9 @@ action_valid(enum rte_table_action_type action)
case RTE_TABLE_ACTION_TTL:
case RTE_TABLE_ACTION_STATS:
case RTE_TABLE_ACTION_TIME:
+ case RTE_TABLE_ACTION_SYM_CRYPTO:
+ case RTE_TABLE_ACTION_TAG:
+ case RTE_TABLE_ACTION_DECAP:
return 1;
default:
return 0;
@@ -1254,6 +2170,7 @@ struct ap_config {
struct rte_table_action_nat_config nat;
struct rte_table_action_ttl_config ttl;
struct rte_table_action_stats_config stats;
+ struct rte_table_action_sym_crypto_config sym_crypto;
};
static size_t
@@ -1274,6 +2191,8 @@ action_cfg_size(enum rte_table_action_type action)
return sizeof(struct rte_table_action_ttl_config);
case RTE_TABLE_ACTION_STATS:
return sizeof(struct rte_table_action_stats_config);
+ case RTE_TABLE_ACTION_SYM_CRYPTO:
+ return sizeof(struct rte_table_action_sym_crypto_config);
default:
return 0;
}
@@ -1305,6 +2224,8 @@ action_cfg_get(struct ap_config *ap_config,
case RTE_TABLE_ACTION_STATS:
return &ap_config->stats;
+ case RTE_TABLE_ACTION_SYM_CRYPTO:
+ return &ap_config->sym_crypto;
default:
return NULL;
}
@@ -1361,6 +2282,15 @@ action_data_size(enum rte_table_action_type action,
case RTE_TABLE_ACTION_TIME:
return sizeof(struct time_data);
+ case RTE_TABLE_ACTION_SYM_CRYPTO:
+ return (sizeof(struct sym_crypto_data));
+
+ case RTE_TABLE_ACTION_TAG:
+ return sizeof(struct tag_data);
+
+ case RTE_TABLE_ACTION_DECAP:
+ return sizeof(struct decap_data);
+
default:
return 0;
}
@@ -1460,6 +2390,10 @@ rte_table_action_profile_action_register(struct rte_table_action_profile *profil
status = stats_cfg_check(action_config);
break;
+ case RTE_TABLE_ACTION_SYM_CRYPTO:
+ status = sym_crypto_cfg_check(action_config);
+ break;
+
default:
status = 0;
break;
@@ -1609,6 +2543,19 @@ rte_table_action_apply(struct rte_table_action *action,
return time_apply(action_data,
action_params);
+ case RTE_TABLE_ACTION_SYM_CRYPTO:
+ return sym_crypto_apply(action_data,
+ &action->cfg.sym_crypto,
+ action_params);
+
+ case RTE_TABLE_ACTION_TAG:
+ return tag_apply(action_data,
+ action_params);
+
+ case RTE_TABLE_ACTION_DECAP:
+ return decap_apply(action_data,
+ action_params);
+
default:
return -EINVAL;
}
@@ -1861,6 +2808,25 @@ rte_table_action_time_read(struct rte_table_action *action,
return 0;
}
+struct rte_cryptodev_sym_session *
+rte_table_action_crypto_sym_session_get(struct rte_table_action *action,
+ void *data)
+{
+ struct sym_crypto_data *sym_crypto_data;
+
+ /* Check input arguments */
+ if ((action == NULL) ||
+ ((action->cfg.action_mask &
+ (1LLU << RTE_TABLE_ACTION_SYM_CRYPTO)) == 0) ||
+ (data == NULL))
+ return NULL;
+
+ sym_crypto_data = action_data_get(data, action,
+ RTE_TABLE_ACTION_SYM_CRYPTO);
+
+ return sym_crypto_data->session;
+}
+
static __rte_always_inline uint64_t
pkt_work(struct rte_mbuf *mbuf,
struct rte_pipeline_table_entry *table_entry,
@@ -1920,6 +2886,14 @@ pkt_work(struct rte_mbuf *mbuf,
dscp);
}
+ if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_DECAP)) {
+ void *data = action_data_get(table_entry,
+ action,
+ RTE_TABLE_ACTION_DECAP);
+
+ pkt_work_decap(mbuf, data);
+ }
+
if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_ENCAP)) {
void *data =
action_data_get(table_entry, action, RTE_TABLE_ACTION_ENCAP);
@@ -1966,6 +2940,22 @@ pkt_work(struct rte_mbuf *mbuf,
pkt_work_time(data, time);
}
+ if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_SYM_CRYPTO)) {
+ void *data = action_data_get(table_entry, action,
+ RTE_TABLE_ACTION_SYM_CRYPTO);
+
+ drop_mask |= pkt_work_sym_crypto(mbuf, data, &cfg->sym_crypto,
+ ip_offset);
+ }
+
+ if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_TAG)) {
+ void *data = action_data_get(table_entry,
+ action,
+ RTE_TABLE_ACTION_TAG);
+
+ pkt_work_tag(mbuf, data);
+ }
+
return drop_mask;
}
@@ -2137,6 +3127,24 @@ pkt4_work(struct rte_mbuf **mbufs,
dscp3);
}
+ if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_DECAP)) {
+ void *data0 = action_data_get(table_entry0,
+ action,
+ RTE_TABLE_ACTION_DECAP);
+ void *data1 = action_data_get(table_entry1,
+ action,
+ RTE_TABLE_ACTION_DECAP);
+ void *data2 = action_data_get(table_entry2,
+ action,
+ RTE_TABLE_ACTION_DECAP);
+ void *data3 = action_data_get(table_entry3,
+ action,
+ RTE_TABLE_ACTION_DECAP);
+
+ pkt4_work_decap(mbuf0, mbuf1, mbuf2, mbuf3,
+ data0, data1, data2, data3);
+ }
+
if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_ENCAP)) {
void *data0 =
action_data_get(table_entry0, action, RTE_TABLE_ACTION_ENCAP);
@@ -2254,6 +3262,44 @@ pkt4_work(struct rte_mbuf **mbufs,
pkt_work_time(data3, time);
}
+ if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_SYM_CRYPTO)) {
+ void *data0 = action_data_get(table_entry0, action,
+ RTE_TABLE_ACTION_SYM_CRYPTO);
+ void *data1 = action_data_get(table_entry1, action,
+ RTE_TABLE_ACTION_SYM_CRYPTO);
+ void *data2 = action_data_get(table_entry2, action,
+ RTE_TABLE_ACTION_SYM_CRYPTO);
+ void *data3 = action_data_get(table_entry3, action,
+ RTE_TABLE_ACTION_SYM_CRYPTO);
+
+ drop_mask0 |= pkt_work_sym_crypto(mbuf0, data0, &cfg->sym_crypto,
+ ip_offset);
+ drop_mask1 |= pkt_work_sym_crypto(mbuf1, data1, &cfg->sym_crypto,
+ ip_offset);
+ drop_mask2 |= pkt_work_sym_crypto(mbuf2, data2, &cfg->sym_crypto,
+ ip_offset);
+ drop_mask3 |= pkt_work_sym_crypto(mbuf3, data3, &cfg->sym_crypto,
+ ip_offset);
+ }
+
+ if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_TAG)) {
+ void *data0 = action_data_get(table_entry0,
+ action,
+ RTE_TABLE_ACTION_TAG);
+ void *data1 = action_data_get(table_entry1,
+ action,
+ RTE_TABLE_ACTION_TAG);
+ void *data2 = action_data_get(table_entry2,
+ action,
+ RTE_TABLE_ACTION_TAG);
+ void *data3 = action_data_get(table_entry3,
+ action,
+ RTE_TABLE_ACTION_TAG);
+
+ pkt4_work_tag(mbuf0, mbuf1, mbuf2, mbuf3,
+ data0, data1, data2, data3);
+ }
+
return drop_mask0 |
(drop_mask1 << 1) |
(drop_mask2 << 2) |
diff --git a/lib/librte_pipeline/rte_table_action.h b/lib/librte_pipeline/rte_table_action.h
index c7f751aa..c9606129 100644
--- a/lib/librte_pipeline/rte_table_action.h
+++ b/lib/librte_pipeline/rte_table_action.h
@@ -93,6 +93,15 @@ enum rte_table_action_type {
/** Timestamp. */
RTE_TABLE_ACTION_TIME,
+
+ /** Crypto. */
+ RTE_TABLE_ACTION_SYM_CRYPTO,
+
+ /** Tag. */
+ RTE_TABLE_ACTION_TAG,
+
+ /** Packet decapsulations. */
+ RTE_TABLE_ACTION_DECAP,
};
/** Common action configuration (per table action profile). */
@@ -366,6 +375,11 @@ enum rte_table_action_encap_type {
/** IP -> { Ether | PPPoE | PPP | IP } */
RTE_TABLE_ACTION_ENCAP_PPPOE,
+
+ /** Ether -> { Ether | IP | UDP | VXLAN | Ether }
+ * Ether -> { Ether | VLAN | IP | UDP | VXLAN | Ether }
+ */
+ RTE_TABLE_ACTION_ENCAP_VXLAN,
};
/** Pre-computed Ethernet header fields for encapsulation action. */
@@ -393,6 +407,34 @@ struct rte_table_action_pppoe_hdr {
uint16_t session_id; /**< Session ID. */
};
+/** Pre-computed IPv4 header fields for encapsulation action. */
+struct rte_table_action_ipv4_header {
+ uint32_t sa; /**< Source address. */
+ uint32_t da; /**< Destination address. */
+ uint8_t dscp; /**< DiffServ Code Point (DSCP). */
+ uint8_t ttl; /**< Time To Live (TTL). */
+};
+
+/** Pre-computed IPv6 header fields for encapsulation action. */
+struct rte_table_action_ipv6_header {
+ uint8_t sa[16]; /**< Source address. */
+ uint8_t da[16]; /**< Destination address. */
+ uint32_t flow_label; /**< Flow label. */
+ uint8_t dscp; /**< DiffServ Code Point (DSCP). */
+ uint8_t hop_limit; /**< Hop Limit (HL). */
+};
+
+/** Pre-computed UDP header fields for encapsulation action. */
+struct rte_table_action_udp_header {
+ uint16_t sp; /**< Source port. */
+ uint16_t dp; /**< Destination port. */
+};
+
+/** Pre-computed VXLAN header fields for encapsulation action. */
+struct rte_table_action_vxlan_hdr {
+ uint32_t vni; /**< VXLAN Network Identifier (VNI). */
+};
+
/** Ether encap parameters. */
struct rte_table_action_encap_ether_params {
struct rte_table_action_ether_hdr ether; /**< Ethernet header. */
@@ -437,6 +479,21 @@ struct rte_table_action_encap_pppoe_params {
struct rte_table_action_pppoe_hdr pppoe; /**< PPPoE/PPP headers. */
};
+/** VXLAN encap parameters. */
+struct rte_table_action_encap_vxlan_params {
+ struct rte_table_action_ether_hdr ether; /**< Ethernet header. */
+ struct rte_table_action_vlan_hdr vlan; /**< VLAN header. */
+
+ RTE_STD_C11
+ union {
+ struct rte_table_action_ipv4_header ipv4; /**< IPv4 header. */
+ struct rte_table_action_ipv6_header ipv6; /**< IPv6 header. */
+ };
+
+ struct rte_table_action_udp_header udp; /**< UDP header. */
+ struct rte_table_action_vxlan_hdr vxlan; /**< VXLAN header. */
+};
+
/** Encap action configuration (per table action profile). */
struct rte_table_action_encap_config {
/** Bit mask defining the set of packet encapsulations enabled for the
@@ -446,6 +503,30 @@ struct rte_table_action_encap_config {
* @see enum rte_table_action_encap_type
*/
uint64_t encap_mask;
+
+ /** Encapsulation type specific configuration. */
+ RTE_STD_C11
+ union {
+ struct {
+ /** Input packet to be encapsulated: offset within the
+ * input packet buffer to the start of the Ethernet
+ * frame to be encapsulated. Offset 0 points to the
+ * first byte of the MBUF structure.
+ */
+ uint32_t data_offset;
+
+ /** Encapsulation header: non-zero when encapsulation
+ * header includes a VLAN tag, zero otherwise.
+ */
+ int vlan;
+
+ /** Encapsulation header: IP version of the IP header
+ * within the encapsulation header. Non-zero for IPv4,
+ * zero for IPv6.
+ */
+ int ip_version;
+ } vxlan; /**< VXLAN specific configuration. */
+ };
};
/** Encap action parameters (per table rule). */
@@ -469,6 +550,9 @@ struct rte_table_action_encap_params {
/** Only valid when *type* is set to PPPoE. */
struct rte_table_action_encap_pppoe_params pppoe;
+
+ /** Only valid when *type* is set to VXLAN. */
+ struct rte_table_action_encap_vxlan_params vxlan;
};
};
@@ -606,6 +690,111 @@ struct rte_table_action_time_params {
};
/**
+ * RTE_TABLE_ACTION_CRYPTO
+ */
+#ifndef RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX
+#define RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX (16)
+#endif
+
+#ifndef RTE_TABLE_ACTION_SYM_CRYPTO_AAD_SIZE_MAX
+#define RTE_TABLE_ACTION_SYM_CRYPTO_AAD_SIZE_MAX (16)
+#endif
+
+#ifndef RTE_TABLE_ACTION_SYM_CRYPTO_IV_OFFSET
+#define RTE_TABLE_ACTION_SYM_CRYPTO_IV_OFFSET \
+ (sizeof(struct rte_crypto_op) + sizeof(struct rte_crypto_sym_op))
+#endif
+
+/** Common action structure to store the data's value, length, and offset */
+struct rte_table_action_vlo {
+ uint8_t *val;
+ uint32_t length;
+ uint32_t offset;
+};
+
+/** Symmetric crypto action configuration (per table action profile). */
+struct rte_table_action_sym_crypto_config {
+ /** Target Cryptodev ID. */
+ uint8_t cryptodev_id;
+
+ /**
+ * Offset to rte_crypto_op structure within the input packet buffer.
+ * Offset 0 points to the first byte of the MBUF structure.
+ */
+ uint32_t op_offset;
+
+ /** The mempool for creating cryptodev sessions. */
+ struct rte_mempool *mp_create;
+
+ /** The mempool for initializing cryptodev sessions. */
+ struct rte_mempool *mp_init;
+};
+
+/** Symmetric Crypto action parameters (per table rule). */
+struct rte_table_action_sym_crypto_params {
+
+ /** Xform pointer contains all relevant information */
+ struct rte_crypto_sym_xform *xform;
+
+ /**
+ * Offset within the input packet buffer to the first byte of data
+ * to be processed by the crypto unit. Offset 0 points to the first
+ * byte of the MBUF structure.
+ */
+ uint32_t data_offset;
+
+ union {
+ struct {
+ /** Cipher iv data. */
+ struct rte_table_action_vlo cipher_iv;
+
+ /** Cipher iv data. */
+ struct rte_table_action_vlo cipher_iv_update;
+
+ /** Auth iv data. */
+ struct rte_table_action_vlo auth_iv;
+
+ /** Auth iv data. */
+ struct rte_table_action_vlo auth_iv_update;
+
+ } cipher_auth;
+
+ struct {
+ /** AEAD AAD data. */
+ struct rte_table_action_vlo aad;
+
+ /** AEAD iv data. */
+ struct rte_table_action_vlo iv;
+
+ /** AEAD AAD data. */
+ struct rte_table_action_vlo aad_update;
+
+ /** AEAD iv data. */
+ struct rte_table_action_vlo iv_update;
+
+ } aead;
+ };
+};
+
+/**
+ * RTE_TABLE_ACTION_TAG
+ */
+/** Tag action parameters (per table rule). */
+struct rte_table_action_tag_params {
+ /** Tag to be attached to the input packet. */
+ uint32_t tag;
+};
+
+/**
+ * RTE_TABLE_ACTION_DECAP
+ */
+/** Decap action parameters (per table rule). */
+struct rte_table_action_decap_params {
+ /** Number of bytes to be removed from the start of the packet. */
+ uint16_t n;
+};
+
+/**
* Table action profile.
*/
struct rte_table_action_profile;
@@ -898,6 +1087,20 @@ rte_table_action_time_read(struct rte_table_action *action,
void *data,
uint64_t *timestamp);
+/**
+ * Table action cryptodev symmetric session get.
+ *
+ * @param[in] action
+ * Handle to table action object (needs to be valid).
+ * @param[in] data
+ * Data byte array (typically table rule data) with sym crypto action.
+ * @return
+ * The pointer to the session on success, NULL otherwise.
+ */
+struct rte_cryptodev_sym_session *__rte_experimental
+rte_table_action_crypto_sym_session_get(struct rte_table_action *action,
+ void *data);
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/librte_port/Makefile b/lib/librte_port/Makefile
index 8df4864e..1b83f6f2 100644
--- a/lib/librte_port/Makefile
+++ b/lib/librte_port/Makefile
@@ -11,7 +11,7 @@ ifeq ($(CONFIG_RTE_PORT_PCAP),y)
LDLIBS += -lpcap
endif
LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev
-LDLIBS += -lrte_ip_frag -lrte_sched
+LDLIBS += -lrte_ip_frag -lrte_sched -lrte_cryptodev
ifeq ($(CONFIG_RTE_LIBRTE_KNI),y)
LDLIBS += -lrte_kni
endif
@@ -38,6 +38,7 @@ ifeq ($(CONFIG_RTE_LIBRTE_KNI),y)
SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_kni.c
endif
SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_source_sink.c
+SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_sym_crypto.c
# install includes
SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port.h
@@ -53,5 +54,6 @@ ifeq ($(CONFIG_RTE_LIBRTE_KNI),y)
SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_kni.h
endif
SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_source_sink.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_sym_crypto.h
include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_port/meson.build b/lib/librte_port/meson.build
index f3d8b443..0d11456f 100644
--- a/lib/librte_port/meson.build
+++ b/lib/librte_port/meson.build
@@ -9,7 +9,8 @@ sources = files(
'rte_port_ras.c',
'rte_port_ring.c',
'rte_port_sched.c',
- 'rte_port_source_sink.c')
+ 'rte_port_source_sink.c',
+ 'rte_port_sym_crypto.c')
headers = files(
'rte_port_ethdev.h',
'rte_port_fd.h',
@@ -18,8 +19,9 @@ headers = files(
'rte_port.h',
'rte_port_ring.h',
'rte_port_sched.h',
- 'rte_port_source_sink.h')
-deps += ['ethdev', 'sched', 'ip_frag']
+ 'rte_port_source_sink.h',
+ 'rte_port_sym_crypto.h')
+deps += ['ethdev', 'sched', 'ip_frag', 'cryptodev']
if dpdk_conf.has('RTE_LIBRTE_KNI')
sources += files('rte_port_kni.c')
diff --git a/lib/librte_port/rte_port_sym_crypto.c b/lib/librte_port/rte_port_sym_crypto.c
new file mode 100644
index 00000000..295984d0
--- /dev/null
+++ b/lib/librte_port/rte_port_sym_crypto.c
@@ -0,0 +1,552 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_malloc.h>
+
+#include "rte_port_sym_crypto.h"
+
+/*
+ * Port Crypto Reader
+ */
+#ifdef RTE_PORT_STATS_COLLECT
+
+#define RTE_PORT_SYM_CRYPTO_READER_STATS_PKTS_IN_ADD(port, val) \
+ (port)->stats.n_pkts_in += (val)
+#define RTE_PORT_SYM_CRYPTO_READER_STATS_PKTS_DROP_ADD(port, val) \
+ (port)->stats.n_pkts_drop += (val)
+
+#else
+
+#define RTE_PORT_SYM_CRYPTO_READER_STATS_PKTS_IN_ADD(port, val)
+#define RTE_PORT_SYM_CRYPTO_READER_STATS_PKTS_DROP_ADD(port, val)
+
+#endif
+
+struct rte_port_sym_crypto_reader {
+ struct rte_port_in_stats stats;
+
+ uint8_t cryptodev_id;
+ uint16_t queue_id;
+ struct rte_crypto_op *ops[RTE_PORT_IN_BURST_SIZE_MAX];
+ rte_port_sym_crypto_reader_callback_fn f_callback;
+ void *arg_callback;
+};
+
+static void *
+rte_port_sym_crypto_reader_create(void *params, int socket_id)
+{
+ struct rte_port_sym_crypto_reader_params *conf =
+ params;
+ struct rte_port_sym_crypto_reader *port;
+
+ /* Check input parameters */
+ if (conf == NULL) {
+ RTE_LOG(ERR, PORT, "%s: params is NULL\n", __func__);
+ return NULL;
+ }
+
+ /* Memory allocation */
+ port = rte_zmalloc_socket("PORT", sizeof(*port),
+ RTE_CACHE_LINE_SIZE, socket_id);
+ if (port == NULL) {
+ RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__);
+ return NULL;
+ }
+
+ /* Initialization */
+ port->cryptodev_id = conf->cryptodev_id;
+ port->queue_id = conf->queue_id;
+ port->f_callback = conf->f_callback;
+ port->arg_callback = conf->arg_callback;
+
+ return port;
+}
+
+static int
+rte_port_sym_crypto_reader_rx(void *port, struct rte_mbuf **pkts, uint32_t n_pkts)
+{
+ struct rte_port_sym_crypto_reader *p =
+ port;
+ uint16_t rx_ops_cnt, i, n = 0;
+
+ rx_ops_cnt = rte_cryptodev_dequeue_burst(p->cryptodev_id, p->queue_id,
+ p->ops, n_pkts);
+
+ for (i = 0; i < rx_ops_cnt; i++) {
+ struct rte_crypto_op *op = p->ops[i];
+
+ /** Drop failed pkts */
+ if (unlikely(op->status != RTE_CRYPTO_OP_STATUS_SUCCESS)) {
+ rte_pktmbuf_free(op->sym->m_src);
+ continue;
+ }
+
+ pkts[n++] = op->sym->m_src;
+ }
+
+ if (p->f_callback)
+ (*p->f_callback)(pkts, n, p->arg_callback);
+
+ RTE_PORT_SYM_CRYPTO_READER_STATS_PKTS_IN_ADD(p, n);
+ RTE_PORT_SYM_CRYPTO_READER_STATS_PKTS_DROP_ADD(p, rx_ops_cnt - n);
+
+ return n;
+}
+
+static int
+rte_port_sym_crypto_reader_free(void *port)
+{
+ if (port == NULL) {
+ RTE_LOG(ERR, PORT, "%s: port is NULL\n", __func__);
+ return -EINVAL;
+ }
+
+ rte_free(port);
+
+ return 0;
+}
+
+static int rte_port_sym_crypto_reader_stats_read(void *port,
+ struct rte_port_in_stats *stats, int clear)
+{
+ struct rte_port_sym_crypto_reader *p =
+ port;
+
+ if (stats != NULL)
+ memcpy(stats, &p->stats, sizeof(p->stats));
+
+ if (clear)
+ memset(&p->stats, 0, sizeof(p->stats));
+
+ return 0;
+}
+
+/*
+ * Port crypto Writer
+ */
+#ifdef RTE_PORT_STATS_COLLECT
+
+#define RTE_PORT_SYM_CRYPTO_WRITER_STATS_PKTS_IN_ADD(port, val) \
+ (port)->stats.n_pkts_in += (val)
+#define RTE_PORT_SYM_CRYPTO_WRITER_STATS_PKTS_DROP_ADD(port, val) \
+ (port)->stats.n_pkts_drop += (val)
+
+#else
+
+#define RTE_PORT_SYM_CRYPTO_WRITER_STATS_PKTS_IN_ADD(port, val)
+#define RTE_PORT_SYM_CRYPTO_WRITER_STATS_PKTS_DROP_ADD(port, val)
+
+#endif
+
+struct rte_port_sym_crypto_writer {
+ struct rte_port_out_stats stats;
+
+ struct rte_crypto_op *tx_buf[2 * RTE_PORT_IN_BURST_SIZE_MAX];
+
+ uint32_t tx_burst_sz;
+ uint32_t tx_buf_count;
+ uint64_t bsz_mask;
+
+ uint8_t cryptodev_id;
+ uint16_t queue_id;
+ uint16_t crypto_op_offset;
+};
+
+static void *
+rte_port_sym_crypto_writer_create(void *params, int socket_id)
+{
+ struct rte_port_sym_crypto_writer_params *conf =
+ params;
+ struct rte_port_sym_crypto_writer *port;
+
+ /* Check input parameters */
+ if ((conf == NULL) ||
+ (conf->tx_burst_sz == 0) ||
+ (conf->tx_burst_sz > RTE_PORT_IN_BURST_SIZE_MAX) ||
+ (!rte_is_power_of_2(conf->tx_burst_sz))) {
+ RTE_LOG(ERR, PORT, "%s: Invalid input parameters\n", __func__);
+ return NULL;
+ }
+
+ /* Memory allocation */
+ port = rte_zmalloc_socket("PORT", sizeof(*port),
+ RTE_CACHE_LINE_SIZE, socket_id);
+ if (port == NULL) {
+ RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__);
+ return NULL;
+ }
+
+ /* Initialization */
+ port->tx_burst_sz = conf->tx_burst_sz;
+ port->tx_buf_count = 0;
+ port->bsz_mask = 1LLU << (conf->tx_burst_sz - 1);
+
+ port->cryptodev_id = conf->cryptodev_id;
+ port->queue_id = conf->queue_id;
+ port->crypto_op_offset = conf->crypto_op_offset;
+
+ return port;
+}
+
+static inline void
+send_burst(struct rte_port_sym_crypto_writer *p)
+{
+ uint32_t nb_tx;
+
+ nb_tx = rte_cryptodev_enqueue_burst(p->cryptodev_id, p->queue_id,
+ p->tx_buf, p->tx_buf_count);
+
+ RTE_PORT_SYM_CRYPTO_WRITER_STATS_PKTS_DROP_ADD(p, p->tx_buf_count -
+ nb_tx);
+ for (; nb_tx < p->tx_buf_count; nb_tx++)
+ rte_pktmbuf_free(p->tx_buf[nb_tx]->sym->m_src);
+
+ p->tx_buf_count = 0;
+}
+
+static int
+rte_port_sym_crypto_writer_tx(void *port, struct rte_mbuf *pkt)
+{
+ struct rte_port_sym_crypto_writer *p =
+ port;
+
+ p->tx_buf[p->tx_buf_count++] = (struct rte_crypto_op *)
+ RTE_MBUF_METADATA_UINT8_PTR(pkt, p->crypto_op_offset);
+ RTE_PORT_SYM_CRYPTO_WRITER_STATS_PKTS_IN_ADD(p, 1);
+ if (p->tx_buf_count >= p->tx_burst_sz)
+ send_burst(p);
+
+ return 0;
+}
+
+static int
+rte_port_sym_crypto_writer_tx_bulk(void *port,
+ struct rte_mbuf **pkts,
+ uint64_t pkts_mask)
+{
+ struct rte_port_sym_crypto_writer *p =
+ port;
+ uint64_t bsz_mask = p->bsz_mask;
+ uint32_t tx_buf_count = p->tx_buf_count;
+ uint64_t expr = (pkts_mask & (pkts_mask + 1)) |
+ ((pkts_mask & bsz_mask) ^ bsz_mask);
+
+ if (expr == 0) {
+ uint64_t n_pkts = __builtin_popcountll(pkts_mask);
+ uint32_t i;
+
+ RTE_PORT_SYM_CRYPTO_WRITER_STATS_PKTS_IN_ADD(p, n_pkts);
+
+ for (i = 0; i < n_pkts; i++)
+ p->tx_buf[p->tx_buf_count++] = (struct rte_crypto_op *)
+ RTE_MBUF_METADATA_UINT8_PTR(pkts[i],
+ p->crypto_op_offset);
+
+ if (p->tx_buf_count >= p->tx_burst_sz)
+ send_burst(p);
+ } else {
+ for (; pkts_mask;) {
+ uint32_t pkt_index = __builtin_ctzll(pkts_mask);
+ uint64_t pkt_mask = 1LLU << pkt_index;
+ struct rte_mbuf *pkt = pkts[pkt_index];
+
+ p->tx_buf[tx_buf_count++] = (struct rte_crypto_op *)
+ RTE_MBUF_METADATA_UINT8_PTR(pkt,
+ p->crypto_op_offset);
+
+ RTE_PORT_SYM_CRYPTO_WRITER_STATS_PKTS_IN_ADD(p, 1);
+ pkts_mask &= ~pkt_mask;
+ }
+
+ p->tx_buf_count = tx_buf_count;
+ if (tx_buf_count >= p->tx_burst_sz)
+ send_burst(p);
+ }
+
+ return 0;
+}
+
+static int
+rte_port_sym_crypto_writer_flush(void *port)
+{
+ struct rte_port_sym_crypto_writer *p =
+ port;
+
+ if (p->tx_buf_count > 0)
+ send_burst(p);
+
+ return 0;
+}
+
+static int
+rte_port_sym_crypto_writer_free(void *port)
+{
+ if (port == NULL) {
+ RTE_LOG(ERR, PORT, "%s: Port is NULL\n", __func__);
+ return -EINVAL;
+ }
+
+ rte_port_sym_crypto_writer_flush(port);
+ rte_free(port);
+
+ return 0;
+}
+
+static int rte_port_sym_crypto_writer_stats_read(void *port,
+ struct rte_port_out_stats *stats, int clear)
+{
+ struct rte_port_sym_crypto_writer *p =
+ port;
+
+ if (stats != NULL)
+ memcpy(stats, &p->stats, sizeof(p->stats));
+
+ if (clear)
+ memset(&p->stats, 0, sizeof(p->stats));
+
+ return 0;
+}
+
+/*
+ * Port crypto Writer Nodrop
+ */
+#ifdef RTE_PORT_STATS_COLLECT
+
+#define RTE_PORT_SYM_CRYPTO_WRITER_NODROP_STATS_PKTS_IN_ADD(port, val) \
+ (port)->stats.n_pkts_in += (val)
+#define RTE_PORT_SYM_CRYPTO_WRITER_NODROP_STATS_PKTS_DROP_ADD(port, val) \
+ (port)->stats.n_pkts_drop += (val)
+
+#else
+
+#define RTE_PORT_SYM_CRYPTO_WRITER_NODROP_STATS_PKTS_IN_ADD(port, val)
+#define RTE_PORT_SYM_CRYPTO_WRITER_NODROP_STATS_PKTS_DROP_ADD(port, val)
+
+#endif
+
+struct rte_port_sym_crypto_writer_nodrop {
+ struct rte_port_out_stats stats;
+
+ struct rte_crypto_op *tx_buf[2 * RTE_PORT_IN_BURST_SIZE_MAX];
+ uint32_t tx_burst_sz;
+ uint32_t tx_buf_count;
+ uint64_t bsz_mask;
+ uint64_t n_retries;
+
+ uint8_t cryptodev_id;
+ uint16_t queue_id;
+ uint16_t crypto_op_offset;
+};
+
+static void *
+rte_port_sym_crypto_writer_nodrop_create(void *params, int socket_id)
+{
+ struct rte_port_sym_crypto_writer_nodrop_params *conf =
+ params;
+ struct rte_port_sym_crypto_writer_nodrop *port;
+
+ /* Check input parameters */
+ if ((conf == NULL) ||
+ (conf->tx_burst_sz == 0) ||
+ (conf->tx_burst_sz > RTE_PORT_IN_BURST_SIZE_MAX) ||
+ (!rte_is_power_of_2(conf->tx_burst_sz))) {
+ RTE_LOG(ERR, PORT, "%s: Invalid input parameters\n", __func__);
+ return NULL;
+ }
+
+ /* Memory allocation */
+ port = rte_zmalloc_socket("PORT", sizeof(*port),
+ RTE_CACHE_LINE_SIZE, socket_id);
+ if (port == NULL) {
+ RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__);
+ return NULL;
+ }
+
+ /* Initialization */
+ port->cryptodev_id = conf->cryptodev_id;
+ port->queue_id = conf->queue_id;
+ port->crypto_op_offset = conf->crypto_op_offset;
+ port->tx_burst_sz = conf->tx_burst_sz;
+ port->tx_buf_count = 0;
+ port->bsz_mask = 1LLU << (conf->tx_burst_sz - 1);
+
+ /*
+ * When n_retries is 0 it means that we should wait for every packet to
+ * send no matter how many retries should it take. To limit number of
+ * branches in fast path, we use UINT64_MAX instead of branching.
+ */
+ port->n_retries = (conf->n_retries == 0) ? UINT64_MAX : conf->n_retries;
+
+ return port;
+}
+
+static inline void
+send_burst_nodrop(struct rte_port_sym_crypto_writer_nodrop *p)
+{
+ uint32_t nb_tx = 0, i;
+
+ nb_tx = rte_cryptodev_enqueue_burst(p->cryptodev_id, p->queue_id,
+ p->tx_buf, p->tx_buf_count);
+
+ /* We sent all the packets in a first try */
+ if (nb_tx >= p->tx_buf_count) {
+ p->tx_buf_count = 0;
+ return;
+ }
+
+ for (i = 0; i < p->n_retries; i++) {
+ nb_tx += rte_cryptodev_enqueue_burst(p->cryptodev_id,
+ p->queue_id, p->tx_buf + nb_tx,
+ p->tx_buf_count - nb_tx);
+
+ /* We sent all the packets in more than one try */
+ if (nb_tx >= p->tx_buf_count) {
+ p->tx_buf_count = 0;
+ return;
+ }
+ }
+
+ /* We didn't send the packets in maximum allowed attempts */
+ RTE_PORT_SYM_CRYPTO_WRITER_NODROP_STATS_PKTS_DROP_ADD(p,
+ p->tx_buf_count - nb_tx);
+ for ( ; nb_tx < p->tx_buf_count; nb_tx++)
+ rte_pktmbuf_free(p->tx_buf[nb_tx]->sym->m_src);
+
+ p->tx_buf_count = 0;
+}
+
+static int
+rte_port_sym_crypto_writer_nodrop_tx(void *port, struct rte_mbuf *pkt)
+{
+ struct rte_port_sym_crypto_writer_nodrop *p =
+ port;
+
+ p->tx_buf[p->tx_buf_count++] = (struct rte_crypto_op *)
+ RTE_MBUF_METADATA_UINT8_PTR(pkt, p->crypto_op_offset);
+ RTE_PORT_SYM_CRYPTO_WRITER_STATS_PKTS_IN_ADD(p, 1);
+ if (p->tx_buf_count >= p->tx_burst_sz)
+ send_burst_nodrop(p);
+
+ return 0;
+}
+
+static int
+rte_port_sym_crypto_writer_nodrop_tx_bulk(void *port,
+ struct rte_mbuf **pkts,
+ uint64_t pkts_mask)
+{
+ struct rte_port_sym_crypto_writer_nodrop *p =
+ port;
+
+ uint64_t bsz_mask = p->bsz_mask;
+ uint32_t tx_buf_count = p->tx_buf_count;
+ uint64_t expr = (pkts_mask & (pkts_mask + 1)) |
+ ((pkts_mask & bsz_mask) ^ bsz_mask);
+
+ if (expr == 0) {
+ uint64_t n_pkts = __builtin_popcountll(pkts_mask);
+ uint32_t i;
+
+ RTE_PORT_SYM_CRYPTO_WRITER_NODROP_STATS_PKTS_IN_ADD(p, n_pkts);
+
+ for (i = 0; i < n_pkts; i++)
+ p->tx_buf[p->tx_buf_count++] = (struct rte_crypto_op *)
+ RTE_MBUF_METADATA_UINT8_PTR(pkts[i],
+ p->crypto_op_offset);
+
+ if (p->tx_buf_count >= p->tx_burst_sz)
+ send_burst_nodrop(p);
+ } else {
+ for ( ; pkts_mask; ) {
+ uint32_t pkt_index = __builtin_ctzll(pkts_mask);
+ uint64_t pkt_mask = 1LLU << pkt_index;
+ struct rte_mbuf *pkt = pkts[pkt_index];
+
+ p->tx_buf[tx_buf_count++] = (struct rte_crypto_op *)
+ RTE_MBUF_METADATA_UINT8_PTR(pkt,
+ p->crypto_op_offset);
+ RTE_PORT_SYM_CRYPTO_WRITER_NODROP_STATS_PKTS_IN_ADD(p,
+ 1);
+ pkts_mask &= ~pkt_mask;
+ }
+
+ p->tx_buf_count = tx_buf_count;
+ if (tx_buf_count >= p->tx_burst_sz)
+ send_burst_nodrop(p);
+ }
+
+ return 0;
+}
+
+static int
+rte_port_sym_crypto_writer_nodrop_flush(void *port)
+{
+ struct rte_port_sym_crypto_writer_nodrop *p =
+ port;
+
+ if (p->tx_buf_count > 0)
+ send_burst_nodrop(p);
+
+ return 0;
+}
+
+static int
+rte_port_sym_crypto_writer_nodrop_free(void *port)
+{
+ if (port == NULL) {
+ RTE_LOG(ERR, PORT, "%s: Port is NULL\n", __func__);
+ return -EINVAL;
+ }
+
+ rte_port_sym_crypto_writer_nodrop_flush(port);
+ rte_free(port);
+
+ return 0;
+}
+
+static int rte_port_sym_crypto_writer_nodrop_stats_read(void *port,
+ struct rte_port_out_stats *stats, int clear)
+{
+ struct rte_port_sym_crypto_writer_nodrop *p =
+ port;
+
+ if (stats != NULL)
+ memcpy(stats, &p->stats, sizeof(p->stats));
+
+ if (clear)
+ memset(&p->stats, 0, sizeof(p->stats));
+
+ return 0;
+}
+
+
+/*
+ * Summary of port operations
+ */
+struct rte_port_in_ops rte_port_sym_crypto_reader_ops = {
+ .f_create = rte_port_sym_crypto_reader_create,
+ .f_free = rte_port_sym_crypto_reader_free,
+ .f_rx = rte_port_sym_crypto_reader_rx,
+ .f_stats = rte_port_sym_crypto_reader_stats_read,
+};
+
+struct rte_port_out_ops rte_port_sym_crypto_writer_ops = {
+ .f_create = rte_port_sym_crypto_writer_create,
+ .f_free = rte_port_sym_crypto_writer_free,
+ .f_tx = rte_port_sym_crypto_writer_tx,
+ .f_tx_bulk = rte_port_sym_crypto_writer_tx_bulk,
+ .f_flush = rte_port_sym_crypto_writer_flush,
+ .f_stats = rte_port_sym_crypto_writer_stats_read,
+};
+
+struct rte_port_out_ops rte_port_sym_crypto_writer_nodrop_ops = {
+ .f_create = rte_port_sym_crypto_writer_nodrop_create,
+ .f_free = rte_port_sym_crypto_writer_nodrop_free,
+ .f_tx = rte_port_sym_crypto_writer_nodrop_tx,
+ .f_tx_bulk = rte_port_sym_crypto_writer_nodrop_tx_bulk,
+ .f_flush = rte_port_sym_crypto_writer_nodrop_flush,
+ .f_stats = rte_port_sym_crypto_writer_nodrop_stats_read,
+};
diff --git a/lib/librte_port/rte_port_sym_crypto.h b/lib/librte_port/rte_port_sym_crypto.h
new file mode 100644
index 00000000..181f6ce0
--- /dev/null
+++ b/lib/librte_port/rte_port_sym_crypto.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef __INCLUDE_RTE_PORT_SYM_CRYPTO_H__
+#define __INCLUDE_RTE_PORT_SYM_CRYPTO_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @file
+ * RTE Port sym crypto Interface
+ *
+ * crypto_reader: input port built on top of pre-initialized crypto interface
+ * crypto_writer: output port built on top of pre-initialized crypto interface
+ *
+ **/
+
+#include <stdint.h>
+
+#include <rte_cryptodev.h>
+
+#include "rte_port.h"
+
+/** Function prototype for reader post action. */
+typedef void (*rte_port_sym_crypto_reader_callback_fn)(struct rte_mbuf **pkts,
+ uint16_t n_pkts, void *arg);
+
+/** Crypto_reader port parameters */
+struct rte_port_sym_crypto_reader_params {
+ /** Target cryptodev ID. */
+ uint8_t cryptodev_id;
+
+ /** Target cryptodev Queue Pair ID. */
+ uint16_t queue_id;
+
+ /** Crypto reader post callback function. */
+ rte_port_sym_crypto_reader_callback_fn f_callback;
+
+ /** Crypto reader post callback function arguments. */
+ void *arg_callback;
+};
+
+/** Crypto_reader port operations. */
+extern struct rte_port_in_ops rte_port_sym_crypto_reader_ops;
+
+
+/** Crypto_writer port parameters. */
+struct rte_port_sym_crypto_writer_params {
+ /** Target cryptodev ID. */
+ uint8_t cryptodev_id;
+
+ /** Target cryptodev Queue Pair ID. */
+ uint16_t queue_id;
+
+ /** offset to rte_crypto_op in the mbufs. */
+ uint16_t crypto_op_offset;
+
+ /** Burst size to crypto interface. */
+ uint32_t tx_burst_sz;
+};
+
+/** Crypto_writer port operations. */
+extern struct rte_port_out_ops rte_port_sym_crypto_writer_ops;
+
+/** Crypto_writer_nodrop port parameters. */
+struct rte_port_sym_crypto_writer_nodrop_params {
+ /** Target cryptodev ID. */
+ uint8_t cryptodev_id;
+
+ /** Target cryptodev queue pair id. */
+ uint16_t queue_id;
+
+ /** Offset to rte_crypto_op in the mbufs. */
+ uint16_t crypto_op_offset;
+
+ /** Burst size to crypto interface. */
+ uint32_t tx_burst_sz;
+
+ /** Maximum number of retries, 0 for no limit. */
+ uint32_t n_retries;
+};
+
+/** Crypto_writer_nodrop port operations. */
+extern struct rte_port_out_ops rte_port_sym_crypto_writer_nodrop_ops;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/lib/librte_port/rte_port_version.map b/lib/librte_port/rte_port_version.map
index 6470629b..609bcec3 100644
--- a/lib/librte_port/rte_port_version.map
+++ b/lib/librte_port/rte_port_version.map
@@ -51,3 +51,12 @@ DPDK_16.11 {
rte_port_fd_writer_nodrop_ops;
} DPDK_16.07;
+
+DPDK_18.11 {
+ global:
+
+ rte_port_sym_crypto_reader_ops;
+ rte_port_sym_crypto_writer_ops;
+ rte_port_sym_crypto_writer_nodrop_ops;
+
+} DPDK_16.11;
diff --git a/lib/librte_power/Makefile b/lib/librte_power/Makefile
index 6f85e885..9bec668d 100644
--- a/lib/librte_power/Makefile
+++ b/lib/librte_power/Makefile
@@ -7,7 +7,7 @@ include $(RTE_SDK)/mk/rte.vars.mk
LIB = librte_power.a
CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing
-LDLIBS += -lrte_eal
+LDLIBS += -lrte_eal -lrte_timer
EXPORT_MAP := rte_power_version.map
@@ -16,8 +16,9 @@ LIBABIVER := 1
# all source are stored in SRCS-y
SRCS-$(CONFIG_RTE_LIBRTE_POWER) := rte_power.c power_acpi_cpufreq.c
SRCS-$(CONFIG_RTE_LIBRTE_POWER) += power_kvm_vm.c guest_channel.c
+SRCS-$(CONFIG_RTE_LIBRTE_POWER) += rte_power_empty_poll.c
# install this header file
-SYMLINK-$(CONFIG_RTE_LIBRTE_POWER)-include := rte_power.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_POWER)-include := rte_power.h rte_power_empty_poll.h
include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_power/channel_commands.h b/lib/librte_power/channel_commands.h
index ee638eef..e7b93a79 100644
--- a/lib/librte_power/channel_commands.h
+++ b/lib/librte_power/channel_commands.h
@@ -19,6 +19,7 @@ extern "C" {
#define CPU_POWER 1
#define CPU_POWER_CONNECT 2
#define PKT_POLICY 3
+#define PKT_POLICY_REMOVE 4
/* CPU Power Command Scaling */
#define CPU_POWER_SCALE_UP 1
@@ -58,6 +59,9 @@ struct traffic {
uint32_t max_max_packet_thresh;
};
+#define CORE_TYPE_VIRTUAL 0
+#define CORE_TYPE_PHYSICAL 1
+
struct channel_packet {
uint64_t resource_id; /**< core_num, device */
uint32_t unit; /**< scale down/up/min/max */
@@ -70,6 +74,7 @@ struct channel_packet {
uint8_t vcpu_to_control[MAX_VCPU_PER_VM];
uint8_t num_vcpu;
struct timer_profile timer_policy;
+ bool core_type;
enum workload workload;
enum policy_to_use policy_to_use;
struct t_boost_status t_boost_status;
diff --git a/lib/librte_power/meson.build b/lib/librte_power/meson.build
index 253173f2..9ed8b56d 100644
--- a/lib/librte_power/meson.build
+++ b/lib/librte_power/meson.build
@@ -5,5 +5,7 @@ if host_machine.system() != 'linux'
build = false
endif
sources = files('rte_power.c', 'power_acpi_cpufreq.c',
- 'power_kvm_vm.c', 'guest_channel.c')
-headers = files('rte_power.h')
+ 'power_kvm_vm.c', 'guest_channel.c',
+ 'rte_power_empty_poll.c')
+headers = files('rte_power.h','rte_power_empty_poll.h')
+deps += ['timer']
diff --git a/lib/librte_power/rte_power_empty_poll.c b/lib/librte_power/rte_power_empty_poll.c
new file mode 100644
index 00000000..e6145462
--- /dev/null
+++ b/lib/librte_power/rte_power_empty_poll.c
@@ -0,0 +1,545 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2018 Intel Corporation
+ */
+
+#include <string.h>
+
+#include <rte_lcore.h>
+#include <rte_cycles.h>
+#include <rte_atomic.h>
+#include <rte_malloc.h>
+#include <inttypes.h>
+
+#include "rte_power.h"
+#include "rte_power_empty_poll.h"
+
+#define INTERVALS_PER_SECOND 100 /* (10ms) */
+#define SECONDS_TO_TRAIN_FOR 2
+#define DEFAULT_MED_TO_HIGH_PERCENT_THRESHOLD 70
+#define DEFAULT_HIGH_TO_MED_PERCENT_THRESHOLD 30
+#define DEFAULT_CYCLES_PER_PACKET 800
+
+static struct ep_params *ep_params;
+static uint32_t med_to_high_threshold = DEFAULT_MED_TO_HIGH_PERCENT_THRESHOLD;
+static uint32_t high_to_med_threshold = DEFAULT_HIGH_TO_MED_PERCENT_THRESHOLD;
+
+static uint32_t avail_freqs[RTE_MAX_LCORE][NUM_FREQS];
+
+static uint32_t total_avail_freqs[RTE_MAX_LCORE];
+
+static uint32_t freq_index[NUM_FREQ];
+
+static uint32_t
+get_freq_index(enum freq_val index)
+{
+ return freq_index[index];
+}
+
+
+static int
+set_power_freq(int lcore_id, enum freq_val freq, bool specific_freq)
+{
+ int err = 0;
+ uint32_t power_freq_index;
+ if (!specific_freq)
+ power_freq_index = get_freq_index(freq);
+ else
+ power_freq_index = freq;
+
+ err = rte_power_set_freq(lcore_id, power_freq_index);
+
+ return err;
+}
+
+
+static inline void __attribute__((always_inline))
+exit_training_state(struct priority_worker *poll_stats)
+{
+ RTE_SET_USED(poll_stats);
+}
+
+static inline void __attribute__((always_inline))
+enter_training_state(struct priority_worker *poll_stats)
+{
+ poll_stats->iter_counter = 0;
+ poll_stats->cur_freq = LOW;
+ poll_stats->queue_state = TRAINING;
+}
+
+static inline void __attribute__((always_inline))
+enter_normal_state(struct priority_worker *poll_stats)
+{
+ /* Clear the averages arrays and strs */
+ memset(poll_stats->edpi_av, 0, sizeof(poll_stats->edpi_av));
+ poll_stats->ec = 0;
+ memset(poll_stats->ppi_av, 0, sizeof(poll_stats->ppi_av));
+ poll_stats->pc = 0;
+
+ poll_stats->cur_freq = MED;
+ poll_stats->iter_counter = 0;
+ poll_stats->threshold_ctr = 0;
+ poll_stats->queue_state = MED_NORMAL;
+ RTE_LOG(INFO, POWER, "Set the power freq to MED\n");
+ set_power_freq(poll_stats->lcore_id, MED, false);
+
+ poll_stats->thresh[MED].threshold_percent = med_to_high_threshold;
+ poll_stats->thresh[HGH].threshold_percent = high_to_med_threshold;
+}
+
+static inline void __attribute__((always_inline))
+enter_busy_state(struct priority_worker *poll_stats)
+{
+ memset(poll_stats->edpi_av, 0, sizeof(poll_stats->edpi_av));
+ poll_stats->ec = 0;
+ memset(poll_stats->ppi_av, 0, sizeof(poll_stats->ppi_av));
+ poll_stats->pc = 0;
+
+ poll_stats->cur_freq = HGH;
+ poll_stats->iter_counter = 0;
+ poll_stats->threshold_ctr = 0;
+ poll_stats->queue_state = HGH_BUSY;
+ set_power_freq(poll_stats->lcore_id, HGH, false);
+}
+
+static inline void __attribute__((always_inline))
+enter_purge_state(struct priority_worker *poll_stats)
+{
+ poll_stats->iter_counter = 0;
+ poll_stats->queue_state = LOW_PURGE;
+}
+
+static inline void __attribute__((always_inline))
+set_state(struct priority_worker *poll_stats,
+ enum queue_state new_state)
+{
+ enum queue_state old_state = poll_stats->queue_state;
+ if (old_state != new_state) {
+
+ /* Call any old state exit functions */
+ if (old_state == TRAINING)
+ exit_training_state(poll_stats);
+
+ /* Call any new state entry functions */
+ if (new_state == TRAINING)
+ enter_training_state(poll_stats);
+ if (new_state == MED_NORMAL)
+ enter_normal_state(poll_stats);
+ if (new_state == HGH_BUSY)
+ enter_busy_state(poll_stats);
+ if (new_state == LOW_PURGE)
+ enter_purge_state(poll_stats);
+ }
+}
+
+static inline void __attribute__((always_inline))
+set_policy(struct priority_worker *poll_stats,
+ struct ep_policy *policy)
+{
+ set_state(poll_stats, policy->state);
+
+ if (policy->state == TRAINING)
+ return;
+
+ poll_stats->thresh[MED_NORMAL].base_edpi = policy->med_base_edpi;
+ poll_stats->thresh[HGH_BUSY].base_edpi = policy->hgh_base_edpi;
+
+ poll_stats->thresh[MED_NORMAL].trained = true;
+ poll_stats->thresh[HGH_BUSY].trained = true;
+
+}
+
+static void
+update_training_stats(struct priority_worker *poll_stats,
+ uint32_t freq,
+ bool specific_freq,
+ uint32_t max_train_iter)
+{
+ RTE_SET_USED(specific_freq);
+
+ char pfi_str[32];
+ uint64_t p0_empty_deq;
+
+ sprintf(pfi_str, "%02d", freq);
+
+ if (poll_stats->cur_freq == freq &&
+ poll_stats->thresh[freq].trained == false) {
+ if (poll_stats->thresh[freq].cur_train_iter == 0) {
+
+ set_power_freq(poll_stats->lcore_id,
+ freq, specific_freq);
+
+ poll_stats->empty_dequeues_prev =
+ poll_stats->empty_dequeues;
+
+ poll_stats->thresh[freq].cur_train_iter++;
+
+ return;
+ } else if (poll_stats->thresh[freq].cur_train_iter
+ <= max_train_iter) {
+
+ p0_empty_deq = poll_stats->empty_dequeues -
+ poll_stats->empty_dequeues_prev;
+
+ poll_stats->empty_dequeues_prev =
+ poll_stats->empty_dequeues;
+
+ poll_stats->thresh[freq].base_edpi += p0_empty_deq;
+ poll_stats->thresh[freq].cur_train_iter++;
+
+ } else {
+ if (poll_stats->thresh[freq].trained == false) {
+ poll_stats->thresh[freq].base_edpi =
+ poll_stats->thresh[freq].base_edpi /
+ max_train_iter;
+
+ /* Add on a factor of 0.05%
+ * this should remove any
+ * false negatives when the system is 0% busy
+ */
+ poll_stats->thresh[freq].base_edpi +=
+ poll_stats->thresh[freq].base_edpi / 2000;
+
+ poll_stats->thresh[freq].trained = true;
+ poll_stats->cur_freq++;
+
+ }
+ }
+ }
+}
+
+static inline uint32_t __attribute__((always_inline))
+update_stats(struct priority_worker *poll_stats)
+{
+ uint64_t tot_edpi = 0, tot_ppi = 0;
+ uint32_t j, percent;
+
+ struct priority_worker *s = poll_stats;
+
+ uint64_t cur_edpi = s->empty_dequeues - s->empty_dequeues_prev;
+
+ s->empty_dequeues_prev = s->empty_dequeues;
+
+ uint64_t ppi = s->num_dequeue_pkts - s->num_dequeue_pkts_prev;
+
+ s->num_dequeue_pkts_prev = s->num_dequeue_pkts;
+
+ if (s->thresh[s->cur_freq].base_edpi < cur_edpi) {
+
+ /* edpi mean empty poll counter difference per interval */
+ RTE_LOG(DEBUG, POWER, "cur_edpi is too large "
+ "cur edpi %"PRId64" "
+ "base edpi %"PRId64"\n",
+ cur_edpi,
+ s->thresh[s->cur_freq].base_edpi);
+ /* Value to make us fail need debug log*/
+ return 1000UL;
+ }
+
+ s->edpi_av[s->ec++ % BINS_AV] = cur_edpi;
+ s->ppi_av[s->pc++ % BINS_AV] = ppi;
+
+ for (j = 0; j < BINS_AV; j++) {
+ tot_edpi += s->edpi_av[j];
+ tot_ppi += s->ppi_av[j];
+ }
+
+ tot_edpi = tot_edpi / BINS_AV;
+
+ percent = 100 - (uint32_t)(((float)tot_edpi /
+ (float)s->thresh[s->cur_freq].base_edpi) * 100);
+
+ return (uint32_t)percent;
+}
+
+
+static inline void __attribute__((always_inline))
+update_stats_normal(struct priority_worker *poll_stats)
+{
+ uint32_t percent;
+
+ if (poll_stats->thresh[poll_stats->cur_freq].base_edpi == 0) {
+
+ enum freq_val cur_freq = poll_stats->cur_freq;
+
+ /* edpi mean empty poll counter difference per interval */
+ RTE_LOG(DEBUG, POWER, "cure freq is %d, edpi is %"PRIu64"\n",
+ cur_freq,
+ poll_stats->thresh[cur_freq].base_edpi);
+ return;
+ }
+
+ percent = update_stats(poll_stats);
+
+ if (percent > 100) {
+ /* edpi mean empty poll counter difference per interval */
+ RTE_LOG(DEBUG, POWER, "Edpi is bigger than threshold\n");
+ return;
+ }
+
+ if (poll_stats->cur_freq == LOW)
+ RTE_LOG(INFO, POWER, "Purge Mode is not currently supported\n");
+ else if (poll_stats->cur_freq == MED) {
+
+ if (percent >
+ poll_stats->thresh[MED].threshold_percent) {
+
+ if (poll_stats->threshold_ctr < INTERVALS_PER_SECOND)
+ poll_stats->threshold_ctr++;
+ else {
+ set_state(poll_stats, HGH_BUSY);
+ RTE_LOG(INFO, POWER, "MOVE to HGH\n");
+ }
+
+ } else {
+ /* reset */
+ poll_stats->threshold_ctr = 0;
+ }
+
+ } else if (poll_stats->cur_freq == HGH) {
+
+ if (percent <
+ poll_stats->thresh[HGH].threshold_percent) {
+
+ if (poll_stats->threshold_ctr < INTERVALS_PER_SECOND)
+ poll_stats->threshold_ctr++;
+ else {
+ set_state(poll_stats, MED_NORMAL);
+ RTE_LOG(INFO, POWER, "MOVE to MED\n");
+ }
+ } else {
+ /* reset */
+ poll_stats->threshold_ctr = 0;
+ }
+
+ }
+}
+
+static int
+empty_poll_training(struct priority_worker *poll_stats,
+ uint32_t max_train_iter)
+{
+
+ if (poll_stats->iter_counter < INTERVALS_PER_SECOND) {
+ poll_stats->iter_counter++;
+ return 0;
+ }
+
+
+ update_training_stats(poll_stats,
+ LOW,
+ false,
+ max_train_iter);
+
+ update_training_stats(poll_stats,
+ MED,
+ false,
+ max_train_iter);
+
+ update_training_stats(poll_stats,
+ HGH,
+ false,
+ max_train_iter);
+
+
+ if (poll_stats->thresh[LOW].trained == true
+ && poll_stats->thresh[MED].trained == true
+ && poll_stats->thresh[HGH].trained == true) {
+
+ set_state(poll_stats, MED_NORMAL);
+
+ RTE_LOG(INFO, POWER, "LOW threshold is %"PRIu64"\n",
+ poll_stats->thresh[LOW].base_edpi);
+
+ RTE_LOG(INFO, POWER, "MED threshold is %"PRIu64"\n",
+ poll_stats->thresh[MED].base_edpi);
+
+
+ RTE_LOG(INFO, POWER, "HIGH threshold is %"PRIu64"\n",
+ poll_stats->thresh[HGH].base_edpi);
+
+ RTE_LOG(INFO, POWER, "Training is Complete for %d\n",
+ poll_stats->lcore_id);
+ }
+
+ return 0;
+}
+
+void __rte_experimental
+rte_empty_poll_detection(struct rte_timer *tim, void *arg)
+{
+
+ uint32_t i;
+
+ struct priority_worker *poll_stats;
+
+ RTE_SET_USED(tim);
+
+ RTE_SET_USED(arg);
+
+ for (i = 0; i < NUM_NODES; i++) {
+
+ poll_stats = &(ep_params->wrk_data.wrk_stats[i]);
+
+ if (rte_lcore_is_enabled(poll_stats->lcore_id) == 0)
+ continue;
+
+ switch (poll_stats->queue_state) {
+ case(TRAINING):
+ empty_poll_training(poll_stats,
+ ep_params->max_train_iter);
+ break;
+
+ case(HGH_BUSY):
+ case(MED_NORMAL):
+ update_stats_normal(poll_stats);
+ break;
+
+ case(LOW_PURGE):
+ break;
+ default:
+ break;
+
+ }
+
+ }
+
+}
+
+int __rte_experimental
+rte_power_empty_poll_stat_init(struct ep_params **eptr, uint8_t *freq_tlb,
+ struct ep_policy *policy)
+{
+ uint32_t i;
+ /* Allocate the ep_params structure */
+ ep_params = rte_zmalloc_socket(NULL,
+ sizeof(struct ep_params),
+ 0,
+ rte_socket_id());
+
+ if (!ep_params)
+ return -1;
+
+ if (freq_tlb == NULL) {
+ freq_index[LOW] = 14;
+ freq_index[MED] = 9;
+ freq_index[HGH] = 1;
+ } else {
+ freq_index[LOW] = freq_tlb[LOW];
+ freq_index[MED] = freq_tlb[MED];
+ freq_index[HGH] = freq_tlb[HGH];
+ }
+
+ RTE_LOG(INFO, POWER, "Initialize the Empty Poll\n");
+
+ /* Train for pre-defined period */
+ ep_params->max_train_iter = INTERVALS_PER_SECOND * SECONDS_TO_TRAIN_FOR;
+
+ struct stats_data *w = &ep_params->wrk_data;
+
+ *eptr = ep_params;
+
+ /* initialize all wrk_stats state */
+ for (i = 0; i < NUM_NODES; i++) {
+
+ if (rte_lcore_is_enabled(i) == 0)
+ continue;
+ /*init the freqs table */
+ total_avail_freqs[i] = rte_power_freqs(i,
+ avail_freqs[i],
+ NUM_FREQS);
+
+ RTE_LOG(INFO, POWER, "total avail freq is %d , lcoreid %d\n",
+ total_avail_freqs[i],
+ i);
+
+ if (get_freq_index(LOW) > total_avail_freqs[i])
+ return -1;
+
+ if (rte_get_master_lcore() != i) {
+ w->wrk_stats[i].lcore_id = i;
+ set_policy(&w->wrk_stats[i], policy);
+ }
+ }
+
+ return 0;
+}
+
+void __rte_experimental
+rte_power_empty_poll_stat_free(void)
+{
+
+ RTE_LOG(INFO, POWER, "Close the Empty Poll\n");
+
+ if (ep_params != NULL)
+ rte_free(ep_params);
+}
+
+int __rte_experimental
+rte_power_empty_poll_stat_update(unsigned int lcore_id)
+{
+ struct priority_worker *poll_stats;
+
+ if (lcore_id >= NUM_NODES)
+ return -1;
+
+ poll_stats = &(ep_params->wrk_data.wrk_stats[lcore_id]);
+
+ if (poll_stats->lcore_id == 0)
+ poll_stats->lcore_id = lcore_id;
+
+ poll_stats->empty_dequeues++;
+
+ return 0;
+}
+
+int __rte_experimental
+rte_power_poll_stat_update(unsigned int lcore_id, uint8_t nb_pkt)
+{
+
+ struct priority_worker *poll_stats;
+
+ if (lcore_id >= NUM_NODES)
+ return -1;
+
+ poll_stats = &(ep_params->wrk_data.wrk_stats[lcore_id]);
+
+ if (poll_stats->lcore_id == 0)
+ poll_stats->lcore_id = lcore_id;
+
+ poll_stats->num_dequeue_pkts += nb_pkt;
+
+ return 0;
+}
+
+
+uint64_t __rte_experimental
+rte_power_empty_poll_stat_fetch(unsigned int lcore_id)
+{
+ struct priority_worker *poll_stats;
+
+ if (lcore_id >= NUM_NODES)
+ return -1;
+
+ poll_stats = &(ep_params->wrk_data.wrk_stats[lcore_id]);
+
+ if (poll_stats->lcore_id == 0)
+ poll_stats->lcore_id = lcore_id;
+
+ return poll_stats->empty_dequeues;
+}
+
+uint64_t __rte_experimental
+rte_power_poll_stat_fetch(unsigned int lcore_id)
+{
+ struct priority_worker *poll_stats;
+
+ if (lcore_id >= NUM_NODES)
+ return -1;
+
+ poll_stats = &(ep_params->wrk_data.wrk_stats[lcore_id]);
+
+ if (poll_stats->lcore_id == 0)
+ poll_stats->lcore_id = lcore_id;
+
+ return poll_stats->num_dequeue_pkts;
+}
diff --git a/lib/librte_power/rte_power_empty_poll.h b/lib/librte_power/rte_power_empty_poll.h
new file mode 100644
index 00000000..c1ad5c24
--- /dev/null
+++ b/lib/librte_power/rte_power_empty_poll.h
@@ -0,0 +1,223 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2018 Intel Corporation
+ */
+
+#ifndef _RTE_EMPTY_POLL_H
+#define _RTE_EMPTY_POLL_H
+
+/**
+ * @file
+ * RTE Power Management
+ */
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <rte_common.h>
+#include <rte_byteorder.h>
+#include <rte_log.h>
+#include <rte_string_fns.h>
+#include <rte_power.h>
+#include <rte_timer.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NUM_FREQS RTE_MAX_LCORE_FREQS
+
+#define BINS_AV 4 /* Has to be ^2 */
+
+#define DROP (NUM_DIRECTIONS * NUM_DEVICES)
+
+#define NUM_PRIORITIES 2
+
+#define NUM_NODES 256 /* Max core number*/
+
+/* Processor Power State */
+enum freq_val {
+ LOW,
+ MED,
+ HGH,
+ NUM_FREQ = NUM_FREQS
+};
+
+
+/* Queue Polling State */
+enum queue_state {
+ TRAINING, /* NO TRAFFIC */
+ MED_NORMAL, /* MED */
+ HGH_BUSY, /* HIGH */
+ LOW_PURGE, /* LOW */
+};
+
+/* Queue Stats */
+struct freq_threshold {
+
+ uint64_t base_edpi;
+ bool trained;
+ uint32_t threshold_percent;
+ uint32_t cur_train_iter;
+};
+
+/* Each Worder Thread Empty Poll Stats */
+struct priority_worker {
+
+ /* Current dequeue and throughput counts */
+ /* These 2 are written to by the worker threads */
+ /* So keep them on their own cache line */
+ uint64_t empty_dequeues;
+ uint64_t num_dequeue_pkts;
+
+ enum queue_state queue_state;
+
+ uint64_t empty_dequeues_prev;
+ uint64_t num_dequeue_pkts_prev;
+
+ /* Used for training only */
+ struct freq_threshold thresh[NUM_FREQ];
+ enum freq_val cur_freq;
+
+ /* bucket arrays to calculate the averages */
+ /* edpi mean empty poll counter difference per interval */
+ uint64_t edpi_av[BINS_AV];
+ /* empty poll counter */
+ uint32_t ec;
+ /* ppi mean valid poll counter per interval */
+ uint64_t ppi_av[BINS_AV];
+ /* valid poll counter */
+ uint32_t pc;
+
+ uint32_t lcore_id;
+ uint32_t iter_counter;
+ uint32_t threshold_ctr;
+ uint32_t display_ctr;
+ uint8_t dev_id;
+
+} __rte_cache_aligned;
+
+
+struct stats_data {
+
+ struct priority_worker wrk_stats[NUM_NODES];
+
+ /* flag to stop rx threads processing packets until training over */
+ bool start_rx;
+
+};
+
+/* Empty Poll Parameters */
+struct ep_params {
+
+ /* Timer related stuff */
+ uint64_t interval_ticks;
+ uint32_t max_train_iter;
+
+ struct rte_timer timer0;
+ struct stats_data wrk_data;
+};
+
+
+/* Sample App Init information */
+struct ep_policy {
+
+ uint64_t med_base_edpi;
+ uint64_t hgh_base_edpi;
+
+ enum queue_state state;
+};
+
+
+
+/**
+ * Initialize the power management system.
+ *
+ * @param eptr
+ * the structure of empty poll configuration
+ * @param freq_tlb
+ * the power state/frequency mapping table
+ * @param policy
+ * the initialization policy from sample app
+ *
+ * @return
+ * - 0 on success.
+ * - Negative on error.
+ */
+int __rte_experimental
+rte_power_empty_poll_stat_init(struct ep_params **eptr, uint8_t *freq_tlb,
+ struct ep_policy *policy);
+
+/**
+ * Free the resource hold by power management system.
+ */
+void __rte_experimental
+rte_power_empty_poll_stat_free(void);
+
+/**
+ * Update specific core empty poll counter
+ * It's not thread safe.
+ *
+ * @param lcore_id
+ * lcore id
+ *
+ * @return
+ * - 0 on success.
+ * - Negative on error.
+ */
+int __rte_experimental
+rte_power_empty_poll_stat_update(unsigned int lcore_id);
+
+/**
+ * Update specific core valid poll counter, not thread safe.
+ *
+ * @param lcore_id
+ * lcore id.
+ * @param nb_pkt
+ * The packet number of one valid poll.
+ *
+ * @return
+ * - 0 on success.
+ * - Negative on error.
+ */
+int __rte_experimental
+rte_power_poll_stat_update(unsigned int lcore_id, uint8_t nb_pkt);
+
+/**
+ * Fetch specific core empty poll counter.
+ *
+ * @param lcore_id
+ * lcore id
+ *
+ * @return
+ * Current lcore empty poll counter value.
+ */
+uint64_t __rte_experimental
+rte_power_empty_poll_stat_fetch(unsigned int lcore_id);
+
+/**
+ * Fetch specific core valid poll counter.
+ *
+ * @param lcore_id
+ * lcore id
+ *
+ * @return
+ * Current lcore valid poll counter value.
+ */
+uint64_t __rte_experimental
+rte_power_poll_stat_fetch(unsigned int lcore_id);
+
+/**
+ * Empty poll state change detection function
+ *
+ * @param tim
+ * The timer structure
+ * @param arg
+ * The customized parameter
+ */
+void __rte_experimental
+rte_empty_poll_detection(struct rte_timer *tim, void *arg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/lib/librte_power/rte_power_version.map b/lib/librte_power/rte_power_version.map
index dd587dfb..17a083b2 100644
--- a/lib/librte_power/rte_power_version.map
+++ b/lib/librte_power/rte_power_version.map
@@ -33,3 +33,16 @@ DPDK_18.08 {
rte_power_get_capabilities;
} DPDK_17.11;
+
+EXPERIMENTAL {
+ global:
+
+ rte_empty_poll_detection;
+ rte_power_empty_poll_stat_fetch;
+ rte_power_empty_poll_stat_free;
+ rte_power_empty_poll_stat_init;
+ rte_power_empty_poll_stat_update;
+ rte_power_poll_stat_fetch;
+ rte_power_poll_stat_update;
+
+};
diff --git a/lib/librte_rawdev/rte_rawdev.c b/lib/librte_rawdev/rte_rawdev.c
index 62b6b97e..9f1e3592 100644
--- a/lib/librte_rawdev/rte_rawdev.c
+++ b/lib/librte_rawdev/rte_rawdev.c
@@ -35,21 +35,19 @@
/* dynamic log identifier */
int librawdev_logtype;
-struct rte_rawdev rte_rawdevices[RTE_RAWDEV_MAX_DEVS];
+static struct rte_rawdev rte_rawdevices[RTE_RAWDEV_MAX_DEVS];
-struct rte_rawdev *rte_rawdevs = &rte_rawdevices[0];
+struct rte_rawdev *rte_rawdevs = rte_rawdevices;
static struct rte_rawdev_global rawdev_globals = {
.nb_devs = 0
};
-struct rte_rawdev_global *rte_rawdev_globals = &rawdev_globals;
-
/* Raw device, northbound API implementation */
uint8_t
rte_rawdev_count(void)
{
- return rte_rawdev_globals->nb_devs;
+ return rawdev_globals.nb_devs;
}
uint16_t
@@ -60,7 +58,7 @@ rte_rawdev_get_dev_id(const char *name)
if (!name)
return -EINVAL;
- for (i = 0; i < rte_rawdev_globals->nb_devs; i++)
+ for (i = 0; i < rawdev_globals.nb_devs; i++)
if ((strcmp(rte_rawdevices[i].name, name)
== 0) &&
(rte_rawdevices[i].attached ==
diff --git a/lib/librte_rawdev/rte_rawdev_pmd.h b/lib/librte_rawdev/rte_rawdev_pmd.h
index bb9bbc35..811e51d0 100644
--- a/lib/librte_rawdev/rte_rawdev_pmd.h
+++ b/lib/librte_rawdev/rte_rawdev_pmd.h
@@ -73,8 +73,6 @@ struct rte_rawdev_global {
uint16_t nb_devs;
};
-extern struct rte_rawdev_global *rte_rawdev_globals;
-/** Pointer to global raw devices data structure. */
extern struct rte_rawdev *rte_rawdevs;
/** The pool of rte_rawdev structures. */
diff --git a/lib/librte_ring/meson.build b/lib/librte_ring/meson.build
index ca8a435e..ab8b0b46 100644
--- a/lib/librte_ring/meson.build
+++ b/lib/librte_ring/meson.build
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright(c) 2017 Intel Corporation
+version = 2
sources = files('rte_ring.c')
headers = files('rte_ring.h',
'rte_ring_c11_mem.h',
diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
index 7a731d07..af5444a9 100644
--- a/lib/librte_ring/rte_ring.h
+++ b/lib/librte_ring/rte_ring.h
@@ -303,11 +303,11 @@ void rte_ring_dump(FILE *f, const struct rte_ring *r);
* There are 2 choices for the users
* 1.use rmb() memory barrier
* 2.use one-direcion load_acquire/store_release barrier,defined by
- * CONFIG_RTE_RING_USE_C11_MEM_MODEL=y
+ * CONFIG_RTE_USE_C11_MEM_MODEL=y
* It depends on performance test results.
* By default, move common functions to rte_ring_generic.h
*/
-#ifdef RTE_RING_USE_C11_MEM_MODEL
+#ifdef RTE_USE_C11_MEM_MODEL
#include "rte_ring_c11_mem.h"
#else
#include "rte_ring_generic.h"
diff --git a/lib/librte_sched/Makefile b/lib/librte_sched/Makefile
index 55d9c698..46c53ed7 100644
--- a/lib/librte_sched/Makefile
+++ b/lib/librte_sched/Makefile
@@ -11,8 +11,6 @@ LIB = librte_sched.a
CFLAGS += -O3
CFLAGS += $(WERROR_FLAGS)
-CFLAGS_rte_red.o := -D_GNU_SOURCE
-
LDLIBS += -lm
LDLIBS += -lrt
LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_net
diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c
index 9269e5c7..587d5e60 100644
--- a/lib/librte_sched/rte_sched.c
+++ b/lib/librte_sched/rte_sched.c
@@ -329,7 +329,7 @@ rte_sched_port_check_params(struct rte_sched_port_params *params)
return -1;
/* socket */
- if ((params->socket < 0) || (params->socket >= RTE_MAX_NUMA_NODES))
+ if (params->socket < 0)
return -3;
/* rate */
@@ -633,7 +633,8 @@ rte_sched_port_config(struct rte_sched_port_params *params)
return NULL;
/* Allocate memory to store the data structures */
- port = rte_zmalloc("qos_params", mem_size, RTE_CACHE_LINE_SIZE);
+ port = rte_zmalloc_socket("qos_params", mem_size, RTE_CACHE_LINE_SIZE,
+ params->socket);
if (port == NULL)
return NULL;
diff --git a/lib/librte_security/rte_security.c b/lib/librte_security/rte_security.c
index 1954960a..c6355de9 100644
--- a/lib/librte_security/rte_security.c
+++ b/lib/librte_security/rte_security.c
@@ -131,6 +131,10 @@ rte_security_capability_get(struct rte_security_ctx *instance,
capability->ipsec.direction ==
idx->ipsec.direction)
return capability;
+ } else if (idx->protocol == RTE_SECURITY_PROTOCOL_PDCP) {
+ if (capability->pdcp.domain ==
+ idx->pdcp.domain)
+ return capability;
}
}
}
diff --git a/lib/librte_security/rte_security.h b/lib/librte_security/rte_security.h
index b0d1b97e..1431b4df 100644
--- a/lib/librte_security/rte_security.h
+++ b/lib/librte_security/rte_security.h
@@ -207,6 +207,64 @@ struct rte_security_macsec_xform {
};
/**
+ * PDCP Mode of session
+ */
+enum rte_security_pdcp_domain {
+ RTE_SECURITY_PDCP_MODE_CONTROL, /**< PDCP control plane */
+ RTE_SECURITY_PDCP_MODE_DATA, /**< PDCP data plane */
+};
+
+/** PDCP Frame direction */
+enum rte_security_pdcp_direction {
+ RTE_SECURITY_PDCP_UPLINK, /**< Uplink */
+ RTE_SECURITY_PDCP_DOWNLINK, /**< Downlink */
+};
+
+/** PDCP Sequence Number Size selectors */
+enum rte_security_pdcp_sn_size {
+ /** PDCP_SN_SIZE_5: 5bit sequence number */
+ RTE_SECURITY_PDCP_SN_SIZE_5 = 5,
+ /** PDCP_SN_SIZE_7: 7bit sequence number */
+ RTE_SECURITY_PDCP_SN_SIZE_7 = 7,
+ /** PDCP_SN_SIZE_12: 12bit sequence number */
+ RTE_SECURITY_PDCP_SN_SIZE_12 = 12,
+ /** PDCP_SN_SIZE_15: 15bit sequence number */
+ RTE_SECURITY_PDCP_SN_SIZE_15 = 15,
+ /** PDCP_SN_SIZE_18: 18bit sequence number */
+ RTE_SECURITY_PDCP_SN_SIZE_18 = 18
+};
+
+/**
+ * PDCP security association configuration data.
+ *
+ * This structure contains data required to create a PDCP security session.
+ */
+struct rte_security_pdcp_xform {
+ int8_t bearer; /**< PDCP bearer ID */
+ /** Enable in order delivery, this field shall be set only if
+ * driver/HW is capable. See RTE_SECURITY_PDCP_ORDERING_CAP.
+ */
+ uint8_t en_ordering;
+ /** Notify driver/HW to detect and remove duplicate packets.
+ * This field should be set only when driver/hw is capable.
+ * See RTE_SECURITY_PDCP_DUP_DETECT_CAP.
+ */
+ uint8_t remove_duplicates;
+ /** PDCP mode of operation: Control or data */
+ enum rte_security_pdcp_domain domain;
+ /** PDCP Frame Direction 0:UL 1:DL */
+ enum rte_security_pdcp_direction pkt_dir;
+ /** Sequence number size, 5/7/12/15/18 */
+ enum rte_security_pdcp_sn_size sn_size;
+ /** Starting Hyper Frame Number to be used together with the SN
+ * from the PDCP frames
+ */
+ uint32_t hfn;
+ /** HFN Threshold for key renegotiation */
+ uint32_t hfn_threshold;
+};
+
+/**
* Security session action type.
*/
enum rte_security_session_action_type {
@@ -232,6 +290,8 @@ enum rte_security_session_protocol {
/**< IPsec Protocol */
RTE_SECURITY_PROTOCOL_MACSEC,
/**< MACSec Protocol */
+ RTE_SECURITY_PROTOCOL_PDCP,
+ /**< PDCP Protocol */
};
/**
@@ -246,6 +306,7 @@ struct rte_security_session_conf {
union {
struct rte_security_ipsec_xform ipsec;
struct rte_security_macsec_xform macsec;
+ struct rte_security_pdcp_xform pdcp;
};
/**< Configuration parameters for security session */
struct rte_crypto_sym_xform *crypto_xform;
@@ -413,6 +474,10 @@ struct rte_security_ipsec_stats {
};
+struct rte_security_pdcp_stats {
+ uint64_t reserved;
+};
+
struct rte_security_stats {
enum rte_security_session_protocol protocol;
/**< Security protocol to be configured */
@@ -421,6 +486,7 @@ struct rte_security_stats {
union {
struct rte_security_macsec_stats macsec;
struct rte_security_ipsec_stats ipsec;
+ struct rte_security_pdcp_stats pdcp;
};
};
@@ -465,6 +531,13 @@ struct rte_security_capability {
int dummy;
} macsec;
/**< MACsec capability */
+ struct {
+ enum rte_security_pdcp_domain domain;
+ /**< PDCP mode of operation: Control or data */
+ uint32_t capa_flags;
+ /**< Capabilitity flags, see RTE_SECURITY_PDCP_* */
+ } pdcp;
+ /**< PDCP capability */
};
const struct rte_cryptodev_capabilities *crypto_capabilities;
@@ -474,6 +547,19 @@ struct rte_security_capability {
/**< Device offload flags */
};
+/** Underlying Hardware/driver which support PDCP may or may not support
+ * packet ordering. Set RTE_SECURITY_PDCP_ORDERING_CAP if it support.
+ * If it is not set, driver/HW assumes packets received are in order
+ * and it will be application's responsibility to maintain ordering.
+ */
+#define RTE_SECURITY_PDCP_ORDERING_CAP 0x00000001
+
+/** Underlying Hardware/driver which support PDCP may or may not detect
+ * duplicate packet. Set RTE_SECURITY_PDCP_DUP_DETECT_CAP if it support.
+ * If it is not set, driver/HW assumes there is no duplicate packet received.
+ */
+#define RTE_SECURITY_PDCP_DUP_DETECT_CAP 0x00000002
+
#define RTE_SECURITY_TX_OLOAD_NEED_MDATA 0x00000001
/**< HW needs metadata update, see rte_security_set_pkt_metadata().
*/
@@ -506,6 +592,10 @@ struct rte_security_capability_idx {
enum rte_security_ipsec_sa_mode mode;
enum rte_security_ipsec_sa_direction direction;
} ipsec;
+ struct {
+ enum rte_security_pdcp_domain domain;
+ uint32_t capa_flags;
+ } pdcp;
};
};
diff --git a/lib/librte_table/Makefile b/lib/librte_table/Makefile
index 276d476a..f935678a 100644
--- a/lib/librte_table/Makefile
+++ b/lib/librte_table/Makefile
@@ -46,6 +46,8 @@ SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_table_acl.h
endif
SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_table_hash.h
SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_table_hash_cuckoo.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_table_hash_func.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_table_hash_func_arm64.h
SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_lru.h
ifeq ($(CONFIG_RTE_ARCH_X86),y)
SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_lru_x86.h
diff --git a/lib/librte_table/meson.build b/lib/librte_table/meson.build
index 8b2f8413..6ae3cd6c 100644
--- a/lib/librte_table/meson.build
+++ b/lib/librte_table/meson.build
@@ -19,6 +19,8 @@ headers = files('rte_table.h',
'rte_table_lpm_ipv6.h',
'rte_table_hash.h',
'rte_table_hash_cuckoo.h',
+ 'rte_table_hash_func.h',
+ 'rte_table_hash_func_arm64.h',
'rte_lru.h',
'rte_table_array.h',
'rte_table_stub.h')
diff --git a/lib/librte_table/rte_table_hash_func.h b/lib/librte_table/rte_table_hash_func.h
new file mode 100644
index 00000000..02296eab
--- /dev/null
+++ b/lib/librte_table/rte_table_hash_func.h
@@ -0,0 +1,245 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2018 Intel Corporation
+ */
+
+#ifndef __INCLUDE_RTE_TABLE_HASH_FUNC_H__
+#define __INCLUDE_RTE_TABLE_HASH_FUNC_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#include <rte_compat.h>
+#include <rte_common.h>
+
+static inline uint64_t __rte_experimental
+rte_crc32_u64_generic(uint64_t crc, uint64_t value)
+{
+ int i;
+
+ crc = (crc & 0xFFFFFFFFLLU) ^ value;
+ for (i = 63; i >= 0; i--) {
+ uint64_t mask;
+
+ mask = -(crc & 1LLU);
+ crc = (crc >> 1LLU) ^ (0x82F63B78LLU & mask);
+ }
+
+ return crc;
+}
+
+#if defined(RTE_ARCH_X86_64)
+
+#include <x86intrin.h>
+
+static inline uint64_t
+rte_crc32_u64(uint64_t crc, uint64_t v)
+{
+ return _mm_crc32_u64(crc, v);
+}
+
+#elif defined(RTE_ARCH_ARM64)
+#include "rte_table_hash_func_arm64.h"
+#else
+
+static inline uint64_t
+rte_crc32_u64(uint64_t crc, uint64_t v)
+{
+ return rte_crc32_u64_generic(crc, v);
+}
+
+#endif
+
+static inline uint64_t __rte_experimental
+rte_table_hash_crc_key8(void *key, void *mask, __rte_unused uint32_t key_size,
+ uint64_t seed)
+{
+ uint64_t *k = key;
+ uint64_t *m = mask;
+ uint64_t crc0;
+
+ crc0 = rte_crc32_u64(seed, k[0] & m[0]);
+
+ return crc0;
+}
+
+static inline uint64_t __rte_experimental
+rte_table_hash_crc_key16(void *key, void *mask, __rte_unused uint32_t key_size,
+ uint64_t seed)
+{
+ uint64_t *k = key;
+ uint64_t *m = mask;
+ uint64_t k0, crc0, crc1;
+
+ k0 = k[0] & m[0];
+
+ crc0 = rte_crc32_u64(k0, seed);
+ crc1 = rte_crc32_u64(k0 >> 32, k[1] & m[1]);
+
+ crc0 ^= crc1;
+
+ return crc0;
+}
+
+static inline uint64_t __rte_experimental
+rte_table_hash_crc_key24(void *key, void *mask, __rte_unused uint32_t key_size,
+ uint64_t seed)
+{
+ uint64_t *k = key;
+ uint64_t *m = mask;
+ uint64_t k0, k2, crc0, crc1;
+
+ k0 = k[0] & m[0];
+ k2 = k[2] & m[2];
+
+ crc0 = rte_crc32_u64(k0, seed);
+ crc1 = rte_crc32_u64(k0 >> 32, k[1] & m[1]);
+
+ crc0 = rte_crc32_u64(crc0, k2);
+
+ crc0 ^= crc1;
+
+ return crc0;
+}
+
+static inline uint64_t __rte_experimental
+rte_table_hash_crc_key32(void *key, void *mask, __rte_unused uint32_t key_size,
+ uint64_t seed)
+{
+ uint64_t *k = key;
+ uint64_t *m = mask;
+ uint64_t k0, k2, crc0, crc1, crc2, crc3;
+
+ k0 = k[0] & m[0];
+ k2 = k[2] & m[2];
+
+ crc0 = rte_crc32_u64(k0, seed);
+ crc1 = rte_crc32_u64(k0 >> 32, k[1] & m[1]);
+
+ crc2 = rte_crc32_u64(k2, k[3] & m[3]);
+ crc3 = k2 >> 32;
+
+ crc0 = rte_crc32_u64(crc0, crc1);
+ crc1 = rte_crc32_u64(crc2, crc3);
+
+ crc0 ^= crc1;
+
+ return crc0;
+}
+
+static inline uint64_t __rte_experimental
+rte_table_hash_crc_key40(void *key, void *mask, __rte_unused uint32_t key_size,
+ uint64_t seed)
+{
+ uint64_t *k = key;
+ uint64_t *m = mask;
+ uint64_t k0, k2, crc0, crc1, crc2, crc3;
+
+ k0 = k[0] & m[0];
+ k2 = k[2] & m[2];
+
+ crc0 = rte_crc32_u64(k0, seed);
+ crc1 = rte_crc32_u64(k0 >> 32, k[1] & m[1]);
+
+ crc2 = rte_crc32_u64(k2, k[3] & m[3]);
+ crc3 = rte_crc32_u64(k2 >> 32, k[4] & m[4]);
+
+ crc0 = rte_crc32_u64(crc0, crc1);
+ crc1 = rte_crc32_u64(crc2, crc3);
+
+ crc0 ^= crc1;
+
+ return crc0;
+}
+
+static inline uint64_t __rte_experimental
+rte_table_hash_crc_key48(void *key, void *mask, __rte_unused uint32_t key_size,
+ uint64_t seed)
+{
+ uint64_t *k = key;
+ uint64_t *m = mask;
+ uint64_t k0, k2, k5, crc0, crc1, crc2, crc3;
+
+ k0 = k[0] & m[0];
+ k2 = k[2] & m[2];
+ k5 = k[5] & m[5];
+
+ crc0 = rte_crc32_u64(k0, seed);
+ crc1 = rte_crc32_u64(k0 >> 32, k[1] & m[1]);
+
+ crc2 = rte_crc32_u64(k2, k[3] & m[3]);
+ crc3 = rte_crc32_u64(k2 >> 32, k[4] & m[4]);
+
+ crc0 = rte_crc32_u64(crc0, (crc1 << 32) ^ crc2);
+ crc1 = rte_crc32_u64(crc3, k5);
+
+ crc0 ^= crc1;
+
+ return crc0;
+}
+
+static inline uint64_t __rte_experimental
+rte_table_hash_crc_key56(void *key, void *mask, __rte_unused uint32_t key_size,
+ uint64_t seed)
+{
+ uint64_t *k = key;
+ uint64_t *m = mask;
+ uint64_t k0, k2, k5, crc0, crc1, crc2, crc3, crc4, crc5;
+
+ k0 = k[0] & m[0];
+ k2 = k[2] & m[2];
+ k5 = k[5] & m[5];
+
+ crc0 = rte_crc32_u64(k0, seed);
+ crc1 = rte_crc32_u64(k0 >> 32, k[1] & m[1]);
+
+ crc2 = rte_crc32_u64(k2, k[3] & m[3]);
+ crc3 = rte_crc32_u64(k2 >> 32, k[4] & m[4]);
+
+ crc4 = rte_crc32_u64(k5, k[6] & m[6]);
+ crc5 = k5 >> 32;
+
+ crc0 = rte_crc32_u64(crc0, (crc1 << 32) ^ crc2);
+ crc1 = rte_crc32_u64(crc3, (crc4 << 32) ^ crc5);
+
+ crc0 ^= crc1;
+
+ return crc0;
+}
+
+static inline uint64_t __rte_experimental
+rte_table_hash_crc_key64(void *key, void *mask, __rte_unused uint32_t key_size,
+ uint64_t seed)
+{
+ uint64_t *k = key;
+ uint64_t *m = mask;
+ uint64_t k0, k2, k5, crc0, crc1, crc2, crc3, crc4, crc5;
+
+ k0 = k[0] & m[0];
+ k2 = k[2] & m[2];
+ k5 = k[5] & m[5];
+
+ crc0 = rte_crc32_u64(k0, seed);
+ crc1 = rte_crc32_u64(k0 >> 32, k[1] & m[1]);
+
+ crc2 = rte_crc32_u64(k2, k[3] & m[3]);
+ crc3 = rte_crc32_u64(k2 >> 32, k[4] & m[4]);
+
+ crc4 = rte_crc32_u64(k5, k[6] & m[6]);
+ crc5 = rte_crc32_u64(k5 >> 32, k[7] & m[7]);
+
+ crc0 = rte_crc32_u64(crc0, (crc1 << 32) ^ crc2);
+ crc1 = rte_crc32_u64(crc3, (crc4 << 32) ^ crc5);
+
+ crc0 ^= crc1;
+
+ return crc0;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/lib/librte_table/rte_table_hash_func_arm64.h b/lib/librte_table/rte_table_hash_func_arm64.h
new file mode 100644
index 00000000..eb04c1ff
--- /dev/null
+++ b/lib/librte_table/rte_table_hash_func_arm64.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Linaro Limited
+ */
+
+#ifndef __INCLUDE_RTE_TABLE_HASH_FUNC_ARM64_H__
+#define __INCLUDE_RTE_TABLE_HASH_FUNC_ARM64_H__
+
+#define _CRC32CX(crc, val) \
+ __asm__("crc32cx %w[c], %w[c], %x[v]":[c] "+r" (crc):[v] "r" (val))
+
+static inline uint64_t
+rte_crc32_u64(uint64_t crc, uint64_t v)
+{
+ uint32_t crc32 = crc;
+
+ _CRC32CX(crc32, v);
+
+ return crc32;
+}
+
+#endif
diff --git a/lib/librte_telemetry/Makefile b/lib/librte_telemetry/Makefile
new file mode 100644
index 00000000..1a050691
--- /dev/null
+++ b/lib/librte_telemetry/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_telemetry.a
+
+CFLAGS += -O3
+CFLAGS += -I$(SRCDIR)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
+LDLIBS += -lrte_eal -lrte_ethdev
+LDLIBS += -lrte_metrics
+LDLIBS += -lpthread
+LDLIBS += -ljansson
+
+EXPORT_MAP := rte_telemetry_version.map
+
+LIBABIVER := 1
+
+# library source files
+SRCS-$(CONFIG_RTE_LIBRTE_TELEMETRY) := rte_telemetry.c
+SRCS-$(CONFIG_RTE_LIBRTE_TELEMETRY) += rte_telemetry_parser.c
+SRCS-$(CONFIG_RTE_LIBRTE_TELEMETRY) += rte_telemetry_parser_test.c
+
+# export include files
+SYMLINK-$(CONFIG_RTE_LIBRTE_TELEMETRY)-include := rte_telemetry.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_telemetry/meson.build b/lib/librte_telemetry/meson.build
new file mode 100644
index 00000000..9492f544
--- /dev/null
+++ b/lib/librte_telemetry/meson.build
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+sources = files('rte_telemetry.c', 'rte_telemetry_parser.c', 'rte_telemetry_parser_test.c')
+headers = files('rte_telemetry.h', 'rte_telemetry_internal.h', 'rte_telemetry_parser.h', 'rte_telemetry_parser_test.h')
+deps += ['metrics', 'ethdev']
+cflags += '-DALLOW_EXPERIMENTAL_API'
+
+jansson = cc.find_library('jansson', required: false)
+if jansson.found()
+ ext_deps += jansson
+ dpdk_app_link_libraries += ['telemetry']
+else
+ build = false
+endif
diff --git a/lib/librte_telemetry/rte_telemetry.c b/lib/librte_telemetry/rte_telemetry.c
new file mode 100644
index 00000000..016431f1
--- /dev/null
+++ b/lib/librte_telemetry/rte_telemetry.c
@@ -0,0 +1,1813 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <jansson.h>
+
+#include <rte_eal.h>
+#include <rte_ethdev.h>
+#include <rte_metrics.h>
+#include <rte_option.h>
+#include <rte_string_fns.h>
+
+#include "rte_telemetry.h"
+#include "rte_telemetry_internal.h"
+#include "rte_telemetry_parser.h"
+#include "rte_telemetry_parser_test.h"
+#include "rte_telemetry_socket_tests.h"
+
+#define BUF_SIZE 1024
+#define ACTION_POST 1
+#define SLEEP_TIME 10
+
+#define SELFTEST_VALID_CLIENT "/var/run/dpdk/valid_client"
+#define SELFTEST_INVALID_CLIENT "/var/run/dpdk/invalid_client"
+#define SOCKET_TEST_CLIENT_PATH "/var/run/dpdk/client"
+
+static telemetry_impl *static_telemetry;
+
+struct telemetry_message_test {
+ char *test_name;
+ int (*test_func_ptr)(struct telemetry_impl *telemetry, int fd);
+};
+
+struct json_data {
+ char *status_code;
+ char *data;
+ int port;
+ char *stat_name;
+ int stat_value;
+};
+
+static void
+rte_telemetry_get_runtime_dir(char *socket_path, size_t size)
+{
+ snprintf(socket_path, size, "%s/telemetry", rte_eal_get_runtime_dir());
+}
+
+int32_t
+rte_telemetry_is_port_active(int port_id)
+{
+ int ret;
+
+ ret = rte_eth_find_next(port_id);
+ if (ret == port_id)
+ return 1;
+
+ TELEMETRY_LOG_ERR("port_id: %d is invalid, not active",
+ port_id);
+
+ return 0;
+}
+
+static int32_t
+rte_telemetry_update_metrics_ethdev(struct telemetry_impl *telemetry,
+ uint16_t port_id, int reg_start_index)
+{
+ int ret, num_xstats, i;
+ struct rte_eth_xstat *eth_xstats;
+
+ if (!rte_eth_dev_is_valid_port(port_id)) {
+ TELEMETRY_LOG_ERR("port_id: %d is invalid", port_id);
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+ }
+
+ ret = rte_telemetry_is_port_active(port_id);
+ if (ret < 1) {
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+ }
+
+ num_xstats = rte_eth_xstats_get(port_id, NULL, 0);
+ if (num_xstats < 0) {
+ TELEMETRY_LOG_ERR("rte_eth_xstats_get(%u) failed: %d", port_id,
+ num_xstats);
+ ret = rte_telemetry_send_error_response(telemetry, -EPERM);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+ }
+
+ eth_xstats = malloc(sizeof(struct rte_eth_xstat) * num_xstats);
+ if (eth_xstats == NULL) {
+ TELEMETRY_LOG_ERR("Failed to malloc memory for xstats");
+ ret = rte_telemetry_send_error_response(telemetry, -ENOMEM);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+ }
+
+ ret = rte_eth_xstats_get(port_id, eth_xstats, num_xstats);
+ if (ret < 0 || ret > num_xstats) {
+ free(eth_xstats);
+ TELEMETRY_LOG_ERR("rte_eth_xstats_get(%u) len%i failed: %d",
+ port_id, num_xstats, ret);
+ ret = rte_telemetry_send_error_response(telemetry, -EPERM);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+ }
+
+ uint64_t xstats_values[num_xstats];
+ for (i = 0; i < num_xstats; i++)
+ xstats_values[i] = eth_xstats[i].value;
+
+ ret = rte_metrics_update_values(port_id, reg_start_index, xstats_values,
+ num_xstats);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not update metrics values");
+ ret = rte_telemetry_send_error_response(telemetry, -EPERM);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ free(eth_xstats);
+ return -1;
+ }
+
+ free(eth_xstats);
+ return 0;
+}
+
+int32_t
+rte_telemetry_write_to_socket(struct telemetry_impl *telemetry,
+ const char *json_string)
+{
+ int ret;
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Could not initialise TELEMETRY_API");
+ return -1;
+ }
+
+ if (telemetry->request_client == NULL) {
+ TELEMETRY_LOG_ERR("No client has been chosen to write to");
+ return -1;
+ }
+
+ if (json_string == NULL) {
+ TELEMETRY_LOG_ERR("Invalid JSON string!");
+ return -1;
+ }
+
+ ret = send(telemetry->request_client->fd,
+ json_string, strlen(json_string), 0);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Failed to write to socket for client: %s",
+ telemetry->request_client->file_path);
+ return -1;
+ }
+
+ return 0;
+}
+
+int32_t
+rte_telemetry_send_error_response(struct telemetry_impl *telemetry,
+ int error_type)
+{
+ int ret;
+ const char *status_code, *json_buffer;
+ json_t *root;
+
+ if (error_type == -EPERM)
+ status_code = "Status Error: Unknown";
+ else if (error_type == -EINVAL)
+ status_code = "Status Error: Invalid Argument 404";
+ else if (error_type == -ENOMEM)
+ status_code = "Status Error: Memory Allocation Error";
+ else {
+ TELEMETRY_LOG_ERR("Invalid error type");
+ return -EINVAL;
+ }
+
+ root = json_object();
+
+ if (root == NULL) {
+ TELEMETRY_LOG_ERR("Could not create root JSON object");
+ return -EPERM;
+ }
+
+ ret = json_object_set_new(root, "status_code", json_string(status_code));
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Status code field cannot be set");
+ json_decref(root);
+ return -EPERM;
+ }
+
+ ret = json_object_set_new(root, "data", json_null());
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Data field cannot be set");
+ json_decref(root);
+ return -EPERM;
+ }
+
+ json_buffer = json_dumps(root, 0);
+ json_decref(root);
+
+ ret = rte_telemetry_write_to_socket(telemetry, json_buffer);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not write to socket");
+ return -EPERM;
+ }
+
+ return 0;
+}
+
+static int
+rte_telemetry_get_metrics(struct telemetry_impl *telemetry, uint32_t port_id,
+ struct rte_metric_value *metrics, struct rte_metric_name *names,
+ int num_metrics)
+{
+ int ret, num_values;
+
+ if (num_metrics < 0) {
+ TELEMETRY_LOG_ERR("Invalid metrics count");
+ goto einval_fail;
+ } else if (num_metrics == 0) {
+ TELEMETRY_LOG_ERR("No metrics to display (none have been registered)");
+ goto eperm_fail;
+ }
+
+ if (metrics == NULL) {
+ TELEMETRY_LOG_ERR("Metrics must be initialised.");
+ goto einval_fail;
+ }
+
+ if (names == NULL) {
+ TELEMETRY_LOG_ERR("Names must be initialised.");
+ goto einval_fail;
+ }
+
+ ret = rte_metrics_get_names(names, num_metrics);
+ if (ret < 0 || ret > num_metrics) {
+ TELEMETRY_LOG_ERR("Cannot get metrics names");
+ goto eperm_fail;
+ }
+
+ num_values = rte_metrics_get_values(port_id, NULL, 0);
+ ret = rte_metrics_get_values(port_id, metrics, num_values);
+ if (ret < 0 || ret > num_values) {
+ TELEMETRY_LOG_ERR("Cannot get metrics values");
+ goto eperm_fail;
+ }
+
+ return 0;
+
+eperm_fail:
+ ret = rte_telemetry_send_error_response(telemetry, -EPERM);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+
+einval_fail:
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+
+}
+
+static int32_t
+rte_telemetry_json_format_stat(struct telemetry_impl *telemetry, json_t *stats,
+ const char *metric_name, uint64_t metric_value)
+{
+ int ret;
+ json_t *stat = json_object();
+
+ if (stat == NULL) {
+ TELEMETRY_LOG_ERR("Could not create stat JSON object");
+ goto eperm_fail;
+ }
+
+ ret = json_object_set_new(stat, "name", json_string(metric_name));
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Stat Name field cannot be set");
+ goto eperm_fail;
+ }
+
+ ret = json_object_set_new(stat, "value", json_integer(metric_value));
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Stat Value field cannot be set");
+ goto eperm_fail;
+ }
+
+ ret = json_array_append_new(stats, stat);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Stat cannot be added to stats json array");
+ goto eperm_fail;
+ }
+
+ return 0;
+
+eperm_fail:
+ ret = rte_telemetry_send_error_response(telemetry, -EPERM);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+
+}
+
+static int32_t
+rte_telemetry_json_format_port(struct telemetry_impl *telemetry,
+ uint32_t port_id, json_t *ports, uint32_t *metric_ids,
+ uint32_t num_metric_ids)
+{
+ struct rte_metric_value *metrics = 0;
+ struct rte_metric_name *names = 0;
+ int num_metrics, ret, err_ret;
+ json_t *port, *stats;
+ uint32_t i;
+
+ num_metrics = rte_metrics_get_names(NULL, 0);
+ if (num_metrics < 0) {
+ TELEMETRY_LOG_ERR("Cannot get metrics count");
+ goto einval_fail;
+ } else if (num_metrics == 0) {
+ TELEMETRY_LOG_ERR("No metrics to display (none have been registered)");
+ goto eperm_fail;
+ }
+
+ metrics = malloc(sizeof(struct rte_metric_value) * num_metrics);
+ names = malloc(sizeof(struct rte_metric_name) * num_metrics);
+ if (metrics == NULL || names == NULL) {
+ TELEMETRY_LOG_ERR("Cannot allocate memory");
+ free(metrics);
+ free(names);
+
+ err_ret = rte_telemetry_send_error_response(telemetry, -ENOMEM);
+ if (err_ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+ }
+
+ ret = rte_telemetry_get_metrics(telemetry, port_id, metrics, names,
+ num_metrics);
+ if (ret < 0) {
+ free(metrics);
+ free(names);
+ TELEMETRY_LOG_ERR("rte_telemetry_get_metrics failed");
+ return -1;
+ }
+
+ port = json_object();
+ stats = json_array();
+ if (port == NULL || stats == NULL) {
+ TELEMETRY_LOG_ERR("Could not create port/stats JSON objects");
+ goto eperm_fail;
+ }
+
+ ret = json_object_set_new(port, "port", json_integer(port_id));
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Port field cannot be set");
+ goto eperm_fail;
+ }
+
+ for (i = 0; i < num_metric_ids; i++) {
+ int metric_id = metric_ids[i];
+ int metric_index = -1;
+ int metric_name_key = -1;
+ int32_t j;
+ uint64_t metric_value;
+
+ if (metric_id >= num_metrics) {
+ TELEMETRY_LOG_ERR("Metric_id: %d is not valid",
+ metric_id);
+ goto einval_fail;
+ }
+
+ for (j = 0; j < num_metrics; j++) {
+ if (metrics[j].key == metric_id) {
+ metric_name_key = metrics[j].key;
+ metric_index = j;
+ break;
+ }
+ }
+
+ const char *metric_name = names[metric_name_key].name;
+ metric_value = metrics[metric_index].value;
+
+ if (metric_name_key < 0 || metric_index < 0) {
+ TELEMETRY_LOG_ERR("Could not get metric name/index");
+ goto eperm_fail;
+ }
+
+ ret = rte_telemetry_json_format_stat(telemetry, stats,
+ metric_name, metric_value);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Format stat with id: %u failed",
+ metric_id);
+ free(metrics);
+ free(names);
+ return -1;
+ }
+ }
+
+ if (json_array_size(stats) == 0)
+ ret = json_object_set_new(port, "stats", json_null());
+ else
+ ret = json_object_set_new(port, "stats", stats);
+
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Stats object cannot be set");
+ goto eperm_fail;
+ }
+
+ ret = json_array_append_new(ports, port);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Port object cannot be added to ports array");
+ goto eperm_fail;
+ }
+
+ free(metrics);
+ free(names);
+ return 0;
+
+eperm_fail:
+ free(metrics);
+ free(names);
+ ret = rte_telemetry_send_error_response(telemetry, -EPERM);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+
+einval_fail:
+ free(metrics);
+ free(names);
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+}
+
+static int32_t
+rte_telemetry_encode_json_format(struct telemetry_impl *telemetry,
+ uint32_t *port_ids, uint32_t num_port_ids, uint32_t *metric_ids,
+ uint32_t num_metric_ids, char **json_buffer)
+{
+ int ret;
+ json_t *root, *ports;
+ uint32_t i;
+
+ if (num_port_ids <= 0 || num_metric_ids <= 0) {
+ TELEMETRY_LOG_ERR("Please provide port and metric ids to query");
+ goto einval_fail;
+ }
+
+ ports = json_array();
+ if (ports == NULL) {
+ TELEMETRY_LOG_ERR("Could not create ports JSON array");
+ goto eperm_fail;
+ }
+
+ for (i = 0; i < num_port_ids; i++) {
+ if (!rte_eth_dev_is_valid_port(port_ids[i])) {
+ TELEMETRY_LOG_ERR("Port: %d invalid", port_ids[i]);
+ goto einval_fail;
+ }
+ }
+
+ for (i = 0; i < num_port_ids; i++) {
+ ret = rte_telemetry_json_format_port(telemetry, port_ids[i],
+ ports, metric_ids, num_metric_ids);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Format port in JSON failed");
+ return -1;
+ }
+ }
+
+ root = json_object();
+ if (root == NULL) {
+ TELEMETRY_LOG_ERR("Could not create root JSON object");
+ goto eperm_fail;
+ }
+
+ ret = json_object_set_new(root, "status_code",
+ json_string("Status OK: 200"));
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Status code field cannot be set");
+ goto eperm_fail;
+ }
+
+ ret = json_object_set_new(root, "data", ports);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Data field cannot be set");
+ goto eperm_fail;
+ }
+
+ *json_buffer = json_dumps(root, JSON_INDENT(2));
+ json_decref(root);
+ return 0;
+
+eperm_fail:
+ ret = rte_telemetry_send_error_response(telemetry, -EPERM);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+
+einval_fail:
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+}
+
+int32_t
+rte_telemetry_send_ports_stats_values(uint32_t *metric_ids, int num_metric_ids,
+ uint32_t *port_ids, int num_port_ids, struct telemetry_impl *telemetry)
+{
+ int ret, i;
+ char *json_buffer = NULL;
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Invalid telemetry argument");
+ return -1;
+ }
+
+ if (metric_ids == NULL) {
+ TELEMETRY_LOG_ERR("Invalid metric_ids array");
+ goto einval_fail;
+ }
+
+ if (num_metric_ids < 0) {
+ TELEMETRY_LOG_ERR("Invalid num_metric_ids, must be positive");
+ goto einval_fail;
+ }
+
+ if (port_ids == NULL) {
+ TELEMETRY_LOG_ERR("Invalid port_ids array");
+ goto einval_fail;
+ }
+
+ if (num_port_ids < 0) {
+ TELEMETRY_LOG_ERR("Invalid num_port_ids, must be positive");
+ goto einval_fail;
+ }
+
+ for (i = 0; i < num_port_ids; i++) {
+ if (!rte_eth_dev_is_valid_port(port_ids[i])) {
+ TELEMETRY_LOG_ERR("Port: %d invalid", port_ids[i]);
+ goto einval_fail;
+ }
+
+ ret = rte_telemetry_update_metrics_ethdev(telemetry,
+ port_ids[i], telemetry->reg_index);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Failed to update ethdev metrics");
+ return -1;
+ }
+ }
+
+ ret = rte_telemetry_encode_json_format(telemetry, port_ids,
+ num_port_ids, metric_ids, num_metric_ids, &json_buffer);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("JSON encode function failed");
+ return -1;
+ }
+
+ ret = rte_telemetry_write_to_socket(telemetry, json_buffer);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not write to socket");
+ return -1;
+ }
+
+ return 0;
+
+einval_fail:
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+}
+
+
+static int32_t
+rte_telemetry_reg_ethdev_to_metrics(uint16_t port_id)
+{
+ int ret, num_xstats, ret_val, i;
+ struct rte_eth_xstat *eth_xstats = NULL;
+ struct rte_eth_xstat_name *eth_xstats_names = NULL;
+
+ if (!rte_eth_dev_is_valid_port(port_id)) {
+ TELEMETRY_LOG_ERR("port_id: %d is invalid", port_id);
+ return -EINVAL;
+ }
+
+ num_xstats = rte_eth_xstats_get(port_id, NULL, 0);
+ if (num_xstats < 0) {
+ TELEMETRY_LOG_ERR("rte_eth_xstats_get(%u) failed: %d",
+ port_id, num_xstats);
+ return -EPERM;
+ }
+
+ eth_xstats = malloc(sizeof(struct rte_eth_xstat) * num_xstats);
+ if (eth_xstats == NULL) {
+ TELEMETRY_LOG_ERR("Failed to malloc memory for xstats");
+ return -ENOMEM;
+ }
+
+ ret = rte_eth_xstats_get(port_id, eth_xstats, num_xstats);
+ const char *xstats_names[num_xstats];
+ eth_xstats_names = malloc(sizeof(struct rte_eth_xstat_name) * num_xstats);
+ if (ret < 0 || ret > num_xstats) {
+ TELEMETRY_LOG_ERR("rte_eth_xstats_get(%u) len%i failed: %d",
+ port_id, num_xstats, ret);
+ ret_val = -EPERM;
+ goto free_xstats;
+ }
+
+ if (eth_xstats_names == NULL) {
+ TELEMETRY_LOG_ERR("Failed to malloc memory for xstats_names");
+ ret_val = -ENOMEM;
+ goto free_xstats;
+ }
+
+ ret = rte_eth_xstats_get_names(port_id, eth_xstats_names, num_xstats);
+ if (ret < 0 || ret > num_xstats) {
+ TELEMETRY_LOG_ERR("rte_eth_xstats_get_names(%u) len%i failed: %d",
+ port_id, num_xstats, ret);
+ ret_val = -EPERM;
+ goto free_xstats;
+ }
+
+ for (i = 0; i < num_xstats; i++)
+ xstats_names[i] = eth_xstats_names[eth_xstats[i].id].name;
+
+ ret_val = rte_metrics_reg_names(xstats_names, num_xstats);
+ if (ret_val < 0) {
+ TELEMETRY_LOG_ERR("rte_metrics_reg_names failed - metrics may already be registered");
+ ret_val = -1;
+ goto free_xstats;
+ }
+
+ goto free_xstats;
+
+free_xstats:
+ free(eth_xstats);
+ free(eth_xstats_names);
+ return ret_val;
+}
+
+static int32_t
+rte_telemetry_initial_accept(struct telemetry_impl *telemetry)
+{
+ uint16_t pid;
+ int ret;
+ int selftest = 0;
+
+ RTE_ETH_FOREACH_DEV(pid) {
+ telemetry->reg_index = rte_telemetry_reg_ethdev_to_metrics(pid);
+ break;
+ }
+
+ if (telemetry->reg_index < 0) {
+ TELEMETRY_LOG_ERR("Failed to register ethdev metrics");
+ return -1;
+ }
+
+ telemetry->metrics_register_done = 1;
+ if (selftest) {
+ ret = rte_telemetry_socket_messaging_testing(telemetry->reg_index,
+ telemetry->server_fd);
+ if (ret < 0)
+ return -1;
+
+ ret = rte_telemetry_parser_test(telemetry);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Parser Tests Failed");
+ return -1;
+ }
+
+ TELEMETRY_LOG_INFO("Success - All Parser Tests Passed");
+ }
+
+ return 0;
+}
+
+static int32_t
+rte_telemetry_read_client(struct telemetry_impl *telemetry)
+{
+ char buf[BUF_SIZE];
+ int ret, buffer_read;
+
+ buffer_read = read(telemetry->accept_fd, buf, BUF_SIZE-1);
+
+ if (buffer_read == -1) {
+ TELEMETRY_LOG_ERR("Read error");
+ return -1;
+ } else if (buffer_read == 0) {
+ goto close_socket;
+ } else {
+ buf[buffer_read] = '\0';
+ ret = rte_telemetry_parse_client_message(telemetry, buf);
+ if (ret < 0)
+ TELEMETRY_LOG_WARN("Parse message failed");
+ goto close_socket;
+ }
+
+close_socket:
+ if (close(telemetry->accept_fd) < 0) {
+ TELEMETRY_LOG_ERR("Close TELEMETRY socket failed");
+ free(telemetry);
+ return -EPERM;
+ }
+ telemetry->accept_fd = 0;
+
+ return 0;
+}
+
+static int32_t
+rte_telemetry_accept_new_client(struct telemetry_impl *telemetry)
+{
+ int ret;
+
+ if (telemetry->accept_fd <= 0) {
+ ret = listen(telemetry->server_fd, 1);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Listening error with server fd");
+ return -1;
+ }
+
+ telemetry->accept_fd = accept(telemetry->server_fd, NULL, NULL);
+ if (telemetry->accept_fd >= 0 &&
+ telemetry->metrics_register_done == 0) {
+ ret = rte_telemetry_initial_accept(telemetry);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Failed to run initial configurations/tests");
+ return -1;
+ }
+ }
+ } else {
+ ret = rte_telemetry_read_client(telemetry);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Failed to read socket buffer");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int32_t
+rte_telemetry_read_client_sockets(struct telemetry_impl *telemetry)
+{
+ int ret;
+ telemetry_client *client;
+ char client_buf[BUF_SIZE];
+ int bytes;
+
+ TAILQ_FOREACH(client, &telemetry->client_list_head, client_list) {
+ bytes = read(client->fd, client_buf, BUF_SIZE-1);
+
+ if (bytes > 0) {
+ client_buf[bytes] = '\0';
+ telemetry->request_client = client;
+ ret = rte_telemetry_parse(telemetry, client_buf);
+ if (ret < 0) {
+ TELEMETRY_LOG_WARN("Parse socket input failed: %i",
+ ret);
+ return -1;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int32_t
+rte_telemetry_run(void *userdata)
+{
+ int ret;
+ struct telemetry_impl *telemetry = userdata;
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_WARN("TELEMETRY could not be initialised");
+ return -1;
+ }
+
+ ret = rte_telemetry_accept_new_client(telemetry);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Accept and read new client failed");
+ return -1;
+ }
+
+ ret = rte_telemetry_read_client_sockets(telemetry);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Client socket read failed");
+ return -1;
+ }
+
+ return 0;
+}
+
+static void
+*rte_telemetry_run_thread_func(void *userdata)
+{
+ int ret;
+ struct telemetry_impl *telemetry = userdata;
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("%s passed a NULL instance", __func__);
+ pthread_exit(0);
+ }
+
+ while (telemetry->thread_status) {
+ rte_telemetry_run(telemetry);
+ ret = usleep(SLEEP_TIME);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Calling thread could not be put to sleep");
+ }
+ pthread_exit(0);
+}
+
+static int32_t
+rte_telemetry_set_socket_nonblock(int fd)
+{
+ int flags;
+
+ if (fd < 0) {
+ TELEMETRY_LOG_ERR("Invalid fd provided");
+ return -1;
+ }
+
+ flags = fcntl(fd, F_GETFL, 0);
+ if (flags < 0)
+ flags = 0;
+
+ return fcntl(fd, F_SETFL, flags | O_NONBLOCK);
+}
+
+static int32_t
+rte_telemetry_create_socket(struct telemetry_impl *telemetry)
+{
+ int ret;
+ struct sockaddr_un addr;
+ char socket_path[BUF_SIZE];
+
+ if (telemetry == NULL)
+ return -1;
+
+ telemetry->server_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
+ if (telemetry->server_fd == -1) {
+ TELEMETRY_LOG_ERR("Failed to open socket");
+ return -1;
+ }
+
+ ret = rte_telemetry_set_socket_nonblock(telemetry->server_fd);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not set socket to NONBLOCK");
+ goto close_socket;
+ }
+
+ addr.sun_family = AF_UNIX;
+ rte_telemetry_get_runtime_dir(socket_path, sizeof(socket_path));
+ strlcpy(addr.sun_path, socket_path, sizeof(addr.sun_path));
+ unlink(socket_path);
+
+ if (bind(telemetry->server_fd, (struct sockaddr *)&addr,
+ sizeof(addr)) < 0) {
+ TELEMETRY_LOG_ERR("Socket binding error");
+ goto close_socket;
+ }
+
+ return 0;
+
+close_socket:
+ if (close(telemetry->server_fd) < 0) {
+ TELEMETRY_LOG_ERR("Close TELEMETRY socket failed");
+ return -EPERM;
+ }
+
+ return -1;
+}
+
+int32_t __rte_experimental
+rte_telemetry_init()
+{
+ int ret;
+ pthread_attr_t attr;
+ const char *telemetry_ctrl_thread = "telemetry";
+
+ if (static_telemetry) {
+ TELEMETRY_LOG_WARN("TELEMETRY structure already initialised");
+ return -EALREADY;
+ }
+
+ static_telemetry = calloc(1, sizeof(struct telemetry_impl));
+ if (static_telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Memory could not be allocated");
+ return -ENOMEM;
+ }
+
+ static_telemetry->socket_id = rte_socket_id();
+ rte_metrics_init(static_telemetry->socket_id);
+
+ ret = pthread_attr_init(&attr);
+ if (ret != 0) {
+ TELEMETRY_LOG_ERR("Pthread attribute init failed");
+ return -EPERM;
+ }
+
+ ret = rte_telemetry_create_socket(static_telemetry);
+ if (ret < 0) {
+ ret = rte_telemetry_cleanup();
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("TELEMETRY cleanup failed");
+ return -EPERM;
+ }
+ TAILQ_INIT(&static_telemetry->client_list_head);
+
+ ret = rte_ctrl_thread_create(&static_telemetry->thread_id,
+ telemetry_ctrl_thread, &attr, rte_telemetry_run_thread_func,
+ (void *)static_telemetry);
+ static_telemetry->thread_status = 1;
+
+ if (ret < 0) {
+ ret = rte_telemetry_cleanup();
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("TELEMETRY cleanup failed");
+ return -EPERM;
+ }
+
+ return 0;
+}
+
+static int32_t
+rte_telemetry_client_cleanup(struct telemetry_client *client)
+{
+ int ret;
+
+ ret = close(client->fd);
+ free(client->file_path);
+ free(client);
+
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Close client socket failed");
+ return -EPERM;
+ }
+
+ return 0;
+}
+
+int32_t __rte_experimental
+rte_telemetry_cleanup(void)
+{
+ int ret;
+ struct telemetry_impl *telemetry = static_telemetry;
+ telemetry_client *client, *temp_client;
+
+ TAILQ_FOREACH_SAFE(client, &telemetry->client_list_head, client_list,
+ temp_client) {
+ TAILQ_REMOVE(&telemetry->client_list_head, client, client_list);
+ ret = rte_telemetry_client_cleanup(client);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Client cleanup failed");
+ return -EPERM;
+ }
+ }
+
+ ret = close(telemetry->server_fd);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Close TELEMETRY socket failed");
+ free(telemetry);
+ return -EPERM;
+ }
+
+ telemetry->thread_status = 0;
+ pthread_join(telemetry->thread_id, NULL);
+ free(telemetry);
+ static_telemetry = NULL;
+
+ return 0;
+}
+
+int32_t
+rte_telemetry_unregister_client(struct telemetry_impl *telemetry,
+ const char *client_path)
+{
+ int ret;
+ telemetry_client *client, *temp_client;
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_WARN("TELEMETRY is not initialised");
+ return -ENODEV;
+ }
+
+ if (client_path == NULL) {
+ TELEMETRY_LOG_ERR("Invalid client path");
+ goto einval_fail;
+ }
+
+ if (TAILQ_EMPTY(&telemetry->client_list_head)) {
+ TELEMETRY_LOG_ERR("There are no clients currently registered");
+ return -EPERM;
+ }
+
+ TAILQ_FOREACH_SAFE(client, &telemetry->client_list_head, client_list,
+ temp_client) {
+ if (strcmp(client_path, client->file_path) == 0) {
+ TAILQ_REMOVE(&telemetry->client_list_head, client,
+ client_list);
+ ret = rte_telemetry_client_cleanup(client);
+
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Client cleanup failed");
+ return -EPERM;
+ }
+
+ return 0;
+ }
+ }
+
+ TELEMETRY_LOG_WARN("Couldn't find client, possibly not registered yet.");
+ return -1;
+
+einval_fail:
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -EINVAL;
+}
+
+int32_t
+rte_telemetry_register_client(struct telemetry_impl *telemetry,
+ const char *client_path)
+{
+ int ret, fd;
+ struct sockaddr_un addrs;
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Could not initialize TELEMETRY API");
+ return -ENODEV;
+ }
+
+ if (client_path == NULL) {
+ TELEMETRY_LOG_ERR("Invalid client path");
+ return -EINVAL;
+ }
+
+ telemetry_client *client;
+ TAILQ_FOREACH(client, &telemetry->client_list_head, client_list) {
+ if (strcmp(client_path, client->file_path) == 0) {
+ TELEMETRY_LOG_WARN("'%s' already registered",
+ client_path);
+ return -EINVAL;
+ }
+ }
+
+ fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
+ if (fd == -1) {
+ TELEMETRY_LOG_ERR("Client socket error");
+ return -EACCES;
+ }
+
+ ret = rte_telemetry_set_socket_nonblock(fd);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not set socket to NONBLOCK");
+ return -EPERM;
+ }
+
+ addrs.sun_family = AF_UNIX;
+ strlcpy(addrs.sun_path, client_path, sizeof(addrs.sun_path));
+ telemetry_client *new_client = malloc(sizeof(telemetry_client));
+ new_client->file_path = strdup(client_path);
+ new_client->fd = fd;
+
+ if (connect(fd, (struct sockaddr *)&addrs, sizeof(addrs)) == -1) {
+ TELEMETRY_LOG_ERR("TELEMETRY client connect to %s didn't work",
+ client_path);
+ ret = rte_telemetry_client_cleanup(new_client);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Client cleanup failed");
+ return -EPERM;
+ }
+ return -EINVAL;
+ }
+
+ TAILQ_INSERT_HEAD(&telemetry->client_list_head, new_client, client_list);
+
+ return 0;
+}
+
+int32_t
+rte_telemetry_parse_client_message(struct telemetry_impl *telemetry, char *buf)
+{
+ int ret, action_int;
+ json_error_t error;
+ json_t *root = json_loads(buf, 0, &error);
+
+ if (root == NULL) {
+ TELEMETRY_LOG_WARN("Could not load JSON object from data passed in : %s",
+ error.text);
+ goto fail;
+ } else if (!json_is_object(root)) {
+ TELEMETRY_LOG_WARN("JSON Request is not a JSON object");
+ goto fail;
+ }
+
+ json_t *action = json_object_get(root, "action");
+ if (action == NULL) {
+ TELEMETRY_LOG_WARN("Request does not have action field");
+ goto fail;
+ } else if (!json_is_integer(action)) {
+ TELEMETRY_LOG_WARN("Action value is not an integer");
+ goto fail;
+ }
+
+ json_t *command = json_object_get(root, "command");
+ if (command == NULL) {
+ TELEMETRY_LOG_WARN("Request does not have command field");
+ goto fail;
+ } else if (!json_is_string(command)) {
+ TELEMETRY_LOG_WARN("Command value is not a string");
+ goto fail;
+ }
+
+ action_int = json_integer_value(action);
+ if (action_int != ACTION_POST) {
+ TELEMETRY_LOG_WARN("Invalid action code");
+ goto fail;
+ }
+
+ if (strcmp(json_string_value(command), "clients") != 0) {
+ TELEMETRY_LOG_WARN("Invalid command");
+ goto fail;
+ }
+
+ json_t *data = json_object_get(root, "data");
+ if (data == NULL) {
+ TELEMETRY_LOG_WARN("Request does not have data field");
+ goto fail;
+ }
+
+ json_t *client_path = json_object_get(data, "client_path");
+ if (client_path == NULL) {
+ TELEMETRY_LOG_WARN("Request does not have client_path field");
+ goto fail;
+ }
+
+ if (!json_is_string(client_path)) {
+ TELEMETRY_LOG_WARN("Client_path value is not a string");
+ goto fail;
+ }
+
+ ret = rte_telemetry_register_client(telemetry,
+ json_string_value(client_path));
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not register client");
+ telemetry->register_fail_count++;
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ TELEMETRY_LOG_WARN("Client attempted to register with invalid message");
+ json_decref(root);
+ return -1;
+}
+
+int32_t
+rte_telemetry_dummy_client_socket(const char *valid_client_path)
+{
+ int sockfd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
+ struct sockaddr_un addr = {0};
+
+ if (sockfd < 0) {
+ TELEMETRY_LOG_ERR("Test socket creation failure");
+ return -1;
+ }
+
+ addr.sun_family = AF_UNIX;
+ strlcpy(addr.sun_path, valid_client_path, sizeof(addr.sun_path));
+ unlink(valid_client_path);
+
+ if (bind(sockfd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+ TELEMETRY_LOG_ERR("Test socket binding failure");
+ return -1;
+ }
+
+ if (listen(sockfd, 1) < 0) {
+ TELEMETRY_LOG_ERR("Listen failure");
+ return -1;
+ }
+
+ return sockfd;
+}
+
+int32_t __rte_experimental
+rte_telemetry_selftest(void)
+{
+ const char *invalid_client_path = SELFTEST_INVALID_CLIENT;
+ const char *valid_client_path = SELFTEST_VALID_CLIENT;
+ int ret, sockfd;
+
+ TELEMETRY_LOG_INFO("Selftest");
+
+ ret = rte_telemetry_init();
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Valid initialisation test failed");
+ return -1;
+ }
+
+ TELEMETRY_LOG_INFO("Success - Valid initialisation test passed");
+
+ ret = rte_telemetry_init();
+ if (ret != -EALREADY) {
+ TELEMETRY_LOG_ERR("Invalid initialisation test failed");
+ return -1;
+ }
+
+ TELEMETRY_LOG_INFO("Success - Invalid initialisation test passed");
+
+ ret = rte_telemetry_unregister_client(static_telemetry,
+ invalid_client_path);
+ if (ret != -EPERM) {
+ TELEMETRY_LOG_ERR("Invalid unregister test failed");
+ return -1;
+ }
+
+ TELEMETRY_LOG_INFO("Success - Invalid unregister test passed");
+
+ sockfd = rte_telemetry_dummy_client_socket(valid_client_path);
+ if (sockfd < 0) {
+ TELEMETRY_LOG_ERR("Test socket creation failed");
+ return -1;
+ }
+
+ ret = rte_telemetry_register_client(static_telemetry, valid_client_path);
+ if (ret != 0) {
+ TELEMETRY_LOG_ERR("Valid register test failed: %i", ret);
+ return -1;
+ }
+
+ accept(sockfd, NULL, NULL);
+ TELEMETRY_LOG_INFO("Success - Valid register test passed");
+
+ ret = rte_telemetry_register_client(static_telemetry, valid_client_path);
+ if (ret != -EINVAL) {
+ TELEMETRY_LOG_ERR("Invalid register test failed: %i", ret);
+ return -1;
+ }
+
+ TELEMETRY_LOG_INFO("Success - Invalid register test passed");
+
+ ret = rte_telemetry_unregister_client(static_telemetry,
+ invalid_client_path);
+ if (ret != -1) {
+ TELEMETRY_LOG_ERR("Invalid unregister test failed: %i", ret);
+ return -1;
+ }
+
+ TELEMETRY_LOG_INFO("Success - Invalid unregister test passed");
+
+ ret = rte_telemetry_unregister_client(static_telemetry, valid_client_path);
+ if (ret != 0) {
+ TELEMETRY_LOG_ERR("Valid unregister test failed: %i", ret);
+ return -1;
+ }
+
+ TELEMETRY_LOG_INFO("Success - Valid unregister test passed");
+
+ ret = rte_telemetry_cleanup();
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Cleanup test failed");
+ return -1;
+ }
+
+ TELEMETRY_LOG_INFO("Success - Valid cleanup test passed");
+
+ return 0;
+}
+
+int32_t
+rte_telemetry_socket_messaging_testing(int index, int socket)
+{
+ struct telemetry_impl *telemetry = calloc(1, sizeof(telemetry_impl));
+ int fd, bad_send_fd, send_fd, bad_fd, bad_recv_fd, recv_fd, ret;
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Could not initialize Telemetry API");
+ return -1;
+ }
+
+ telemetry->server_fd = socket;
+ telemetry->reg_index = index;
+ TELEMETRY_LOG_INFO("Beginning Telemetry socket message Selftest");
+ rte_telemetry_socket_test_setup(telemetry, &send_fd, &recv_fd);
+ TELEMETRY_LOG_INFO("Register valid client test");
+
+ ret = rte_telemetry_socket_register_test(telemetry, &fd, send_fd,
+ recv_fd);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Register valid client test failed!");
+ free(telemetry);
+ return -1;
+ }
+
+ TELEMETRY_LOG_INFO("Success - Register valid client test passed!");
+
+ TELEMETRY_LOG_INFO("Register invalid/same client test");
+ ret = rte_telemetry_socket_test_setup(telemetry, &bad_send_fd,
+ &bad_recv_fd);
+ ret = rte_telemetry_socket_register_test(telemetry, &bad_fd,
+ bad_send_fd, bad_recv_fd);
+ if (!ret) {
+ TELEMETRY_LOG_ERR("Register invalid/same client test failed!");
+ free(telemetry);
+ return -1;
+ }
+
+ TELEMETRY_LOG_INFO("Success - Register invalid/same client test passed!");
+
+ ret = rte_telemetry_json_socket_message_test(telemetry, fd);
+ if (ret < 0) {
+ free(telemetry);
+ return -1;
+ }
+
+ free(telemetry);
+ return 0;
+}
+
+int32_t
+rte_telemetry_socket_register_test(struct telemetry_impl *telemetry, int *fd,
+ int send_fd, int recv_fd)
+{
+ int ret;
+ char good_req_string[BUF_SIZE];
+
+ snprintf(good_req_string, sizeof(good_req_string),
+ "{\"action\":1,\"command\":\"clients\",\"data\":{\"client_path\""
+ ":\"%s\"}}", SOCKET_TEST_CLIENT_PATH);
+
+ listen(recv_fd, 1);
+
+ ret = send(send_fd, good_req_string, strlen(good_req_string), 0);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not send message over socket");
+ return -1;
+ }
+
+ rte_telemetry_run(telemetry);
+
+ if (telemetry->register_fail_count != 0)
+ return -1;
+
+ *fd = accept(recv_fd, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+rte_telemetry_socket_test_setup(struct telemetry_impl *telemetry, int *send_fd,
+ int *recv_fd)
+{
+ int ret;
+ const char *client_path = SOCKET_TEST_CLIENT_PATH;
+ char socket_path[BUF_SIZE];
+ struct sockaddr_un addr = {0};
+ struct sockaddr_un addrs = {0};
+ *send_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
+ *recv_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
+
+ listen(telemetry->server_fd, 5);
+ addr.sun_family = AF_UNIX;
+ rte_telemetry_get_runtime_dir(socket_path, sizeof(socket_path));
+ strlcpy(addr.sun_path, socket_path, sizeof(addr.sun_path));
+
+ ret = connect(*send_fd, (struct sockaddr *) &addr, sizeof(addr));
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not connect socket");
+ return -1;
+ }
+
+ telemetry->accept_fd = accept(telemetry->server_fd, NULL, NULL);
+
+ addrs.sun_family = AF_UNIX;
+ strlcpy(addrs.sun_path, client_path, sizeof(addrs.sun_path));
+ unlink(client_path);
+
+ ret = bind(*recv_fd, (struct sockaddr *)&addrs, sizeof(addrs));
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not bind socket");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int32_t
+rte_telemetry_stat_parse(char *buf, struct json_data *json_data_struct)
+{
+ json_error_t error;
+ json_t *root = json_loads(buf, 0, &error);
+ int arraylen, i;
+ json_t *status, *dataArray, *port, *stats, *name, *value, *dataArrayObj,
+ *statsArrayObj;
+
+ stats = NULL;
+ port = NULL;
+ name = NULL;
+
+ if (buf == NULL) {
+ TELEMETRY_LOG_ERR("JSON message is NULL");
+ return -EINVAL;
+ }
+
+ if (root == NULL) {
+ TELEMETRY_LOG_ERR("Could not load JSON object from data passed in : %s",
+ error.text);
+ return -EPERM;
+ } else if (!json_is_object(root)) {
+ TELEMETRY_LOG_ERR("JSON Request is not a JSON object");
+ json_decref(root);
+ return -EINVAL;
+ }
+
+ status = json_object_get(root, "status_code");
+ if (!status) {
+ TELEMETRY_LOG_ERR("Request does not have status field");
+ return -EINVAL;
+ } else if (!json_is_string(status)) {
+ TELEMETRY_LOG_ERR("Status value is not a string");
+ return -EINVAL;
+ }
+
+ json_data_struct->status_code = strdup(json_string_value(status));
+
+ dataArray = json_object_get(root, "data");
+ if (dataArray == NULL) {
+ TELEMETRY_LOG_ERR("Request does not have data field");
+ return -EINVAL;
+ }
+
+ arraylen = json_array_size(dataArray);
+ if (arraylen == 0) {
+ json_data_struct->data = "null";
+ return -EINVAL;
+ }
+
+ for (i = 0; i < arraylen; i++) {
+ dataArrayObj = json_array_get(dataArray, i);
+ port = json_object_get(dataArrayObj, "port");
+ stats = json_object_get(dataArrayObj, "stats");
+ }
+
+ if (port == NULL) {
+ TELEMETRY_LOG_ERR("Request does not have port field");
+ return -EINVAL;
+ }
+
+ if (!json_is_integer(port)) {
+ TELEMETRY_LOG_ERR("Port value is not an integer");
+ return -EINVAL;
+ }
+
+ json_data_struct->port = json_integer_value(port);
+
+ if (stats == NULL) {
+ TELEMETRY_LOG_ERR("Request does not have stats field");
+ return -EINVAL;
+ }
+
+ arraylen = json_array_size(stats);
+ for (i = 0; i < arraylen; i++) {
+ statsArrayObj = json_array_get(stats, i);
+ name = json_object_get(statsArrayObj, "name");
+ value = json_object_get(statsArrayObj, "value");
+ }
+
+ if (name == NULL) {
+ TELEMETRY_LOG_ERR("Request does not have name field");
+ return -EINVAL;
+ }
+
+ if (!json_is_string(name)) {
+ TELEMETRY_LOG_ERR("Stat name value is not a string");
+ return -EINVAL;
+ }
+
+ json_data_struct->stat_name = strdup(json_string_value(name));
+
+ if (value == NULL) {
+ TELEMETRY_LOG_ERR("Request does not have value field");
+ return -EINVAL;
+ }
+
+ if (!json_is_integer(value)) {
+ TELEMETRY_LOG_ERR("Stat value is not an integer");
+ return -EINVAL;
+ }
+
+ json_data_struct->stat_value = json_integer_value(value);
+
+ return 0;
+}
+
+static void
+rte_telemetry_free_test_data(struct json_data *data)
+{
+ free(data->status_code);
+ free(data->stat_name);
+ free(data);
+}
+
+int32_t
+rte_telemetry_valid_json_test(struct telemetry_impl *telemetry, int fd)
+{
+ int ret;
+ int port = 0;
+ int value = 0;
+ int fail_count = 0;
+ int buffer_read = 0;
+ char buf[BUF_SIZE];
+ struct json_data *data_struct;
+ errno = 0;
+ const char *status = "Status OK: 200";
+ const char *name = "rx_good_packets";
+ const char *valid_json_message = "{\"action\":0,\"command\":"
+ "\"ports_stats_values_by_name\",\"data\":{\"ports\""
+ ":[0],\"stats\":[\"rx_good_packets\"]}}";
+
+ ret = send(fd, valid_json_message, strlen(valid_json_message), 0);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not send message over socket");
+ return -1;
+ }
+
+ rte_telemetry_run(telemetry);
+ buffer_read = recv(fd, buf, BUF_SIZE-1, 0);
+
+ if (buffer_read == -1) {
+ TELEMETRY_LOG_ERR("Read error");
+ return -1;
+ }
+
+ buf[buffer_read] = '\0';
+ data_struct = calloc(1, sizeof(struct json_data));
+ ret = rte_telemetry_stat_parse(buf, data_struct);
+
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not parse stats");
+ fail_count++;
+ }
+
+ if (strcmp(data_struct->status_code, status) != 0) {
+ TELEMETRY_LOG_ERR("Status code is invalid");
+ fail_count++;
+ }
+
+ if (data_struct->port != port) {
+ TELEMETRY_LOG_ERR("Port is invalid");
+ fail_count++;
+ }
+
+ if (strcmp(data_struct->stat_name, name) != 0) {
+ TELEMETRY_LOG_ERR("Stat name is invalid");
+ fail_count++;
+ }
+
+ if (data_struct->stat_value != value) {
+ TELEMETRY_LOG_ERR("Stat value is invalid");
+ fail_count++;
+ }
+
+ rte_telemetry_free_test_data(data_struct);
+ if (fail_count > 0)
+ return -1;
+
+ TELEMETRY_LOG_INFO("Success - Passed valid JSON message test passed");
+
+ return 0;
+}
+
+int32_t
+rte_telemetry_invalid_json_test(struct telemetry_impl *telemetry, int fd)
+{
+ int ret;
+ char buf[BUF_SIZE];
+ int fail_count = 0;
+ const char *invalid_json = "{]";
+ const char *status = "Status Error: Unknown";
+ const char *data = "null";
+ struct json_data *data_struct;
+ int buffer_read = 0;
+ errno = 0;
+
+ ret = send(fd, invalid_json, strlen(invalid_json), 0);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not send message over socket");
+ return -1;
+ }
+
+ rte_telemetry_run(telemetry);
+ buffer_read = recv(fd, buf, BUF_SIZE-1, 0);
+
+ if (buffer_read == -1) {
+ TELEMETRY_LOG_ERR("Read error");
+ return -1;
+ }
+
+ buf[buffer_read] = '\0';
+
+ data_struct = calloc(1, sizeof(struct json_data));
+ ret = rte_telemetry_stat_parse(buf, data_struct);
+
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not parse stats");
+
+ if (strcmp(data_struct->status_code, status) != 0) {
+ TELEMETRY_LOG_ERR("Status code is invalid");
+ fail_count++;
+ }
+
+ if (strcmp(data_struct->data, data) != 0) {
+ TELEMETRY_LOG_ERR("Data status is invalid");
+ fail_count++;
+ }
+
+ rte_telemetry_free_test_data(data_struct);
+ if (fail_count > 0)
+ return -1;
+
+ TELEMETRY_LOG_INFO("Success - Passed invalid JSON message test");
+
+ return 0;
+}
+
+int32_t
+rte_telemetry_json_contents_test(struct telemetry_impl *telemetry, int fd)
+{
+ int ret;
+ char buf[BUF_SIZE];
+ int fail_count = 0;
+ char *status = "Status Error: Invalid Argument 404";
+ char *data = "null";
+ struct json_data *data_struct;
+ const char *invalid_contents = "{\"action\":0,\"command\":"
+ "\"ports_stats_values_by_name\",\"data\":{\"ports\""
+ ":[0],\"stats\":[\"some_invalid_param\","
+ "\"another_invalid_param\"]}}";
+ int buffer_read = 0;
+ errno = 0;
+
+ ret = send(fd, invalid_contents, strlen(invalid_contents), 0);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not send message over socket");
+ return -1;
+ }
+
+ rte_telemetry_run(telemetry);
+ buffer_read = recv(fd, buf, BUF_SIZE-1, 0);
+
+ if (buffer_read == -1) {
+ TELEMETRY_LOG_ERR("Read error");
+ return -1;
+ }
+
+ buf[buffer_read] = '\0';
+ data_struct = calloc(1, sizeof(struct json_data));
+ ret = rte_telemetry_stat_parse(buf, data_struct);
+
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not parse stats");
+
+ if (strcmp(data_struct->status_code, status) != 0) {
+ TELEMETRY_LOG_ERR("Status code is invalid");
+ fail_count++;
+ }
+
+ if (strcmp(data_struct->data, data) != 0) {
+ TELEMETRY_LOG_ERR("Data status is invalid");
+ fail_count++;
+ }
+
+ rte_telemetry_free_test_data(data_struct);
+ if (fail_count > 0)
+ return -1;
+
+ TELEMETRY_LOG_INFO("Success - Passed invalid JSON content test");
+
+ return 0;
+}
+
+int32_t
+rte_telemetry_json_empty_test(struct telemetry_impl *telemetry, int fd)
+{
+ int ret;
+ char buf[BUF_SIZE];
+ int fail_count = 0;
+ const char *status = "Status Error: Invalid Argument 404";
+ char *data = "null";
+ struct json_data *data_struct;
+ const char *empty_json = "{}";
+ int buffer_read = 0;
+ errno = 0;
+
+ ret = (send(fd, empty_json, strlen(empty_json), 0));
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not send message over socket");
+ return -1;
+ }
+
+ rte_telemetry_run(telemetry);
+ buffer_read = recv(fd, buf, BUF_SIZE-1, 0);
+
+ if (buffer_read == -1) {
+ TELEMETRY_LOG_ERR("Read error");
+ return -1;
+ }
+
+ buf[buffer_read] = '\0';
+ data_struct = calloc(1, sizeof(struct json_data));
+ ret = rte_telemetry_stat_parse(buf, data_struct);
+
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not parse stats");
+
+ if (strcmp(data_struct->status_code, status) != 0) {
+ TELEMETRY_LOG_ERR("Status code is invalid");
+ fail_count++;
+ }
+
+ if (strcmp(data_struct->data, data) != 0) {
+ TELEMETRY_LOG_ERR("Data status is invalid");
+ fail_count++;
+ }
+
+ rte_telemetry_free_test_data(data_struct);
+
+ if (fail_count > 0)
+ return -1;
+
+ TELEMETRY_LOG_INFO("Success - Passed JSON empty message test");
+
+ return 0;
+}
+
+int32_t
+rte_telemetry_json_socket_message_test(struct telemetry_impl *telemetry, int fd)
+{
+ uint16_t i;
+ int ret, fail_count;
+
+ fail_count = 0;
+ struct telemetry_message_test socket_json_tests[] = {
+ {.test_name = "Invalid JSON test",
+ .test_func_ptr = rte_telemetry_invalid_json_test},
+ {.test_name = "Valid JSON test",
+ .test_func_ptr = rte_telemetry_valid_json_test},
+ {.test_name = "JSON contents test",
+ .test_func_ptr = rte_telemetry_json_contents_test},
+ {.test_name = "JSON empty tests",
+ .test_func_ptr = rte_telemetry_json_empty_test}
+ };
+
+#define NUM_TESTS RTE_DIM(socket_json_tests)
+
+ for (i = 0; i < NUM_TESTS; i++) {
+ TELEMETRY_LOG_INFO("%s", socket_json_tests[i].test_name);
+ ret = (socket_json_tests[i].test_func_ptr)
+ (telemetry, fd);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("%s failed",
+ socket_json_tests[i].test_name);
+ fail_count++;
+ }
+ }
+
+ if (fail_count > 0) {
+ TELEMETRY_LOG_ERR("Failed %i JSON socket message test(s)",
+ fail_count);
+ return -1;
+ }
+
+ TELEMETRY_LOG_INFO("Success - All JSON tests passed");
+
+ return 0;
+}
+
+int telemetry_log_level;
+
+static struct rte_option option = {
+ .opt_str = "--telemetry",
+ .cb = &rte_telemetry_init,
+ .enabled = 0
+};
+
+RTE_INIT(rte_telemetry_register)
+{
+ telemetry_log_level = rte_log_register("lib.telemetry");
+ if (telemetry_log_level >= 0)
+ rte_log_set_level(telemetry_log_level, RTE_LOG_ERR);
+
+ rte_option_register(&option);
+}
diff --git a/lib/librte_telemetry/rte_telemetry.h b/lib/librte_telemetry/rte_telemetry.h
new file mode 100644
index 00000000..119db16f
--- /dev/null
+++ b/lib/librte_telemetry/rte_telemetry.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdint.h>
+
+#ifndef _RTE_TELEMETRY_H_
+#define _RTE_TELEMETRY_H_
+
+/**
+ * @file
+ * RTE Telemetry
+ *
+ * The telemetry library provides a method to retrieve statistics from
+ * DPDK by sending a JSON encoded message over a socket. DPDK will send
+ * a JSON encoded response containing telemetry data.
+ ***/
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Initialize Telemetry
+ *
+ * @return
+ * 0 on successful initialisation.
+ * @return
+ * -ENOMEM on memory allocation error
+ * @return
+ * -EPERM on unknown error failure
+ * @return
+ * -EALREADY if Telemetry is already initialised.
+ */
+int32_t __rte_experimental
+rte_telemetry_init(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Clean up and free memory.
+ *
+ * @return
+ * 0 on success
+ * @return
+ * -EPERM on failure
+ */
+int32_t __rte_experimental
+rte_telemetry_cleanup(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Runs various tests to ensure telemetry initialisation and register/unregister
+ * functions are working correctly.
+ *
+ * @return
+ * 0 on success when all tests have passed
+ * @return
+ * -1 on failure when the test has failed
+ */
+int32_t __rte_experimental
+rte_telemetry_selftest(void);
+
+#endif
diff --git a/lib/librte_telemetry/rte_telemetry_internal.h b/lib/librte_telemetry/rte_telemetry_internal.h
new file mode 100644
index 00000000..de7afda3
--- /dev/null
+++ b/lib/librte_telemetry/rte_telemetry_internal.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <rte_log.h>
+#include <rte_tailq.h>
+
+#ifndef _RTE_TELEMETRY_INTERNAL_H_
+#define _RTE_TELEMETRY_INTERNAL_H_
+
+/* Logging Macros */
+extern int telemetry_log_level;
+
+#define TELEMETRY_LOG(level, fmt, args...) \
+ rte_log(RTE_LOG_ ##level, telemetry_log_level, "%s(): "fmt "\n", \
+ __func__, ##args)
+
+#define TELEMETRY_LOG_ERR(fmt, args...) \
+ TELEMETRY_LOG(ERR, fmt, ## args)
+
+#define TELEMETRY_LOG_WARN(fmt, args...) \
+ TELEMETRY_LOG(WARNING, fmt, ## args)
+
+#define TELEMETRY_LOG_INFO(fmt, args...) \
+ TELEMETRY_LOG(INFO, fmt, ## args)
+
+typedef struct telemetry_client {
+ char *file_path;
+ int fd;
+ TAILQ_ENTRY(telemetry_client) client_list;
+} telemetry_client;
+
+typedef struct telemetry_impl {
+ int accept_fd;
+ int server_fd;
+ pthread_t thread_id;
+ int thread_status;
+ uint32_t socket_id;
+ int reg_index;
+ int metrics_register_done;
+ TAILQ_HEAD(, telemetry_client) client_list_head;
+ struct telemetry_client *request_client;
+ int register_fail_count;
+} telemetry_impl;
+
+enum rte_telemetry_parser_actions {
+ ACTION_GET = 0,
+ ACTION_DELETE = 2
+};
+
+int32_t
+rte_telemetry_parse_client_message(struct telemetry_impl *telemetry, char *buf);
+
+int32_t
+rte_telemetry_send_error_response(struct telemetry_impl *telemetry,
+ int error_type);
+
+int32_t
+rte_telemetry_register_client(struct telemetry_impl *telemetry,
+ const char *client_path);
+
+int32_t
+rte_telemetry_unregister_client(struct telemetry_impl *telemetry,
+ const char *client_path);
+
+/**
+ * This is a wrapper for the ethdev api rte_eth_find_next().
+ * If rte_eth_find_next() returns the same port id that we passed it,
+ * then we know that that port is active.
+ */
+int32_t
+rte_telemetry_is_port_active(int port_id);
+
+int32_t
+rte_telemetry_send_ports_stats_values(uint32_t *metric_ids, int num_metric_ids,
+ uint32_t *port_ids, int num_port_ids, struct telemetry_impl *telemetry);
+
+int32_t
+rte_telemetry_socket_messaging_testing(int index, int socket);
+
+#endif
diff --git a/lib/librte_telemetry/rte_telemetry_parser.c b/lib/librte_telemetry/rte_telemetry_parser.c
new file mode 100644
index 00000000..03a58a2f
--- /dev/null
+++ b/lib/librte_telemetry/rte_telemetry_parser.c
@@ -0,0 +1,586 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <errno.h>
+#include <jansson.h>
+
+#include <rte_metrics.h>
+#include <rte_common.h>
+#include <rte_ethdev.h>
+
+#include "rte_telemetry_internal.h"
+
+typedef int (*command_func)(struct telemetry_impl *, int, json_t *);
+
+struct rte_telemetry_command {
+ char *text;
+ command_func fn;
+} command;
+
+static int32_t
+rte_telemetry_command_clients(struct telemetry_impl *telemetry, int action,
+ json_t *data)
+{
+ int ret;
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Invalid telemetry argument");
+ return -1;
+ }
+
+ if (action != ACTION_DELETE) {
+ TELEMETRY_LOG_WARN("Invalid action for this command");
+ goto einval_fail;
+ }
+
+ if (!json_is_object(data)) {
+ TELEMETRY_LOG_WARN("Invalid data provided for this command");
+ goto einval_fail;
+ }
+
+ json_t *client_path = json_object_get(data, "client_path");
+ if (!json_is_string(client_path)) {
+ TELEMETRY_LOG_WARN("Command value is not a string");
+ goto einval_fail;
+ }
+
+ ret = rte_telemetry_unregister_client(telemetry,
+ json_string_value(client_path));
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not unregister client");
+ goto einval_fail;
+ }
+
+ return 0;
+
+einval_fail:
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+}
+
+static int32_t
+rte_telemetry_command_ports(struct telemetry_impl *telemetry, int action,
+ json_t *data)
+{
+ int ret;
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Invalid telemetry argument");
+ return -1;
+ }
+
+ if (!json_is_null(data)) {
+ TELEMETRY_LOG_WARN("Data should be NULL JSON object for 'ports' command");
+ goto einval_fail;
+ }
+
+ if (action != ACTION_GET) {
+ TELEMETRY_LOG_WARN("Invalid action for this command");
+ goto einval_fail;
+ }
+
+ return 0;
+
+einval_fail:
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+}
+
+static int32_t
+rte_telemetry_command_ports_details(struct telemetry_impl *telemetry,
+ int action, json_t *data)
+{
+ json_t *value, *port_ids_json = json_object_get(data, "ports");
+ uint64_t num_port_ids = json_array_size(port_ids_json);
+ int ret, port_ids[num_port_ids];
+ RTE_SET_USED(port_ids);
+ size_t index;
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Invalid telemetry argument");
+ return -1;
+ }
+
+ if (action != ACTION_GET) {
+ TELEMETRY_LOG_WARN("Invalid action for this command");
+ goto einval_fail;
+ }
+
+ if (!json_is_object(data)) {
+ TELEMETRY_LOG_WARN("Invalid data provided for this command");
+ goto einval_fail;
+ }
+
+ if (!json_is_array(port_ids_json)) {
+ TELEMETRY_LOG_WARN("Invalid Port ID array");
+ goto einval_fail;
+ }
+
+ json_array_foreach(port_ids_json, index, value) {
+ if (!json_is_integer(value)) {
+ TELEMETRY_LOG_WARN("Port ID given is invalid");
+ goto einval_fail;
+ }
+ port_ids[index] = json_integer_value(value);
+ }
+
+ return 0;
+
+einval_fail:
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+}
+
+static int32_t
+rte_telemetry_command_port_stats(struct telemetry_impl *telemetry, int action,
+ json_t *data)
+{
+ int ret;
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Invalid telemetry argument");
+ return -1;
+ }
+
+ if (!json_is_null(data)) {
+ TELEMETRY_LOG_WARN("Data should be NULL JSON object for 'port_stats' command");
+ goto einval_fail;
+ }
+
+ if (action != ACTION_GET) {
+ TELEMETRY_LOG_WARN("Invalid action for this command");
+ goto einval_fail;
+ }
+
+ return 0;
+
+einval_fail:
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+}
+
+static int32_t
+rte_telemetry_stat_names_to_ids(struct telemetry_impl *telemetry,
+ const char * const *stat_names, uint32_t *stat_ids,
+ uint64_t num_stat_names)
+{
+ struct rte_metric_name *names;
+ int ret, num_metrics;
+ uint32_t i, k;
+
+ if (stat_names == NULL) {
+ TELEMETRY_LOG_WARN("Invalid stat_names argument");
+ goto einval_fail;
+ }
+
+ if (num_stat_names <= 0) {
+ TELEMETRY_LOG_WARN("Invalid num_stat_names argument");
+ goto einval_fail;
+ }
+
+ num_metrics = rte_metrics_get_names(NULL, 0);
+ if (num_metrics < 0) {
+ TELEMETRY_LOG_ERR("Cannot get metrics count");
+ goto eperm_fail;
+ } else if (num_metrics == 0) {
+ TELEMETRY_LOG_WARN("No metrics have been registered");
+ goto eperm_fail;
+ }
+
+ names = malloc(sizeof(struct rte_metric_name) * num_metrics);
+ if (names == NULL) {
+ TELEMETRY_LOG_ERR("Cannot allocate memory for names");
+
+ ret = rte_telemetry_send_error_response(telemetry, -ENOMEM);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+
+ return -1;
+ }
+
+ ret = rte_metrics_get_names(names, num_metrics);
+ if (ret < 0 || ret > num_metrics) {
+ TELEMETRY_LOG_ERR("Cannot get metrics names");
+ free(names);
+ goto eperm_fail;
+ }
+
+ k = 0;
+ for (i = 0; i < (uint32_t)num_stat_names; i++) {
+ uint32_t j;
+ for (j = 0; j < (uint32_t)num_metrics; j++) {
+ if (strcmp(stat_names[i], names[j].name) == 0) {
+ stat_ids[k] = j;
+ k++;
+ break;
+ }
+ }
+ }
+
+ if (k != num_stat_names) {
+ TELEMETRY_LOG_WARN("Invalid stat names provided");
+ free(names);
+ goto einval_fail;
+ }
+
+ free(names);
+ return 0;
+
+einval_fail:
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+
+eperm_fail:
+ ret = rte_telemetry_send_error_response(telemetry, -EPERM);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+}
+
+int32_t
+rte_telemetry_command_ports_all_stat_values(struct telemetry_impl *telemetry,
+ int action, json_t *data)
+{
+ int ret, num_metrics, i, p;
+ struct rte_metric_name *names;
+ uint64_t num_port_ids = 0;
+ uint32_t port_ids[RTE_MAX_ETHPORTS];
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Invalid telemetry argument");
+ return -1;
+ }
+
+ if (action != ACTION_GET) {
+ TELEMETRY_LOG_WARN("Invalid action for this command");
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+ }
+
+ if (json_is_object(data)) {
+ TELEMETRY_LOG_WARN("Invalid data provided for this command");
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+ }
+
+ num_metrics = rte_metrics_get_names(NULL, 0);
+ if (num_metrics < 0) {
+ TELEMETRY_LOG_ERR("Cannot get metrics count");
+
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+
+ return -1;
+ } else if (num_metrics == 0) {
+ TELEMETRY_LOG_ERR("No metrics to display (none have been registered)");
+
+ ret = rte_telemetry_send_error_response(telemetry, -EPERM);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+
+ return -1;
+ }
+
+ names = malloc(sizeof(struct rte_metric_name) * num_metrics);
+ if (names == NULL) {
+ TELEMETRY_LOG_ERR("Cannot allocate memory");
+ ret = rte_telemetry_send_error_response(telemetry,
+ -ENOMEM);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+ }
+
+ const char *stat_names[num_metrics];
+ uint32_t stat_ids[num_metrics];
+
+ RTE_ETH_FOREACH_DEV(p) {
+ port_ids[num_port_ids] = p;
+ num_port_ids++;
+ }
+
+ if (!num_port_ids) {
+ TELEMETRY_LOG_WARN("No active ports");
+
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+
+ goto fail;
+ }
+
+ ret = rte_metrics_get_names(names, num_metrics);
+ for (i = 0; i < num_metrics; i++)
+ stat_names[i] = names[i].name;
+
+ ret = rte_telemetry_stat_names_to_ids(telemetry, stat_names, stat_ids,
+ num_metrics);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not convert stat names to IDs");
+ goto fail;
+ }
+
+ ret = rte_telemetry_send_ports_stats_values(stat_ids, num_metrics,
+ port_ids, num_port_ids, telemetry);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Sending ports stats values failed");
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ free(names);
+ return -1;
+}
+
+int32_t
+rte_telemetry_command_ports_stats_values_by_name(struct telemetry_impl
+ *telemetry, int action, json_t *data)
+{
+ int ret;
+ json_t *port_ids_json = json_object_get(data, "ports");
+ json_t *stat_names_json = json_object_get(data, "stats");
+ uint64_t num_port_ids = json_array_size(port_ids_json);
+ uint64_t num_stat_names = json_array_size(stat_names_json);
+ const char *stat_names[num_stat_names];
+ uint32_t port_ids[num_port_ids], stat_ids[num_stat_names];
+ size_t index;
+ json_t *value;
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Invalid telemetry argument");
+ return -1;
+ }
+
+ if (action != ACTION_GET) {
+ TELEMETRY_LOG_WARN("Invalid action for this command");
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+ }
+
+ if (!json_is_object(data)) {
+ TELEMETRY_LOG_WARN("Invalid data provided for this command");
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+ }
+
+ if (!json_is_array(port_ids_json) ||
+ !json_is_array(stat_names_json)) {
+ TELEMETRY_LOG_WARN("Invalid input data array(s)");
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+ }
+
+ json_array_foreach(port_ids_json, index, value) {
+ if (!json_is_integer(value)) {
+ TELEMETRY_LOG_WARN("Port ID given is not valid");
+ ret = rte_telemetry_send_error_response(telemetry,
+ -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+ }
+ port_ids[index] = json_integer_value(value);
+ ret = rte_telemetry_is_port_active(port_ids[index]);
+ if (ret < 1) {
+ ret = rte_telemetry_send_error_response(telemetry,
+ -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -1;
+ }
+ }
+
+ json_array_foreach(stat_names_json, index, value) {
+ if (!json_is_string(value)) {
+ TELEMETRY_LOG_WARN("Stat Name given is not a string");
+
+ ret = rte_telemetry_send_error_response(telemetry,
+ -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+
+ return -1;
+ }
+ stat_names[index] = json_string_value(value);
+ }
+
+ ret = rte_telemetry_stat_names_to_ids(telemetry, stat_names, stat_ids,
+ num_stat_names);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not convert stat names to IDs");
+ return -1;
+ }
+
+ ret = rte_telemetry_send_ports_stats_values(stat_ids, num_stat_names,
+ port_ids, num_port_ids, telemetry);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Sending ports stats values failed");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int32_t
+rte_telemetry_parse_command(struct telemetry_impl *telemetry, int action,
+ const char *command, json_t *data)
+{
+ int ret;
+ uint32_t i;
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Invalid telemetry argument");
+ return -1;
+ }
+
+ struct rte_telemetry_command commands[] = {
+ {
+ .text = "clients",
+ .fn = &rte_telemetry_command_clients
+ },
+ {
+ .text = "ports",
+ .fn = &rte_telemetry_command_ports
+ },
+ {
+ .text = "ports_details",
+ .fn = &rte_telemetry_command_ports_details
+ },
+ {
+ .text = "port_stats",
+ .fn = &rte_telemetry_command_port_stats
+ },
+ {
+ .text = "ports_stats_values_by_name",
+ .fn = &rte_telemetry_command_ports_stats_values_by_name
+ },
+ {
+ .text = "ports_all_stat_values",
+ .fn = &rte_telemetry_command_ports_all_stat_values
+ }
+ };
+
+ const uint32_t num_commands = RTE_DIM(commands);
+
+ for (i = 0; i < num_commands; i++) {
+ if (strcmp(command, commands[i].text) == 0) {
+ ret = commands[i].fn(telemetry, action, data);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Command Function for %s failed",
+ commands[i].text);
+ return -1;
+ }
+ return 0;
+ }
+ }
+
+ TELEMETRY_LOG_WARN("\"%s\" command not found", command);
+
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+
+ return -1;
+}
+
+int32_t __rte_experimental
+rte_telemetry_parse(struct telemetry_impl *telemetry, char *socket_rx_data)
+{
+ int ret, action_int;
+ json_error_t error;
+ json_t *root, *action, *command, *data;
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Invalid telemetry argument");
+ return -1;
+ }
+
+ root = json_loads(socket_rx_data, 0, &error);
+ if (root == NULL) {
+ TELEMETRY_LOG_WARN("Could not load JSON object from data passed in : %s",
+ error.text);
+ ret = rte_telemetry_send_error_response(telemetry, -EPERM);
+ if (ret < 0)
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -EPERM;
+ } else if (!json_is_object(root)) {
+ TELEMETRY_LOG_WARN("JSON Request is not a JSON object");
+ json_decref(root);
+ goto einval_fail;
+ }
+
+ action = json_object_get(root, "action");
+ if (action == NULL) {
+ TELEMETRY_LOG_WARN("Request does not have action field");
+ goto einval_fail;
+ } else if (!json_is_integer(action)) {
+ TELEMETRY_LOG_WARN("Action value is not an integer");
+ goto einval_fail;
+ }
+
+ command = json_object_get(root, "command");
+ if (command == NULL) {
+ TELEMETRY_LOG_WARN("Request does not have command field");
+ goto einval_fail;
+ } else if (!json_is_string(command)) {
+ TELEMETRY_LOG_WARN("Command value is not a string");
+ goto einval_fail;
+ }
+
+ action_int = json_integer_value(action);
+ if (action_int != ACTION_GET && action_int != ACTION_DELETE) {
+ TELEMETRY_LOG_WARN("Invalid action code");
+ goto einval_fail;
+ }
+
+ const char *command_string = json_string_value(command);
+ data = json_object_get(root, "data");
+ if (data == NULL) {
+ TELEMETRY_LOG_WARN("Request does not have data field");
+ goto einval_fail;
+ }
+
+ ret = rte_telemetry_parse_command(telemetry, action_int, command_string,
+ data);
+ if (ret < 0) {
+ TELEMETRY_LOG_WARN("Could not parse command");
+ return -EINVAL;
+ }
+
+ return 0;
+
+einval_fail:
+ ret = rte_telemetry_send_error_response(telemetry, -EINVAL);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not send error");
+ return -EPERM;
+ }
+ return -EINVAL;
+}
diff --git a/lib/librte_telemetry/rte_telemetry_parser.h b/lib/librte_telemetry/rte_telemetry_parser.h
new file mode 100644
index 00000000..b7051945
--- /dev/null
+++ b/lib/librte_telemetry/rte_telemetry_parser.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include "rte_telemetry_internal.h"
+#include "rte_compat.h"
+
+#ifndef _RTE_TELEMETRY_PARSER_H_
+#define _RTE_TELEMETRY_PARSER_H_
+
+int32_t __rte_experimental
+rte_telemetry_parse(struct telemetry_impl *telemetry, char *socket_rx_data);
+
+#endif
diff --git a/lib/librte_telemetry/rte_telemetry_parser_test.c b/lib/librte_telemetry/rte_telemetry_parser_test.c
new file mode 100644
index 00000000..5fe93fa6
--- /dev/null
+++ b/lib/librte_telemetry/rte_telemetry_parser_test.c
@@ -0,0 +1,534 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <errno.h>
+#include <jansson.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <rte_common.h>
+#include <rte_tailq.h>
+#include <rte_string_fns.h>
+
+#include "rte_telemetry_parser.h"
+
+enum choices {
+ INV_ACTION_VAL,
+ INV_COMMAND_VAL,
+ INV_DATA_VAL,
+ INV_ACTION_FIELD,
+ INV_COMMAND_FIELD,
+ INV_DATA_FIELD,
+ INV_JSON_FORMAT,
+ VALID_REQ
+};
+
+
+#define TEST_CLIENT "/var/run/dpdk/test_client"
+
+int32_t
+rte_telemetry_create_test_socket(struct telemetry_impl *telemetry,
+ const char *test_client_path)
+{
+ int ret, sockfd;
+ struct sockaddr_un addr = {0};
+ struct telemetry_client *client;
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Telemetry argument has not been initialised");
+ return -EINVAL;
+ }
+
+ sockfd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
+ if (sockfd < 0) {
+ TELEMETRY_LOG_ERR("Test socket creation failure");
+ return -1;
+ }
+
+ addr.sun_family = AF_UNIX;
+ strlcpy(addr.sun_path, test_client_path, sizeof(addr.sun_path));
+ unlink(test_client_path);
+
+ if (bind(sockfd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+ TELEMETRY_LOG_ERR("Test socket binding failure");
+ return -1;
+ }
+
+ if (listen(sockfd, 1) < 0) {
+ TELEMETRY_LOG_ERR("Listen failure");
+ return -1;
+ }
+
+ ret = rte_telemetry_register_client(telemetry, test_client_path);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Register dummy client failed: %i", ret);
+ return -1;
+ }
+
+ ret = accept(sockfd, NULL, NULL);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Socket accept failed");
+ return -1;
+ }
+
+ TAILQ_FOREACH(client, &telemetry->client_list_head, client_list)
+ telemetry->request_client = client;
+
+ return 0;
+}
+
+int32_t
+rte_telemetry_format_port_stat_ids(int *port_ids, int num_port_ids,
+ const char * const *stat_names, int num_stat_names, json_t **data)
+{
+
+ int ret;
+ json_t *stat_names_json_array = NULL;
+ json_t *port_ids_json_array = NULL;
+ uint32_t i;
+
+ if (num_port_ids < 0) {
+ TELEMETRY_LOG_ERR("Port Ids Count invalid");
+ goto fail;
+ }
+
+ *data = json_object();
+ if (*data == NULL) {
+ TELEMETRY_LOG_ERR("Data json object creation failed");
+ goto fail;
+ }
+
+ port_ids_json_array = json_array();
+ if (port_ids_json_array == NULL) {
+ TELEMETRY_LOG_ERR("port_ids_json_array creation failed");
+ goto fail;
+ }
+
+ for (i = 0; i < (uint32_t)num_port_ids; i++) {
+ ret = json_array_append(port_ids_json_array,
+ json_integer(port_ids[i]));
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("JSON array creation failed");
+ goto fail;
+ }
+ }
+
+ ret = json_object_set_new(*data, "ports", port_ids_json_array);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Setting 'ports' value in data object failed");
+ goto fail;
+ }
+
+ if (stat_names) {
+ if (num_stat_names < 0) {
+ TELEMETRY_LOG_ERR("Stat Names Count invalid");
+ goto fail;
+ }
+
+ stat_names_json_array = json_array();
+ if (stat_names_json_array == NULL) {
+ TELEMETRY_LOG_ERR("stat_names_json_array creation failed");
+ goto fail;
+ }
+
+ uint32_t i;
+ for (i = 0; i < (uint32_t)num_stat_names; i++) {
+ ret = json_array_append(stat_names_json_array,
+ json_string(stat_names[i]));
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("JSON array creation failed");
+ goto fail;
+ }
+ }
+
+ ret = json_object_set_new(*data, "stats", stat_names_json_array);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Setting 'stats' value in data object failed");
+ goto fail;
+ }
+ }
+
+ return 0;
+
+fail:
+ if (*data)
+ json_decref(*data);
+ if (stat_names_json_array)
+ json_decref(stat_names_json_array);
+ if (port_ids_json_array)
+ json_decref(port_ids_json_array);
+ return -1;
+}
+
+int32_t
+rte_telemetry_create_json_request(int action, char *command,
+ const char *client_path, int *port_ids, int num_port_ids,
+ const char * const *stat_names, int num_stat_names, char **request,
+ int inv_choice)
+{
+ int ret;
+ json_t *root = json_object();
+ json_t *data;
+
+ if (root == NULL) {
+ TELEMETRY_LOG_ERR("Could not create root json object");
+ goto fail;
+ }
+
+ if (inv_choice == INV_ACTION_FIELD) {
+ ret = json_object_set_new(root, "ac--on", json_integer(action));
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Setting invalid action field in root object failed");
+ goto fail;
+ }
+ } else {
+ ret = json_object_set_new(root, "action", json_integer(action));
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Setting valid action field in root object failed");
+ goto fail;
+ }
+ }
+
+ if (inv_choice == INV_COMMAND_FIELD) {
+ ret = json_object_set_new(root, "co---nd", json_string(command));
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Setting invalid command field in root object failed");
+ goto fail;
+ }
+ } else {
+ ret = json_object_set_new(root, "command", json_string(command));
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Setting valid command field in root object failed");
+ goto fail;
+ }
+ }
+
+ data = json_null();
+ if (client_path) {
+ data = json_object();
+ if (data == NULL) {
+ TELEMETRY_LOG_ERR("Data json object creation failed");
+ goto fail;
+ }
+
+ ret = json_object_set_new(data, "client_path",
+ json_string(client_path));
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Setting valid client_path field in data object failed");
+ goto fail;
+ }
+
+ } else if (port_ids) {
+ ret = rte_telemetry_format_port_stat_ids(port_ids, num_port_ids,
+ stat_names, num_stat_names, &data);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Formatting Port/Stat arrays failed");
+ goto fail;
+ }
+
+ }
+
+ if (inv_choice == INV_DATA_FIELD) {
+ ret = json_object_set_new(root, "d--a", data);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Setting invalid data field in data object failed");
+ goto fail;
+ }
+ } else {
+ ret = json_object_set_new(root, "data", data);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Setting valid data field in data object failed");
+ goto fail;
+ }
+ }
+
+ *request = json_dumps(root, 0);
+ if (*request == NULL) {
+ TELEMETRY_LOG_ERR("Converting JSON root object to char* failed");
+ goto fail;
+ }
+
+ json_decref(root);
+ return 0;
+
+fail:
+ if (root)
+ json_decref(root);
+ return -1;
+}
+
+int32_t
+rte_telemetry_send_get_ports_and_stats_request(struct telemetry_impl *telemetry,
+ int action_choice, char *command_choice, int inv_choice)
+{
+ int ret;
+ char *request;
+ char *client_path_data = NULL;
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Telemetry argument has not been initialised");
+ return -EINVAL;
+ }
+
+
+ if (inv_choice == INV_ACTION_VAL)
+ action_choice = -1;
+ else if (inv_choice == INV_COMMAND_VAL)
+ command_choice = "INVALID_COMMAND";
+ else if (inv_choice == INV_DATA_VAL)
+ client_path_data = "INVALID_DATA";
+
+ ret = rte_telemetry_create_json_request(action_choice, command_choice,
+ client_path_data, NULL, -1, NULL, -1, &request, inv_choice);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not create JSON Request");
+ return -1;
+ }
+
+ if (inv_choice == INV_JSON_FORMAT)
+ request++;
+
+ ret = rte_telemetry_parse(telemetry, request);
+ if (ret < 0) {
+ TELEMETRY_LOG_WARN("Could not parse JSON Request");
+ return -1;
+ }
+
+ return 0;
+}
+
+int32_t
+rte_telemetry_send_get_ports_details_request(struct telemetry_impl *telemetry,
+ int action_choice, int *port_ids, int num_port_ids, int inv_choice)
+{
+ int ret;
+ char *request;
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Telemetry argument has not been initialised");
+ return -EINVAL;
+ }
+
+ char *command = "ports_details";
+
+ if (inv_choice == INV_ACTION_VAL)
+ action_choice = -1;
+ else if (inv_choice == INV_COMMAND_VAL)
+ command = "INVALID_COMMAND";
+ else if (inv_choice == INV_DATA_VAL)
+ port_ids = NULL;
+
+
+ ret = rte_telemetry_create_json_request(action_choice, command, NULL,
+ port_ids, num_port_ids, NULL, -1, &request, inv_choice);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not create JSON Request");
+ return -1;
+ }
+
+ if (inv_choice == INV_JSON_FORMAT)
+ request++;
+
+ ret = rte_telemetry_parse(telemetry, request);
+ if (ret < 0) {
+ TELEMETRY_LOG_WARN("Could not parse JSON Request");
+ return -1;
+ }
+
+ return 0;
+}
+
+int32_t
+rte_telemetry_send_stats_values_by_name_request(struct telemetry_impl
+ *telemetry, int action_choice, int *port_ids, int num_port_ids,
+ const char * const *stat_names, int num_stat_names,
+ int inv_choice)
+{
+ int ret;
+ char *request;
+ char *command = "ports_stats_values_by_name";
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Telemetry argument has not been initialised");
+ return -EINVAL;
+ }
+
+ if (inv_choice == INV_ACTION_VAL)
+ action_choice = -1;
+ else if (inv_choice == INV_COMMAND_VAL)
+ command = "INVALID_COMMAND";
+ else if (inv_choice == INV_DATA_VAL) {
+ port_ids = NULL;
+ stat_names = NULL;
+ }
+
+ ret = rte_telemetry_create_json_request(action_choice, command, NULL,
+ port_ids, num_port_ids, stat_names, num_stat_names, &request,
+ inv_choice);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not create JSON Request");
+ return -1;
+ }
+
+ if (inv_choice == INV_JSON_FORMAT)
+ request++;
+
+ ret = rte_telemetry_parse(telemetry, request);
+ if (ret < 0) {
+ TELEMETRY_LOG_WARN("Could not parse JSON Request");
+ return -1;
+ }
+
+ return 0;
+}
+
+int32_t
+rte_telemetry_send_unreg_request(struct telemetry_impl *telemetry,
+ int action_choice, const char *client_path, int inv_choice)
+{
+ int ret;
+ char *request;
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Telemetry argument has not been initialised");
+ return -EINVAL;
+ }
+
+ char *command = "clients";
+
+ if (inv_choice == INV_ACTION_VAL)
+ action_choice = -1;
+ else if (inv_choice == INV_COMMAND_VAL)
+ command = "INVALID_COMMAND";
+ else if (inv_choice == INV_DATA_VAL)
+ client_path = NULL;
+
+ ret = rte_telemetry_create_json_request(action_choice, command,
+ client_path, NULL, -1, NULL, -1, &request, inv_choice);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not create JSON Request");
+ return -1;
+ }
+
+ if (inv_choice == INV_JSON_FORMAT)
+ request++;
+
+ ret = rte_telemetry_parse(telemetry, request);
+ if (ret < 0) {
+ TELEMETRY_LOG_WARN("Could not parse JSON Request");
+ return -1;
+ }
+
+ return 0;
+}
+
+int32_t
+rte_telemetry_parser_test(struct telemetry_impl *telemetry)
+{
+ int ret;
+ const char *client_path = TEST_CLIENT;
+
+ if (telemetry == NULL) {
+ TELEMETRY_LOG_ERR("Telemetry argument has not been initialised");
+ return -EINVAL;
+ }
+
+ ret = rte_telemetry_create_test_socket(telemetry, client_path);
+ if (ret < 0) {
+ TELEMETRY_LOG_ERR("Could not create test request client socket");
+ return -1;
+ }
+
+ int port_ids[] = {0, 1};
+ int num_port_ids = RTE_DIM(port_ids);
+
+ static const char * const stat_names[] = {"tx_good_packets",
+ "rx_good_packets"};
+ int num_stat_names = RTE_DIM(stat_names);
+
+ static const char * const test_types[] = {
+ "INVALID ACTION VALUE TESTS",
+ "INVALID COMMAND VALUE TESTS",
+ "INVALID DATA VALUE TESTS",
+ "INVALID ACTION FIELD TESTS",
+ "INVALID COMMAND FIELD TESTS",
+ "INVALID DATA FIELD TESTS",
+ "INVALID JSON FORMAT TESTS",
+ "VALID TESTS"
+ };
+
+
+#define NUM_TEST_TYPES (sizeof(test_types)/sizeof(const char * const))
+
+ uint32_t i;
+ for (i = 0; i < NUM_TEST_TYPES; i++) {
+ TELEMETRY_LOG_INFO("%s", test_types[i]);
+
+ ret = rte_telemetry_send_get_ports_and_stats_request(telemetry,
+ ACTION_GET, "ports", i);
+ if (ret != 0 && i == VALID_REQ) {
+ TELEMETRY_LOG_ERR("Get ports valid test failed");
+ return -EPERM;
+ } else if (ret != -1 && i != VALID_REQ) {
+ TELEMETRY_LOG_ERR("Get ports invalid test failed");
+ return -EPERM;
+ }
+
+ TELEMETRY_LOG_INFO("Success - Get ports test passed");
+
+ ret = rte_telemetry_send_get_ports_details_request(telemetry,
+ ACTION_GET, port_ids, num_port_ids, i);
+ if (ret != 0 && i == VALID_REQ) {
+ TELEMETRY_LOG_ERR("Get ports details valid");
+ return -EPERM;
+ } else if (ret != -1 && i != VALID_REQ) {
+ TELEMETRY_LOG_ERR("Get ports details invalid");
+ return -EPERM;
+ }
+
+ TELEMETRY_LOG_INFO("Success - Get ports details test passed");
+
+ ret = rte_telemetry_send_get_ports_and_stats_request(telemetry,
+ ACTION_GET, "port_stats", i);
+ if (ret != 0 && i == VALID_REQ) {
+ TELEMETRY_LOG_ERR("Get port stats valid test");
+ return -EPERM;
+ } else if (ret != -1 && i != VALID_REQ) {
+ TELEMETRY_LOG_ERR("Get ports stats invalid test failed");
+ return -EPERM;
+ }
+
+ TELEMETRY_LOG_INFO("Success - Get ports stats test passed");
+
+ ret = rte_telemetry_send_stats_values_by_name_request(telemetry,
+ ACTION_GET, port_ids, num_port_ids, stat_names,
+ num_stat_names, i);
+ if (ret != 0 && i == VALID_REQ) {
+ TELEMETRY_LOG_ERR("Get ports stats values by name valid test failed");
+ return -EPERM;
+ } else if (ret != -1 && i != VALID_REQ) {
+ TELEMETRY_LOG_ERR("Get ports stats values by name invalid test failed");
+ return -EPERM;
+ }
+
+ TELEMETRY_LOG_INFO("Success - Get ports stats values by name test passed");
+
+ ret = rte_telemetry_send_unreg_request(telemetry, ACTION_DELETE,
+ client_path, i);
+ if (ret != 0 && i == VALID_REQ) {
+ TELEMETRY_LOG_ERR("Deregister valid test failed");
+ return -EPERM;
+ } else if (ret != -1 && i != VALID_REQ) {
+ TELEMETRY_LOG_ERR("Deregister invalid test failed");
+ return -EPERM;
+ }
+
+ TELEMETRY_LOG_INFO("Success - Deregister test passed");
+ }
+
+ return 0;
+}
diff --git a/lib/librte_telemetry/rte_telemetry_parser_test.h b/lib/librte_telemetry/rte_telemetry_parser_test.h
new file mode 100644
index 00000000..6ada8527
--- /dev/null
+++ b/lib/librte_telemetry/rte_telemetry_parser_test.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _RTE_TELEMETRY_PARSER_TEST_H_
+#define _RTE_TELEMETRY_PARSER_TEST_H_
+
+int32_t
+rte_telemetry_parser_test(struct telemetry_impl *telemetry);
+
+int32_t
+rte_telemetry_format_port_stat_ids(int *port_ids, int num_port_ids,
+ const char * const stat_names, int num_stat_names, json_t **data);
+
+int32_t
+rte_telemetry_create_json_request(int action, char *command,
+ const char *client_path, int *port_ids, int num_port_ids,
+ const char * const stat_names, int num_stat_names, char **request,
+ int inv_choice);
+
+int32_t
+rte_telemetry_send_get_ports_and_stats_request(struct telemetry_impl *telemetry,
+ int action_choice, char *command_choice, int inv_choice);
+
+int32_t
+rte_telemetry_send_get_ports_details_request(struct telemetry_impl *telemetry,
+ int action_choice, int *port_ids, int num_port_ids, int inv_choice);
+
+int32_t
+rte_telemetry_send_stats_values_by_name_request(struct telemetry_impl
+ *telemetry, int action_choice, int *port_ids, int num_port_ids,
+ const char * const stat_names, int num_stat_names,
+ int inv_choice);
+
+int32_t
+rte_telemetry_send_unreg_request(int action_choice, const char *client_path,
+ int inv_choice);
+
+#endif
diff --git a/lib/librte_telemetry/rte_telemetry_socket_tests.h b/lib/librte_telemetry/rte_telemetry_socket_tests.h
new file mode 100644
index 00000000..db9167c5
--- /dev/null
+++ b/lib/librte_telemetry/rte_telemetry_socket_tests.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdbool.h>
+
+#include "rte_telemetry_internal.h"
+
+#ifndef _RTE_TELEMETRY_SOCKET_TESTING_H_
+#define _RTE_TELEMETRY_SOCKET_TESTING_H_
+
+int32_t
+rte_telemetry_json_socket_message_test(struct telemetry_impl *telemetry,
+ int fd);
+
+int32_t
+rte_telemetry_invalid_json_test(struct telemetry_impl *telemetry, int fd);
+
+int32_t
+rte_telemetry_valid_json_test(struct telemetry_impl *telemetry, int fd);
+
+int32_t
+rte_telemetry_json_contents_test(struct telemetry_impl *telemetry, int fd);
+
+int32_t
+rte_telemetry_json_empty_test(struct telemetry_impl *telemetry, int fd);
+
+int32_t
+rte_telemetry_socket_register_test(struct telemetry_impl *telemetry, int *fd,
+ int send_fd, int recv_fd);
+
+int32_t
+rte_telemetry_socket_test_setup(struct telemetry_impl *telemetry, int *send_fd,
+ int *recv_fd);
+
+#endif
diff --git a/lib/librte_telemetry/rte_telemetry_version.map b/lib/librte_telemetry/rte_telemetry_version.map
new file mode 100644
index 00000000..fa62d771
--- /dev/null
+++ b/lib/librte_telemetry/rte_telemetry_version.map
@@ -0,0 +1,10 @@
+EXPERIMENTAL {
+ global:
+
+ rte_telemetry_cleanup;
+ rte_telemetry_init;
+ rte_telemetry_parse;
+ rte_telemetry_selftest;
+
+ local: *;
+};
diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
index de431fbb..5dd31898 100644
--- a/lib/librte_vhost/Makefile
+++ b/lib/librte_vhost/Makefile
@@ -13,13 +13,13 @@ LIBABIVER := 4
CFLAGS += -DALLOW_EXPERIMENTAL_API
CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -D_FILE_OFFSET_BITS=64
CFLAGS += -I vhost_user
+CFLAGS += -fno-strict-aliasing
LDLIBS += -lpthread
ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y)
LDLIBS += -lnuma
endif
-LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev -lrte_net \
- -lrte_cryptodev -lrte_hash
+LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev -lrte_net
# all source are stored in SRCS-y
SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c socket.c vhost.c \
@@ -30,6 +30,7 @@ SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h
# only compile vhost crypto when cryptodev is enabled
ifeq ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y)
+LDLIBS += -lrte_cryptodev -lrte_hash
SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost_crypto.c
SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost_crypto.h
endif
diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build
index bd62e0e3..e33e6fc1 100644
--- a/lib/librte_vhost/meson.build
+++ b/lib/librte_vhost/meson.build
@@ -7,8 +7,11 @@ endif
if has_libnuma == 1
dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true)
endif
+dpdk_conf.set('RTE_LIBRTE_VHOST_POSTCOPY',
+ cc.has_header('linux/userfaultfd.h'))
version = 4
allow_experimental_apis = true
+cflags += '-fno-strict-aliasing'
sources = files('fd_man.c', 'iotlb.c', 'socket.c', 'vdpa.c',
'vhost.c', 'vhost_user.c',
'virtio_net.c', 'vhost_crypto.c')
diff --git a/lib/librte_vhost/rte_vdpa.h b/lib/librte_vhost/rte_vdpa.h
index 90465ca2..a418da47 100644
--- a/lib/librte_vhost/rte_vdpa.h
+++ b/lib/librte_vhost/rte_vdpa.h
@@ -21,67 +21,138 @@ enum vdpa_addr_type {
VDPA_ADDR_MAX
};
+/**
+ * vdpa device address
+ */
struct rte_vdpa_dev_addr {
+ /** vdpa address type */
enum vdpa_addr_type type;
+
+ /** vdpa pci address */
union {
uint8_t __dummy[64];
struct rte_pci_addr pci_addr;
};
};
+/**
+ * vdpa device operations
+ */
struct rte_vdpa_dev_ops {
- /* Get capabilities of this device */
+ /** Get capabilities of this device */
int (*get_queue_num)(int did, uint32_t *queue_num);
+
+ /** Get supported features of this device */
int (*get_features)(int did, uint64_t *features);
+
+ /** Get supported protocol features of this device */
int (*get_protocol_features)(int did, uint64_t *protocol_features);
- /* Driver configure/close the device */
+ /** Driver configure/close the device */
int (*dev_conf)(int vid);
int (*dev_close)(int vid);
- /* Enable/disable this vring */
+ /** Enable/disable this vring */
int (*set_vring_state)(int vid, int vring, int state);
- /* Set features when changed */
+ /** Set features when changed */
int (*set_features)(int vid);
- /* Destination operations when migration done */
+ /** Destination operations when migration done */
int (*migration_done)(int vid);
- /* Get the vfio group fd */
+ /** Get the vfio group fd */
int (*get_vfio_group_fd)(int vid);
- /* Get the vfio device fd */
+ /** Get the vfio device fd */
int (*get_vfio_device_fd)(int vid);
- /* Get the notify area info of the queue */
+ /** Get the notify area info of the queue */
int (*get_notify_area)(int vid, int qid,
uint64_t *offset, uint64_t *size);
- /* Reserved for future extension */
+ /** Reserved for future extension */
void *reserved[5];
};
+/**
+ * vdpa device structure includes device address and device operations.
+ */
struct rte_vdpa_device {
+ /** vdpa device address */
struct rte_vdpa_dev_addr addr;
+ /** vdpa device operations */
struct rte_vdpa_dev_ops *ops;
} __rte_cache_aligned;
-/* Register a vdpa device, return did if successful, -1 on failure */
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Register a vdpa device
+ *
+ * @param addr
+ * the vdpa device address
+ * @param ops
+ * the vdpa device operations
+ * @return
+ * device id on success, -1 on failure
+ */
int __rte_experimental
rte_vdpa_register_device(struct rte_vdpa_dev_addr *addr,
struct rte_vdpa_dev_ops *ops);
-/* Unregister a vdpa device, return -1 on failure */
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Unregister a vdpa device
+ *
+ * @param did
+ * vdpa device id
+ * @return
+ * device id on success, -1 on failure
+ */
int __rte_experimental
rte_vdpa_unregister_device(int did);
-/* Find did of a vdpa device, return -1 on failure */
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Find the device id of a vdpa device
+ *
+ * @param addr
+ * the vdpa device address
+ * @return
+ * device id on success, -1 on failure
+ */
int __rte_experimental
rte_vdpa_find_device_id(struct rte_vdpa_dev_addr *addr);
-/* Find a vdpa device based on did */
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Find a vdpa device based on device id
+ *
+ * @param did
+ * device id
+ * @return
+ * rte_vdpa_device on success, NULL on failure
+ */
struct rte_vdpa_device * __rte_experimental
rte_vdpa_get_device(int did);
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Get current available vdpa device number
+ *
+ * @return
+ * available vdpa device number
+ */
+int __rte_experimental
+rte_vdpa_get_device_num(void);
#endif /* _RTE_VDPA_H_ */
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index b02673d4..d280ac42 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -28,6 +28,7 @@ extern "C" {
#define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1)
#define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2)
#define RTE_VHOST_USER_IOMMU_SUPPORT (1ULL << 3)
+#define RTE_VHOST_USER_POSTCOPY_SUPPORT (1ULL << 4)
/** Protocol features. */
#ifndef VHOST_USER_PROTOCOL_F_MQ
@@ -58,6 +59,10 @@ extern "C" {
#define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
#endif
+#ifndef VHOST_USER_PROTOCOL_F_PAGEFAULT
+#define VHOST_USER_PROTOCOL_F_PAGEFAULT 8
+#endif
+
#ifndef VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD
#define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10
#endif
diff --git a/lib/librte_vhost/rte_vhost_version.map b/lib/librte_vhost/rte_vhost_version.map
index da220dd0..ae39b6e2 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -67,6 +67,7 @@ EXPERIMENTAL {
rte_vdpa_unregister_device;
rte_vdpa_find_device_id;
rte_vdpa_get_device;
+ rte_vdpa_get_device_num;
rte_vhost_driver_attach_vdpa_device;
rte_vhost_driver_detach_vdpa_device;
rte_vhost_driver_get_vdpa_device_id;
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index d6303174..01b60ff9 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -51,6 +51,8 @@ struct vhost_user_socket {
uint64_t supported_features;
uint64_t features;
+ uint64_t protocol_features;
+
/*
* Device id to identify a specific backend device.
* It's set to -1 for the default software implementation.
@@ -94,18 +96,23 @@ static struct vhost_user vhost_user = {
.mutex = PTHREAD_MUTEX_INITIALIZER,
};
-/* return bytes# of read on success or negative val on failure. */
+/*
+ * return bytes# of read on success or negative val on failure. Update fdnum
+ * with number of fds read.
+ */
int
-read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds,
+ int *fd_num)
{
struct iovec iov;
struct msghdr msgh;
- size_t fdsize = fd_num * sizeof(int);
- char control[CMSG_SPACE(fdsize)];
+ char control[CMSG_SPACE(max_fds * sizeof(int))];
struct cmsghdr *cmsg;
int got_fds = 0;
int ret;
+ *fd_num = 0;
+
memset(&msgh, 0, sizeof(msgh));
iov.iov_base = buf;
iov.iov_len = buflen;
@@ -131,13 +138,14 @@ read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
if ((cmsg->cmsg_level == SOL_SOCKET) &&
(cmsg->cmsg_type == SCM_RIGHTS)) {
got_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+ *fd_num = got_fds;
memcpy(fds, CMSG_DATA(cmsg), got_fds * sizeof(int));
break;
}
}
/* Clear out unused file descriptors */
- while (got_fds < fd_num)
+ while (got_fds < max_fds)
fds[got_fds++] = -1;
return ret;
@@ -720,7 +728,7 @@ rte_vhost_driver_get_protocol_features(const char *path,
did = vsocket->vdpa_dev_id;
vdpa_dev = rte_vdpa_get_device(did);
if (!vdpa_dev || !vdpa_dev->ops->get_protocol_features) {
- *protocol_features = VHOST_USER_PROTOCOL_FEATURES;
+ *protocol_features = vsocket->protocol_features;
goto unlock_exit;
}
@@ -733,7 +741,7 @@ rte_vhost_driver_get_protocol_features(const char *path,
goto unlock_exit;
}
- *protocol_features = VHOST_USER_PROTOCOL_FEATURES
+ *protocol_features = vsocket->protocol_features
& vdpa_protocol_features;
unlock_exit:
@@ -852,11 +860,21 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
vsocket->use_builtin_virtio_net = true;
vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES;
vsocket->features = VIRTIO_NET_SUPPORTED_FEATURES;
+ vsocket->protocol_features = VHOST_USER_PROTOCOL_FEATURES;
- /* Dequeue zero copy can't assure descriptors returned in order */
+ /*
+ * Dequeue zero copy can't assure descriptors returned in order.
+ * Also, it requires that the guest memory is populated, which is
+ * not compatible with postcopy.
+ */
if (vsocket->dequeue_zero_copy) {
vsocket->supported_features &= ~(1ULL << VIRTIO_F_IN_ORDER);
vsocket->features &= ~(1ULL << VIRTIO_F_IN_ORDER);
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "Dequeue zero copy requested, disabling postcopy support\n");
+ vsocket->protocol_features &=
+ ~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
}
if (!(flags & RTE_VHOST_USER_IOMMU_SUPPORT)) {
@@ -864,6 +882,18 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
vsocket->features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
}
+ if (!(flags & RTE_VHOST_USER_POSTCOPY_SUPPORT)) {
+ vsocket->protocol_features &=
+ ~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
+ } else {
+#ifndef RTE_LIBRTE_VHOST_POSTCOPY
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Postcopy requested but not compiled\n");
+ ret = -1;
+ goto out_mutex;
+#endif
+ }
+
if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
if (vsocket->reconnect && reconn_tid == 0) {
diff --git a/lib/librte_vhost/vdpa.c b/lib/librte_vhost/vdpa.c
index c82fd437..c2c5dff1 100644
--- a/lib/librte_vhost/vdpa.c
+++ b/lib/librte_vhost/vdpa.c
@@ -113,3 +113,9 @@ rte_vdpa_get_device(int did)
return vdpa_devices[did];
}
+
+int
+rte_vdpa_get_device_num(void)
+{
+ return vdpa_device_num;
+}
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 3c9be10a..70ac6bc9 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -8,6 +8,7 @@
#include <stdint.h>
#include <stdlib.h>
#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numa.h>
#include <numaif.h>
#endif
@@ -343,6 +344,7 @@ vhost_new_device(void)
dev->flags = VIRTIO_DEV_BUILTIN_VIRTIO_NET;
dev->slave_req_fd = -1;
dev->vdpa_dev_id = -1;
+ dev->postcopy_ufd = -1;
rte_spinlock_init(&dev->slave_req_lock);
return i;
@@ -480,7 +482,7 @@ rte_vhost_get_numa_node(int vid)
int numa_node;
int ret;
- if (dev == NULL)
+ if (dev == NULL || numa_available() != 0)
return -1;
ret = get_mempolicy(&numa_node, NULL, 0, dev,
@@ -646,12 +648,18 @@ rte_vhost_avail_entries(int vid, uint16_t queue_id)
}
static inline void
-vhost_enable_notify_split(struct vhost_virtqueue *vq, int enable)
+vhost_enable_notify_split(struct virtio_net *dev,
+ struct vhost_virtqueue *vq, int enable)
{
- if (enable)
- vq->used->flags &= ~VRING_USED_F_NO_NOTIFY;
- else
- vq->used->flags |= VRING_USED_F_NO_NOTIFY;
+ if (!(dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))) {
+ if (enable)
+ vq->used->flags &= ~VRING_USED_F_NO_NOTIFY;
+ else
+ vq->used->flags |= VRING_USED_F_NO_NOTIFY;
+ } else {
+ if (enable)
+ vhost_avail_event(vq) = vq->last_avail_idx;
+ }
}
static inline void
@@ -660,8 +668,10 @@ vhost_enable_notify_packed(struct virtio_net *dev,
{
uint16_t flags;
- if (!enable)
+ if (!enable) {
vq->device_event->flags = VRING_EVENT_F_DISABLE;
+ return;
+ }
flags = VRING_EVENT_F_ENABLE;
if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
@@ -689,7 +699,7 @@ rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
if (vq_is_packed(dev))
vhost_enable_notify_packed(dev, vq, enable);
else
- vhost_enable_notify_split(vq, enable);
+ vhost_enable_notify_split(dev, vq, enable);
return 0;
}
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 760a09c0..b4abad30 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -284,6 +284,16 @@ struct guest_page {
uint64_t size;
};
+/* The possible results of a message handling function */
+enum vh_result {
+ /* Message handling failed */
+ VH_RESULT_ERR = -1,
+ /* Message handling successful */
+ VH_RESULT_OK = 0,
+ /* Message handling successful and reply prepared */
+ VH_RESULT_REPLY = 1,
+};
+
/**
* function prototype for the vhost backend to handler specific vhost user
* messages prior to the master message handling
@@ -292,17 +302,15 @@ struct guest_page {
* vhost device id
* @param msg
* Message pointer.
- * @param require_reply
- * If the handler requires sending a reply, this varaible shall be written 1,
- * otherwise 0.
* @param skip_master
* If the handler requires skipping the master message handling, this variable
* shall be written 1, otherwise 0.
* @return
- * 0 on success, -1 on failure
+ * VH_RESULT_OK on success, VH_RESULT_REPLY on success with reply,
+ * VH_RESULT_ERR on failure
*/
-typedef int (*vhost_msg_pre_handle)(int vid, void *msg,
- uint32_t *require_reply, uint32_t *skip_master);
+typedef enum vh_result (*vhost_msg_pre_handle)(int vid, void *msg,
+ uint32_t *skip_master);
/**
* function prototype for the vhost backend to handler specific vhost user
@@ -312,14 +320,11 @@ typedef int (*vhost_msg_pre_handle)(int vid, void *msg,
* vhost device id
* @param msg
* Message pointer.
- * @param require_reply
- * If the handler requires sending a reply, this varaible shall be written 1,
- * otherwise 0.
* @return
- * 0 on success, -1 on failure
+ * VH_RESULT_OK on success, VH_RESULT_REPLY on success with reply,
+ * VH_RESULT_ERR on failure
*/
-typedef int (*vhost_msg_post_handle)(int vid, void *msg,
- uint32_t *require_reply);
+typedef enum vh_result (*vhost_msg_post_handle)(int vid, void *msg);
/**
* pre and post vhost user message handlers
@@ -363,6 +368,9 @@ struct virtio_net {
int slave_req_fd;
rte_spinlock_t slave_req_lock;
+ int postcopy_ufd;
+ int postcopy_listening;
+
/*
* Device id to identify a specific backend device.
* It's set to -1 for the default software implementation.
@@ -648,6 +656,8 @@ vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq,
return __vhost_iova_to_vva(dev, vq, iova, len, perm);
}
+#define vhost_avail_event(vr) \
+ (*(volatile uint16_t*)&(vr)->used->ring[(vr)->size])
#define vhost_used_event(vr) \
(*(volatile uint16_t*)&(vr)->avail->ring[(vr)->size])
diff --git a/lib/librte_vhost/vhost_crypto.c b/lib/librte_vhost/vhost_crypto.c
index 57341ef8..9811a232 100644
--- a/lib/librte_vhost/vhost_crypto.c
+++ b/lib/librte_vhost/vhost_crypto.c
@@ -425,35 +425,34 @@ vhost_crypto_close_sess(struct vhost_crypto *vcrypto, uint64_t session_id)
return 0;
}
-static int
-vhost_crypto_msg_post_handler(int vid, void *msg, uint32_t *require_reply)
+static enum vh_result
+vhost_crypto_msg_post_handler(int vid, void *msg)
{
struct virtio_net *dev = get_device(vid);
struct vhost_crypto *vcrypto;
VhostUserMsg *vmsg = msg;
- int ret = 0;
+ enum vh_result ret = VH_RESULT_OK;
- if (dev == NULL || require_reply == NULL) {
+ if (dev == NULL) {
VC_LOG_ERR("Invalid vid %i", vid);
- return -EINVAL;
+ return VH_RESULT_ERR;
}
vcrypto = dev->extern_data;
if (vcrypto == NULL) {
VC_LOG_ERR("Cannot find required data, is it initialized?");
- return -ENOENT;
+ return VH_RESULT_ERR;
}
- *require_reply = 0;
-
if (vmsg->request.master == VHOST_USER_CRYPTO_CREATE_SESS) {
vhost_crypto_create_sess(vcrypto,
&vmsg->payload.crypto_session);
- *require_reply = 1;
- } else if (vmsg->request.master == VHOST_USER_CRYPTO_CLOSE_SESS)
- ret = vhost_crypto_close_sess(vcrypto, vmsg->payload.u64);
- else
- ret = -EINVAL;
+ vmsg->fd_num = 0;
+ ret = VH_RESULT_REPLY;
+ } else if (vmsg->request.master == VHOST_USER_CRYPTO_CLOSE_SESS) {
+ if (vhost_crypto_close_sess(vcrypto, vmsg->payload.u64))
+ ret = VH_RESULT_ERR;
+ }
return ret;
}
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index a2d4c9ff..508228a3 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -24,13 +24,19 @@
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
+#include <sys/syscall.h>
#include <assert.h>
#ifdef RTE_LIBRTE_VHOST_NUMA
#include <numaif.h>
#endif
+#ifdef RTE_LIBRTE_VHOST_POSTCOPY
+#include <linux/userfaultfd.h>
+#endif
#include <rte_common.h>
#include <rte_malloc.h>
@@ -69,8 +75,14 @@ static const char *vhost_message_str[VHOST_USER_MAX] = {
[VHOST_USER_IOTLB_MSG] = "VHOST_USER_IOTLB_MSG",
[VHOST_USER_CRYPTO_CREATE_SESS] = "VHOST_USER_CRYPTO_CREATE_SESS",
[VHOST_USER_CRYPTO_CLOSE_SESS] = "VHOST_USER_CRYPTO_CLOSE_SESS",
+ [VHOST_USER_POSTCOPY_ADVISE] = "VHOST_USER_POSTCOPY_ADVISE",
+ [VHOST_USER_POSTCOPY_LISTEN] = "VHOST_USER_POSTCOPY_LISTEN",
+ [VHOST_USER_POSTCOPY_END] = "VHOST_USER_POSTCOPY_END",
};
+static int send_vhost_reply(int sockfd, struct VhostUserMsg *msg);
+static int read_vhost_message(int sockfd, struct VhostUserMsg *msg);
+
static uint64_t
get_blk_size(int fd)
{
@@ -120,6 +132,13 @@ vhost_backend_cleanup(struct virtio_net *dev)
close(dev->slave_req_fd);
dev->slave_req_fd = -1;
}
+
+ if (dev->postcopy_ufd >= 0) {
+ close(dev->postcopy_ufd);
+ dev->postcopy_ufd = -1;
+ }
+
+ dev->postcopy_listening = 0;
}
/*
@@ -127,51 +146,73 @@ vhost_backend_cleanup(struct virtio_net *dev)
* the device hasn't been initialised.
*/
static int
-vhost_user_set_owner(void)
+vhost_user_set_owner(struct virtio_net **pdev __rte_unused,
+ struct VhostUserMsg *msg __rte_unused,
+ int main_fd __rte_unused)
{
- return 0;
+ return VH_RESULT_OK;
}
static int
-vhost_user_reset_owner(struct virtio_net *dev)
+vhost_user_reset_owner(struct virtio_net **pdev,
+ struct VhostUserMsg *msg __rte_unused,
+ int main_fd __rte_unused)
{
+ struct virtio_net *dev = *pdev;
vhost_destroy_device_notify(dev);
cleanup_device(dev, 0);
reset_device(dev);
- return 0;
+ return VH_RESULT_OK;
}
/*
* The features that we support are requested.
*/
-static uint64_t
-vhost_user_get_features(struct virtio_net *dev)
+static int
+vhost_user_get_features(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
{
+ struct virtio_net *dev = *pdev;
uint64_t features = 0;
rte_vhost_driver_get_features(dev->ifname, &features);
- return features;
+
+ msg->payload.u64 = features;
+ msg->size = sizeof(msg->payload.u64);
+ msg->fd_num = 0;
+
+ return VH_RESULT_REPLY;
}
/*
* The queue number that we support are requested.
*/
-static uint32_t
-vhost_user_get_queue_num(struct virtio_net *dev)
+static int
+vhost_user_get_queue_num(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
{
+ struct virtio_net *dev = *pdev;
uint32_t queue_num = 0;
rte_vhost_driver_get_queue_num(dev->ifname, &queue_num);
- return queue_num;
+
+ msg->payload.u64 = (uint64_t)queue_num;
+ msg->size = sizeof(msg->payload.u64);
+ msg->fd_num = 0;
+
+ return VH_RESULT_REPLY;
}
/*
* We receive the negotiated features supported by us and the virtio device.
*/
static int
-vhost_user_set_features(struct virtio_net *dev, uint64_t features)
+vhost_user_set_features(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
{
+ struct virtio_net *dev = *pdev;
+ uint64_t features = msg->payload.u64;
uint64_t vhost_features = 0;
struct rte_vdpa_device *vdpa_dev;
int did = -1;
@@ -181,12 +222,12 @@ vhost_user_set_features(struct virtio_net *dev, uint64_t features)
RTE_LOG(ERR, VHOST_CONFIG,
"(%d) received invalid negotiated features.\n",
dev->vid);
- return -1;
+ return VH_RESULT_ERR;
}
if (dev->flags & VIRTIO_DEV_RUNNING) {
if (dev->features == features)
- return 0;
+ return VH_RESULT_OK;
/*
* Error out if master tries to change features while device is
@@ -197,7 +238,7 @@ vhost_user_set_features(struct virtio_net *dev, uint64_t features)
RTE_LOG(ERR, VHOST_CONFIG,
"(%d) features changed while device is running.\n",
dev->vid);
- return -1;
+ return VH_RESULT_ERR;
}
if (dev->notify_ops->features_changed)
@@ -242,16 +283,18 @@ vhost_user_set_features(struct virtio_net *dev, uint64_t features)
if (vdpa_dev && vdpa_dev->ops->set_features)
vdpa_dev->ops->set_features(dev->vid);
- return 0;
+ return VH_RESULT_OK;
}
/*
* The virtio device sends us the size of the descriptor ring.
*/
static int
-vhost_user_set_vring_num(struct virtio_net *dev,
- VhostUserMsg *msg)
+vhost_user_set_vring_num(struct virtio_net **pdev,
+ struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
{
+ struct virtio_net *dev = *pdev;
struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
vq->size = msg->payload.state.num;
@@ -264,7 +307,7 @@ vhost_user_set_vring_num(struct virtio_net *dev,
if ((vq->size & (vq->size - 1)) || vq->size > 32768) {
RTE_LOG(ERR, VHOST_CONFIG,
"invalid virtqueue size %u\n", vq->size);
- return -1;
+ return VH_RESULT_ERR;
}
if (dev->dequeue_zero_copy) {
@@ -290,7 +333,7 @@ vhost_user_set_vring_num(struct virtio_net *dev,
if (!vq->shadow_used_packed) {
RTE_LOG(ERR, VHOST_CONFIG,
"failed to allocate memory for shadow used ring.\n");
- return -1;
+ return VH_RESULT_ERR;
}
} else {
@@ -300,7 +343,7 @@ vhost_user_set_vring_num(struct virtio_net *dev,
if (!vq->shadow_used_split) {
RTE_LOG(ERR, VHOST_CONFIG,
"failed to allocate memory for shadow used ring.\n");
- return -1;
+ return VH_RESULT_ERR;
}
}
@@ -310,10 +353,10 @@ vhost_user_set_vring_num(struct virtio_net *dev,
if (!vq->batch_copy_elems) {
RTE_LOG(ERR, VHOST_CONFIG,
"failed to allocate memory for batching copy.\n");
- return -1;
+ return VH_RESULT_ERR;
}
- return 0;
+ return VH_RESULT_OK;
}
/*
@@ -357,11 +400,13 @@ numa_realloc(struct virtio_net *dev, int index)
memcpy(vq, old_vq, sizeof(*vq));
TAILQ_INIT(&vq->zmbuf_list);
- new_zmbuf = rte_malloc_socket(NULL, vq->zmbuf_size *
- sizeof(struct zcopy_mbuf), 0, newnode);
- if (new_zmbuf) {
- rte_free(vq->zmbufs);
- vq->zmbufs = new_zmbuf;
+ if (dev->dequeue_zero_copy) {
+ new_zmbuf = rte_malloc_socket(NULL, vq->zmbuf_size *
+ sizeof(struct zcopy_mbuf), 0, newnode);
+ if (new_zmbuf) {
+ rte_free(vq->zmbufs);
+ vq->zmbufs = new_zmbuf;
+ }
}
if (vq_is_packed(dev)) {
@@ -609,14 +654,15 @@ translate_ring_addresses(struct virtio_net *dev, int vq_index)
* This function then converts these to our address space.
*/
static int
-vhost_user_set_vring_addr(struct virtio_net **pdev, VhostUserMsg *msg)
+vhost_user_set_vring_addr(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
{
+ struct virtio_net *dev = *pdev;
struct vhost_virtqueue *vq;
struct vhost_vring_addr *addr = &msg->payload.addr;
- struct virtio_net *dev = *pdev;
if (dev->mem == NULL)
- return -1;
+ return VH_RESULT_ERR;
/* addr->index refers to the queue index. The txq 1, rxq is 0. */
vq = dev->virtqueue[msg->payload.addr.index];
@@ -633,27 +679,29 @@ vhost_user_set_vring_addr(struct virtio_net **pdev, VhostUserMsg *msg)
(1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) {
dev = translate_ring_addresses(dev, msg->payload.addr.index);
if (!dev)
- return -1;
+ return VH_RESULT_ERR;
*pdev = dev;
}
- return 0;
+ return VH_RESULT_OK;
}
/*
* The virtio device sends us the available ring last used index.
*/
static int
-vhost_user_set_vring_base(struct virtio_net *dev,
- VhostUserMsg *msg)
+vhost_user_set_vring_base(struct virtio_net **pdev,
+ struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
{
+ struct virtio_net *dev = *pdev;
dev->virtqueue[msg->payload.state.index]->last_used_idx =
msg->payload.state.num;
dev->virtqueue[msg->payload.state.index]->last_avail_idx =
msg->payload.state.num;
- return 0;
+ return VH_RESULT_OK;
}
static int
@@ -778,10 +826,11 @@ vhost_memory_changed(struct VhostUserMemory *new,
}
static int
-vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *pmsg)
+vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd)
{
struct virtio_net *dev = *pdev;
- struct VhostUserMemory memory = pmsg->payload.memory;
+ struct VhostUserMemory *memory = &msg->payload.memory;
struct rte_vhost_mem_region *reg;
void *mmap_addr;
uint64_t mmap_size;
@@ -791,20 +840,20 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *pmsg)
int populate;
int fd;
- if (memory.nregions > VHOST_MEMORY_MAX_NREGIONS) {
+ if (memory->nregions > VHOST_MEMORY_MAX_NREGIONS) {
RTE_LOG(ERR, VHOST_CONFIG,
- "too many memory regions (%u)\n", memory.nregions);
- return -1;
+ "too many memory regions (%u)\n", memory->nregions);
+ return VH_RESULT_ERR;
}
- if (dev->mem && !vhost_memory_changed(&memory, dev->mem)) {
+ if (dev->mem && !vhost_memory_changed(memory, dev->mem)) {
RTE_LOG(INFO, VHOST_CONFIG,
"(%d) memory regions not changed\n", dev->vid);
- for (i = 0; i < memory.nregions; i++)
- close(pmsg->fds[i]);
+ for (i = 0; i < memory->nregions; i++)
+ close(msg->fds[i]);
- return 0;
+ return VH_RESULT_OK;
}
if (dev->mem) {
@@ -828,30 +877,30 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *pmsg)
"(%d) failed to allocate memory "
"for dev->guest_pages\n",
dev->vid);
- return -1;
+ return VH_RESULT_ERR;
}
}
dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) +
- sizeof(struct rte_vhost_mem_region) * memory.nregions, 0);
+ sizeof(struct rte_vhost_mem_region) * memory->nregions, 0);
if (dev->mem == NULL) {
RTE_LOG(ERR, VHOST_CONFIG,
"(%d) failed to allocate memory for dev->mem\n",
dev->vid);
- return -1;
+ return VH_RESULT_ERR;
}
- dev->mem->nregions = memory.nregions;
+ dev->mem->nregions = memory->nregions;
- for (i = 0; i < memory.nregions; i++) {
- fd = pmsg->fds[i];
+ for (i = 0; i < memory->nregions; i++) {
+ fd = msg->fds[i];
reg = &dev->mem->regions[i];
- reg->guest_phys_addr = memory.regions[i].guest_phys_addr;
- reg->guest_user_addr = memory.regions[i].userspace_addr;
- reg->size = memory.regions[i].memory_size;
+ reg->guest_phys_addr = memory->regions[i].guest_phys_addr;
+ reg->guest_user_addr = memory->regions[i].userspace_addr;
+ reg->size = memory->regions[i].memory_size;
reg->fd = fd;
- mmap_offset = memory.regions[i].mmap_offset;
+ mmap_offset = memory->regions[i].mmap_offset;
/* Check for memory_size + mmap_offset overflow */
if (mmap_offset >= -reg->size) {
@@ -920,6 +969,70 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *pmsg)
mmap_size,
alignment,
mmap_offset);
+
+ if (dev->postcopy_listening) {
+ /*
+ * We haven't a better way right now than sharing
+ * DPDK's virtual address with Qemu, so that Qemu can
+ * retrieve the region offset when handling userfaults.
+ */
+ memory->regions[i].userspace_addr =
+ reg->host_user_addr;
+ }
+ }
+ if (dev->postcopy_listening) {
+ /* Send the addresses back to qemu */
+ msg->fd_num = 0;
+ send_vhost_reply(main_fd, msg);
+
+ /* Wait for qemu to acknolwedge it's got the addresses
+ * we've got to wait before we're allowed to generate faults.
+ */
+ VhostUserMsg ack_msg;
+ if (read_vhost_message(main_fd, &ack_msg) <= 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to read qemu ack on postcopy set-mem-table\n");
+ goto err_mmap;
+ }
+ if (ack_msg.request.master != VHOST_USER_SET_MEM_TABLE) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Bad qemu ack on postcopy set-mem-table (%d)\n",
+ ack_msg.request.master);
+ goto err_mmap;
+ }
+
+ /* Now userfault register and we can use the memory */
+ for (i = 0; i < memory->nregions; i++) {
+#ifdef RTE_LIBRTE_VHOST_POSTCOPY
+ reg = &dev->mem->regions[i];
+ struct uffdio_register reg_struct;
+
+ /*
+ * Let's register all the mmap'ed area to ensure
+ * alignment on page boundary.
+ */
+ reg_struct.range.start =
+ (uint64_t)(uintptr_t)reg->mmap_addr;
+ reg_struct.range.len = reg->mmap_size;
+ reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
+
+ if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER,
+ &reg_struct)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to register ufd for region %d: (ufd = %d) %s\n",
+ i, dev->postcopy_ufd,
+ strerror(errno));
+ goto err_mmap;
+ }
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "\t userfaultfd registered for range : %llx - %llx\n",
+ reg_struct.range.start,
+ reg_struct.range.start +
+ reg_struct.range.len - 1);
+#else
+ goto err_mmap;
+#endif
+ }
}
for (i = 0; i < dev->nr_vring; i++) {
@@ -934,8 +1047,10 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *pmsg)
vring_invalidate(dev, vq);
dev = translate_ring_addresses(dev, i);
- if (!dev)
- return -1;
+ if (!dev) {
+ dev = *pdev;
+ goto err_mmap;
+ }
*pdev = dev;
}
@@ -943,13 +1058,13 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *pmsg)
dump_guest_pages(dev);
- return 0;
+ return VH_RESULT_OK;
err_mmap:
free_mem_region(dev);
rte_free(dev->mem);
dev->mem = NULL;
- return -1;
+ return VH_RESULT_ERR;
}
static bool
@@ -991,17 +1106,19 @@ virtio_is_ready(struct virtio_net *dev)
return 1;
}
-static void
-vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg)
+static int
+vhost_user_set_vring_call(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
{
+ struct virtio_net *dev = *pdev;
struct vhost_vring_file file;
struct vhost_virtqueue *vq;
- file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
- if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+ file.index = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ if (msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
file.fd = VIRTIO_INVALID_EVENTFD;
else
- file.fd = pmsg->fds[0];
+ file.fd = msg->fds[0];
RTE_LOG(INFO, VHOST_CONFIG,
"vring call idx:%d file:%d\n", file.index, file.fd);
@@ -1010,27 +1127,41 @@ vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg)
close(vq->callfd);
vq->callfd = file.fd;
+
+ return VH_RESULT_OK;
}
-static void
-vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *pmsg)
+static int vhost_user_set_vring_err(struct virtio_net **pdev __rte_unused,
+ struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
{
+ if (!(msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK))
+ close(msg->fds[0]);
+ RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
+
+ return VH_RESULT_OK;
+}
+
+static int
+vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
struct vhost_vring_file file;
struct vhost_virtqueue *vq;
- struct virtio_net *dev = *pdev;
- file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
- if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+ file.index = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ if (msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
file.fd = VIRTIO_INVALID_EVENTFD;
else
- file.fd = pmsg->fds[0];
+ file.fd = msg->fds[0];
RTE_LOG(INFO, VHOST_CONFIG,
"vring kick idx:%d file:%d\n", file.index, file.fd);
/* Interpret ring addresses only when ring is started. */
dev = translate_ring_addresses(dev, file.index);
if (!dev)
- return;
+ return VH_RESULT_ERR;
*pdev = dev;
@@ -1047,6 +1178,8 @@ vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *pmsg)
if (vq->kickfd >= 0)
close(vq->kickfd);
vq->kickfd = file.fd;
+
+ return VH_RESULT_OK;
}
static void
@@ -1069,9 +1202,11 @@ free_zmbufs(struct vhost_virtqueue *vq)
* when virtio is stopped, qemu will send us the GET_VRING_BASE message.
*/
static int
-vhost_user_get_vring_base(struct virtio_net *dev,
- VhostUserMsg *msg)
+vhost_user_get_vring_base(struct virtio_net **pdev,
+ struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
{
+ struct virtio_net *dev = *pdev;
struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
/* We have to stop the queue (virtio) if it is running. */
@@ -1114,7 +1249,10 @@ vhost_user_get_vring_base(struct virtio_net *dev,
rte_free(vq->batch_copy_elems);
vq->batch_copy_elems = NULL;
- return 0;
+ msg->size = sizeof(msg->payload.state);
+ msg->fd_num = 0;
+
+ return VH_RESULT_REPLY;
}
/*
@@ -1122,9 +1260,11 @@ vhost_user_get_vring_base(struct virtio_net *dev,
* enable the virtio queue pair.
*/
static int
-vhost_user_set_vring_enable(struct virtio_net *dev,
- VhostUserMsg *msg)
+vhost_user_set_vring_enable(struct virtio_net **pdev,
+ struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
{
+ struct virtio_net *dev = *pdev;
int enable = (int)msg->payload.state.num;
int index = (int)msg->payload.state.index;
struct rte_vdpa_device *vdpa_dev;
@@ -1145,13 +1285,15 @@ vhost_user_set_vring_enable(struct virtio_net *dev,
dev->virtqueue[index]->enabled = enable;
- return 0;
+ return VH_RESULT_OK;
}
-static void
-vhost_user_get_protocol_features(struct virtio_net *dev,
- struct VhostUserMsg *msg)
+static int
+vhost_user_get_protocol_features(struct virtio_net **pdev,
+ struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
{
+ struct virtio_net *dev = *pdev;
uint64_t features, protocol_features;
rte_vhost_driver_get_features(dev->ifname, &features);
@@ -1168,35 +1310,53 @@ vhost_user_get_protocol_features(struct virtio_net *dev,
msg->payload.u64 = protocol_features;
msg->size = sizeof(msg->payload.u64);
+ msg->fd_num = 0;
+
+ return VH_RESULT_REPLY;
}
-static void
-vhost_user_set_protocol_features(struct virtio_net *dev,
- uint64_t protocol_features)
+static int
+vhost_user_set_protocol_features(struct virtio_net **pdev,
+ struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
{
- if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
- return;
+ struct virtio_net *dev = *pdev;
+ uint64_t protocol_features = msg->payload.u64;
+ uint64_t slave_protocol_features = 0;
+
+ rte_vhost_driver_get_protocol_features(dev->ifname,
+ &slave_protocol_features);
+ if (protocol_features & ~slave_protocol_features) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) received invalid protocol features.\n",
+ dev->vid);
+ return VH_RESULT_ERR;
+ }
dev->protocol_features = protocol_features;
+
+ return VH_RESULT_OK;
}
static int
-vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg)
+vhost_user_set_log_base(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
{
+ struct virtio_net *dev = *pdev;
int fd = msg->fds[0];
uint64_t size, off;
void *addr;
if (fd < 0) {
RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
- return -1;
+ return VH_RESULT_ERR;
}
if (msg->size != sizeof(VhostUserLog)) {
RTE_LOG(ERR, VHOST_CONFIG,
"invalid log base msg size: %"PRId32" != %d\n",
msg->size, (int)sizeof(VhostUserLog));
- return -1;
+ return VH_RESULT_ERR;
}
size = msg->payload.log.mmap_size;
@@ -1207,7 +1367,7 @@ vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg)
RTE_LOG(ERR, VHOST_CONFIG,
"log offset %#"PRIx64" exceeds log size %#"PRIx64"\n",
off, size);
- return -1;
+ return VH_RESULT_ERR;
}
RTE_LOG(INFO, VHOST_CONFIG,
@@ -1222,7 +1382,7 @@ vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg)
close(fd);
if (addr == MAP_FAILED) {
RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
- return -1;
+ return VH_RESULT_ERR;
}
/*
@@ -1236,7 +1396,24 @@ vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg)
dev->log_base = dev->log_addr + off;
dev->log_size = size;
- return 0;
+ /*
+ * The spec is not clear about it (yet), but QEMU doesn't expect
+ * any payload in the reply.
+ */
+ msg->size = 0;
+ msg->fd_num = 0;
+
+ return VH_RESULT_REPLY;
+}
+
+static int vhost_user_set_log_fd(struct virtio_net **pdev __rte_unused,
+ struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ close(msg->fds[0]);
+ RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
+
+ return VH_RESULT_OK;
}
/*
@@ -1248,8 +1425,10 @@ vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg)
* a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
*/
static int
-vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg)
+vhost_user_send_rarp(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
{
+ struct virtio_net *dev = *pdev;
uint8_t *mac = (uint8_t *)&msg->payload.u64;
struct rte_vdpa_device *vdpa_dev;
int did = -1;
@@ -1273,40 +1452,44 @@ vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg)
if (vdpa_dev && vdpa_dev->ops->migration_done)
vdpa_dev->ops->migration_done(dev->vid);
- return 0;
+ return VH_RESULT_OK;
}
static int
-vhost_user_net_set_mtu(struct virtio_net *dev, struct VhostUserMsg *msg)
+vhost_user_net_set_mtu(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
{
+ struct virtio_net *dev = *pdev;
if (msg->payload.u64 < VIRTIO_MIN_MTU ||
msg->payload.u64 > VIRTIO_MAX_MTU) {
RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n",
msg->payload.u64);
- return -1;
+ return VH_RESULT_ERR;
}
dev->mtu = msg->payload.u64;
- return 0;
+ return VH_RESULT_OK;
}
static int
-vhost_user_set_req_fd(struct virtio_net *dev, struct VhostUserMsg *msg)
+vhost_user_set_req_fd(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
{
+ struct virtio_net *dev = *pdev;
int fd = msg->fds[0];
if (fd < 0) {
RTE_LOG(ERR, VHOST_CONFIG,
"Invalid file descriptor for slave channel (%d)\n",
fd);
- return -1;
+ return VH_RESULT_ERR;
}
dev->slave_req_fd = fd;
- return 0;
+ return VH_RESULT_OK;
}
static int
@@ -1359,7 +1542,8 @@ is_vring_iotlb_invalidate(struct vhost_virtqueue *vq,
}
static int
-vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg)
+vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
{
struct virtio_net *dev = *pdev;
struct vhost_iotlb_msg *imsg = &msg->payload.iotlb;
@@ -1371,7 +1555,7 @@ vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg)
len = imsg->size;
vva = qva_to_vva(dev, imsg->uaddr, &len);
if (!vva)
- return -1;
+ return VH_RESULT_ERR;
for (i = 0; i < dev->nr_vring; i++) {
struct vhost_virtqueue *vq = dev->virtqueue[i];
@@ -1397,12 +1581,118 @@ vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg)
default:
RTE_LOG(ERR, VHOST_CONFIG, "Invalid IOTLB message type (%d)\n",
imsg->type);
- return -1;
+ return VH_RESULT_ERR;
}
- return 0;
+ return VH_RESULT_OK;
}
+static int
+vhost_user_set_postcopy_advise(struct virtio_net **pdev,
+ struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+#ifdef RTE_LIBRTE_VHOST_POSTCOPY
+ struct uffdio_api api_struct;
+
+ dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+
+ if (dev->postcopy_ufd == -1) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Userfaultfd not available: %s\n",
+ strerror(errno));
+ return VH_RESULT_ERR;
+ }
+ api_struct.api = UFFD_API;
+ api_struct.features = 0;
+ if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) {
+ RTE_LOG(ERR, VHOST_CONFIG, "UFFDIO_API ioctl failure: %s\n",
+ strerror(errno));
+ close(dev->postcopy_ufd);
+ dev->postcopy_ufd = -1;
+ return VH_RESULT_ERR;
+ }
+ msg->fds[0] = dev->postcopy_ufd;
+ msg->fd_num = 1;
+
+ return VH_RESULT_REPLY;
+#else
+ dev->postcopy_ufd = -1;
+ msg->fd_num = 0;
+
+ return VH_RESULT_ERR;
+#endif
+}
+
+static int
+vhost_user_set_postcopy_listen(struct virtio_net **pdev,
+ struct VhostUserMsg *msg __rte_unused,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+
+ if (dev->mem && dev->mem->nregions) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Regions already registered at postcopy-listen\n");
+ return VH_RESULT_ERR;
+ }
+ dev->postcopy_listening = 1;
+
+ return VH_RESULT_OK;
+}
+
+static int
+vhost_user_postcopy_end(struct virtio_net **pdev, struct VhostUserMsg *msg,
+ int main_fd __rte_unused)
+{
+ struct virtio_net *dev = *pdev;
+
+ dev->postcopy_listening = 0;
+ if (dev->postcopy_ufd >= 0) {
+ close(dev->postcopy_ufd);
+ dev->postcopy_ufd = -1;
+ }
+
+ msg->payload.u64 = 0;
+ msg->size = sizeof(msg->payload.u64);
+ msg->fd_num = 0;
+
+ return VH_RESULT_REPLY;
+}
+
+typedef int (*vhost_message_handler_t)(struct virtio_net **pdev,
+ struct VhostUserMsg *msg,
+ int main_fd);
+static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = {
+ [VHOST_USER_NONE] = NULL,
+ [VHOST_USER_GET_FEATURES] = vhost_user_get_features,
+ [VHOST_USER_SET_FEATURES] = vhost_user_set_features,
+ [VHOST_USER_SET_OWNER] = vhost_user_set_owner,
+ [VHOST_USER_RESET_OWNER] = vhost_user_reset_owner,
+ [VHOST_USER_SET_MEM_TABLE] = vhost_user_set_mem_table,
+ [VHOST_USER_SET_LOG_BASE] = vhost_user_set_log_base,
+ [VHOST_USER_SET_LOG_FD] = vhost_user_set_log_fd,
+ [VHOST_USER_SET_VRING_NUM] = vhost_user_set_vring_num,
+ [VHOST_USER_SET_VRING_ADDR] = vhost_user_set_vring_addr,
+ [VHOST_USER_SET_VRING_BASE] = vhost_user_set_vring_base,
+ [VHOST_USER_GET_VRING_BASE] = vhost_user_get_vring_base,
+ [VHOST_USER_SET_VRING_KICK] = vhost_user_set_vring_kick,
+ [VHOST_USER_SET_VRING_CALL] = vhost_user_set_vring_call,
+ [VHOST_USER_SET_VRING_ERR] = vhost_user_set_vring_err,
+ [VHOST_USER_GET_PROTOCOL_FEATURES] = vhost_user_get_protocol_features,
+ [VHOST_USER_SET_PROTOCOL_FEATURES] = vhost_user_set_protocol_features,
+ [VHOST_USER_GET_QUEUE_NUM] = vhost_user_get_queue_num,
+ [VHOST_USER_SET_VRING_ENABLE] = vhost_user_set_vring_enable,
+ [VHOST_USER_SEND_RARP] = vhost_user_send_rarp,
+ [VHOST_USER_NET_SET_MTU] = vhost_user_net_set_mtu,
+ [VHOST_USER_SET_SLAVE_REQ_FD] = vhost_user_set_req_fd,
+ [VHOST_USER_IOTLB_MSG] = vhost_user_iotlb_msg,
+ [VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise,
+ [VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen,
+ [VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end,
+};
+
+
/* return bytes# of read on success or negative val on failure. */
static int
read_vhost_message(int sockfd, struct VhostUserMsg *msg)
@@ -1410,7 +1700,7 @@ read_vhost_message(int sockfd, struct VhostUserMsg *msg)
int ret;
ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
- msg->fds, VHOST_MEMORY_MAX_NREGIONS);
+ msg->fds, VHOST_MEMORY_MAX_NREGIONS, &msg->fd_num);
if (ret <= 0)
return ret;
@@ -1434,13 +1724,13 @@ read_vhost_message(int sockfd, struct VhostUserMsg *msg)
}
static int
-send_vhost_message(int sockfd, struct VhostUserMsg *msg, int *fds, int fd_num)
+send_vhost_message(int sockfd, struct VhostUserMsg *msg)
{
if (!msg)
return 0;
return send_fd_message(sockfd, (char *)msg,
- VHOST_USER_HDR_SIZE + msg->size, fds, fd_num);
+ VHOST_USER_HDR_SIZE + msg->size, msg->fds, msg->fd_num);
}
static int
@@ -1454,19 +1744,18 @@ send_vhost_reply(int sockfd, struct VhostUserMsg *msg)
msg->flags |= VHOST_USER_VERSION;
msg->flags |= VHOST_USER_REPLY_MASK;
- return send_vhost_message(sockfd, msg, NULL, 0);
+ return send_vhost_message(sockfd, msg);
}
static int
-send_vhost_slave_message(struct virtio_net *dev, struct VhostUserMsg *msg,
- int *fds, int fd_num)
+send_vhost_slave_message(struct virtio_net *dev, struct VhostUserMsg *msg)
{
int ret;
if (msg->flags & VHOST_USER_NEED_REPLY)
rte_spinlock_lock(&dev->slave_req_lock);
- ret = send_vhost_message(dev->slave_req_fd, msg, fds, fd_num);
+ ret = send_vhost_message(dev->slave_req_fd, msg);
if (ret < 0 && (msg->flags & VHOST_USER_NEED_REPLY))
rte_spinlock_unlock(&dev->slave_req_lock);
@@ -1477,7 +1766,8 @@ send_vhost_slave_message(struct virtio_net *dev, struct VhostUserMsg *msg,
* Allocate a queue pair if it hasn't been allocated yet
*/
static int
-vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg)
+vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev,
+ struct VhostUserMsg *msg)
{
uint16_t vring_idx;
@@ -1555,6 +1845,7 @@ vhost_user_msg_handler(int vid, int fd)
int ret;
int unlock_required = 0;
uint32_t skip_master = 0;
+ int request;
dev = get_device(vid);
if (dev == NULL)
@@ -1633,132 +1924,54 @@ vhost_user_msg_handler(int vid, int fd)
}
if (dev->extern_ops.pre_msg_handle) {
- uint32_t need_reply;
-
ret = (*dev->extern_ops.pre_msg_handle)(dev->vid,
- (void *)&msg, &need_reply, &skip_master);
- if (ret < 0)
+ (void *)&msg, &skip_master);
+ if (ret == VH_RESULT_ERR)
goto skip_to_reply;
-
- if (need_reply)
+ else if (ret == VH_RESULT_REPLY)
send_vhost_reply(fd, &msg);
if (skip_master)
goto skip_to_post_handle;
}
- switch (msg.request.master) {
- case VHOST_USER_GET_FEATURES:
- msg.payload.u64 = vhost_user_get_features(dev);
- msg.size = sizeof(msg.payload.u64);
- send_vhost_reply(fd, &msg);
- break;
- case VHOST_USER_SET_FEATURES:
- ret = vhost_user_set_features(dev, msg.payload.u64);
- if (ret)
- return -1;
- break;
-
- case VHOST_USER_GET_PROTOCOL_FEATURES:
- vhost_user_get_protocol_features(dev, &msg);
- send_vhost_reply(fd, &msg);
- break;
- case VHOST_USER_SET_PROTOCOL_FEATURES:
- vhost_user_set_protocol_features(dev, msg.payload.u64);
- break;
-
- case VHOST_USER_SET_OWNER:
- vhost_user_set_owner();
- break;
- case VHOST_USER_RESET_OWNER:
- vhost_user_reset_owner(dev);
- break;
-
- case VHOST_USER_SET_MEM_TABLE:
- ret = vhost_user_set_mem_table(&dev, &msg);
- break;
-
- case VHOST_USER_SET_LOG_BASE:
- vhost_user_set_log_base(dev, &msg);
-
- /* it needs a reply */
- msg.size = sizeof(msg.payload.u64);
- send_vhost_reply(fd, &msg);
- break;
- case VHOST_USER_SET_LOG_FD:
- close(msg.fds[0]);
- RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
- break;
-
- case VHOST_USER_SET_VRING_NUM:
- vhost_user_set_vring_num(dev, &msg);
- break;
- case VHOST_USER_SET_VRING_ADDR:
- vhost_user_set_vring_addr(&dev, &msg);
- break;
- case VHOST_USER_SET_VRING_BASE:
- vhost_user_set_vring_base(dev, &msg);
- break;
-
- case VHOST_USER_GET_VRING_BASE:
- vhost_user_get_vring_base(dev, &msg);
- msg.size = sizeof(msg.payload.state);
- send_vhost_reply(fd, &msg);
- break;
-
- case VHOST_USER_SET_VRING_KICK:
- vhost_user_set_vring_kick(&dev, &msg);
- break;
- case VHOST_USER_SET_VRING_CALL:
- vhost_user_set_vring_call(dev, &msg);
- break;
-
- case VHOST_USER_SET_VRING_ERR:
- if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
- close(msg.fds[0]);
- RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
- break;
-
- case VHOST_USER_GET_QUEUE_NUM:
- msg.payload.u64 = (uint64_t)vhost_user_get_queue_num(dev);
- msg.size = sizeof(msg.payload.u64);
- send_vhost_reply(fd, &msg);
- break;
-
- case VHOST_USER_SET_VRING_ENABLE:
- vhost_user_set_vring_enable(dev, &msg);
- break;
- case VHOST_USER_SEND_RARP:
- vhost_user_send_rarp(dev, &msg);
- break;
-
- case VHOST_USER_NET_SET_MTU:
- ret = vhost_user_net_set_mtu(dev, &msg);
- break;
-
- case VHOST_USER_SET_SLAVE_REQ_FD:
- ret = vhost_user_set_req_fd(dev, &msg);
- break;
-
- case VHOST_USER_IOTLB_MSG:
- ret = vhost_user_iotlb_msg(&dev, &msg);
- break;
+ request = msg.request.master;
+ if (request > VHOST_USER_NONE && request < VHOST_USER_MAX) {
+ if (!vhost_message_handlers[request])
+ goto skip_to_post_handle;
+ ret = vhost_message_handlers[request](&dev, &msg, fd);
- default:
- ret = -1;
- break;
+ switch (ret) {
+ case VH_RESULT_ERR:
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Processing %s failed.\n",
+ vhost_message_str[request]);
+ break;
+ case VH_RESULT_OK:
+ RTE_LOG(DEBUG, VHOST_CONFIG,
+ "Processing %s succeeded.\n",
+ vhost_message_str[request]);
+ break;
+ case VH_RESULT_REPLY:
+ RTE_LOG(DEBUG, VHOST_CONFIG,
+ "Processing %s succeeded and needs reply.\n",
+ vhost_message_str[request]);
+ send_vhost_reply(fd, &msg);
+ break;
+ }
+ } else {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Requested invalid message type %d.\n", request);
+ ret = VH_RESULT_ERR;
}
skip_to_post_handle:
- if (dev->extern_ops.post_msg_handle) {
- uint32_t need_reply;
-
+ if (ret != VH_RESULT_ERR && dev->extern_ops.post_msg_handle) {
ret = (*dev->extern_ops.post_msg_handle)(
- dev->vid, (void *)&msg, &need_reply);
- if (ret < 0)
+ dev->vid, (void *)&msg);
+ if (ret == VH_RESULT_ERR)
goto skip_to_reply;
-
- if (need_reply)
+ else if (ret == VH_RESULT_REPLY)
send_vhost_reply(fd, &msg);
}
@@ -1766,10 +1979,20 @@ skip_to_reply:
if (unlock_required)
vhost_user_unlock_all_queue_pairs(dev);
+ /*
+ * If the request required a reply that was already sent,
+ * this optional reply-ack won't be sent as the
+ * VHOST_USER_NEED_REPLY was cleared in send_vhost_reply().
+ */
if (msg.flags & VHOST_USER_NEED_REPLY) {
- msg.payload.u64 = !!ret;
+ msg.payload.u64 = ret == VH_RESULT_ERR;
msg.size = sizeof(msg.payload.u64);
+ msg.fd_num = 0;
send_vhost_reply(fd, &msg);
+ } else if (ret == VH_RESULT_ERR) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "vhost message handling failed.\n");
+ return -1;
}
if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) {
@@ -1805,9 +2028,9 @@ skip_to_reply:
}
static int process_slave_message_reply(struct virtio_net *dev,
- const VhostUserMsg *msg)
+ const struct VhostUserMsg *msg)
{
- VhostUserMsg msg_reply;
+ struct VhostUserMsg msg_reply;
int ret;
if ((msg->flags & VHOST_USER_NEED_REPLY) == 0)
@@ -1848,7 +2071,7 @@ vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm)
},
};
- ret = send_vhost_message(dev->slave_req_fd, &msg, NULL, 0);
+ ret = send_vhost_message(dev->slave_req_fd, &msg);
if (ret < 0) {
RTE_LOG(ERR, VHOST_CONFIG,
"Failed to send IOTLB miss message (%d)\n",
@@ -1864,8 +2087,6 @@ static int vhost_user_slave_set_vring_host_notifier(struct virtio_net *dev,
uint64_t offset,
uint64_t size)
{
- int *fdp = NULL;
- size_t fd_num = 0;
int ret;
struct VhostUserMsg msg = {
.request.slave = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG,
@@ -1881,11 +2102,11 @@ static int vhost_user_slave_set_vring_host_notifier(struct virtio_net *dev,
if (fd < 0)
msg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK;
else {
- fdp = &fd;
- fd_num = 1;
+ msg.fds[0] = fd;
+ msg.fd_num = 1;
}
- ret = send_vhost_slave_message(dev, &msg, fdp, fd_num);
+ ret = send_vhost_slave_message(dev, &msg);
if (ret < 0) {
RTE_LOG(ERR, VHOST_CONFIG,
"Failed to set host notifier (%d)\n", ret);
diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
index 42166adf..dc97be84 100644
--- a/lib/librte_vhost/vhost_user.h
+++ b/lib/librte_vhost/vhost_user.h
@@ -22,7 +22,8 @@
(1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \
(1ULL << VHOST_USER_PROTOCOL_F_CRYPTO_SESSION) | \
(1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \
- (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER))
+ (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT))
typedef enum VhostUserRequest {
VHOST_USER_NONE = 0,
@@ -50,7 +51,10 @@ typedef enum VhostUserRequest {
VHOST_USER_IOTLB_MSG = 22,
VHOST_USER_CRYPTO_CREATE_SESS = 26,
VHOST_USER_CRYPTO_CLOSE_SESS = 27,
- VHOST_USER_MAX = 28
+ VHOST_USER_POSTCOPY_ADVISE = 28,
+ VHOST_USER_POSTCOPY_LISTEN = 29,
+ VHOST_USER_POSTCOPY_END = 30,
+ VHOST_USER_MAX = 31
} VhostUserRequest;
typedef enum VhostUserSlaveRequest {
@@ -132,6 +136,7 @@ typedef struct VhostUserMsg {
VhostUserVringArea area;
} payload;
int fds[VHOST_MEMORY_MAX_NREGIONS];
+ int fd_num;
} __attribute((packed)) VhostUserMsg;
#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
@@ -146,7 +151,8 @@ int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm);
int vhost_user_host_notifier_ctrl(int vid, bool enable);
/* socket.c */
-int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num);
+int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds,
+ int *fd_num);
int send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num);
#endif
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 99c7afc8..8ad30c94 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -122,7 +122,7 @@ flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
static __rte_always_inline void
update_shadow_used_ring_split(struct vhost_virtqueue *vq,
- uint16_t desc_idx, uint16_t len)
+ uint16_t desc_idx, uint32_t len)
{
uint16_t i = vq->shadow_used_idx++;
@@ -186,7 +186,7 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
static __rte_always_inline void
update_shadow_used_ring_packed(struct vhost_virtqueue *vq,
- uint16_t desc_idx, uint16_t len, uint16_t count)
+ uint16_t desc_idx, uint32_t len, uint16_t count)
{
uint16_t i = vq->shadow_used_idx++;
@@ -329,7 +329,7 @@ static __rte_always_inline int
fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint32_t avail_idx, uint16_t *vec_idx,
struct buf_vector *buf_vec, uint16_t *desc_chain_head,
- uint16_t *desc_chain_len, uint8_t perm)
+ uint32_t *desc_chain_len, uint8_t perm)
{
uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
uint16_t vec_id = *vec_idx;
@@ -409,7 +409,7 @@ reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint16_t max_tries, tries = 0;
uint16_t head_idx = 0;
- uint16_t len = 0;
+ uint32_t len = 0;
*num_buffers = 0;
cur_idx = vq->last_avail_idx;
@@ -452,7 +452,7 @@ static __rte_always_inline int
fill_vec_buf_packed_indirect(struct virtio_net *dev,
struct vhost_virtqueue *vq,
struct vring_packed_desc *desc, uint16_t *vec_idx,
- struct buf_vector *buf_vec, uint16_t *len, uint8_t perm)
+ struct buf_vector *buf_vec, uint32_t *len, uint8_t perm)
{
uint16_t i;
uint32_t nr_descs;
@@ -508,7 +508,7 @@ static __rte_always_inline int
fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint16_t avail_idx, uint16_t *desc_count,
struct buf_vector *buf_vec, uint16_t *vec_idx,
- uint16_t *buf_id, uint16_t *len, uint8_t perm)
+ uint16_t *buf_id, uint32_t *len, uint8_t perm)
{
bool wrap_counter = vq->avail_wrap_counter;
struct vring_packed_desc *descs = vq->desc_packed;
@@ -521,6 +521,7 @@ fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
return -1;
*desc_count = 0;
+ *len = 0;
while (1) {
if (unlikely(vec_id >= BUF_VECTOR_MAX))
@@ -573,7 +574,7 @@ reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint16_t max_tries, tries = 0;
uint16_t buf_id = 0;
- uint16_t len = 0;
+ uint32_t len = 0;
uint16_t desc_count;
*num_buffers = 0;
@@ -888,6 +889,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
struct rte_mbuf **pkts, uint32_t count)
{
struct vhost_virtqueue *vq;
+ uint32_t nb_tx = 0;
VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
@@ -915,9 +917,9 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
goto out;
if (vq_is_packed(dev))
- count = virtio_dev_rx_packed(dev, vq, pkts, count);
+ nb_tx = virtio_dev_rx_packed(dev, vq, pkts, count);
else
- count = virtio_dev_rx_split(dev, vq, pkts, count);
+ nb_tx = virtio_dev_rx_split(dev, vq, pkts, count);
out:
if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
@@ -926,7 +928,7 @@ out:
out_access_unlock:
rte_spinlock_unlock(&vq->access_lock);
- return count;
+ return nb_tx;
}
uint16_t
@@ -1358,8 +1360,10 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
}
}
- flush_shadow_used_ring_split(dev, vq);
- vhost_vring_call_split(dev, vq);
+ if (likely(vq->shadow_used_idx)) {
+ flush_shadow_used_ring_split(dev, vq);
+ vhost_vring_call_split(dev, vq);
+ }
}
rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
@@ -1378,7 +1382,8 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
for (i = 0; i < count; i++) {
struct buf_vector buf_vec[BUF_VECTOR_MAX];
- uint16_t head_idx, dummy_len;
+ uint16_t head_idx;
+ uint32_t dummy_len;
uint16_t nr_vec = 0;
int err;
@@ -1437,8 +1442,10 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
do_data_copy_dequeue(vq);
if (unlikely(i < count))
vq->shadow_used_idx = i;
- flush_shadow_used_ring_split(dev, vq);
- vhost_vring_call_split(dev, vq);
+ if (likely(vq->shadow_used_idx)) {
+ flush_shadow_used_ring_split(dev, vq);
+ vhost_vring_call_split(dev, vq);
+ }
}
return i;
@@ -1473,8 +1480,10 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
}
}
- flush_shadow_used_ring_packed(dev, vq);
- vhost_vring_call_packed(dev, vq);
+ if (likely(vq->shadow_used_idx)) {
+ flush_shadow_used_ring_packed(dev, vq);
+ vhost_vring_call_packed(dev, vq);
+ }
}
VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
@@ -1485,7 +1494,8 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
for (i = 0; i < count; i++) {
struct buf_vector buf_vec[BUF_VECTOR_MAX];
- uint16_t buf_id, dummy_len;
+ uint16_t buf_id;
+ uint32_t dummy_len;
uint16_t desc_count, nr_vec = 0;
int err;
@@ -1551,8 +1561,10 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
do_data_copy_dequeue(vq);
if (unlikely(i < count))
vq->shadow_used_idx = i;
- flush_shadow_used_ring_packed(dev, vq);
- vhost_vring_call_packed(dev, vq);
+ if (likely(vq->shadow_used_idx)) {
+ flush_shadow_used_ring_packed(dev, vq);
+ vhost_vring_call_packed(dev, vq);
+ }
}
return i;
diff --git a/lib/meson.build b/lib/meson.build
index eb91f100..bb7f443f 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -9,13 +9,14 @@
# given as a dep, no need to mention ring. This is especially true for the
# core libs which are widely reused, so their deps are kept to a minimum.
libraries = [ 'compat', # just a header, used for versioning
- 'kvargs',
+ 'cmdline', # ethdev depends on cmdline for parsing functions
+ 'kvargs', # eal depends on kvargs
'eal', 'ring', 'mempool', 'mbuf', 'net', 'ethdev', 'pci', # core
'metrics', # bitrate/latency stats depends on this
'hash', # efd depends on this
'timer', # eventdev depends on this
'acl', 'bbdev', 'bitratestats', 'cfgfile',
- 'cmdline', 'compressdev', 'cryptodev',
+ 'compressdev', 'cryptodev',
'distributor', 'efd', 'eventdev',
'gro', 'gso', 'ip_frag', 'jobstats',
'kni', 'latencystats', 'lpm', 'member',
@@ -24,12 +25,18 @@ libraries = [ 'compat', # just a header, used for versioning
# add pkt framework libs which use other libs from above
'port', 'table', 'pipeline',
# flow_classify lib depends on pkt framework table lib
- 'flow_classify', 'bpf']
+ 'flow_classify', 'bpf', 'telemetry']
default_cflags = machine_args
if cc.has_argument('-Wno-format-truncation')
default_cflags += '-Wno-format-truncation'
endif
+
+enabled_libs = [] # used to print summary at the end
+
+# -D_GNU_SOURCE unconditionally
+default_cflags += '-D_GNU_SOURCE'
+
foreach l:libraries
build = true
name = l
@@ -45,18 +52,17 @@ foreach l:libraries
# use "deps" for internal DPDK dependencies, and "ext_deps" for
# external package/library requirements
ext_deps = []
- deps = ['eal'] # eal is standard dependency except for itself
- if l == 'kvargs'
- deps = []
- endif
- if l == 'eal'
- deps = ['kvargs']
+ deps = []
+ # eal is standard dependency once built
+ if dpdk_conf.has('RTE_LIBRTE_EAL')
+ deps += ['eal']
endif
dir_name = 'librte_' + l
subdir(dir_name)
if build
+ enabled_libs += name
dpdk_conf.set('RTE_LIBRTE_' + name.to_upper(), 1)
install_headers(headers)
@@ -87,10 +93,8 @@ foreach l:libraries
lib_version = '@0@.1'.format(version)
so_version = '@0@'.format(version)
else
- prj_ver = meson.project_version().split('.')
- lib_version = '@0@.@1@'.format(
- prj_ver.get(0), prj_ver.get(1))
- so_version = lib_version
+ lib_version = major_version
+ so_version = major_version
endif
# first build static lib
@@ -126,6 +130,7 @@ foreach l:libraries
dependencies: shared_deps)
dpdk_libraries = [shared_lib] + dpdk_libraries
+ dpdk_static_libraries = [static_lib] + dpdk_static_libraries
endif # sources.length() > 0
set_variable('shared_' + libname, shared_dep)